diff --git a/.drone.yml b/.drone.yml index d5e3bfe..3af0bcf 100644 --- a/.drone.yml +++ b/.drone.yml @@ -1,7 +1,7 @@ --- kind: pipeline type: docker -name: default +name: default-gcc steps: - name: submodules @@ -17,5 +17,25 @@ steps: - cmake .. -DCMAKE_BUILD_TYPE=Release -DCMAKE_CXX_FLAGS_RELEASE="-march=native" -DCMAKE_C_FLAGS_RELEASE="-march=native" -DCMAKE_INSTALL_PREFIX="/usr" - make -j$(nproc) - make install +--- +kind: pipeline +type: docker +name: default-clang -... +steps: + - name: submodules + image: alpine/git + commands: + - git submodule update --init --recursive + - name: build + image: debian:bullseye + commands: + - DEBIAN_FRONTEND=noninteractive apt update + - DEBIAN_FRONTEND=noninteractive apt install -y clang cmake make + - update-alternatives --install /usr/bin/cc cc /usr/bin/clang 100 + - update-alternatives --install /usr/bin/c++ c++ /usr/bin/clang 100 + - mkdir build && cd build + - cmake .. -DCMAKE_BUILD_TYPE=Release -DCMAKE_CXX_FLAGS_RELEASE="-march=native" -DCMAKE_C_FLAGS_RELEASE="-march=native" -DCMAKE_INSTALL_PREFIX="/usr" + - make -j$(nproc) + - make install +... \ No newline at end of file diff --git a/.gitignore b/.gitignore index 6c080b1..a734e00 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,3 @@ /.idea /build -/cmake-build-debug \ No newline at end of file +/cmake-build-* \ No newline at end of file diff --git a/CMakeLists.txt b/CMakeLists.txt index 7c4c311..6d93b61 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -11,6 +11,8 @@ set(CMAKE_BUILD_WITH_INSTALL_RPATH TRUE) set(CMAKE_INTERPROCEDURAL_OPTIMIZATION TRUE) cmake_policy(SET CMP0069 NEW) set(CMAKE_POLICY_DEFAULT_CMP0069 NEW) +cmake_policy(SET CMP0074 NEW) +set(CMAKE_POLICY_DEFAULT_CMP0074 NEW) if(NOT CMAKE_BUILD_TYPE) @@ -23,16 +25,25 @@ set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -ggdb -O0 -fno-exceptions") if(EMSCRIPTEN) set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -DNDEBUG -O3 -ffast-math -fno-exceptions -fno-rtti") set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -DNDEBUG -O3 -ffast-math -fno-exceptions") + set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -DNDEBUG -O3 -ffast-math -fno-exceptions -fno-rtti") + set(CMAKE_C_FLAGS_RELWITHDEBINFO "${CMAKE_C_FLAGS_RELWITHDEBINFO} -DNDEBUG -O3 -ffast-math -fno-exceptions") else() set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -DNDEBUG -Ofast -fno-exceptions -fno-rtti") set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -DNDEBUG -Ofast -fno-exceptions") + set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -DNDEBUG -Ofast -fno-exceptions -fno-rtti") + set(CMAKE_C_FLAGS_RELWITHDEBINFO "${CMAKE_C_FLAGS_RELWITHDEBINFO} -DNDEBUG -Ofast -fno-exceptions") if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang") set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -frecord-gcc-switches -Rpass=loop-vectorize -Rpass-missed=loop-vectorize -Rpass-analysis=loop-vectorize") set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -frecord-gcc-switches -Rpass=loop-vectorize -Rpass-missed=loop-vectorize -Rpass-analysis=loop-vectorize") + set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -frecord-gcc-switches -Rpass=loop-vectorize -Rpass-missed=loop-vectorize -Rpass-analysis=loop-vectorize") elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU") - set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -fopt-info-all -frecord-gcc-switches") - set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -fopt-info-all -frecord-gcc-switches") +# set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -fipa-pta -ftree-loop-ivcanon -floop-nest-optimize -ftree-vectorize -ftree-loop-im -fgraphite-identity -floop-parallelize-all -fdevirtualize-speculatively -fdevirtualize-at-ltrans -ftree-parallelize-loops=4 -fopt-info-all -frecord-gcc-switches") +# set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -fipa-pta -ftree-loop-ivcanon -floop-nest-optimize -ftree-vectorize -ftree-loop-im -fgraphite-identity -floop-parallelize-all -fdevirtualize-speculatively -fdevirtualize-at-ltrans -ftree-parallelize-loops=4 -fopt-info-all -frecord-gcc-switches") + + set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -fvect-cost-model=unlimited -fopt-info-all -frecord-gcc-switches") + set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -fvect-cost-model=unlimited -fopt-info-all -frecord-gcc-switches") + set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -fvect-cost-model=unlimited -fopt-info-all -frecord-gcc-switches") endif() endif() @@ -51,7 +62,7 @@ include_directories(lib/gaborator) include_directories(lib/pffft) include_directories(lib/MIPP/src) -set(MIPP_ROOT "lib/MIPP/src") +set(MIPP_ROOT "${CMAKE_SOURCE_DIR}/lib/MIPP/src") add_subdirectory(lib/pffft EXCLUDE_FROM_ALL) add_executable(test test.cpp) diff --git a/cgaborator.cpp b/cgaborator.cpp index eb9e62c..8d72a48 100644 --- a/cgaborator.cpp +++ b/cgaborator.cpp @@ -5,6 +5,12 @@ #include #include +#ifdef __AVX2__ + +#include + +#endif + class Gaborator { public: @@ -137,15 +143,47 @@ private: }, min_band, INT_MAX, st0, st1, coefs); */ - gaborator::apply_to_slice(false, [&](int band, int64_t st, int time_step, unsigned len, const std::complex *p0){ - for (unsigned int i = 0; i < len; i++) { - gaborProcessEntry(band, st, std::abs(*p0++)); - st += time_step; + std::vector magnitudes; + gaborator::apply_to_slice(false, [&](int band, int64_t sampleIndex, int time_step, unsigned len, const std::complex *p0) { + + //process magnitudes beforehand for easier auto-vectorization + magnitudes.resize(len); + +#ifdef __AVX2__ + + int64_t i; + for (i = 0; i < (((int64_t)len) - 7); i += 8) { + // load 8 complex values (--> 16 floats overall) into two SIMD registers + __m256 inLo = _mm256_loadu_ps(reinterpret_cast (p0 + i )); + __m256 inHi = _mm256_loadu_ps(reinterpret_cast (p0 + i + 4)); + + // separates the real and imaginary part, however values are in a wrong order + __m256 re = _mm256_shuffle_ps (inLo, inHi, _MM_SHUFFLE(2, 0, 2, 0)); + __m256 im = _mm256_shuffle_ps (inLo, inHi, _MM_SHUFFLE(3, 1, 3, 1)); + + // do the heavy work on the unordered vectors + __m256 abs = _mm256_sqrt_ps(_mm256_add_ps(_mm256_mul_ps(re, re), _mm256_mul_ps(im, im))); + + // reorder values prior to storing + __m256d ordered = _mm256_permute4x64_pd (_mm256_castps_pd(abs), _MM_SHUFFLE(3, 1, 2, 0)); + _mm256_storeu_ps(magnitudes.data() + i, _mm256_castpd_ps(ordered)); + } + + for (int64_t j = i; j < len; j++) { +#else + for (unsigned int j = 0; j < len; j++) { +#endif + magnitudes[j] = std::abs(p0[j]); + } + + for(auto magnitude : magnitudes){ + gaborProcessEntry(band, sampleIndex, magnitude); + sampleIndex += time_step; } }, min_band, INT_MAX, st0, st1, coefs); } - inline void gaborProcessEntry(int band, int64_t sampleIndex, float coef) { + inline void gaborProcessEntry(int band, int64_t sampleIndex, float coefficient) { int64_t coefficientIndex = sampleIndex / frequencyBinTimeStepSize; int bandIndex = band - firstBandCache; @@ -171,7 +209,7 @@ private: // due to reduction in precision (from audio sample accuracy to steps) multiple // magnitudes could be placed in the same stepIndex, bandIndex pair. // We take the maximum magnitudes value. - currentCoefficient[bandIndex] = std::max(currentCoefficient[bandIndex], coef); + currentCoefficient[bandIndex] = std::max(currentCoefficient[bandIndex], coefficient); } } @@ -180,9 +218,13 @@ private: std::vector resultCache; + + //circular buffer with current coefficents std::vector> coefficients; int firstBandCache = -1; int numberOfBandsCache = 0; + + //The index of the most recent coefficent (in steps) int64_t mostRecentCoefficentIndex = 0; const int blockSize; diff --git a/cgaborator.pc.in b/cgaborator.pc.in index 0d7ab9e..d763bca 100644 --- a/cgaborator.pc.in +++ b/cgaborator.pc.in @@ -10,4 +10,4 @@ Requires: @pc_req_public@ Requires.private: @pc_req_private@ Cflags: -I"${includedir}" Libs: -L"${libdir}" -l@target1@ -Libs.private: -L"${libdir}" -l@target1@ -l@target2@ @pc_libs_private@ \ No newline at end of file +Libs.private: -L"${libdir}" -l@target1@ @pc_libs_private@ \ No newline at end of file