Add AVX2 std::abs, clang build
All checks were successful
continuous-integration/drone/push Build is passing
All checks were successful
continuous-integration/drone/push Build is passing
This commit is contained in:
parent
430ac5d89f
commit
13f6e179e3
24
.drone.yml
24
.drone.yml
|
@ -1,7 +1,7 @@
|
||||||
---
|
---
|
||||||
kind: pipeline
|
kind: pipeline
|
||||||
type: docker
|
type: docker
|
||||||
name: default
|
name: default-gcc
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- name: submodules
|
- name: submodules
|
||||||
|
@ -17,5 +17,25 @@ steps:
|
||||||
- cmake .. -DCMAKE_BUILD_TYPE=Release -DCMAKE_CXX_FLAGS_RELEASE="-march=native" -DCMAKE_C_FLAGS_RELEASE="-march=native" -DCMAKE_INSTALL_PREFIX="/usr"
|
- cmake .. -DCMAKE_BUILD_TYPE=Release -DCMAKE_CXX_FLAGS_RELEASE="-march=native" -DCMAKE_C_FLAGS_RELEASE="-march=native" -DCMAKE_INSTALL_PREFIX="/usr"
|
||||||
- make -j$(nproc)
|
- make -j$(nproc)
|
||||||
- make install
|
- make install
|
||||||
|
---
|
||||||
|
kind: pipeline
|
||||||
|
type: docker
|
||||||
|
name: default-clang
|
||||||
|
|
||||||
...
|
steps:
|
||||||
|
- name: submodules
|
||||||
|
image: alpine/git
|
||||||
|
commands:
|
||||||
|
- git submodule update --init --recursive
|
||||||
|
- name: build
|
||||||
|
image: debian:bullseye
|
||||||
|
commands:
|
||||||
|
- DEBIAN_FRONTEND=noninteractive apt update
|
||||||
|
- DEBIAN_FRONTEND=noninteractive apt install -y clang cmake make
|
||||||
|
- update-alternatives --install /usr/bin/cc cc /usr/bin/clang 100
|
||||||
|
- update-alternatives --install /usr/bin/c++ c++ /usr/bin/clang 100
|
||||||
|
- mkdir build && cd build
|
||||||
|
- cmake .. -DCMAKE_BUILD_TYPE=Release -DCMAKE_CXX_FLAGS_RELEASE="-march=native" -DCMAKE_C_FLAGS_RELEASE="-march=native" -DCMAKE_INSTALL_PREFIX="/usr"
|
||||||
|
- make -j$(nproc)
|
||||||
|
- make install
|
||||||
|
...
|
2
.gitignore
vendored
2
.gitignore
vendored
|
@ -1,3 +1,3 @@
|
||||||
/.idea
|
/.idea
|
||||||
/build
|
/build
|
||||||
/cmake-build-debug
|
/cmake-build-*
|
|
@ -11,6 +11,8 @@ set(CMAKE_BUILD_WITH_INSTALL_RPATH TRUE)
|
||||||
set(CMAKE_INTERPROCEDURAL_OPTIMIZATION TRUE)
|
set(CMAKE_INTERPROCEDURAL_OPTIMIZATION TRUE)
|
||||||
cmake_policy(SET CMP0069 NEW)
|
cmake_policy(SET CMP0069 NEW)
|
||||||
set(CMAKE_POLICY_DEFAULT_CMP0069 NEW)
|
set(CMAKE_POLICY_DEFAULT_CMP0069 NEW)
|
||||||
|
cmake_policy(SET CMP0074 NEW)
|
||||||
|
set(CMAKE_POLICY_DEFAULT_CMP0074 NEW)
|
||||||
|
|
||||||
|
|
||||||
if(NOT CMAKE_BUILD_TYPE)
|
if(NOT CMAKE_BUILD_TYPE)
|
||||||
|
@ -23,16 +25,25 @@ set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -ggdb -O0 -fno-exceptions")
|
||||||
if(EMSCRIPTEN)
|
if(EMSCRIPTEN)
|
||||||
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -DNDEBUG -O3 -ffast-math -fno-exceptions -fno-rtti")
|
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -DNDEBUG -O3 -ffast-math -fno-exceptions -fno-rtti")
|
||||||
set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -DNDEBUG -O3 -ffast-math -fno-exceptions")
|
set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -DNDEBUG -O3 -ffast-math -fno-exceptions")
|
||||||
|
set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -DNDEBUG -O3 -ffast-math -fno-exceptions -fno-rtti")
|
||||||
|
set(CMAKE_C_FLAGS_RELWITHDEBINFO "${CMAKE_C_FLAGS_RELWITHDEBINFO} -DNDEBUG -O3 -ffast-math -fno-exceptions")
|
||||||
else()
|
else()
|
||||||
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -DNDEBUG -Ofast -fno-exceptions -fno-rtti")
|
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -DNDEBUG -Ofast -fno-exceptions -fno-rtti")
|
||||||
set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -DNDEBUG -Ofast -fno-exceptions")
|
set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -DNDEBUG -Ofast -fno-exceptions")
|
||||||
|
set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -DNDEBUG -Ofast -fno-exceptions -fno-rtti")
|
||||||
|
set(CMAKE_C_FLAGS_RELWITHDEBINFO "${CMAKE_C_FLAGS_RELWITHDEBINFO} -DNDEBUG -Ofast -fno-exceptions")
|
||||||
|
|
||||||
if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
|
if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
|
||||||
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -frecord-gcc-switches -Rpass=loop-vectorize -Rpass-missed=loop-vectorize -Rpass-analysis=loop-vectorize")
|
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -frecord-gcc-switches -Rpass=loop-vectorize -Rpass-missed=loop-vectorize -Rpass-analysis=loop-vectorize")
|
||||||
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -frecord-gcc-switches -Rpass=loop-vectorize -Rpass-missed=loop-vectorize -Rpass-analysis=loop-vectorize")
|
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -frecord-gcc-switches -Rpass=loop-vectorize -Rpass-missed=loop-vectorize -Rpass-analysis=loop-vectorize")
|
||||||
|
set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -frecord-gcc-switches -Rpass=loop-vectorize -Rpass-missed=loop-vectorize -Rpass-analysis=loop-vectorize")
|
||||||
elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
|
elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
|
||||||
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -fopt-info-all -frecord-gcc-switches")
|
# set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -fipa-pta -ftree-loop-ivcanon -floop-nest-optimize -ftree-vectorize -ftree-loop-im -fgraphite-identity -floop-parallelize-all -fdevirtualize-speculatively -fdevirtualize-at-ltrans -ftree-parallelize-loops=4 -fopt-info-all -frecord-gcc-switches")
|
||||||
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -fopt-info-all -frecord-gcc-switches")
|
# set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -fipa-pta -ftree-loop-ivcanon -floop-nest-optimize -ftree-vectorize -ftree-loop-im -fgraphite-identity -floop-parallelize-all -fdevirtualize-speculatively -fdevirtualize-at-ltrans -ftree-parallelize-loops=4 -fopt-info-all -frecord-gcc-switches")
|
||||||
|
|
||||||
|
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -fvect-cost-model=unlimited -fopt-info-all -frecord-gcc-switches")
|
||||||
|
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -fvect-cost-model=unlimited -fopt-info-all -frecord-gcc-switches")
|
||||||
|
set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -fvect-cost-model=unlimited -fopt-info-all -frecord-gcc-switches")
|
||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
@ -51,7 +62,7 @@ include_directories(lib/gaborator)
|
||||||
include_directories(lib/pffft)
|
include_directories(lib/pffft)
|
||||||
include_directories(lib/MIPP/src)
|
include_directories(lib/MIPP/src)
|
||||||
|
|
||||||
set(MIPP_ROOT "lib/MIPP/src")
|
set(MIPP_ROOT "${CMAKE_SOURCE_DIR}/lib/MIPP/src")
|
||||||
add_subdirectory(lib/pffft EXCLUDE_FROM_ALL)
|
add_subdirectory(lib/pffft EXCLUDE_FROM_ALL)
|
||||||
|
|
||||||
add_executable(test test.cpp)
|
add_executable(test test.cpp)
|
||||||
|
|
|
@ -5,6 +5,12 @@
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
#include <memory>
|
#include <memory>
|
||||||
|
|
||||||
|
#ifdef __AVX2__
|
||||||
|
|
||||||
|
#include <immintrin.h>
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
class Gaborator {
|
class Gaborator {
|
||||||
|
|
||||||
public:
|
public:
|
||||||
|
@ -137,15 +143,47 @@ private:
|
||||||
}, min_band, INT_MAX, st0, st1, coefs);
|
}, min_band, INT_MAX, st0, st1, coefs);
|
||||||
*/
|
*/
|
||||||
|
|
||||||
gaborator::apply_to_slice(false, [&](int band, int64_t st, int time_step, unsigned len, const std::complex<float> *p0){
|
std::vector<float> magnitudes;
|
||||||
for (unsigned int i = 0; i < len; i++) {
|
gaborator::apply_to_slice(false, [&](int band, int64_t sampleIndex, int time_step, unsigned len, const std::complex<float> *p0) {
|
||||||
gaborProcessEntry(band, st, std::abs(*p0++));
|
|
||||||
st += time_step;
|
//process magnitudes beforehand for easier auto-vectorization
|
||||||
|
magnitudes.resize(len);
|
||||||
|
|
||||||
|
#ifdef __AVX2__
|
||||||
|
|
||||||
|
int64_t i;
|
||||||
|
for (i = 0; i < (((int64_t)len) - 7); i += 8) {
|
||||||
|
// load 8 complex values (--> 16 floats overall) into two SIMD registers
|
||||||
|
__m256 inLo = _mm256_loadu_ps(reinterpret_cast<const float *> (p0 + i ));
|
||||||
|
__m256 inHi = _mm256_loadu_ps(reinterpret_cast<const float *> (p0 + i + 4));
|
||||||
|
|
||||||
|
// separates the real and imaginary part, however values are in a wrong order
|
||||||
|
__m256 re = _mm256_shuffle_ps (inLo, inHi, _MM_SHUFFLE(2, 0, 2, 0));
|
||||||
|
__m256 im = _mm256_shuffle_ps (inLo, inHi, _MM_SHUFFLE(3, 1, 3, 1));
|
||||||
|
|
||||||
|
// do the heavy work on the unordered vectors
|
||||||
|
__m256 abs = _mm256_sqrt_ps(_mm256_add_ps(_mm256_mul_ps(re, re), _mm256_mul_ps(im, im)));
|
||||||
|
|
||||||
|
// reorder values prior to storing
|
||||||
|
__m256d ordered = _mm256_permute4x64_pd (_mm256_castps_pd(abs), _MM_SHUFFLE(3, 1, 2, 0));
|
||||||
|
_mm256_storeu_ps(magnitudes.data() + i, _mm256_castpd_ps(ordered));
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int64_t j = i; j < len; j++) {
|
||||||
|
#else
|
||||||
|
for (unsigned int j = 0; j < len; j++) {
|
||||||
|
#endif
|
||||||
|
magnitudes[j] = std::abs(p0[j]);
|
||||||
|
}
|
||||||
|
|
||||||
|
for(auto magnitude : magnitudes){
|
||||||
|
gaborProcessEntry(band, sampleIndex, magnitude);
|
||||||
|
sampleIndex += time_step;
|
||||||
}
|
}
|
||||||
}, min_band, INT_MAX, st0, st1, coefs);
|
}, min_band, INT_MAX, st0, st1, coefs);
|
||||||
}
|
}
|
||||||
|
|
||||||
inline void gaborProcessEntry(int band, int64_t sampleIndex, float coef) {
|
inline void gaborProcessEntry(int band, int64_t sampleIndex, float coefficient) {
|
||||||
int64_t coefficientIndex = sampleIndex / frequencyBinTimeStepSize;
|
int64_t coefficientIndex = sampleIndex / frequencyBinTimeStepSize;
|
||||||
int bandIndex = band - firstBandCache;
|
int bandIndex = band - firstBandCache;
|
||||||
|
|
||||||
|
@ -171,7 +209,7 @@ private:
|
||||||
// due to reduction in precision (from audio sample accuracy to steps) multiple
|
// due to reduction in precision (from audio sample accuracy to steps) multiple
|
||||||
// magnitudes could be placed in the same stepIndex, bandIndex pair.
|
// magnitudes could be placed in the same stepIndex, bandIndex pair.
|
||||||
// We take the maximum magnitudes value.
|
// We take the maximum magnitudes value.
|
||||||
currentCoefficient[bandIndex] = std::max(currentCoefficient[bandIndex], coef);
|
currentCoefficient[bandIndex] = std::max(currentCoefficient[bandIndex], coefficient);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -180,9 +218,13 @@ private:
|
||||||
|
|
||||||
|
|
||||||
std::vector<float> resultCache;
|
std::vector<float> resultCache;
|
||||||
|
|
||||||
|
//circular buffer with current coefficents
|
||||||
std::vector<std::vector<float>> coefficients;
|
std::vector<std::vector<float>> coefficients;
|
||||||
int firstBandCache = -1;
|
int firstBandCache = -1;
|
||||||
int numberOfBandsCache = 0;
|
int numberOfBandsCache = 0;
|
||||||
|
|
||||||
|
//The index of the most recent coefficent (in steps)
|
||||||
int64_t mostRecentCoefficentIndex = 0;
|
int64_t mostRecentCoefficentIndex = 0;
|
||||||
|
|
||||||
const int blockSize;
|
const int blockSize;
|
||||||
|
|
|
@ -10,4 +10,4 @@ Requires: @pc_req_public@
|
||||||
Requires.private: @pc_req_private@
|
Requires.private: @pc_req_private@
|
||||||
Cflags: -I"${includedir}"
|
Cflags: -I"${includedir}"
|
||||||
Libs: -L"${libdir}" -l@target1@
|
Libs: -L"${libdir}" -l@target1@
|
||||||
Libs.private: -L"${libdir}" -l@target1@ -l@target2@ @pc_libs_private@
|
Libs.private: -L"${libdir}" -l@target1@ @pc_libs_private@
|
Loading…
Reference in a new issue