Add AVX2 std::abs, clang build
All checks were successful
continuous-integration/drone/push Build is passing
All checks were successful
continuous-integration/drone/push Build is passing
This commit is contained in:
parent
430ac5d89f
commit
13f6e179e3
24
.drone.yml
24
.drone.yml
|
@ -1,7 +1,7 @@
|
|||
---
|
||||
kind: pipeline
|
||||
type: docker
|
||||
name: default
|
||||
name: default-gcc
|
||||
|
||||
steps:
|
||||
- name: submodules
|
||||
|
@ -17,5 +17,25 @@ steps:
|
|||
- cmake .. -DCMAKE_BUILD_TYPE=Release -DCMAKE_CXX_FLAGS_RELEASE="-march=native" -DCMAKE_C_FLAGS_RELEASE="-march=native" -DCMAKE_INSTALL_PREFIX="/usr"
|
||||
- make -j$(nproc)
|
||||
- make install
|
||||
---
|
||||
kind: pipeline
|
||||
type: docker
|
||||
name: default-clang
|
||||
|
||||
...
|
||||
steps:
|
||||
- name: submodules
|
||||
image: alpine/git
|
||||
commands:
|
||||
- git submodule update --init --recursive
|
||||
- name: build
|
||||
image: debian:bullseye
|
||||
commands:
|
||||
- DEBIAN_FRONTEND=noninteractive apt update
|
||||
- DEBIAN_FRONTEND=noninteractive apt install -y clang cmake make
|
||||
- update-alternatives --install /usr/bin/cc cc /usr/bin/clang 100
|
||||
- update-alternatives --install /usr/bin/c++ c++ /usr/bin/clang 100
|
||||
- mkdir build && cd build
|
||||
- cmake .. -DCMAKE_BUILD_TYPE=Release -DCMAKE_CXX_FLAGS_RELEASE="-march=native" -DCMAKE_C_FLAGS_RELEASE="-march=native" -DCMAKE_INSTALL_PREFIX="/usr"
|
||||
- make -j$(nproc)
|
||||
- make install
|
||||
...
|
2
.gitignore
vendored
2
.gitignore
vendored
|
@ -1,3 +1,3 @@
|
|||
/.idea
|
||||
/build
|
||||
/cmake-build-debug
|
||||
/cmake-build-*
|
|
@ -11,6 +11,8 @@ set(CMAKE_BUILD_WITH_INSTALL_RPATH TRUE)
|
|||
set(CMAKE_INTERPROCEDURAL_OPTIMIZATION TRUE)
|
||||
cmake_policy(SET CMP0069 NEW)
|
||||
set(CMAKE_POLICY_DEFAULT_CMP0069 NEW)
|
||||
cmake_policy(SET CMP0074 NEW)
|
||||
set(CMAKE_POLICY_DEFAULT_CMP0074 NEW)
|
||||
|
||||
|
||||
if(NOT CMAKE_BUILD_TYPE)
|
||||
|
@ -23,16 +25,25 @@ set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -ggdb -O0 -fno-exceptions")
|
|||
if(EMSCRIPTEN)
|
||||
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -DNDEBUG -O3 -ffast-math -fno-exceptions -fno-rtti")
|
||||
set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -DNDEBUG -O3 -ffast-math -fno-exceptions")
|
||||
set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -DNDEBUG -O3 -ffast-math -fno-exceptions -fno-rtti")
|
||||
set(CMAKE_C_FLAGS_RELWITHDEBINFO "${CMAKE_C_FLAGS_RELWITHDEBINFO} -DNDEBUG -O3 -ffast-math -fno-exceptions")
|
||||
else()
|
||||
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -DNDEBUG -Ofast -fno-exceptions -fno-rtti")
|
||||
set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -DNDEBUG -Ofast -fno-exceptions")
|
||||
set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -DNDEBUG -Ofast -fno-exceptions -fno-rtti")
|
||||
set(CMAKE_C_FLAGS_RELWITHDEBINFO "${CMAKE_C_FLAGS_RELWITHDEBINFO} -DNDEBUG -Ofast -fno-exceptions")
|
||||
|
||||
if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
|
||||
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -frecord-gcc-switches -Rpass=loop-vectorize -Rpass-missed=loop-vectorize -Rpass-analysis=loop-vectorize")
|
||||
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -frecord-gcc-switches -Rpass=loop-vectorize -Rpass-missed=loop-vectorize -Rpass-analysis=loop-vectorize")
|
||||
set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -frecord-gcc-switches -Rpass=loop-vectorize -Rpass-missed=loop-vectorize -Rpass-analysis=loop-vectorize")
|
||||
elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
|
||||
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -fopt-info-all -frecord-gcc-switches")
|
||||
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -fopt-info-all -frecord-gcc-switches")
|
||||
# set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -fipa-pta -ftree-loop-ivcanon -floop-nest-optimize -ftree-vectorize -ftree-loop-im -fgraphite-identity -floop-parallelize-all -fdevirtualize-speculatively -fdevirtualize-at-ltrans -ftree-parallelize-loops=4 -fopt-info-all -frecord-gcc-switches")
|
||||
# set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -fipa-pta -ftree-loop-ivcanon -floop-nest-optimize -ftree-vectorize -ftree-loop-im -fgraphite-identity -floop-parallelize-all -fdevirtualize-speculatively -fdevirtualize-at-ltrans -ftree-parallelize-loops=4 -fopt-info-all -frecord-gcc-switches")
|
||||
|
||||
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -fvect-cost-model=unlimited -fopt-info-all -frecord-gcc-switches")
|
||||
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -fvect-cost-model=unlimited -fopt-info-all -frecord-gcc-switches")
|
||||
set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -fvect-cost-model=unlimited -fopt-info-all -frecord-gcc-switches")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
|
@ -51,7 +62,7 @@ include_directories(lib/gaborator)
|
|||
include_directories(lib/pffft)
|
||||
include_directories(lib/MIPP/src)
|
||||
|
||||
set(MIPP_ROOT "lib/MIPP/src")
|
||||
set(MIPP_ROOT "${CMAKE_SOURCE_DIR}/lib/MIPP/src")
|
||||
add_subdirectory(lib/pffft EXCLUDE_FROM_ALL)
|
||||
|
||||
add_executable(test test.cpp)
|
||||
|
|
|
@ -5,6 +5,12 @@
|
|||
#include <cmath>
|
||||
#include <memory>
|
||||
|
||||
#ifdef __AVX2__
|
||||
|
||||
#include <immintrin.h>
|
||||
|
||||
#endif
|
||||
|
||||
class Gaborator {
|
||||
|
||||
public:
|
||||
|
@ -137,15 +143,47 @@ private:
|
|||
}, min_band, INT_MAX, st0, st1, coefs);
|
||||
*/
|
||||
|
||||
gaborator::apply_to_slice(false, [&](int band, int64_t st, int time_step, unsigned len, const std::complex<float> *p0){
|
||||
for (unsigned int i = 0; i < len; i++) {
|
||||
gaborProcessEntry(band, st, std::abs(*p0++));
|
||||
st += time_step;
|
||||
std::vector<float> magnitudes;
|
||||
gaborator::apply_to_slice(false, [&](int band, int64_t sampleIndex, int time_step, unsigned len, const std::complex<float> *p0) {
|
||||
|
||||
//process magnitudes beforehand for easier auto-vectorization
|
||||
magnitudes.resize(len);
|
||||
|
||||
#ifdef __AVX2__
|
||||
|
||||
int64_t i;
|
||||
for (i = 0; i < (((int64_t)len) - 7); i += 8) {
|
||||
// load 8 complex values (--> 16 floats overall) into two SIMD registers
|
||||
__m256 inLo = _mm256_loadu_ps(reinterpret_cast<const float *> (p0 + i ));
|
||||
__m256 inHi = _mm256_loadu_ps(reinterpret_cast<const float *> (p0 + i + 4));
|
||||
|
||||
// separates the real and imaginary part, however values are in a wrong order
|
||||
__m256 re = _mm256_shuffle_ps (inLo, inHi, _MM_SHUFFLE(2, 0, 2, 0));
|
||||
__m256 im = _mm256_shuffle_ps (inLo, inHi, _MM_SHUFFLE(3, 1, 3, 1));
|
||||
|
||||
// do the heavy work on the unordered vectors
|
||||
__m256 abs = _mm256_sqrt_ps(_mm256_add_ps(_mm256_mul_ps(re, re), _mm256_mul_ps(im, im)));
|
||||
|
||||
// reorder values prior to storing
|
||||
__m256d ordered = _mm256_permute4x64_pd (_mm256_castps_pd(abs), _MM_SHUFFLE(3, 1, 2, 0));
|
||||
_mm256_storeu_ps(magnitudes.data() + i, _mm256_castpd_ps(ordered));
|
||||
}
|
||||
|
||||
for (int64_t j = i; j < len; j++) {
|
||||
#else
|
||||
for (unsigned int j = 0; j < len; j++) {
|
||||
#endif
|
||||
magnitudes[j] = std::abs(p0[j]);
|
||||
}
|
||||
|
||||
for(auto magnitude : magnitudes){
|
||||
gaborProcessEntry(band, sampleIndex, magnitude);
|
||||
sampleIndex += time_step;
|
||||
}
|
||||
}, min_band, INT_MAX, st0, st1, coefs);
|
||||
}
|
||||
|
||||
inline void gaborProcessEntry(int band, int64_t sampleIndex, float coef) {
|
||||
inline void gaborProcessEntry(int band, int64_t sampleIndex, float coefficient) {
|
||||
int64_t coefficientIndex = sampleIndex / frequencyBinTimeStepSize;
|
||||
int bandIndex = band - firstBandCache;
|
||||
|
||||
|
@ -171,7 +209,7 @@ private:
|
|||
// due to reduction in precision (from audio sample accuracy to steps) multiple
|
||||
// magnitudes could be placed in the same stepIndex, bandIndex pair.
|
||||
// We take the maximum magnitudes value.
|
||||
currentCoefficient[bandIndex] = std::max(currentCoefficient[bandIndex], coef);
|
||||
currentCoefficient[bandIndex] = std::max(currentCoefficient[bandIndex], coefficient);
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -180,9 +218,13 @@ private:
|
|||
|
||||
|
||||
std::vector<float> resultCache;
|
||||
|
||||
//circular buffer with current coefficents
|
||||
std::vector<std::vector<float>> coefficients;
|
||||
int firstBandCache = -1;
|
||||
int numberOfBandsCache = 0;
|
||||
|
||||
//The index of the most recent coefficent (in steps)
|
||||
int64_t mostRecentCoefficentIndex = 0;
|
||||
|
||||
const int blockSize;
|
||||
|
|
|
@ -10,4 +10,4 @@ Requires: @pc_req_public@
|
|||
Requires.private: @pc_req_private@
|
||||
Cflags: -I"${includedir}"
|
||||
Libs: -L"${libdir}" -l@target1@
|
||||
Libs.private: -L"${libdir}" -l@target1@ -l@target2@ @pc_libs_private@
|
||||
Libs.private: -L"${libdir}" -l@target1@ @pc_libs_private@
|
Loading…
Reference in a new issue