Add AVX2 std::abs, clang build
All checks were successful
continuous-integration/drone/push Build is passing

This commit is contained in:
DataHoarder 2022-07-15 14:07:27 +02:00
parent 430ac5d89f
commit 13f6e179e3
Signed by: DataHoarder
SSH key fingerprint: SHA256:OLTRf6Fl87G52SiR7sWLGNzlJt4WOX+tfI2yxo0z7xk
5 changed files with 86 additions and 13 deletions

View file

@ -1,7 +1,7 @@
---
kind: pipeline
type: docker
name: default
name: default-gcc
steps:
- name: submodules
@ -17,5 +17,25 @@ steps:
- cmake .. -DCMAKE_BUILD_TYPE=Release -DCMAKE_CXX_FLAGS_RELEASE="-march=native" -DCMAKE_C_FLAGS_RELEASE="-march=native" -DCMAKE_INSTALL_PREFIX="/usr"
- make -j$(nproc)
- make install
---
kind: pipeline
type: docker
name: default-clang
...
steps:
- name: submodules
image: alpine/git
commands:
- git submodule update --init --recursive
- name: build
image: debian:bullseye
commands:
- DEBIAN_FRONTEND=noninteractive apt update
- DEBIAN_FRONTEND=noninteractive apt install -y clang cmake make
- update-alternatives --install /usr/bin/cc cc /usr/bin/clang 100
- update-alternatives --install /usr/bin/c++ c++ /usr/bin/clang 100
- mkdir build && cd build
- cmake .. -DCMAKE_BUILD_TYPE=Release -DCMAKE_CXX_FLAGS_RELEASE="-march=native" -DCMAKE_C_FLAGS_RELEASE="-march=native" -DCMAKE_INSTALL_PREFIX="/usr"
- make -j$(nproc)
- make install
...

2
.gitignore vendored
View file

@ -1,3 +1,3 @@
/.idea
/build
/cmake-build-debug
/cmake-build-*

View file

@ -11,6 +11,8 @@ set(CMAKE_BUILD_WITH_INSTALL_RPATH TRUE)
set(CMAKE_INTERPROCEDURAL_OPTIMIZATION TRUE)
cmake_policy(SET CMP0069 NEW)
set(CMAKE_POLICY_DEFAULT_CMP0069 NEW)
cmake_policy(SET CMP0074 NEW)
set(CMAKE_POLICY_DEFAULT_CMP0074 NEW)
if(NOT CMAKE_BUILD_TYPE)
@ -23,16 +25,25 @@ set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -ggdb -O0 -fno-exceptions")
if(EMSCRIPTEN)
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -DNDEBUG -O3 -ffast-math -fno-exceptions -fno-rtti")
set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -DNDEBUG -O3 -ffast-math -fno-exceptions")
set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -DNDEBUG -O3 -ffast-math -fno-exceptions -fno-rtti")
set(CMAKE_C_FLAGS_RELWITHDEBINFO "${CMAKE_C_FLAGS_RELWITHDEBINFO} -DNDEBUG -O3 -ffast-math -fno-exceptions")
else()
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -DNDEBUG -Ofast -fno-exceptions -fno-rtti")
set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -DNDEBUG -Ofast -fno-exceptions")
set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -DNDEBUG -Ofast -fno-exceptions -fno-rtti")
set(CMAKE_C_FLAGS_RELWITHDEBINFO "${CMAKE_C_FLAGS_RELWITHDEBINFO} -DNDEBUG -Ofast -fno-exceptions")
if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -frecord-gcc-switches -Rpass=loop-vectorize -Rpass-missed=loop-vectorize -Rpass-analysis=loop-vectorize")
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -frecord-gcc-switches -Rpass=loop-vectorize -Rpass-missed=loop-vectorize -Rpass-analysis=loop-vectorize")
set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -frecord-gcc-switches -Rpass=loop-vectorize -Rpass-missed=loop-vectorize -Rpass-analysis=loop-vectorize")
elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -fopt-info-all -frecord-gcc-switches")
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -fopt-info-all -frecord-gcc-switches")
# set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -fipa-pta -ftree-loop-ivcanon -floop-nest-optimize -ftree-vectorize -ftree-loop-im -fgraphite-identity -floop-parallelize-all -fdevirtualize-speculatively -fdevirtualize-at-ltrans -ftree-parallelize-loops=4 -fopt-info-all -frecord-gcc-switches")
# set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -fipa-pta -ftree-loop-ivcanon -floop-nest-optimize -ftree-vectorize -ftree-loop-im -fgraphite-identity -floop-parallelize-all -fdevirtualize-speculatively -fdevirtualize-at-ltrans -ftree-parallelize-loops=4 -fopt-info-all -frecord-gcc-switches")
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -fvect-cost-model=unlimited -fopt-info-all -frecord-gcc-switches")
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -fvect-cost-model=unlimited -fopt-info-all -frecord-gcc-switches")
set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -fvect-cost-model=unlimited -fopt-info-all -frecord-gcc-switches")
endif()
endif()
@ -51,7 +62,7 @@ include_directories(lib/gaborator)
include_directories(lib/pffft)
include_directories(lib/MIPP/src)
set(MIPP_ROOT "lib/MIPP/src")
set(MIPP_ROOT "${CMAKE_SOURCE_DIR}/lib/MIPP/src")
add_subdirectory(lib/pffft EXCLUDE_FROM_ALL)
add_executable(test test.cpp)

View file

@ -5,6 +5,12 @@
#include <cmath>
#include <memory>
#ifdef __AVX2__
#include <immintrin.h>
#endif
class Gaborator {
public:
@ -137,15 +143,47 @@ private:
}, min_band, INT_MAX, st0, st1, coefs);
*/
gaborator::apply_to_slice(false, [&](int band, int64_t st, int time_step, unsigned len, const std::complex<float> *p0){
for (unsigned int i = 0; i < len; i++) {
gaborProcessEntry(band, st, std::abs(*p0++));
st += time_step;
std::vector<float> magnitudes;
gaborator::apply_to_slice(false, [&](int band, int64_t sampleIndex, int time_step, unsigned len, const std::complex<float> *p0) {
//process magnitudes beforehand for easier auto-vectorization
magnitudes.resize(len);
#ifdef __AVX2__
int64_t i;
for (i = 0; i < (((int64_t)len) - 7); i += 8) {
// load 8 complex values (--> 16 floats overall) into two SIMD registers
__m256 inLo = _mm256_loadu_ps(reinterpret_cast<const float *> (p0 + i ));
__m256 inHi = _mm256_loadu_ps(reinterpret_cast<const float *> (p0 + i + 4));
// separates the real and imaginary part, however values are in a wrong order
__m256 re = _mm256_shuffle_ps (inLo, inHi, _MM_SHUFFLE(2, 0, 2, 0));
__m256 im = _mm256_shuffle_ps (inLo, inHi, _MM_SHUFFLE(3, 1, 3, 1));
// do the heavy work on the unordered vectors
__m256 abs = _mm256_sqrt_ps(_mm256_add_ps(_mm256_mul_ps(re, re), _mm256_mul_ps(im, im)));
// reorder values prior to storing
__m256d ordered = _mm256_permute4x64_pd (_mm256_castps_pd(abs), _MM_SHUFFLE(3, 1, 2, 0));
_mm256_storeu_ps(magnitudes.data() + i, _mm256_castpd_ps(ordered));
}
for (int64_t j = i; j < len; j++) {
#else
for (unsigned int j = 0; j < len; j++) {
#endif
magnitudes[j] = std::abs(p0[j]);
}
for(auto magnitude : magnitudes){
gaborProcessEntry(band, sampleIndex, magnitude);
sampleIndex += time_step;
}
}, min_band, INT_MAX, st0, st1, coefs);
}
inline void gaborProcessEntry(int band, int64_t sampleIndex, float coef) {
inline void gaborProcessEntry(int band, int64_t sampleIndex, float coefficient) {
int64_t coefficientIndex = sampleIndex / frequencyBinTimeStepSize;
int bandIndex = band - firstBandCache;
@ -171,7 +209,7 @@ private:
// due to reduction in precision (from audio sample accuracy to steps) multiple
// magnitudes could be placed in the same stepIndex, bandIndex pair.
// We take the maximum magnitudes value.
currentCoefficient[bandIndex] = std::max(currentCoefficient[bandIndex], coef);
currentCoefficient[bandIndex] = std::max(currentCoefficient[bandIndex], coefficient);
}
}
@ -180,9 +218,13 @@ private:
std::vector<float> resultCache;
//circular buffer with current coefficents
std::vector<std::vector<float>> coefficients;
int firstBandCache = -1;
int numberOfBandsCache = 0;
//The index of the most recent coefficent (in steps)
int64_t mostRecentCoefficentIndex = 0;
const int blockSize;

View file

@ -10,4 +10,4 @@ Requires: @pc_req_public@
Requires.private: @pc_req_private@
Cflags: -I"${includedir}"
Libs: -L"${libdir}" -l@target1@
Libs.private: -L"${libdir}" -l@target1@ -l@target2@ @pc_libs_private@
Libs.private: -L"${libdir}" -l@target1@ @pc_libs_private@