From 6a9c6f7232afb1bc135544661d2fe9efa7ad789b Mon Sep 17 00:00:00 2001 From: WeebDataHoarder <57538841+WeebDataHoarder@users.noreply.github.com> Date: Wed, 13 Jul 2022 19:51:10 +0200 Subject: [PATCH] Better auto-vectorization results --- CMakeLists.txt | 8 ++--- cgaborator.cpp | 70 ++++++++++++++++++++++++++------------------ include/cgaborator.h | 2 +- 3 files changed, 47 insertions(+), 33 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 2767ba3..77cf285 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -17,11 +17,11 @@ set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -ggdb -O0 -fno-exceptions -f set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -ggdb -O0 -fno-exceptions") if(EMSCRIPTEN) - set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3 -fno-exceptions -fno-rtti") - set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -O3 -fno-exceptions") + set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -DNDEBUG -O3 -ffast-math -fno-exceptions -fno-rtti") + set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -DNDEBUG -O3 -ffast-math -fno-exceptions") else() - set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -Ofast -fno-exceptions -fno-rtti") - set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -Ofast -fno-exceptions") + set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -DNDEBUG -Ofast -fno-exceptions -fno-rtti") + set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -DNDEBUG -Ofast -fno-exceptions") if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang") set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -frecord-gcc-switches -Rpass=loop-vectorize -Rpass-missed=loop-vectorize -Rpass-analysis=loop-vectorize") diff --git a/cgaborator.cpp b/cgaborator.cpp index b665e62..923a97f 100644 --- a/cgaborator.cpp +++ b/cgaborator.cpp @@ -51,8 +51,8 @@ public: coefficientSize = (latency + 2*blockSize) / frequencyBinTimeStepSize; coefficients.resize(coefficientSize); - for (int i = 0; i < coefficients.size(); ++i){ - coefficients[i] = std::make_unique>(numberOfBandsCache); + for (auto & coefficient : coefficients){ + coefficient.resize(numberOfBandsCache); } assert(t_in == 0); @@ -60,7 +60,7 @@ public: } - float* gaborTransform(float* audio_block, int audio_block_length, size_t* return_size, size_t* slice_size) { + float* gaborTransform(float* audio_block, int64_t audio_block_length, size_t* return_size, size_t* slice_size) { resultCache.clear(); if (audio_block == nullptr || audio_block_length == 0) { //finish @@ -92,18 +92,15 @@ public: private: - void analyze(float* audio_block, int audio_block_length){ + void analyze(float* audio_block, int64_t audio_block_length){ analyzer.analyze(audio_block, t_in, t_in + audio_block_length, coefs); int64_t st0 = t_in - latency; int64_t st1 = t_in - latency + audio_block_length; - gaborator::process([&](int band, int64_t audioSampleIndex, std::complex& coef) { - gaborProcessEntry(band, int(audioSampleIndex), coef); - }, min_band, INT_MAX, st0, st1, coefs); + gaborApplySlice(st0, st1); - - t_in += (int64_t) audio_block_length; + t_in += audio_block_length; int64_t t_out = t_in - latency; @@ -115,34 +112,51 @@ private: int64_t st1 = t_in; //flush all till latency spot - gaborator::process([&](int band, int64_t audioSampleIndex, std::complex& coef) { - gaborProcessEntry(band, int(audioSampleIndex), coef); - }, min_band, INT_MAX, st0, st1, coefs); + gaborApplySlice(st0, st1); //flush remaining for (int i = 1; i < coefficientSize; ++i) { - int circularIndex = (mostRecentCoefficentIndex + i) % int(coefficientSize); + int64_t circularIndex = (mostRecentCoefficentIndex + i) % coefficientSize; - auto& currentCoefficient = *coefficients[circularIndex]; + auto& currentCoefficient = coefficients[circularIndex]; resultCache.insert(resultCache.end(), currentCoefficient.begin(), currentCoefficient.end()); - // fill the oldest with zeros - std::fill(currentCoefficient.begin(), currentCoefficient.end(), 0); + // fill the oldest with zeros, but only the first round + if(i <= coefficientSize) { + std::fill(currentCoefficient.begin(), currentCoefficient.end(), 0); + } } } - inline void gaborProcessEntry(int band, int sampleIndex, std::complex& coef) { - int coefficientIndex = sampleIndex / frequencyBinTimeStepSize; + inline void gaborApplySlice(int64_t st0, int64_t st1) { + /* + Following code is equivalent, but it has been inlined for performance + + gaborator::process([&](int band, int64_t audioSampleIndex, std::complex& coef) { + gaborProcessEntry(band, audioSampleIndex, coef); + }, min_band, INT_MAX, st0, st1, coefs); + */ + + gaborator::apply_to_slice(false, [&](int band, int64_t st, int time_step, unsigned len, const std::complex *p0){ + for (unsigned int i = 0; i < len; i++) { + gaborProcessEntry(band, st, std::abs(*p0++)); + st += time_step; + } + }, min_band, INT_MAX, st0, st1, coefs); + } + + inline void gaborProcessEntry(int band, int64_t sampleIndex, float coef) { + int64_t coefficientIndex = sampleIndex / frequencyBinTimeStepSize; int bandIndex = band - firstBandCache; - int circularIndex = coefficientIndex % int(coefficientSize); - - auto& currentCoefficient = *coefficients[circularIndex]; - // The first results have a negative audio sample index // ignore these if (coefficientIndex > 0 && bandIndex < numberOfBandsCache) { + int64_t circularIndex = coefficientIndex % coefficientSize; + + auto& currentCoefficient = coefficients[circularIndex]; + // If a new index is reached, save the old (fixed) coefficients in the history // Fill the array with zeros to get the max if (coefficientIndex > mostRecentCoefficentIndex && coefficientIndex > coefficientSize) { @@ -157,7 +171,7 @@ private: // due to reduction in precision (from audio sample accuracy to steps) multiple // magnitudes could be placed in the same stepIndex, bandIndex pair. // We take the maximum magnitudes value. - currentCoefficient[bandIndex] = std::max(currentCoefficient[bandIndex], std::abs(coef)); + currentCoefficient[bandIndex] = std::max(currentCoefficient[bandIndex], coef); } } @@ -166,13 +180,13 @@ private: std::vector resultCache; - std::vector>> coefficients; + std::vector> coefficients; int firstBandCache = -1; int numberOfBandsCache = 0; - int mostRecentCoefficentIndex = 0; + int64_t mostRecentCoefficentIndex = 0; const int blockSize; - const int frequencyBinTimeStepSize; + const int64_t frequencyBinTimeStepSize; int64_t t_in; int min_band; const int sample_rate; @@ -180,7 +194,7 @@ private: int64_t coefficientSize; private: - gaborator::parameters parameters; + const gaborator::parameters parameters; gaborator::analyzer analyzer; gaborator::coefs coefs; }; @@ -198,7 +212,7 @@ int gaborator_number_of_bands(uintptr_t ptr) { return reinterpret_cast(ptr)->numberOfBands(); } -float* gaborator_transform(uintptr_t ptr, float* audio_block, int audio_block_length, size_t* return_size, size_t* slice_size){ +float* gaborator_transform(uintptr_t ptr, float* audio_block, int64_t audio_block_length, size_t* return_size, size_t* slice_size){ return reinterpret_cast(ptr)->gaborTransform(audio_block, audio_block_length, return_size, slice_size); } diff --git a/include/cgaborator.h b/include/cgaborator.h index 0a96843..d447316 100644 --- a/include/cgaborator.h +++ b/include/cgaborator.h @@ -15,7 +15,7 @@ void gaborator_release(uintptr_t ptr); int64_t gaborator_analysis_support(uintptr_t ptr); int gaborator_number_of_bands(uintptr_t ptr); -float* gaborator_transform(uintptr_t ptr, float* audio_block, int audio_block_length, size_t* return_size, size_t* slice_size); +float* gaborator_transform(uintptr_t ptr, float* audio_block, int64_t audio_block_length, size_t* return_size, size_t* slice_size); #ifdef __cplusplus