Better auto-vectorization results

2022-07-13 19:51:10 +02:00 · 2022-07-13 19:51:10 +02:00 · 6a9c6f7232
parent 0d63ad8d32
commit 6a9c6f7232
3 changed files with 47 additions and 33 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -17,11 +17,11 @@ set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -ggdb -O0 -fno-exceptions -f
 set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -ggdb -O0 -fno-exceptions")

 if(EMSCRIPTEN)
-    set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3 -fno-exceptions -fno-rtti")
-    set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -O3 -fno-exceptions")
+    set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -DNDEBUG -O3 -ffast-math -fno-exceptions -fno-rtti")
+    set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -DNDEBUG -O3 -ffast-math -fno-exceptions")
 else()
-    set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -Ofast -fno-exceptions -fno-rtti")
-    set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -Ofast -fno-exceptions")
+    set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -DNDEBUG -Ofast -fno-exceptions -fno-rtti")
+    set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -DNDEBUG -Ofast -fno-exceptions")

    if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
        set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -frecord-gcc-switches -Rpass=loop-vectorize -Rpass-missed=loop-vectorize -Rpass-analysis=loop-vectorize")
--- a/cgaborator.cpp
+++ b/cgaborator.cpp
@ -51,8 +51,8 @@ public:
        coefficientSize = (latency + 2*blockSize) / frequencyBinTimeStepSize;

        coefficients.resize(coefficientSize);
-        for (int i = 0; i < coefficients.size(); ++i){
-            coefficients[i] = std::make_unique<std::vector<float>>(numberOfBandsCache);
+        for (auto & coefficient : coefficients){
+            coefficient.resize(numberOfBandsCache);
        }

        assert(t_in == 0);
@ -60,7 +60,7 @@ public:
    }


-	float* gaborTransform(float* audio_block, int audio_block_length, size_t* return_size, size_t* slice_size) {
+	float* gaborTransform(float* audio_block, int64_t audio_block_length, size_t* return_size, size_t* slice_size) {
        resultCache.clear();

        if (audio_block == nullptr || audio_block_length == 0) { //finish
@ -92,18 +92,15 @@ public:


 private:
-    void analyze(float* audio_block, int audio_block_length){
+    void analyze(float* audio_block, int64_t audio_block_length){
        analyzer.analyze(audio_block, t_in, t_in + audio_block_length, coefs);

        int64_t st0 = t_in - latency;
        int64_t st1 = t_in - latency + audio_block_length;

-        gaborator::process([&](int band, int64_t audioSampleIndex, std::complex<float>& coef) {
-			gaborProcessEntry(band, int(audioSampleIndex), coef);
-        }, min_band, INT_MAX, st0, st1, coefs);
+		gaborApplySlice(st0, st1);

-
-        t_in += (int64_t) audio_block_length;
+        t_in += audio_block_length;

        int64_t t_out = t_in - latency;

@ -115,34 +112,51 @@ private:
        int64_t st1 = t_in;

        //flush all till latency spot
-        gaborator::process([&](int band, int64_t audioSampleIndex, std::complex<float>& coef) {
-			gaborProcessEntry(band, int(audioSampleIndex), coef);
-        }, min_band, INT_MAX, st0, st1, coefs);
+		gaborApplySlice(st0, st1);

        //flush remaining
        for (int i = 1; i < coefficientSize; ++i) {
-            int circularIndex = (mostRecentCoefficentIndex + i) % int(coefficientSize);
+            int64_t circularIndex = (mostRecentCoefficentIndex + i) % coefficientSize;

-            auto& currentCoefficient = *coefficients[circularIndex];
+            auto& currentCoefficient = coefficients[circularIndex];

            resultCache.insert(resultCache.end(), currentCoefficient.begin(), currentCoefficient.end());
-            // fill the oldest with zeros
-            std::fill(currentCoefficient.begin(), currentCoefficient.end(), 0);
+            // fill the oldest with zeros, but only the first round
+			if(i <= coefficientSize) {
+				std::fill(currentCoefficient.begin(), currentCoefficient.end(), 0);
+			}
        }
    }

-    inline void gaborProcessEntry(int band, int sampleIndex, std::complex<float>& coef) {
-        int coefficientIndex = sampleIndex / frequencyBinTimeStepSize;
+	inline void gaborApplySlice(int64_t st0, int64_t st1) {
+		/*
+		Following code is equivalent, but it has been inlined for performance
+
+		gaborator::process([&](int band, int64_t audioSampleIndex, std::complex<float>& coef) {
+			gaborProcessEntry(band, audioSampleIndex, coef);
+		}, min_band, INT_MAX, st0, st1, coefs);
+		*/
+
+		gaborator::apply_to_slice(false, [&](int band, int64_t st, int time_step, unsigned len, const std::complex<float> *p0){
+			for (unsigned int i = 0; i < len; i++) {
+				gaborProcessEntry(band, st, std::abs(*p0++));
+				st += time_step;
+			}
+		}, min_band, INT_MAX, st0, st1, coefs);
+	}
+
+    inline void gaborProcessEntry(int band, int64_t sampleIndex, float coef) {
+		int64_t coefficientIndex = sampleIndex / frequencyBinTimeStepSize;
        int bandIndex = band - firstBandCache;

-        int circularIndex = coefficientIndex % int(coefficientSize);
-
-        auto& currentCoefficient = *coefficients[circularIndex];
-
        // The first results have a negative audio sample index
        // ignore these
        if (coefficientIndex > 0 && bandIndex < numberOfBandsCache) {

+			int64_t circularIndex = coefficientIndex % coefficientSize;
+
+			auto& currentCoefficient = coefficients[circularIndex];
+
            // If a new index is reached, save the old (fixed) coefficients in the history
            // Fill the array with zeros to get the max
            if (coefficientIndex > mostRecentCoefficentIndex && coefficientIndex > coefficientSize) {
@ -157,7 +171,7 @@ private:
            // due to reduction in precision (from audio sample accuracy to steps) multiple
            // magnitudes could be placed in the same stepIndex, bandIndex pair.
            // We take the maximum magnitudes value.
-            currentCoefficient[bandIndex] = std::max(currentCoefficient[bandIndex], std::abs(coef));
+            currentCoefficient[bandIndex] = std::max(currentCoefficient[bandIndex], coef);
        }

    }
@ -166,13 +180,13 @@ private:


    std::vector<float> resultCache;
-    std::vector<std::unique_ptr<std::vector<float>>> coefficients;
+    std::vector<std::vector<float>> coefficients;
    int firstBandCache = -1;
    int numberOfBandsCache = 0;
-    int mostRecentCoefficentIndex = 0;
+    int64_t mostRecentCoefficentIndex = 0;

    const int blockSize;
-    const int frequencyBinTimeStepSize;
+    const int64_t frequencyBinTimeStepSize;
    int64_t t_in;
    int min_band;
    const int sample_rate;
@ -180,7 +194,7 @@ private:
    int64_t coefficientSize;

 private:
-    gaborator::parameters parameters;
+    const gaborator::parameters parameters;
    gaborator::analyzer<float> analyzer;
    gaborator::coefs<float> coefs;
 };
@ -198,7 +212,7 @@ int gaborator_number_of_bands(uintptr_t ptr) {
    return reinterpret_cast<Gaborator*>(ptr)->numberOfBands();
 }

-float* gaborator_transform(uintptr_t ptr, float* audio_block, int audio_block_length, size_t* return_size, size_t* slice_size){
+float* gaborator_transform(uintptr_t ptr, float* audio_block, int64_t audio_block_length, size_t* return_size, size_t* slice_size){
 	return reinterpret_cast<Gaborator*>(ptr)->gaborTransform(audio_block, audio_block_length, return_size, slice_size);
 }

--- a/include/cgaborator.h
+++ b/include/cgaborator.h
@ -15,7 +15,7 @@ void gaborator_release(uintptr_t ptr);
 int64_t gaborator_analysis_support(uintptr_t ptr);
 int gaborator_number_of_bands(uintptr_t ptr);

-float* gaborator_transform(uintptr_t ptr, float* audio_block, int audio_block_length, size_t* return_size, size_t* slice_size);
+float* gaborator_transform(uintptr_t ptr, float* audio_block, int64_t audio_block_length, size_t* return_size, size_t* slice_size);


 #ifdef __cplusplus