Adjust cmake flags

Optimize circular ring bugger for coefficients
Move gaborProcessEntry preconditions to gaborApplySlice
2022-07-15 17:24:41 +02:00 · 2022-07-15 16:47:34 +02:00 · 2022-07-15 14:42:30 +02:00
2 changed files with 55 additions and 53 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -41,9 +41,9 @@ else()
 #        set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -fipa-pta -ftree-loop-ivcanon -floop-nest-optimize -ftree-vectorize -ftree-loop-im -fgraphite-identity -floop-parallelize-all -fdevirtualize-speculatively -fdevirtualize-at-ltrans -ftree-parallelize-loops=4 -fopt-info-all -frecord-gcc-switches")
 #        set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -fipa-pta -ftree-loop-ivcanon -floop-nest-optimize -ftree-vectorize -ftree-loop-im -fgraphite-identity -floop-parallelize-all -fdevirtualize-speculatively -fdevirtualize-at-ltrans -ftree-parallelize-loops=4 -fopt-info-all -frecord-gcc-switches")

-        set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -fvect-cost-model=unlimited -fopt-info-all -frecord-gcc-switches")
-        set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -fvect-cost-model=unlimited -fopt-info-all -frecord-gcc-switches")
-        set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -fvect-cost-model=unlimited -fopt-info-all -frecord-gcc-switches")
+        set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -fvect-cost-model=unlimited -fipa-pta -ftree-loop-ivcanon -ftree-loop-im -fdevirtualize-speculatively -fdevirtualize-at-ltrans -fopt-info-all -frecord-gcc-switches")
+        set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -fvect-cost-model=unlimited -fipa-pta -ftree-loop-ivcanon -ftree-loop-im -fdevirtualize-speculatively -fdevirtualize-at-ltrans -fopt-info-all -frecord-gcc-switches")
+        set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -fvect-cost-model=unlimited -fipa-pta -ftree-loop-ivcanon -ftree-loop-im -fdevirtualize-speculatively -fdevirtualize-at-ltrans -fopt-info-all -frecord-gcc-switches")
    endif()
 endif()

--- a/cgaborator.cpp
+++ b/cgaborator.cpp
@ -56,10 +56,8 @@ public:

        coefficientSize = (latency + 2*blockSize) / frequencyBinTimeStepSize;

-        coefficients.resize(coefficientSize);
-        for (auto & coefficient : coefficients){
-            coefficient.resize(numberOfBandsCache);
-        }
+		//Allocate ring buffer and members in a contiguous array
+		coefficients = static_cast<float *>(calloc(coefficientSize * numberOfBandsCache, sizeof(float)));

        assert(t_in == 0);

@ -94,7 +92,9 @@ public:
        return numberOfBandsCache;
    }

-    ~Gaborator()= default;
+    ~Gaborator() {
+		free(coefficients);
+	}


 private:
@ -122,32 +122,42 @@ private:

        //flush remaining
        for (int i = 1; i < coefficientSize; ++i) {
-            int64_t circularIndex = (mostRecentCoefficentIndex + i) % coefficientSize;
+            float* currentCoefficient = &coefficients[((mostRecentCoefficentIndex + i) % coefficientSize) * numberOfBandsCache];

-            auto& currentCoefficient = coefficients[circularIndex];
-
-            resultCache.insert(resultCache.end(), currentCoefficient.begin(), currentCoefficient.end());
+            resultCache.insert(resultCache.end(), currentCoefficient, currentCoefficient + numberOfBandsCache);
            // fill the oldest with zeros, but only the first round
 			if(i <= coefficientSize) {
-				std::fill(currentCoefficient.begin(), currentCoefficient.end(), 0);
+				std::fill(currentCoefficient, currentCoefficient + numberOfBandsCache, 0);
 			}
        }
    }

 	inline void gaborApplySlice(int64_t st0, int64_t st1) {
+		//Adjust start to match gaborProcessEntry requirements
+		if((st0 / frequencyBinTimeStepSize) <= 0){
+			st0 = frequencyBinTimeStepSize;
+		}
+
+		//Skip if nothing to process, the first results have a negative audio sample index
+		if(st0 > st1){
+			return;
+		}
+
+		int b0 = min_band;
+		int b1 = numberOfBandsCache + firstBandCache;
+
 		/*
 		Following code is equivalent, but it has been inlined for performance

 		gaborator::process([&](int band, int64_t audioSampleIndex, std::complex<float>& coef) {
 			gaborProcessEntry(band, audioSampleIndex, coef);
-		}, min_band, INT_MAX, st0, st1, coefs);
+		}, b0, b1, st0, st1, coefs);
 		*/

-		std::vector<float> magnitudes;
 		gaborator::apply_to_slice(false, [&](int band, int64_t sampleIndex, int time_step, unsigned len, const std::complex<float> *p0) {

 			//process magnitudes beforehand for easier auto-vectorization
-			magnitudes.resize(len);
+			magnitudeCache.resize(len);

 #ifdef __AVX2__

@ -166,52 +176,43 @@ private:

 				// reorder values prior to storing
 				__m256d ordered = _mm256_permute4x64_pd (_mm256_castps_pd(abs), _MM_SHUFFLE(3, 1, 2, 0));
-				_mm256_storeu_ps(magnitudes.data() + i, _mm256_castpd_ps(ordered));
+				_mm256_storeu_ps(magnitudeCache.data() + i, _mm256_castpd_ps(ordered));
 			}

 			for (int64_t j = i; j < len; j++) {
 #else
-			for (unsigned int j = 0; j < len; j++) {
+			for (int64_t j = 0; j < len; j++) {
 #endif
-				magnitudes[j] = std::abs(p0[j]);
+				magnitudeCache[j] = std::abs(p0[j]);
 			}

-			for(auto magnitude : magnitudes){
-				gaborProcessEntry(band, sampleIndex, magnitude);
-				sampleIndex += time_step;
+
+			int bandIndex = band - firstBandCache;
+			for (unsigned int j = 0; j < len; j++) {
+				gaborProcessEntry(bandIndex, (sampleIndex + time_step * j) / frequencyBinTimeStepSize, magnitudeCache[j]);
 			}
-		}, min_band, INT_MAX, st0, st1, coefs);
+		}, b0, b1, st0, st1, coefs);
 	}

-    inline void gaborProcessEntry(int band, int64_t sampleIndex, float coefficient) {
-		int64_t coefficientIndex = sampleIndex / frequencyBinTimeStepSize;
-        int bandIndex = band - firstBandCache;
+    inline void gaborProcessEntry(int bandIndex, int64_t coefficientIndex, float coefficient) {
+		float* currentCoefficient = &coefficients[(coefficientIndex % coefficientSize) * numberOfBandsCache];

-        // The first results have a negative audio sample index
-        // ignore these
-        if (coefficientIndex > 0 && bandIndex < numberOfBandsCache) {
-
-			int64_t circularIndex = coefficientIndex % coefficientSize;
-
-			auto& currentCoefficient = coefficients[circularIndex];
-
-            // If a new index is reached, save the old (fixed) coefficients in the history
-            // Fill the array with zeros to get the max
-            if (coefficientIndex > mostRecentCoefficentIndex && coefficientIndex > coefficientSize) {
-                // keep the new maximum
-                mostRecentCoefficentIndex = coefficientIndex;
-                // "copy" the oldest data to the history
-                // the slice can be reused thanks to the oldest being filled with zeros just after
-                resultCache.insert(resultCache.end(), currentCoefficient.begin(), currentCoefficient.end());
-                // fill the oldest with zeros
-                std::fill(currentCoefficient.begin(), currentCoefficient.end(), 0);
-            }
-            // due to reduction in precision (from audio sample accuracy to steps) multiple
-            // magnitudes could be placed in the same stepIndex, bandIndex pair.
-            // We take the maximum magnitudes value.
-            currentCoefficient[bandIndex] = std::max(currentCoefficient[bandIndex], coefficient);
-        }
+		// If a new index is reached, save the old (fixed) coefficients in the history
+		// Fill the array with zeros to get the max
+		if (coefficientIndex > mostRecentCoefficentIndex && coefficientIndex > coefficientSize) {
+			// keep the new maximum
+			mostRecentCoefficentIndex = coefficientIndex;
+			// "copy" the oldest data to the history
+			// the slice can be reused thanks to the oldest being filled with zeros just after
+			resultCache.insert(resultCache.end(), currentCoefficient, currentCoefficient + numberOfBandsCache);
+			// fill the oldest with zeros
+			std::fill(currentCoefficient, currentCoefficient + numberOfBandsCache, 0);
+		}

+		// due to reduction in precision (from audio sample accuracy to steps) multiple
+		// magnitudes could be placed in the same stepIndex, bandIndex pair.
+		// We take the maximum magnitudes value.
+		currentCoefficient[bandIndex] = std::max(currentCoefficient[bandIndex], coefficient);
    }

 private:
@ -219,15 +220,16 @@ private:

    std::vector<float> resultCache;

-	//circular buffer with current coefficents
-    std::vector<std::vector<float>> coefficients;
+	//circular buffer with current coefficients
+    float* coefficients = nullptr;
    int firstBandCache = -1;
    int numberOfBandsCache = 0;

-	//The index of the most recent coefficent (in steps)
+	//The index of the most recent coefficient (in steps)
    int64_t mostRecentCoefficentIndex = 0;

    const int blockSize;
+	std::vector<float> magnitudeCache;
    const int64_t frequencyBinTimeStepSize;
    int64_t t_in;
    int min_band;
Author	SHA1	Message	Date
DataHoarder	b77b6d919c	Adjust cmake flags All checks were successful continuous-integration/drone/push Build is passing Details	2022-07-15 17:24:41 +02:00
DataHoarder	57709448de	Optimize circular ring bugger for coefficients	2022-07-15 16:47:34 +02:00
DataHoarder	c20789e8ce	Move gaborProcessEntry preconditions to gaborApplySlice	2022-07-15 14:42:30 +02:00