Better auto-vectorization results
All checks were successful
continuous-integration/drone/push Build is passing

This commit is contained in:
DataHoarder 2022-07-13 19:51:10 +02:00
parent 0d63ad8d32
commit 6a9c6f7232
Signed by: DataHoarder
SSH key fingerprint: SHA256:OLTRf6Fl87G52SiR7sWLGNzlJt4WOX+tfI2yxo0z7xk
3 changed files with 47 additions and 33 deletions

View file

@ -17,11 +17,11 @@ set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -ggdb -O0 -fno-exceptions -f
set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -ggdb -O0 -fno-exceptions") set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -ggdb -O0 -fno-exceptions")
if(EMSCRIPTEN) if(EMSCRIPTEN)
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3 -fno-exceptions -fno-rtti") set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -DNDEBUG -O3 -ffast-math -fno-exceptions -fno-rtti")
set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -O3 -fno-exceptions") set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -DNDEBUG -O3 -ffast-math -fno-exceptions")
else() else()
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -Ofast -fno-exceptions -fno-rtti") set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -DNDEBUG -Ofast -fno-exceptions -fno-rtti")
set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -Ofast -fno-exceptions") set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -DNDEBUG -Ofast -fno-exceptions")
if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang") if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -frecord-gcc-switches -Rpass=loop-vectorize -Rpass-missed=loop-vectorize -Rpass-analysis=loop-vectorize") set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -frecord-gcc-switches -Rpass=loop-vectorize -Rpass-missed=loop-vectorize -Rpass-analysis=loop-vectorize")

View file

@ -51,8 +51,8 @@ public:
coefficientSize = (latency + 2*blockSize) / frequencyBinTimeStepSize; coefficientSize = (latency + 2*blockSize) / frequencyBinTimeStepSize;
coefficients.resize(coefficientSize); coefficients.resize(coefficientSize);
for (int i = 0; i < coefficients.size(); ++i){ for (auto & coefficient : coefficients){
coefficients[i] = std::make_unique<std::vector<float>>(numberOfBandsCache); coefficient.resize(numberOfBandsCache);
} }
assert(t_in == 0); assert(t_in == 0);
@ -60,7 +60,7 @@ public:
} }
float* gaborTransform(float* audio_block, int audio_block_length, size_t* return_size, size_t* slice_size) { float* gaborTransform(float* audio_block, int64_t audio_block_length, size_t* return_size, size_t* slice_size) {
resultCache.clear(); resultCache.clear();
if (audio_block == nullptr || audio_block_length == 0) { //finish if (audio_block == nullptr || audio_block_length == 0) { //finish
@ -92,18 +92,15 @@ public:
private: private:
void analyze(float* audio_block, int audio_block_length){ void analyze(float* audio_block, int64_t audio_block_length){
analyzer.analyze(audio_block, t_in, t_in + audio_block_length, coefs); analyzer.analyze(audio_block, t_in, t_in + audio_block_length, coefs);
int64_t st0 = t_in - latency; int64_t st0 = t_in - latency;
int64_t st1 = t_in - latency + audio_block_length; int64_t st1 = t_in - latency + audio_block_length;
gaborator::process([&](int band, int64_t audioSampleIndex, std::complex<float>& coef) { gaborApplySlice(st0, st1);
gaborProcessEntry(band, int(audioSampleIndex), coef);
}, min_band, INT_MAX, st0, st1, coefs);
t_in += audio_block_length;
t_in += (int64_t) audio_block_length;
int64_t t_out = t_in - latency; int64_t t_out = t_in - latency;
@ -115,34 +112,51 @@ private:
int64_t st1 = t_in; int64_t st1 = t_in;
//flush all till latency spot //flush all till latency spot
gaborator::process([&](int band, int64_t audioSampleIndex, std::complex<float>& coef) { gaborApplySlice(st0, st1);
gaborProcessEntry(band, int(audioSampleIndex), coef);
}, min_band, INT_MAX, st0, st1, coefs);
//flush remaining //flush remaining
for (int i = 1; i < coefficientSize; ++i) { for (int i = 1; i < coefficientSize; ++i) {
int circularIndex = (mostRecentCoefficentIndex + i) % int(coefficientSize); int64_t circularIndex = (mostRecentCoefficentIndex + i) % coefficientSize;
auto& currentCoefficient = *coefficients[circularIndex]; auto& currentCoefficient = coefficients[circularIndex];
resultCache.insert(resultCache.end(), currentCoefficient.begin(), currentCoefficient.end()); resultCache.insert(resultCache.end(), currentCoefficient.begin(), currentCoefficient.end());
// fill the oldest with zeros // fill the oldest with zeros, but only the first round
std::fill(currentCoefficient.begin(), currentCoefficient.end(), 0); if(i <= coefficientSize) {
std::fill(currentCoefficient.begin(), currentCoefficient.end(), 0);
}
} }
} }
inline void gaborProcessEntry(int band, int sampleIndex, std::complex<float>& coef) { inline void gaborApplySlice(int64_t st0, int64_t st1) {
int coefficientIndex = sampleIndex / frequencyBinTimeStepSize; /*
Following code is equivalent, but it has been inlined for performance
gaborator::process([&](int band, int64_t audioSampleIndex, std::complex<float>& coef) {
gaborProcessEntry(band, audioSampleIndex, coef);
}, min_band, INT_MAX, st0, st1, coefs);
*/
gaborator::apply_to_slice(false, [&](int band, int64_t st, int time_step, unsigned len, const std::complex<float> *p0){
for (unsigned int i = 0; i < len; i++) {
gaborProcessEntry(band, st, std::abs(*p0++));
st += time_step;
}
}, min_band, INT_MAX, st0, st1, coefs);
}
inline void gaborProcessEntry(int band, int64_t sampleIndex, float coef) {
int64_t coefficientIndex = sampleIndex / frequencyBinTimeStepSize;
int bandIndex = band - firstBandCache; int bandIndex = band - firstBandCache;
int circularIndex = coefficientIndex % int(coefficientSize);
auto& currentCoefficient = *coefficients[circularIndex];
// The first results have a negative audio sample index // The first results have a negative audio sample index
// ignore these // ignore these
if (coefficientIndex > 0 && bandIndex < numberOfBandsCache) { if (coefficientIndex > 0 && bandIndex < numberOfBandsCache) {
int64_t circularIndex = coefficientIndex % coefficientSize;
auto& currentCoefficient = coefficients[circularIndex];
// If a new index is reached, save the old (fixed) coefficients in the history // If a new index is reached, save the old (fixed) coefficients in the history
// Fill the array with zeros to get the max // Fill the array with zeros to get the max
if (coefficientIndex > mostRecentCoefficentIndex && coefficientIndex > coefficientSize) { if (coefficientIndex > mostRecentCoefficentIndex && coefficientIndex > coefficientSize) {
@ -157,7 +171,7 @@ private:
// due to reduction in precision (from audio sample accuracy to steps) multiple // due to reduction in precision (from audio sample accuracy to steps) multiple
// magnitudes could be placed in the same stepIndex, bandIndex pair. // magnitudes could be placed in the same stepIndex, bandIndex pair.
// We take the maximum magnitudes value. // We take the maximum magnitudes value.
currentCoefficient[bandIndex] = std::max(currentCoefficient[bandIndex], std::abs(coef)); currentCoefficient[bandIndex] = std::max(currentCoefficient[bandIndex], coef);
} }
} }
@ -166,13 +180,13 @@ private:
std::vector<float> resultCache; std::vector<float> resultCache;
std::vector<std::unique_ptr<std::vector<float>>> coefficients; std::vector<std::vector<float>> coefficients;
int firstBandCache = -1; int firstBandCache = -1;
int numberOfBandsCache = 0; int numberOfBandsCache = 0;
int mostRecentCoefficentIndex = 0; int64_t mostRecentCoefficentIndex = 0;
const int blockSize; const int blockSize;
const int frequencyBinTimeStepSize; const int64_t frequencyBinTimeStepSize;
int64_t t_in; int64_t t_in;
int min_band; int min_band;
const int sample_rate; const int sample_rate;
@ -180,7 +194,7 @@ private:
int64_t coefficientSize; int64_t coefficientSize;
private: private:
gaborator::parameters parameters; const gaborator::parameters parameters;
gaborator::analyzer<float> analyzer; gaborator::analyzer<float> analyzer;
gaborator::coefs<float> coefs; gaborator::coefs<float> coefs;
}; };
@ -198,7 +212,7 @@ int gaborator_number_of_bands(uintptr_t ptr) {
return reinterpret_cast<Gaborator*>(ptr)->numberOfBands(); return reinterpret_cast<Gaborator*>(ptr)->numberOfBands();
} }
float* gaborator_transform(uintptr_t ptr, float* audio_block, int audio_block_length, size_t* return_size, size_t* slice_size){ float* gaborator_transform(uintptr_t ptr, float* audio_block, int64_t audio_block_length, size_t* return_size, size_t* slice_size){
return reinterpret_cast<Gaborator*>(ptr)->gaborTransform(audio_block, audio_block_length, return_size, slice_size); return reinterpret_cast<Gaborator*>(ptr)->gaborTransform(audio_block, audio_block_length, return_size, slice_size);
} }

View file

@ -15,7 +15,7 @@ void gaborator_release(uintptr_t ptr);
int64_t gaborator_analysis_support(uintptr_t ptr); int64_t gaborator_analysis_support(uintptr_t ptr);
int gaborator_number_of_bands(uintptr_t ptr); int gaborator_number_of_bands(uintptr_t ptr);
float* gaborator_transform(uintptr_t ptr, float* audio_block, int audio_block_length, size_t* return_size, size_t* slice_size); float* gaborator_transform(uintptr_t ptr, float* audio_block, int64_t audio_block_length, size_t* return_size, size_t* slice_size);
#ifdef __cplusplus #ifdef __cplusplus