312 lines
8.8 KiB
C++
312 lines
8.8 KiB
C++
//
|
|
// Vector math operations
|
|
//
|
|
// Copyright (C) 2016-2021 Andreas Gustafsson. This file is part of
|
|
// the Gaborator library source distribution. See the file LICENSE at
|
|
// the top level of the distribution for license information.
|
|
//
|
|
|
|
#ifndef _GABORATOR_VECTOR_MATH_H
|
|
#define _GABORATOR_VECTOR_MATH_H
|
|
|
|
#include <assert.h>
|
|
|
|
#if GABORATOR_USE_SSE3_INTRINSICS
|
|
#include <pmmintrin.h>
|
|
#endif
|
|
|
|
#include <complex>
|
|
|
|
// Use __restrict__ where supported
|
|
#if defined(__GNUC__) || defined(__clang__)
|
|
#define GABORATOR_RESTRICT __restrict__
|
|
#else
|
|
#define GABORATOR_RESTRICT
|
|
#endif
|
|
|
|
namespace gaborator {
|
|
|
|
// The _naive versions are used when SSE is not available, and as
|
|
// references when testing the SSE versions
|
|
|
|
// Naive or not, this is faster than the macOS std::complex
|
|
// multiplication operator, which checks for infinities even with
|
|
// -ffast-math.
|
|
|
|
template <class T>
|
|
std::complex<T> complex_mul_naive(std::complex<T> a,
|
|
std::complex<T> b)
|
|
{
|
|
return std::complex<T>(a.real() * b.real() - a.imag() * b.imag(),
|
|
a.real() * b.imag() + a.imag() * b.real());
|
|
}
|
|
|
|
#if GABORATOR_USE_SSE3_INTRINSICS
|
|
// Note: this is sometimes slower than the naive code.
|
|
static inline
|
|
std::complex<double> complex_mul(std::complex<double> a_,
|
|
std::complex<double> b_)
|
|
{
|
|
__m128d a = _mm_setr_pd(a_.real(), a_.imag());
|
|
__m128d b = _mm_setr_pd(b_.real(), b_.imag());
|
|
__m128d as = (__m128d) _mm_shuffle_pd(a, a, 0x1);
|
|
__m128d t0 = _mm_mul_pd(a, _mm_shuffle_pd(b, b, 0x0));
|
|
__m128d t1 = _mm_mul_pd(as, _mm_shuffle_pd(b, b, 0x3));
|
|
__m128d c = _mm_addsub_pd(t0, t1); // SSE3
|
|
return std::complex<double>(c[0], c[1]);
|
|
}
|
|
#else
|
|
static inline
|
|
std::complex<double> complex_mul(std::complex<double> a_,
|
|
std::complex<double> b_)
|
|
{
|
|
return complex_mul_naive(a_, b_);
|
|
}
|
|
#endif
|
|
|
|
static inline
|
|
std::complex<float> complex_mul(std::complex<float> a_,
|
|
std::complex<float> b_)
|
|
{
|
|
return complex_mul_naive(a_, b_);
|
|
}
|
|
|
|
template <class T, class U, class V>
|
|
static inline void
|
|
elementwise_product_naive(T *GABORATOR_RESTRICT r,
|
|
U *GABORATOR_RESTRICT a,
|
|
V *GABORATOR_RESTRICT b,
|
|
size_t n)
|
|
{
|
|
for (size_t i = 0; i < n; i++)
|
|
r[i] = complex_mul(a[i], b[i]);
|
|
}
|
|
|
|
template <class T, class U, class V, class S>
|
|
static inline void
|
|
elementwise_product_times_scalar_naive(T *GABORATOR_RESTRICT r,
|
|
U *GABORATOR_RESTRICT a,
|
|
V *GABORATOR_RESTRICT b,
|
|
S s,
|
|
size_t n)
|
|
{
|
|
for (size_t i = 0; i < n; i++)
|
|
r[i] = a[i] * b[i] * s;
|
|
}
|
|
|
|
// I is the input complex data type, O is the output data type
|
|
template <class I, class O>
|
|
static inline void
|
|
complex_magnitude_naive(I *GABORATOR_RESTRICT inv,
|
|
O *GABORATOR_RESTRICT outv,
|
|
size_t n)
|
|
{
|
|
for (size_t i = 0; i < n; i++)
|
|
outv[i] = std::sqrt(inv[i].real() * inv[i].real() +
|
|
inv[i].imag() * inv[i].imag());
|
|
}
|
|
|
|
#if GABORATOR_USE_SSE3_INTRINSICS
|
|
|
|
#include <pmmintrin.h>
|
|
|
|
// Perform two complex float multiplies in parallel
|
|
|
|
static inline
|
|
__m128 complex_mul_vec2(__m128 aa, __m128 bb) {
|
|
__m128 aas =_mm_shuffle_ps(aa, aa, 0xb1);
|
|
__m128 t0 = _mm_mul_ps(aa, _mm_moveldup_ps(bb));
|
|
__m128 t1 = _mm_mul_ps(aas, _mm_movehdup_ps(bb));
|
|
return _mm_addsub_ps(t0, t1); // SSE3
|
|
}
|
|
|
|
// Calculate the elementwise product of a complex float vector
|
|
// and another complex float vector.
|
|
|
|
static inline void
|
|
elementwise_product(std::complex<float> *cv,
|
|
const std::complex<float> *av,
|
|
const std::complex<float> *bv,
|
|
size_t n)
|
|
{
|
|
assert((n & 1) == 0);
|
|
n >>= 1;
|
|
__m128 *c = (__m128 *) cv;
|
|
const __m128 *a = (const __m128 *) av;
|
|
const __m128 *b = (const __m128 *) bv;
|
|
while (n--) {
|
|
__m128 aa = *a++;
|
|
__m128 bb = *b++;
|
|
*c++ = complex_mul_vec2(aa, bb);
|
|
}
|
|
}
|
|
|
|
// Calculate the elementwise product of a complex float vector
|
|
// and real float vector.
|
|
//
|
|
// The input "a" may be unaligned; the output "c" must be aligned.
|
|
|
|
static inline void
|
|
elementwise_product(std::complex<float> *cv,
|
|
const std::complex<float> *av,
|
|
const float *bv,
|
|
size_t n)
|
|
{
|
|
assert((n & 3) == 0);
|
|
n >>= 2;
|
|
__m128 *c = (__m128 *) cv;
|
|
const __m128 *a = (const __m128 *) av;
|
|
const __m128 *b = (const __m128 *) bv;
|
|
while (n--) {
|
|
__m128 a0 = (__m128) _mm_loadu_si128((const __m128i *) a++);
|
|
__m128 a1 = (__m128) _mm_loadu_si128((const __m128i *) a++);
|
|
__m128 bb = *b++;
|
|
*c++ = _mm_mul_ps(a0, _mm_unpacklo_ps(bb, bb));
|
|
*c++ = _mm_mul_ps(a1, _mm_unpackhi_ps(bb, bb));
|
|
}
|
|
}
|
|
|
|
static inline void
|
|
elementwise_product_times_scalar(std::complex<float> *cv,
|
|
const std::complex<float> *av,
|
|
const std::complex<float> *bv,
|
|
std::complex<float> d,
|
|
size_t n)
|
|
{
|
|
assert((n & 1) == 0);
|
|
n >>= 1;
|
|
const __m128 *a = (const __m128 *) av;
|
|
const __m128 *b = (const __m128 *) bv;
|
|
const __m128 dd = (__m128) { d.real(), d.imag(), d.real(), d.imag() };
|
|
__m128 *c = (__m128 *) cv;
|
|
while (n--) {
|
|
__m128 aa = *a++;
|
|
__m128 bb = *b++;
|
|
*c++ = complex_mul_vec2(complex_mul_vec2(aa, bb), dd);
|
|
}
|
|
}
|
|
|
|
// XXX arguments reversed wrt others
|
|
|
|
static inline void
|
|
complex_magnitude(std::complex<float> *inv,
|
|
float *outv,
|
|
size_t n)
|
|
{
|
|
// Processes four complex values (32 bytes) at a time ,
|
|
// outputting four scalar magnitudes (16 bytes) at a time.
|
|
while ((((uintptr_t) inv) & 0x1F) && n) {
|
|
std::complex<float> v = *inv++;
|
|
*outv++ = std::sqrt(v.real() * v.real() + v.imag() * v.imag());
|
|
n--;
|
|
}
|
|
const __m128 *in = (const __m128 *) inv;
|
|
__m128 *out = (__m128 *) outv;
|
|
while (n >= 4) {
|
|
__m128 aa = *in++; // c0re c0im c1re c1im
|
|
__m128 aa2 = _mm_mul_ps(aa, aa); // c0re^2 c0im^2 c1re^2 c1im^2
|
|
__m128 bb = *in++; // c2re c2im c3re c3im
|
|
__m128 bb2 = _mm_mul_ps(bb, bb); // etc
|
|
// Gather the real parts: x0 x2 y0 y2
|
|
// 10 00 10 00 = 0x88
|
|
__m128 re2 =_mm_shuffle_ps(aa2, bb2, 0x88);
|
|
__m128 im2 =_mm_shuffle_ps(aa2, bb2, 0xdd);
|
|
__m128 mag2 = _mm_add_ps(re2, im2);
|
|
__m128 mag = _mm_sqrt_ps(mag2);
|
|
// Unaligned store
|
|
_mm_storeu_si128((__m128i *)out, (__m128i)mag);
|
|
out++;
|
|
n -= 4;
|
|
}
|
|
inv = (std::complex<float> *) in;
|
|
outv = (float *)out;
|
|
while (n) {
|
|
std::complex<float> v = *inv++;
|
|
*outv++ = std::sqrt(v.real() * v.real() + v.imag() * v.imag());
|
|
n--;
|
|
}
|
|
}
|
|
|
|
// Double-precision version is unoptimized
|
|
|
|
static inline void
|
|
elementwise_product(std::complex<double> *c,
|
|
const std::complex<double> *a,
|
|
const std::complex<double> *b,
|
|
size_t n)
|
|
{
|
|
elementwise_product_naive(c, a, b, n);
|
|
}
|
|
|
|
static inline void
|
|
elementwise_product(std::complex<double> *c,
|
|
const std::complex<double> *a,
|
|
const double *b,
|
|
size_t n)
|
|
{
|
|
elementwise_product_naive(c, a, b, n);
|
|
}
|
|
|
|
template <class T, class U, class V, class S>
|
|
static inline void
|
|
elementwise_product_times_scalar(T *r,
|
|
U *a,
|
|
V *b,
|
|
S s,
|
|
size_t n)
|
|
{
|
|
elementwise_product_times_scalar_naive(r, a, b, s, n);
|
|
}
|
|
|
|
template <class O>
|
|
static inline void
|
|
complex_magnitude(std::complex<double> *inv,
|
|
O *outv,
|
|
size_t n)
|
|
{
|
|
complex_magnitude_naive(inv, outv, n);
|
|
}
|
|
|
|
#else // ! GABORATOR_USE_SSE3_INTRINSICS
|
|
|
|
// Forward to the naive implementations. These are inline functions
|
|
// rather than #defines to avoid namespace pollution.
|
|
|
|
template <class T, class U, class V>
|
|
static inline void
|
|
elementwise_product(T *r,
|
|
U *a,
|
|
V *b,
|
|
size_t n)
|
|
{
|
|
elementwise_product_naive(r, a, b, n);
|
|
}
|
|
|
|
template <class T, class U, class V, class S>
|
|
static inline void
|
|
elementwise_product_times_scalar(T *r,
|
|
U *a,
|
|
V *b,
|
|
S s,
|
|
size_t n)
|
|
{
|
|
elementwise_product_times_scalar_naive(r, a, b, s, n);
|
|
}
|
|
|
|
template <class I, class O>
|
|
static inline void
|
|
complex_magnitude(I *inv,
|
|
O *outv,
|
|
size_t n)
|
|
{
|
|
complex_magnitude_naive(inv, outv, n);
|
|
}
|
|
|
|
#endif // ! USE_SSE3_INTINSICS
|
|
|
|
} // namespace
|
|
|
|
#undef GABORATOR_RESTRICT
|
|
|
|
#endif
|