Use standard intrinsic types, adjust __builtin with intrinsics
This commit is contained in:
parent
cfe987a51d
commit
f86f6d213a
|
@ -47,12 +47,12 @@ static inline
|
|||
std::complex<double> complex_mul(std::complex<double> a_,
|
||||
std::complex<double> b_)
|
||||
{
|
||||
__v2df a = _mm_setr_pd(a_.real(), a_.imag());
|
||||
__v2df b = _mm_setr_pd(b_.real(), b_.imag());
|
||||
__v2df as = (__v2df) _mm_shuffle_pd(a, a, 0x1);
|
||||
__v2df t0 = _mm_mul_pd(a, _mm_shuffle_pd(b, b, 0x0));
|
||||
__v2df t1 = _mm_mul_pd(as, _mm_shuffle_pd(b, b, 0x3));
|
||||
__v2df c = __builtin_ia32_addsubpd(t0, t1); // SSE3
|
||||
__m128d a = _mm_setr_pd(a_.real(), a_.imag());
|
||||
__m128d b = _mm_setr_pd(b_.real(), b_.imag());
|
||||
__m128d as = (__m128d) _mm_shuffle_pd(a, a, 0x1);
|
||||
__m128d t0 = _mm_mul_pd(a, _mm_shuffle_pd(b, b, 0x0));
|
||||
__m128d t1 = _mm_mul_pd(as, _mm_shuffle_pd(b, b, 0x3));
|
||||
__m128d c = _mm_addsub_pd(t0, t1); // SSE3
|
||||
return std::complex<double>(c[0], c[1]);
|
||||
}
|
||||
#else
|
||||
|
@ -113,11 +113,11 @@ complex_magnitude_naive(I *GABORATOR_RESTRICT inv,
|
|||
// Perform two complex float multiplies in parallel
|
||||
|
||||
static inline
|
||||
__v4sf complex_mul_vec2(__v4sf aa, __v4sf bb) {
|
||||
__v4sf aas =_mm_shuffle_ps(aa, aa, 0xb1);
|
||||
__v4sf t0 = _mm_mul_ps(aa, _mm_moveldup_ps(bb));
|
||||
__v4sf t1 = _mm_mul_ps(aas, _mm_movehdup_ps(bb));
|
||||
return __builtin_ia32_addsubps(t0, t1); // SSE3
|
||||
__m128 complex_mul_vec2(__m128 aa, __m128 bb) {
|
||||
__m128 aas =_mm_shuffle_ps(aa, aa, 0xb1);
|
||||
__m128 t0 = _mm_mul_ps(aa, _mm_moveldup_ps(bb));
|
||||
__m128 t1 = _mm_mul_ps(aas, _mm_movehdup_ps(bb));
|
||||
return _mm_addsub_ps(t0, t1); // SSE3
|
||||
}
|
||||
|
||||
// Calculate the elementwise product of a complex float vector
|
||||
|
@ -131,12 +131,12 @@ elementwise_product(std::complex<float> *cv,
|
|||
{
|
||||
assert((n & 1) == 0);
|
||||
n >>= 1;
|
||||
__v4sf *c = (__v4sf *) cv;
|
||||
const __v4sf *a = (const __v4sf *) av;
|
||||
const __v4sf *b = (const __v4sf *) bv;
|
||||
__m128 *c = (__m128 *) cv;
|
||||
const __m128 *a = (const __m128 *) av;
|
||||
const __m128 *b = (const __m128 *) bv;
|
||||
while (n--) {
|
||||
__v4sf aa = *a++;
|
||||
__v4sf bb = *b++;
|
||||
__m128 aa = *a++;
|
||||
__m128 bb = *b++;
|
||||
*c++ = complex_mul_vec2(aa, bb);
|
||||
}
|
||||
}
|
||||
|
@ -154,13 +154,13 @@ elementwise_product(std::complex<float> *cv,
|
|||
{
|
||||
assert((n & 3) == 0);
|
||||
n >>= 2;
|
||||
__v4sf *c = (__v4sf *) cv;
|
||||
const __v4sf *a = (const __v4sf *) av;
|
||||
const __v4sf *b = (const __v4sf *) bv;
|
||||
__m128 *c = (__m128 *) cv;
|
||||
const __m128 *a = (const __m128 *) av;
|
||||
const __m128 *b = (const __m128 *) bv;
|
||||
while (n--) {
|
||||
__v4sf a0 = (__v4sf) _mm_loadu_si128((const __m128i *) a++);
|
||||
__v4sf a1 = (__v4sf) _mm_loadu_si128((const __m128i *) a++);
|
||||
__v4sf bb = *b++;
|
||||
__m128 a0 = (__m128) _mm_loadu_si128((const __m128i *) a++);
|
||||
__m128 a1 = (__m128) _mm_loadu_si128((const __m128i *) a++);
|
||||
__m128 bb = *b++;
|
||||
*c++ = _mm_mul_ps(a0, _mm_unpacklo_ps(bb, bb));
|
||||
*c++ = _mm_mul_ps(a1, _mm_unpackhi_ps(bb, bb));
|
||||
}
|
||||
|
@ -175,13 +175,13 @@ elementwise_product_times_scalar(std::complex<float> *cv,
|
|||
{
|
||||
assert((n & 1) == 0);
|
||||
n >>= 1;
|
||||
const __v4sf *a = (const __v4sf *) av;
|
||||
const __v4sf *b = (const __v4sf *) bv;
|
||||
const __v4sf dd = (__v4sf) { d.real(), d.imag(), d.real(), d.imag() };
|
||||
__v4sf *c = (__v4sf *) cv;
|
||||
const __m128 *a = (const __m128 *) av;
|
||||
const __m128 *b = (const __m128 *) bv;
|
||||
const __m128 dd = (__m128) { d.real(), d.imag(), d.real(), d.imag() };
|
||||
__m128 *c = (__m128 *) cv;
|
||||
while (n--) {
|
||||
__v4sf aa = *a++;
|
||||
__v4sf bb = *b++;
|
||||
__m128 aa = *a++;
|
||||
__m128 bb = *b++;
|
||||
*c++ = complex_mul_vec2(complex_mul_vec2(aa, bb), dd);
|
||||
}
|
||||
}
|
||||
|
@ -200,19 +200,19 @@ complex_magnitude(std::complex<float> *inv,
|
|||
*outv++ = std::sqrt(v.real() * v.real() + v.imag() * v.imag());
|
||||
n--;
|
||||
}
|
||||
const __v4sf *in = (const __v4sf *) inv;
|
||||
__v4sf *out = (__v4sf *) outv;
|
||||
const __m128 *in = (const __m128 *) inv;
|
||||
__m128 *out = (__m128 *) outv;
|
||||
while (n >= 4) {
|
||||
__v4sf aa = *in++; // c0re c0im c1re c1im
|
||||
__v4sf aa2 = _mm_mul_ps(aa, aa); // c0re^2 c0im^2 c1re^2 c1im^2
|
||||
__v4sf bb = *in++; // c2re c2im c3re c3im
|
||||
__v4sf bb2 = _mm_mul_ps(bb, bb); // etc
|
||||
__m128 aa = *in++; // c0re c0im c1re c1im
|
||||
__m128 aa2 = _mm_mul_ps(aa, aa); // c0re^2 c0im^2 c1re^2 c1im^2
|
||||
__m128 bb = *in++; // c2re c2im c3re c3im
|
||||
__m128 bb2 = _mm_mul_ps(bb, bb); // etc
|
||||
// Gather the real parts: x0 x2 y0 y2
|
||||
// 10 00 10 00 = 0x88
|
||||
__v4sf re2 =_mm_shuffle_ps(aa2, bb2, 0x88);
|
||||
__v4sf im2 =_mm_shuffle_ps(aa2, bb2, 0xdd);
|
||||
__v4sf mag2 = _mm_add_ps(re2, im2);
|
||||
__v4sf mag = __builtin_ia32_sqrtps(mag2);
|
||||
__m128 re2 =_mm_shuffle_ps(aa2, bb2, 0x88);
|
||||
__m128 im2 =_mm_shuffle_ps(aa2, bb2, 0xdd);
|
||||
__m128 mag2 = _mm_add_ps(re2, im2);
|
||||
__m128 mag = _mm_sqrt_ps(mag2);
|
||||
// Unaligned store
|
||||
_mm_storeu_si128((__m128i *)out, (__m128i)mag);
|
||||
out++;
|
||||
|
|
Loading…
Reference in a new issue