S.O.N.G/libtta
Archived
3
0
Fork 0
This repository has been archived on 2022-02-26. You can view files and clone it, but cannot push or open issues or pull requests.
libtta/filter.h
2015-02-20 10:41:16 +00:00

210 lines
7 KiB
C

/*
* filter.h
*
* Description: TTA hybrid filter functions
* Copyright (c) 1999-2015 Aleksander Djuric. All rights reserved.
* SSE4 optimization copyright (c) 2008 Kazuki Oikawa
* Distributed under the GNU Lesser General Public License (LGPL).
* The complete text of the license can be found in the COPYING
* file included in the distribution.
*
*/
#ifndef _FILTER_H
#define _FILTER_H
#if defined(CPU_ARM) && defined(ENABLE_ASM) // implements in filter_arm.S
extern int hybrid_filter_dec(TTA_fltst *fs, int *in);
extern int hybrid_filter_enc(TTA_fltst *fs, int *in);
#elif defined(ENABLE_SSE2) || defined(ENABLE_SSE4)
#if defined(ENABLE_SSE4)
#include <smmintrin.h>
#define mullo_epi32(a, b) _mm_mullo_epi32(a, b)
#else // ENABLE_SSE2
#include <emmintrin.h>
#define mullo_epi32(a, b) \
_mm_unpacklo_epi32(_mm_shuffle_epi32(_mm_mul_epu32(a, b), 0xd8), \
_mm_shuffle_epi32(_mm_mul_epu32(_mm_shuffle_epi32(a, 0xb1), \
_mm_shuffle_epi32(b, 0xb1)), 0xd8))
#endif // ENABLE_SSE4
////////////////////////// hybrid_filter_sse4_dec ///////////////////////////
/////////////////////////////////////////////////////////////////////////////
static __inline void hybrid_filter_dec(TTA_fltst *fs, TTAint32 *in) {
register TTAint32 *pA = fs->dl;
register TTAint32 *pB = fs->qm;
register TTAint32 *pM = fs->dx;
register TTAint32 sum = fs->round;
register __m128i xmA1, xmA2, xmB1, xmB2, xmM1, xmM2, xmDP;
xmA1 = _mm_load_si128((__m128i*)pA);
xmA2 = _mm_load_si128((__m128i*)(pA + 4));
xmB1 = _mm_load_si128((__m128i*)pB);
xmB2 = _mm_load_si128((__m128i*)(pB + 4));
xmM1 = _mm_load_si128((__m128i*)pM);
xmM2 = _mm_load_si128((__m128i*)(pM + 4));
if (fs->error < 0) {
xmB1 = _mm_sub_epi32(xmB1, xmM1);
xmB2 = _mm_sub_epi32(xmB2, xmM2);
_mm_store_si128((__m128i*)pB, xmB1);
_mm_store_si128((__m128i*)(pB + 4), xmB2);
} else if (fs->error > 0) {
xmB1 = _mm_add_epi32(xmB1, xmM1);
xmB2 = _mm_add_epi32(xmB2, xmM2);
_mm_store_si128((__m128i*)pB, xmB1);
_mm_store_si128((__m128i*)(pB + 4), xmB2);
}
xmDP = _mm_add_epi32(mullo_epi32(xmA1, xmB1), mullo_epi32(xmA2, xmB2));
xmDP = _mm_add_epi32(xmDP, _mm_unpackhi_epi64(xmDP, xmDP));
sum += _mm_cvtsi128_si32(xmDP) + _mm_cvtsi128_si32(_mm_shuffle_epi32(xmDP, 1));
xmM1 = _mm_or_si128(_mm_srli_si128(xmM1, 4), _mm_slli_si128(xmM2, 12));
xmA1 = _mm_or_si128(_mm_srli_si128(xmA1, 4), _mm_slli_si128(xmA2, 12));
xmM2 = _mm_andnot_si128(_mm_setr_epi32(0, 1, 1, 3),
_mm_or_si128(_mm_srai_epi32(xmA2, 30), _mm_setr_epi32(1, 2, 2, 4)));
_mm_store_si128((__m128i*)pA, xmA1);
_mm_store_si128((__m128i*)pM, xmM1);
_mm_store_si128((__m128i*)(pM + 4), xmM2);
fs->error = *in;
*in += (sum >> fs->shift);
pA[4] = -pA[5]; pA[5] = -pA[6];
pA[6] = *in - pA[7]; pA[7] = *in;
pA[5] += pA[6]; pA[4] += pA[5];
} // hybrid_filter_dec
////////////////////////// hybrid_filter_sse4_enc ///////////////////////////
/////////////////////////////////////////////////////////////////////////////
static __inline void hybrid_filter_enc(TTA_fltst *fs, TTAint32 *in) {
register TTAint32 *pA = fs->dl;
register TTAint32 *pB = fs->qm;
register TTAint32 *pM = fs->dx;
register TTAint32 sum = fs->round;
register __m128i xmA1, xmA2, xmB1, xmB2, xmM1, xmM2, xmDP;
xmA1 = _mm_load_si128((__m128i*)pA);
xmA2 = _mm_load_si128((__m128i*)(pA + 4));
xmB1 = _mm_load_si128((__m128i*)pB);
xmB2 = _mm_load_si128((__m128i*)(pB + 4));
xmM1 = _mm_load_si128((__m128i*)pM);
xmM2 = _mm_load_si128((__m128i*)(pM + 4));
if (fs->error < 0) {
xmB1 = _mm_sub_epi32(xmB1, xmM1);
xmB2 = _mm_sub_epi32(xmB2, xmM2);
_mm_store_si128((__m128i*)pB, xmB1);
_mm_store_si128((__m128i*)(pB + 4), xmB2);
} else if (fs->error > 0) {
xmB1 = _mm_add_epi32(xmB1, xmM1);
xmB2 = _mm_add_epi32(xmB2, xmM2);
_mm_store_si128((__m128i*)pB, xmB1);
_mm_store_si128((__m128i*)(pB + 4), xmB2);
}
xmDP = _mm_add_epi32(mullo_epi32(xmA1, xmB1), mullo_epi32(xmA2, xmB2));
xmDP = _mm_add_epi32(xmDP, _mm_unpackhi_epi64(xmDP, xmDP));
sum += _mm_cvtsi128_si32(xmDP) + _mm_cvtsi128_si32(_mm_shuffle_epi32(xmDP, 1));
xmM1 = _mm_or_si128(_mm_srli_si128(xmM1, 4), _mm_slli_si128(xmM2, 12));
xmA1 = _mm_or_si128(_mm_srli_si128(xmA1, 4), _mm_slli_si128(xmA2, 12));
xmM2 = _mm_andnot_si128(_mm_setr_epi32(0, 1, 1, 3),
_mm_or_si128(_mm_srai_epi32(xmA2, 30), _mm_setr_epi32(1, 2, 2, 4)));
_mm_store_si128((__m128i*)pA, xmA1);
_mm_store_si128((__m128i*)pM, xmM1);
_mm_store_si128((__m128i*)(pM + 4), xmM2);
pA[4] = -pA[5]; pA[5] = -pA[6];
pA[6] = *in - pA[7]; pA[7] = *in;
pA[5] += pA[6]; pA[4] += pA[5];
*in -= (sum >> fs->shift);
fs->error = *in;
} // hybrid_filter_enc
#else // PORTABLE
///////////////////////// hybrid_filter_compat_dec //////////////////////////
/////////////////////////////////////////////////////////////////////////////
static __inline void hybrid_filter_dec(TTA_fltst *fs, TTAint32 *in) {
register TTAint32 *pA = fs->dl;
register TTAint32 *pB = fs->qm;
register TTAint32 *pM = fs->dx;
register TTAint32 sum = fs->round;
if (fs->error < 0) {
pB[0] -= pM[0]; pB[1] -= pM[1]; pB[2] -= pM[2]; pB[3] -= pM[3];
pB[4] -= pM[4]; pB[5] -= pM[5]; pB[6] -= pM[6]; pB[7] -= pM[7];
} else if (fs->error > 0) {
pB[0] += pM[0]; pB[1] += pM[1]; pB[2] += pM[2]; pB[3] += pM[3];
pB[4] += pM[4]; pB[5] += pM[5]; pB[6] += pM[6]; pB[7] += pM[7];
}
sum += pA[0] * pB[0] + pA[1] * pB[1] + pA[2] * pB[2] + pA[3] * pB[3] +
pA[4] * pB[4] + pA[5] * pB[5] + pA[6] * pB[6] + pA[7] * pB[7];
pM[0] = pM[1]; pM[1] = pM[2]; pM[2] = pM[3]; pM[3] = pM[4];
pA[0] = pA[1]; pA[1] = pA[2]; pA[2] = pA[3]; pA[3] = pA[4];
pM[4] = ((pA[4] >> 30) | 1);
pM[5] = ((pA[5] >> 30) | 2) & ~1;
pM[6] = ((pA[6] >> 30) | 2) & ~1;
pM[7] = ((pA[7] >> 30) | 4) & ~3;
fs->error = *in;
*in += (sum >> fs->shift);
pA[4] = -pA[5]; pA[5] = -pA[6];
pA[6] = *in - pA[7]; pA[7] = *in;
pA[5] += pA[6]; pA[4] += pA[5];
} // hybrid_filter_dec
///////////////////////// hybrid_filter_compat_enc //////////////////////////
/////////////////////////////////////////////////////////////////////////////
static __inline void hybrid_filter_enc(TTA_fltst *fs, TTAint32 *in) {
register TTAint32 *pA = fs->dl;
register TTAint32 *pB = fs->qm;
register TTAint32 *pM = fs->dx;
register TTAint32 sum = fs->round;
if (fs->error < 0) {
pB[0] -= pM[0]; pB[1] -= pM[1]; pB[2] -= pM[2]; pB[3] -= pM[3];
pB[4] -= pM[4]; pB[5] -= pM[5]; pB[6] -= pM[6]; pB[7] -= pM[7];
} else if (fs->error > 0) {
pB[0] += pM[0]; pB[1] += pM[1]; pB[2] += pM[2]; pB[3] += pM[3];
pB[4] += pM[4]; pB[5] += pM[5]; pB[6] += pM[6]; pB[7] += pM[7];
}
sum += pA[0] * pB[0] + pA[1] * pB[1] + pA[2] * pB[2] + pA[3] * pB[3] +
pA[4] * pB[4] + pA[5] * pB[5] + pA[6] * pB[6] + pA[7] * pB[7];
pM[0] = pM[1]; pM[1] = pM[2]; pM[2] = pM[3]; pM[3] = pM[4];
pA[0] = pA[1]; pA[1] = pA[2]; pA[2] = pA[3]; pA[3] = pA[4];
pM[4] = ((pA[4] >> 30) | 1);
pM[5] = ((pA[5] >> 30) | 2) & ~1;
pM[6] = ((pA[6] >> 30) | 2) & ~1;
pM[7] = ((pA[7] >> 30) | 4) & ~3;
pA[4] = -pA[5]; pA[5] = -pA[6];
pA[6] = *in - pA[7]; pA[7] = *in;
pA[5] += pA[6]; pA[4] += pA[5];
*in -= (sum >> fs->shift);
fs->error = *in;
} // hybrid_filter_enc
#endif // PORTABLE
#endif // _FILTER_H