audio-resampler: Improve taps memory layout

Rearrange the oversampled taps in memory to make it easier to use
SIMD instructions on them. this simplifies some sse code.
Add some more optimizations
This commit is contained in:
Wim Taymans 2016-02-11 11:57:26 +01:00
parent e9fc039bb1
commit f6e0481ab5
2 changed files with 245 additions and 80 deletions

View file

@ -45,23 +45,15 @@ inner_product_gfloat_linear_1_sse (gfloat * o, const gfloat * a,
const gfloat * b, gint len, const gfloat * icoeff, gint oversample) const gfloat * b, gint len, const gfloat * icoeff, gint oversample)
{ {
gint i = 0; gint i = 0;
__m128 sum = _mm_setzero_ps (), t, b0; __m128 sum = _mm_setzero_ps (), t;
__m128 f = _mm_loadu_ps(icoeff); __m128 f = _mm_loadu_ps(icoeff);
for (; i < len; i += 4) { for (; i < len; i += 4) {
t = _mm_loadu_ps (a + i); t = _mm_loadu_ps (a + i);
sum = _mm_add_ps (sum, _mm_mul_ps (_mm_unpacklo_ps (t, t),
b0 = _mm_loadh_pi (b0, (__m64 *) (b + (i+0)*oversample)); _mm_load_ps (b + 2 * (i + 0))));
b0 = _mm_loadl_pi (b0, (__m64 *) (b + (i+1)*oversample)); sum = _mm_add_ps (sum, _mm_mul_ps (_mm_unpackhi_ps (t, t),
_mm_load_ps (b + 2 * (i + 2))));
sum =
_mm_add_ps (sum, _mm_mul_ps (_mm_unpacklo_ps (t, t), b0));
b0 = _mm_loadh_pi (b0, (__m64 *) (b + (i+2)*oversample));
b0 = _mm_loadl_pi (b0, (__m64 *) (b + (i+3)*oversample));
sum =
_mm_add_ps (sum, _mm_mul_ps (_mm_unpackhi_ps (t, t), b0));
} }
sum = _mm_mul_ps (sum, f); sum = _mm_mul_ps (sum, f);
sum = _mm_add_ps (sum, _mm_movehl_ps (sum, sum)); sum = _mm_add_ps (sum, _mm_movehl_ps (sum, sum));
@ -79,9 +71,9 @@ inner_product_gfloat_cubic_1_sse (gfloat * o, const gfloat * a,
for (; i < len; i += 2) { for (; i < len; i += 2) {
sum = _mm_add_ps (sum, _mm_mul_ps (_mm_load1_ps (a + i + 0), sum = _mm_add_ps (sum, _mm_mul_ps (_mm_load1_ps (a + i + 0),
_mm_loadu_ps (b + (i + 0) * oversample))); _mm_load_ps (b + 4 * (i + 0))));
sum = _mm_add_ps (sum, _mm_mul_ps (_mm_load1_ps (a + i + 1), sum = _mm_add_ps (sum, _mm_mul_ps (_mm_load1_ps (a + i + 1),
_mm_loadu_ps (b + (i + 1) * oversample))); _mm_load_ps (b + 4 * (i + 1))));
} }
sum = _mm_mul_ps (sum, f); sum = _mm_mul_ps (sum, f);
sum = _mm_add_ps (sum, _mm_movehl_ps (sum, sum)); sum = _mm_add_ps (sum, _mm_movehl_ps (sum, sum));
@ -118,9 +110,10 @@ inner_product_gfloat_none_2_sse (gfloat * o, const gfloat * a,
} }
MAKE_RESAMPLE_FUNC (gfloat, none, 1, sse); MAKE_RESAMPLE_FUNC (gfloat, none, 1, sse);
MAKE_RESAMPLE_FUNC (gfloat, none, 2, sse);
MAKE_RESAMPLE_FUNC (gfloat, linear, 1, sse); MAKE_RESAMPLE_FUNC (gfloat, linear, 1, sse);
MAKE_RESAMPLE_FUNC (gfloat, cubic, 1, sse); MAKE_RESAMPLE_FUNC (gfloat, cubic, 1, sse);
MAKE_RESAMPLE_FUNC (gfloat, none, 2, sse);
#endif #endif
#if defined (HAVE_EMMINTRIN_H) && defined(__SSE2__) #if defined (HAVE_EMMINTRIN_H) && defined(__SSE2__)
@ -154,6 +147,94 @@ inner_product_gint16_none_1_sse2 (gint16 * o, const gint16 * a,
*o = _mm_extract_epi16 (sum, 0); *o = _mm_extract_epi16 (sum, 0);
} }
static inline void
inner_product_gint16_linear_1_sse2 (gint16 * o, const gint16 * a,
const gint16 * b, gint len, const gint16 * icoeff, gint oversample)
{
gint i = 0;
__m128i sum, t, ta, tb, m1, m2;
__m128i f = _mm_cvtsi64_si128 (*((long long*)icoeff));
sum = _mm_setzero_si128 ();
f = _mm_unpacklo_epi16 (f, sum);
for (; i < len; i += 8) {
t = _mm_loadu_si128 ((__m128i *) (a + i));
ta = _mm_unpacklo_epi16 (t, t);
tb = _mm_load_si128 ((__m128i *) (b + 2 * i + 0));
m1 = _mm_mulhi_epi16 (ta, tb);
m2 = _mm_mullo_epi16 (ta, tb);
sum = _mm_add_epi32 (sum, _mm_unpacklo_epi16 (m2, m1));
sum = _mm_add_epi32 (sum, _mm_unpackhi_epi16 (m2, m1));
ta = _mm_unpackhi_epi16 (t, t);
tb = _mm_load_si128 ((__m128i *) (b + 2 * i + 8));
m1 = _mm_mulhi_epi16 (ta, tb);
m2 = _mm_mullo_epi16 (ta, tb);
sum = _mm_add_epi32 (sum, _mm_unpacklo_epi16 (m2, m1));
sum = _mm_add_epi32 (sum, _mm_unpackhi_epi16 (m2, m1));
}
sum = _mm_srai_epi32 (sum, PRECISION_S16);
sum = _mm_madd_epi16 (sum, f);
sum =
_mm_add_epi32 (sum, _mm_shuffle_epi32 (sum, _MM_SHUFFLE (2, 3, 2,
3)));
sum =
_mm_add_epi32 (sum, _mm_shuffle_epi32 (sum, _MM_SHUFFLE (1, 1, 1,
1)));
sum = _mm_add_epi32 (sum, _mm_set1_epi32 (1 << (PRECISION_S16 - 1)));
sum = _mm_srai_epi32 (sum, PRECISION_S16);
sum = _mm_packs_epi32 (sum, sum);
*o = _mm_extract_epi16 (sum, 0);
}
static inline void
inner_product_gint16_cubic_1_sse2 (gint16 * o, const gint16 * a,
const gint16 * b, gint len, const gint16 * icoeff, gint oversample)
{
gint i = 0;
__m128i sum, ta, tb, m1, m2;
__m128i f = _mm_cvtsi64_si128 (*((long long*)icoeff));
sum = _mm_setzero_si128 ();
f = _mm_unpacklo_epi16 (f, sum);
for (; i < len; i += 2) {
ta = _mm_cvtsi32_si128 (*(gint32*)(a + i));
ta = _mm_unpacklo_epi16 (ta, ta);
ta = _mm_unpacklo_epi16 (ta, ta);
tb = _mm_load_si128 ((__m128i *) (b + 4 * i + 0));
m1 = _mm_mulhi_epi16 (ta, tb);
m2 = _mm_mullo_epi16 (ta, tb);
sum = _mm_add_epi32 (sum, _mm_unpacklo_epi16 (m2, m1));
sum = _mm_add_epi32 (sum, _mm_unpackhi_epi16 (m2, m1));
}
sum = _mm_srai_epi32 (sum, PRECISION_S16);
sum = _mm_madd_epi16 (sum, f);
sum =
_mm_add_epi32 (sum, _mm_shuffle_epi32 (sum, _MM_SHUFFLE (2, 3, 2,
3)));
sum =
_mm_add_epi32 (sum, _mm_shuffle_epi32 (sum, _MM_SHUFFLE (1, 1, 1,
1)));
sum = _mm_add_epi32 (sum, _mm_set1_epi32 (1 << (PRECISION_S16 - 1)));
sum = _mm_srai_epi32 (sum, PRECISION_S16);
sum = _mm_packs_epi32 (sum, sum);
*o = _mm_extract_epi16 (sum, 0);
}
static inline void static inline void
inner_product_gdouble_none_1_sse2 (gdouble * o, const gdouble * a, inner_product_gdouble_none_1_sse2 (gdouble * o, const gdouble * a,
const gdouble * b, gint len, const gdouble * icoeff, gint oversample) const gdouble * b, gint len, const gdouble * icoeff, gint oversample)
@ -179,6 +260,51 @@ inner_product_gdouble_none_1_sse2 (gdouble * o, const gdouble * a,
_mm_store_sd (o, sum); _mm_store_sd (o, sum);
} }
static inline void
inner_product_gdouble_linear_1_sse2 (gdouble * o, const gdouble * a,
const gdouble * b, gint len, const gdouble * icoeff, gint oversample)
{
gint i = 0;
__m128d sum = _mm_setzero_pd ();
__m128d f = _mm_loadu_pd (icoeff);
for (; i < len; i += 4) {
sum = _mm_add_pd (sum, _mm_mul_pd (_mm_load1_pd (a + i + 0), _mm_load_pd (b + 2 * i + 0)));
sum = _mm_add_pd (sum, _mm_mul_pd (_mm_load1_pd (a + i + 1), _mm_load_pd (b + 2 * i + 2)));
sum = _mm_add_pd (sum, _mm_mul_pd (_mm_load1_pd (a + i + 2), _mm_load_pd (b + 2 * i + 4)));
sum = _mm_add_pd (sum, _mm_mul_pd (_mm_load1_pd (a + i + 3), _mm_load_pd (b + 2 * i + 6)));
}
sum = _mm_mul_pd (sum, f);
sum = _mm_add_sd (sum, _mm_unpackhi_pd (sum, sum));
_mm_store_sd (o, sum);
}
static inline void
inner_product_gdouble_cubic_1_sse2 (gdouble * o, const gdouble * a,
const gdouble * b, gint len, const gdouble * icoeff, gint oversample)
{
gint i = 0;
__m128d sum1 = _mm_setzero_pd (), t;
__m128d sum2 = _mm_setzero_pd ();
__m128d f1 = _mm_loadu_pd (icoeff);
__m128d f2 = _mm_loadu_pd (icoeff+2);
for (; i < len; i += 2) {
t = _mm_load1_pd (a + i + 0);
sum1 = _mm_add_pd (sum1, _mm_mul_pd (t, _mm_load_pd (b + 4 * i + 0)));
sum2 = _mm_add_pd (sum2, _mm_mul_pd (t, _mm_load_pd (b + 4 * i + 2)));
t = _mm_load1_pd (a + i + 1);
sum1 = _mm_add_pd (sum1, _mm_mul_pd (t, _mm_load_pd (b + 4 * i + 4)));
sum2 = _mm_add_pd (sum2, _mm_mul_pd (t, _mm_load_pd (b + 4 * i + 6)));
}
sum1 = _mm_mul_pd (sum1, f1);
sum2 = _mm_mul_pd (sum2, f2);
sum1 = _mm_add_pd (sum1, sum2);
sum1 = _mm_add_sd (sum1, _mm_unpackhi_pd (sum1, sum1));
_mm_store_sd (o, sum1);
}
static inline void static inline void
inner_product_gint16_none_2_sse2 (gint16 * o, const gint16 * a, inner_product_gint16_none_2_sse2 (gint16 * o, const gint16 * a,
const gint16 * b, gint len, const gint16 * icoeff, gint oversample) const gint16 * b, gint len, const gint16 * icoeff, gint oversample)
@ -239,9 +365,16 @@ inner_product_gdouble_none_2_sse2 (gdouble * o, const gdouble * a,
} }
MAKE_RESAMPLE_FUNC (gint16, none, 1, sse2); MAKE_RESAMPLE_FUNC (gint16, none, 1, sse2);
MAKE_RESAMPLE_FUNC (gint16, linear, 1, sse2);
MAKE_RESAMPLE_FUNC (gint16, cubic, 1, sse2);
MAKE_RESAMPLE_FUNC (gdouble, none, 1, sse2); MAKE_RESAMPLE_FUNC (gdouble, none, 1, sse2);
MAKE_RESAMPLE_FUNC (gdouble, linear, 1, sse2);
MAKE_RESAMPLE_FUNC (gdouble, cubic, 1, sse2);
MAKE_RESAMPLE_FUNC (gint16, none, 2, sse2); MAKE_RESAMPLE_FUNC (gint16, none, 2, sse2);
MAKE_RESAMPLE_FUNC (gdouble, none, 2, sse2); MAKE_RESAMPLE_FUNC (gdouble, none, 2, sse2);
#endif #endif
#if defined (HAVE_SMMINTRIN_H) && defined(__SSE4_1__) #if defined (HAVE_SMMINTRIN_H) && defined(__SSE4_1__)
@ -295,21 +428,29 @@ audio_resampler_check_x86 (const gchar *option)
#if defined (HAVE_XMMINTRIN_H) && defined(__SSE__) #if defined (HAVE_XMMINTRIN_H) && defined(__SSE__)
GST_DEBUG ("enable SSE optimisations"); GST_DEBUG ("enable SSE optimisations");
resample_gfloat_none_1 = resample_gfloat_none_1_sse; resample_gfloat_none_1 = resample_gfloat_none_1_sse;
resample_gfloat_none_2 = resample_gfloat_none_2_sse;
resample_gfloat_linear_1 = resample_gfloat_linear_1_sse; resample_gfloat_linear_1 = resample_gfloat_linear_1_sse;
resample_gfloat_cubic_1 = resample_gfloat_cubic_1_sse; resample_gfloat_cubic_1 = resample_gfloat_cubic_1_sse;
resample_gfloat_none_2 = resample_gfloat_none_2_sse;
#endif #endif
} else if (!strcmp (option, "sse2")) { } else if (!strcmp (option, "sse2")) {
#if defined (HAVE_EMMINTRIN_H) && defined(__SSE2__) #if defined (HAVE_EMMINTRIN_H) && defined(__SSE2__)
GST_DEBUG ("enable SSE2 optimisations"); GST_DEBUG ("enable SSE2 optimisations");
resample_gint16_none_1 = resample_gint16_none_1_sse2; resample_gint16_none_1 = resample_gint16_none_1_sse2;
resample_gint16_linear_1 = resample_gint16_linear_1_sse2;
resample_gint16_cubic_1 = resample_gint16_cubic_1_sse2;
resample_gfloat_none_1 = resample_gfloat_none_1_sse; resample_gfloat_none_1 = resample_gfloat_none_1_sse;
resample_gfloat_none_2 = resample_gfloat_none_2_sse;
resample_gdouble_none_1 = resample_gdouble_none_1_sse2;
resample_gint16_none_2 = resample_gint16_none_2_sse2;
resample_gdouble_none_2 = resample_gdouble_none_2_sse2;
resample_gfloat_linear_1 = resample_gfloat_linear_1_sse; resample_gfloat_linear_1 = resample_gfloat_linear_1_sse;
resample_gfloat_cubic_1 = resample_gfloat_cubic_1_sse; resample_gfloat_cubic_1 = resample_gfloat_cubic_1_sse;
resample_gdouble_none_1 = resample_gdouble_none_1_sse2;
resample_gdouble_linear_1 = resample_gdouble_linear_1_sse2;
resample_gdouble_cubic_1 = resample_gdouble_cubic_1_sse2;
resample_gint16_none_2 = resample_gint16_none_2_sse2;
resample_gfloat_none_2 = resample_gfloat_none_2_sse;
resample_gdouble_none_2 = resample_gdouble_none_2_sse2;
#endif #endif
} else if (!strcmp (option, "sse41")) { } else if (!strcmp (option, "sse41")) {
#if defined (HAVE_SMMINTRIN_H) && defined(__SSE4_1__) #if defined (HAVE_SMMINTRIN_H) && defined(__SSE4_1__)

View file

@ -390,6 +390,27 @@ MAKE_CONVERT_TAPS_INT_FUNC (gint32, PRECISION_S32);
MAKE_CONVERT_TAPS_FLOAT_FUNC (gfloat); MAKE_CONVERT_TAPS_FLOAT_FUNC (gfloat);
MAKE_CONVERT_TAPS_FLOAT_FUNC (gdouble); MAKE_CONVERT_TAPS_FLOAT_FUNC (gdouble);
#define MAKE_EXTRACT_TAPS_FUNC(type) \
static inline void \
extract_taps_##type (GstAudioResampler * resampler, type *tmpcoeff, \
gint n_taps, gint oversample, gint mult) \
{ \
gint i, j, k; \
for (i = 0; i < oversample; i++) { \
type *coeff = (type *) ((gint8*)resampler->coeff + \
i * resampler->cstride); \
for (j = 0; j < n_taps; j++) { \
for (k = 0; k < mult; k++) { \
*coeff++ = tmpcoeff[i + j*oversample + k]; \
} \
} \
} \
}
MAKE_EXTRACT_TAPS_FUNC (gint16);
MAKE_EXTRACT_TAPS_FUNC (gint32);
MAKE_EXTRACT_TAPS_FUNC (gfloat);
MAKE_EXTRACT_TAPS_FUNC (gdouble);
#define GET_TAPS_NONE_FUNC(type) \ #define GET_TAPS_NONE_FUNC(type) \
static inline gpointer \ static inline gpointer \
get_taps_##type##_none (GstAudioResampler * resampler, \ get_taps_##type##_none (GstAudioResampler * resampler, \
@ -421,12 +442,19 @@ get_taps_##type##_none (GstAudioResampler * resampler,
} \ } \
return res; \ return res; \
} }
GET_TAPS_NONE_FUNC (gint16); GET_TAPS_NONE_FUNC (gint16);
GET_TAPS_NONE_FUNC (gint32); GET_TAPS_NONE_FUNC (gint32);
GET_TAPS_NONE_FUNC (gfloat); GET_TAPS_NONE_FUNC (gfloat);
GET_TAPS_NONE_FUNC (gdouble); GET_TAPS_NONE_FUNC (gdouble);
#define MAKE_COEFF_LINEAR_INT_FUNC(type,type2,prec) \
static inline void \
make_coeff_##type##_linear (gint frac, gint out_rate, type *icoeff) \
{ \
type x = ((type2)frac << prec) / out_rate; \
icoeff[0] = icoeff[2] = x; \
icoeff[1] = icoeff[3] = (1L << prec) - x; \
}
#define MAKE_COEFF_LINEAR_FLOAT_FUNC(type) \ #define MAKE_COEFF_LINEAR_FLOAT_FUNC(type) \
static inline void \ static inline void \
make_coeff_##type##_linear (gint frac, gint out_rate, type *icoeff) \ make_coeff_##type##_linear (gint frac, gint out_rate, type *icoeff) \
@ -435,30 +463,11 @@ make_coeff_##type##_linear (gint frac, gint out_rate, type *icoeff) \
icoeff[0] = icoeff[2] = x; \ icoeff[0] = icoeff[2] = x; \
icoeff[1] = icoeff[3] = 1.0 - x; \ icoeff[1] = icoeff[3] = 1.0 - x; \
} }
#define MAKE_COEFF_LINEAR_INT_FUNC(type,type2,prec) \
static inline void \
make_coeff_##type##_linear (gint frac, gint out_rate, type *icoeff) \
{ \
type x = ((type2)frac << prec) / out_rate; \
icoeff[0] = icoeff[2] = x; \
icoeff[1] = icoeff[3] = (1 << prec) - x; \
}
MAKE_COEFF_LINEAR_INT_FUNC (gint16, gint32, PRECISION_S16); MAKE_COEFF_LINEAR_INT_FUNC (gint16, gint32, PRECISION_S16);
MAKE_COEFF_LINEAR_INT_FUNC (gint32, gint64, PRECISION_S32); MAKE_COEFF_LINEAR_INT_FUNC (gint32, gint64, PRECISION_S32);
MAKE_COEFF_LINEAR_FLOAT_FUNC (gfloat); MAKE_COEFF_LINEAR_FLOAT_FUNC (gfloat);
MAKE_COEFF_LINEAR_FLOAT_FUNC (gdouble); MAKE_COEFF_LINEAR_FLOAT_FUNC (gdouble);
#define MAKE_COEFF_CUBIC_FLOAT_FUNC(type) \
static inline void \
make_coeff_##type##_cubic (gint frac, gint out_rate, type *icoeff) \
{ \
type x = (type) frac / out_rate, x2 = x * x, x3 = x2 * x; \
icoeff[0] = 0.16667f * (x3 - x); \
icoeff[1] = x + 0.5f * (x2 - x3); \
icoeff[3] = -0.33333f * x + 0.5f * x2 - 0.16667f * x3; \
icoeff[2] = 1. - icoeff[0] - icoeff[1] - icoeff[3]; \
}
#define MAKE_COEFF_CUBIC_INT_FUNC(type,type2,prec) \ #define MAKE_COEFF_CUBIC_INT_FUNC(type,type2,prec) \
static inline void \ static inline void \
make_coeff_##type##_cubic (gint frac, gint out_rate, type *icoeff) \ make_coeff_##type##_cubic (gint frac, gint out_rate, type *icoeff) \
@ -473,7 +482,16 @@ make_coeff_##type##_cubic (gint frac, gint out_rate, type *icoeff) \
(x2 >> 1) - ((((type2) x3 << prec) / 6) >> prec); \ (x2 >> 1) - ((((type2) x3 << prec) / 6) >> prec); \
icoeff[2] = one - icoeff[0] - icoeff[1] - icoeff[3]; \ icoeff[2] = one - icoeff[0] - icoeff[1] - icoeff[3]; \
} }
#define MAKE_COEFF_CUBIC_FLOAT_FUNC(type) \
static inline void \
make_coeff_##type##_cubic (gint frac, gint out_rate, type *icoeff) \
{ \
type x = (type) frac / out_rate, x2 = x * x, x3 = x2 * x; \
icoeff[0] = 0.16667f * (x3 - x); \
icoeff[1] = x + 0.5f * (x2 - x3); \
icoeff[3] = -0.33333f * x + 0.5f * x2 - 0.16667f * x3; \
icoeff[2] = 1. - icoeff[0] - icoeff[1] - icoeff[3]; \
}
MAKE_COEFF_CUBIC_INT_FUNC (gint16, gint32, PRECISION_S16); MAKE_COEFF_CUBIC_INT_FUNC (gint16, gint32, PRECISION_S16);
MAKE_COEFF_CUBIC_INT_FUNC (gint32, gint64, PRECISION_S32); MAKE_COEFF_CUBIC_INT_FUNC (gint32, gint64, PRECISION_S32);
MAKE_COEFF_CUBIC_FLOAT_FUNC (gfloat); MAKE_COEFF_CUBIC_FLOAT_FUNC (gfloat);
@ -488,12 +506,13 @@ get_taps_##type##_##inter (GstAudioResampler * resampler, \
gint out_rate = resampler->out_rate; \ gint out_rate = resampler->out_rate; \
gint offset, frac, pos; \ gint offset, frac, pos; \
gint oversample = resampler->oversample; \ gint oversample = resampler->oversample; \
gint cstride = resampler->cstride; \
\ \
pos = *samp_phase * oversample; \ pos = *samp_phase * oversample; \
offset = (oversample - 1) - (pos / out_rate); \ offset = (oversample - 1) - (pos / out_rate); \
frac = pos % out_rate; \ frac = pos % out_rate; \
\ \
res = (type *)resampler->coeff + offset; \ res = (gint8 *) resampler->coeff + offset * cstride; \
make_coeff_##type##_##inter (frac, out_rate, icoeff); \ make_coeff_##type##_##inter (frac, out_rate, icoeff); \
\ \
*samp_index += resampler->samp_inc; \ *samp_index += resampler->samp_inc; \
@ -526,7 +545,7 @@ inner_product_##type##_none_1_c (type * o, const type * a, \
for (i = 0; i < len; i++) \ for (i = 0; i < len; i++) \
res += (type2) a[i] * (type2) b[i]; \ res += (type2) a[i] * (type2) b[i]; \
\ \
res = (res + (1 << ((prec) - 1))) >> (prec); \ res = (res + (1L << ((prec) - 1))) >> (prec); \
*o = CLAMP (res, -(limit), (limit) - 1); \ *o = CLAMP (res, -(limit), (limit) - 1); \
} }
@ -542,12 +561,12 @@ inner_product_##type##_linear_1_c (type * o, const type * a, \
type2 res[2] = { 0, 0 }; \ type2 res[2] = { 0, 0 }; \
\ \
for (i = 0; i < len; i++) { \ for (i = 0; i < len; i++) { \
res[0] += (type2) a[i] * (type2) b[i * oversample + 0]; \ res[0] += (type2) a[i] * (type2) b[2 * i + 0]; \
res[1] += (type2) a[i] * (type2) b[i * oversample + 1]; \ res[1] += (type2) a[i] * (type2) b[2 * i + 1]; \
} \ } \
res[0] = (res[0] >> (prec)) * ic[0] + \ res[0] = (res[0] >> (prec)) * (type2) ic[0] + \
(res[1] >> (prec)) * ic[1]; \ (res[1] >> (prec)) * (type2) ic[1]; \
res[0] = (res[0] + (1 << ((prec) - 1))) >> (prec); \ res[0] = (res[0] + (1L << ((prec) - 1))) >> (prec); \
*o = CLAMP (res[0], -(limit), (limit) - 1); \ *o = CLAMP (res[0], -(limit), (limit) - 1); \
} }
@ -563,16 +582,16 @@ inner_product_##type##_cubic_1_c (type * o, const type * a, \
type2 res[4] = { 0, 0, 0, 0 }; \ type2 res[4] = { 0, 0, 0, 0 }; \
\ \
for (i = 0; i < len; i++) { \ for (i = 0; i < len; i++) { \
res[0] += (type2) a[i] * (type2) b[i * oversample + 0]; \ res[0] += (type2) a[i] * (type2) b[4 * i + 0]; \
res[1] += (type2) a[i] * (type2) b[i * oversample + 1]; \ res[1] += (type2) a[i] * (type2) b[4 * i + 1]; \
res[2] += (type2) a[i] * (type2) b[i * oversample + 2]; \ res[2] += (type2) a[i] * (type2) b[4 * i + 2]; \
res[3] += (type2) a[i] * (type2) b[i * oversample + 3]; \ res[3] += (type2) a[i] * (type2) b[4 * i + 3]; \
} \ } \
res[0] = (res[0] >> (prec)) * ic[0] + \ res[0] = (res[0] >> (prec)) * (type2) ic[0] + \
(res[1] >> (prec)) * ic[1] + \ (res[1] >> (prec)) * (type2) ic[1] + \
(res[2] >> (prec)) * ic[2] + \ (res[2] >> (prec)) * (type2) ic[2] + \
(res[3] >> (prec)) * ic[3]; \ (res[3] >> (prec)) * (type2) ic[3]; \
res[0] = (res[0] + (1 << ((prec) - 1))) >> (prec); \ res[0] = (res[0] + (1L << ((prec) - 1))) >> (prec); \
*o = CLAMP (res[0], -(limit), (limit) - 1); \ *o = CLAMP (res[0], -(limit), (limit) - 1); \
} }
@ -605,8 +624,8 @@ inner_product_##type##_linear_1_c (type * o, const type * a, \
type res[2] = { 0.0, 0.0 }; \ type res[2] = { 0.0, 0.0 }; \
\ \
for (i = 0; i < len; i++) { \ for (i = 0; i < len; i++) { \
res[0] += a[i] * b[i * oversample + 0]; \ res[0] += a[i] * b[2 * i + 0]; \
res[1] += a[i] * b[i * oversample + 1]; \ res[1] += a[i] * b[2 * i + 1]; \
} \ } \
*o = res[0] * ic[0] + res[1] * ic[1]; \ *o = res[0] * ic[0] + res[1] * ic[1]; \
} }
@ -622,10 +641,10 @@ inner_product_##type##_cubic_1_c (type * o, const type * a, \
type res[4] = { 0.0, 0.0, 0.0, 0.0 }; \ type res[4] = { 0.0, 0.0, 0.0, 0.0 }; \
\ \
for (i = 0; i < len; i++) { \ for (i = 0; i < len; i++) { \
res[0] += a[i] * b[i * oversample + 0]; \ res[0] += a[i] * b[4 * i + 0]; \
res[1] += a[i] * b[i * oversample + 1]; \ res[1] += a[i] * b[4 * i + 1]; \
res[2] += a[i] * b[i * oversample + 2]; \ res[2] += a[i] * b[4 * i + 2]; \
res[3] += a[i] * b[i * oversample + 3]; \ res[3] += a[i] * b[4 * i + 3]; \
} \ } \
*o = res[0] * ic[0] + res[1] * ic[1] + \ *o = res[0] * ic[0] + res[1] * ic[1] + \
res[2] * ic[2] + res[3] * ic[3]; \ res[2] * ic[2] + res[3] * ic[3]; \
@ -659,9 +678,10 @@ resample_ ##type## _ ##inter## _ ##channels## _ ##arch (GstAudioResampler * resa
\ \
ipp = &ip[samp_index * channels]; \ ipp = &ip[samp_index * channels]; \
\ \
taps = get_taps_ ##type##_##inter (resampler, &samp_index, &samp_phase, icoeff); \ taps = get_taps_ ##type##_##inter \
\ (resampler, &samp_index, &samp_phase, icoeff); \
inner_product_ ##type##_##inter##_##channels##_##arch (op, ipp, taps, n_taps, icoeff, oversample); \ inner_product_ ##type##_##inter##_##channels##_##arch \
(op, ipp, taps, n_taps, icoeff, oversample); \
op += ostride; \ op += ostride; \
} \ } \
memmove (ip, &ip[samp_index * channels], \ memmove (ip, &ip[samp_index * channels], \
@ -802,10 +822,10 @@ deinterleave_ ##type (GstAudioResampler * resampler, gpointer sbuf[], \
} \ } \
} }
MAKE_DEINTERLEAVE_FUNC (gdouble);
MAKE_DEINTERLEAVE_FUNC (gfloat);
MAKE_DEINTERLEAVE_FUNC (gint32);
MAKE_DEINTERLEAVE_FUNC (gint16); MAKE_DEINTERLEAVE_FUNC (gint16);
MAKE_DEINTERLEAVE_FUNC (gint32);
MAKE_DEINTERLEAVE_FUNC (gfloat);
MAKE_DEINTERLEAVE_FUNC (gdouble);
static DeinterleaveFunc deinterleave_funcs[] = { static DeinterleaveFunc deinterleave_funcs[] = {
deinterleave_gint16, deinterleave_gint16,
@ -875,7 +895,7 @@ calculate_kaiser_params (GstAudioResampler * resampler)
static void static void
alloc_coeff_mem (GstAudioResampler * resampler, gint bps, gint n_taps, alloc_coeff_mem (GstAudioResampler * resampler, gint bps, gint n_taps,
gint n_phases) gint n_phases, gint n_mult)
{ {
if (resampler->alloc_taps >= n_taps && resampler->alloc_phases >= n_phases) if (resampler->alloc_taps >= n_taps && resampler->alloc_phases >= n_phases)
return; return;
@ -883,7 +903,8 @@ alloc_coeff_mem (GstAudioResampler * resampler, gint bps, gint n_taps,
resampler->tmpcoeff = resampler->tmpcoeff =
g_realloc_n (resampler->tmpcoeff, n_taps, sizeof (gdouble)); g_realloc_n (resampler->tmpcoeff, n_taps, sizeof (gdouble));
resampler->cstride = GST_ROUND_UP_32 (bps * (n_taps + TAPS_OVERREAD)); resampler->cstride =
GST_ROUND_UP_32 (bps * (n_mult * n_taps + TAPS_OVERREAD));
g_free (resampler->coeffmem); g_free (resampler->coeffmem);
resampler->coeffmem = g_malloc0 (n_phases * resampler->cstride + ALIGN - 1); resampler->coeffmem = g_malloc0 (n_phases * resampler->cstride + ALIGN - 1);
resampler->coeff = MEM_ALIGN (resampler->coeffmem, ALIGN); resampler->coeff = MEM_ALIGN (resampler->coeffmem, ALIGN);
@ -983,7 +1004,7 @@ resampler_calculate_taps (GstAudioResampler * resampler)
} }
if (interpolate) { if (interpolate) {
gint otaps; gint otaps, mult;
gpointer coeff; gpointer coeff;
gdouble x, weight, *tmpcoeff; gdouble x, weight, *tmpcoeff;
GstAudioResamplerFilterInterpolation filter_interpolation = GstAudioResamplerFilterInterpolation filter_interpolation =
@ -995,37 +1016,40 @@ resampler_calculate_taps (GstAudioResampler * resampler)
else else
resampler->filter_interpolation = filter_interpolation; resampler->filter_interpolation = filter_interpolation;
otaps = oversample * n_taps;
switch (resampler->filter_interpolation) { switch (resampler->filter_interpolation) {
default: default:
case GST_AUDIO_RESAMPLER_FILTER_INTERPOLATION_LINEAR: case GST_AUDIO_RESAMPLER_FILTER_INTERPOLATION_LINEAR:
otaps += 1; mult = 2;
break; break;
case GST_AUDIO_RESAMPLER_FILTER_INTERPOLATION_CUBIC: case GST_AUDIO_RESAMPLER_FILTER_INTERPOLATION_CUBIC:
otaps += 3; mult = 4;
break; break;
} }
otaps = oversample * n_taps + mult - 1;
alloc_coeff_mem (resampler, bps, otaps, 1); alloc_coeff_mem (resampler, bps, otaps, oversample, mult);
coeff = resampler->coeff; coeff = tmpcoeff = resampler->tmpcoeff;
tmpcoeff = resampler->tmpcoeff;
x = 1.0 - n_taps / 2; x = 1.0 - n_taps / 2;
weight = fill_taps (resampler, tmpcoeff, x, otaps, oversample); weight = fill_taps (resampler, tmpcoeff, x, otaps, oversample);
switch (resampler->format) { switch (resampler->format) {
case GST_AUDIO_FORMAT_S16: case GST_AUDIO_FORMAT_S16:
convert_taps_gint16 (tmpcoeff, coeff, weight / oversample, otaps); convert_taps_gint16 (tmpcoeff, coeff, weight / oversample, otaps);
extract_taps_gint16 (resampler, coeff, n_taps, oversample, mult);
break; break;
case GST_AUDIO_FORMAT_S32: case GST_AUDIO_FORMAT_S32:
convert_taps_gint32 (tmpcoeff, coeff, weight / oversample, otaps); convert_taps_gint32 (tmpcoeff, coeff, weight / oversample, otaps);
extract_taps_gint32 (resampler, coeff, n_taps, oversample, mult);
break; break;
case GST_AUDIO_FORMAT_F32: case GST_AUDIO_FORMAT_F32:
convert_taps_gfloat (tmpcoeff, coeff, weight / oversample, otaps); convert_taps_gfloat (tmpcoeff, coeff, weight / oversample, otaps);
extract_taps_gfloat (resampler, coeff, n_taps, oversample, mult);
break; break;
default: default:
case GST_AUDIO_FORMAT_F64: case GST_AUDIO_FORMAT_F64:
convert_taps_gdouble (tmpcoeff, coeff, weight / oversample, otaps); convert_taps_gdouble (tmpcoeff, coeff, weight / oversample, otaps);
extract_taps_gdouble (resampler, coeff, n_taps, oversample, mult);
break; break;
} }
} else { } else {
@ -1033,7 +1057,7 @@ resampler_calculate_taps (GstAudioResampler * resampler)
GST_AUDIO_RESAMPLER_FILTER_INTERPOLATION_NONE; GST_AUDIO_RESAMPLER_FILTER_INTERPOLATION_NONE;
resampler->taps = g_realloc_n (resampler->taps, out_rate, sizeof (Tap)); resampler->taps = g_realloc_n (resampler->taps, out_rate, sizeof (Tap));
memset (resampler->taps, 0, sizeof (Tap) * out_rate); memset (resampler->taps, 0, sizeof (Tap) * out_rate);
alloc_coeff_mem (resampler, bps, n_taps, out_rate); alloc_coeff_mem (resampler, bps, n_taps, out_rate, 1);
} }
resampler->samp_inc = in_rate / out_rate; resampler->samp_inc = in_rate / out_rate;