audio-resampler: remove stereo optimizations

The stereo optimizations don't give enough benefit.
Rename none to full to make it clear that we use a full filter instead
of an interpolated one
This commit is contained in:
Wim Taymans 2016-02-19 10:40:03 +01:00
parent b820074a49
commit 107f53ea0a
3 changed files with 65 additions and 209 deletions

View file

@ -18,7 +18,7 @@
*/
static inline void
inner_product_gint16_none_1_neon (gint16 * o, const gint16 * a,
inner_product_gint16_full_1_neon (gint16 * o, const gint16 * a,
const gint16 * b, gint len, const gint16 * icoeff)
{
uint32_t remainder = len % 16;
@ -146,7 +146,7 @@ inner_product_gint16_cubic_1_neon (gint16 * o, const gint16 * a,
}
static inline void
inner_product_gint32_none_1_neon (gint32 * o, const gint32 * a,
inner_product_gint32_full_1_neon (gint32 * o, const gint32 * a,
const gint32 * b, gint len, const gint32 * icoeff)
{
uint32_t remainder = len % 8;
@ -264,7 +264,7 @@ inner_product_gint32_cubic_1_neon (gint32 * o, const gint32 * a,
}
static inline void
inner_product_gfloat_none_1_neon (gfloat * o, const gfloat * a,
inner_product_gfloat_full_1_neon (gfloat * o, const gfloat * a,
const gfloat * b, gint len, const gfloat * icoeff)
{
uint32_t remainder = len % 16;
@ -385,15 +385,15 @@ inner_product_gfloat_cubic_1_neon (gfloat * o, const gfloat * a,
"q9", "q10", "q11", "memory");
}
MAKE_RESAMPLE_FUNC (gint16, none, 1, neon);
MAKE_RESAMPLE_FUNC (gint16, full, 1, neon);
MAKE_RESAMPLE_FUNC (gint16, linear, 1, neon);
MAKE_RESAMPLE_FUNC (gint16, cubic, 1, neon);
MAKE_RESAMPLE_FUNC (gint32, none, 1, neon);
MAKE_RESAMPLE_FUNC (gint32, full, 1, neon);
MAKE_RESAMPLE_FUNC (gint32, linear, 1, neon);
MAKE_RESAMPLE_FUNC (gint32, cubic, 1, neon);
MAKE_RESAMPLE_FUNC (gfloat, none, 1, neon);
MAKE_RESAMPLE_FUNC (gfloat, full, 1, neon);
MAKE_RESAMPLE_FUNC (gfloat, linear, 1, neon);
MAKE_RESAMPLE_FUNC (gfloat, cubic, 1, neon);
@ -402,15 +402,15 @@ audio_resampler_check_neon (const gchar *target_name, const gchar *option)
{
if (!strcmp (target_name, "neon")) {
GST_DEBUG ("enable NEON optimisations");
resample_gint16_none_1 = resample_gint16_none_1_neon;
resample_gint16_full_1 = resample_gint16_full_1_neon;
resample_gint16_linear_1 = resample_gint16_linear_1_neon;
resample_gint16_cubic_1 = resample_gint16_cubic_1_neon;
resample_gint32_none_1 = resample_gint32_none_1_neon;
resample_gint32_full_1 = resample_gint32_full_1_neon;
resample_gint32_linear_1 = resample_gint32_linear_1_neon;
resample_gint32_cubic_1 = resample_gint32_cubic_1_neon;
resample_gfloat_none_1 = resample_gfloat_none_1_neon;
resample_gfloat_full_1 = resample_gfloat_full_1_neon;
resample_gfloat_linear_1 = resample_gfloat_linear_1_neon;
resample_gfloat_cubic_1 = resample_gfloat_cubic_1_neon;
}

View file

@ -21,7 +21,7 @@
#include <xmmintrin.h>
static inline void
inner_product_gfloat_none_1_sse (gfloat * o, const gfloat * a,
inner_product_gfloat_full_1_sse (gfloat * o, const gfloat * a,
const gfloat * b, gint len, const gfloat * icoeff)
{
gint i = 0;
@ -82,46 +82,16 @@ inner_product_gfloat_cubic_1_sse (gfloat * o, const gfloat * a,
_mm_store_ss (o, sum);
}
static inline void
inner_product_gfloat_none_2_sse (gfloat * o, const gfloat * a,
const gfloat * b, gint len, const gfloat * icoeff)
{
gint i = 0;
__m128 sum = _mm_setzero_ps (), t;
for (; i < len; i += 8) {
t = _mm_load_ps (b + i);
sum =
_mm_add_ps (sum, _mm_mul_ps (_mm_loadu_ps (a + 2 * i + 0),
_mm_unpacklo_ps (t, t)));
sum =
_mm_add_ps (sum, _mm_mul_ps (_mm_loadu_ps (a + 2 * i + 4),
_mm_unpackhi_ps (t, t)));
t = _mm_load_ps (b + i + 4);
sum =
_mm_add_ps (sum, _mm_mul_ps (_mm_loadu_ps (a + 2 * i + 8),
_mm_unpacklo_ps (t, t)));
sum =
_mm_add_ps (sum, _mm_mul_ps (_mm_loadu_ps (a + 2 * i + 12),
_mm_unpackhi_ps (t, t)));
}
sum = _mm_add_ps (sum, _mm_movehl_ps (sum, sum));
*(gint64*)o = _mm_cvtsi128_si64 ((__m128i)sum);
}
MAKE_RESAMPLE_FUNC (gfloat, none, 1, sse);
MAKE_RESAMPLE_FUNC (gfloat, full, 1, sse);
MAKE_RESAMPLE_FUNC (gfloat, linear, 1, sse);
MAKE_RESAMPLE_FUNC (gfloat, cubic, 1, sse);
MAKE_RESAMPLE_FUNC (gfloat, none, 2, sse);
#endif
#if defined (HAVE_EMMINTRIN_H) && defined(__SSE2__)
#include <emmintrin.h>
static inline void
inner_product_gint16_none_1_sse2 (gint16 * o, const gint16 * a,
inner_product_gint16_full_1_sse2 (gint16 * o, const gint16 * a,
const gint16 * b, gint len, const gint16 * icoeff)
{
gint i = 0;
@ -230,7 +200,7 @@ inner_product_gint16_cubic_1_sse2 (gint16 * o, const gint16 * a,
}
static inline void
inner_product_gdouble_none_1_sse2 (gdouble * o, const gdouble * a,
inner_product_gdouble_full_1_sse2 (gdouble * o, const gdouble * a,
const gdouble * b, gint len, const gdouble * icoeff)
{
gint i = 0;
@ -299,76 +269,14 @@ inner_product_gdouble_cubic_1_sse2 (gdouble * o, const gdouble * a,
_mm_store_sd (o, sum1);
}
static inline void
inner_product_gint16_none_2_sse2 (gint16 * o, const gint16 * a,
const gint16 * b, gint len, const gint16 * icoeff)
{
gint i = 0;
__m128i sum, ta, tb, t1;
sum = _mm_setzero_si128 ();
for (; i < len; i += 8) {
tb = _mm_load_si128 ((__m128i *) (b + i));
t1 = _mm_unpacklo_epi16 (tb, tb);
ta = _mm_loadu_si128 ((__m128i *) (a + 2 * i));
sum = _mm_add_epi32 (sum, _mm_madd_epi16 (ta, t1));
t1 = _mm_unpackhi_epi16 (tb, tb);
ta = _mm_loadu_si128 ((__m128i *) (a + 2 * i + 8));
sum = _mm_add_epi32 (sum, _mm_madd_epi16 (ta, t1));
}
sum =
_mm_add_epi32 (sum, _mm_shuffle_epi32 (sum, _MM_SHUFFLE (2, 3, 2,
3)));
sum = _mm_add_epi32 (sum, _mm_set1_epi32 (1 << (PRECISION_S16 - 1)));
sum = _mm_srai_epi32 (sum, PRECISION_S16);
sum = _mm_packs_epi32 (sum, sum);
*(gint32*)o = _mm_cvtsi128_si32 (sum);
}
static inline void
inner_product_gdouble_none_2_sse2 (gdouble * o, const gdouble * a,
const gdouble * b, gint len, const gdouble * icoeff)
{
gint i = 0;
__m128d sum = _mm_setzero_pd (), t;
for (; i < len; i += 4) {
t = _mm_load_pd (b + i);
sum =
_mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + 2 * i),
_mm_unpacklo_pd (t, t)));
sum =
_mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + 2 * i + 2),
_mm_unpackhi_pd (t, t)));
t = _mm_load_pd (b + i + 2);
sum =
_mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + 2 * i + 4),
_mm_unpacklo_pd (t, t)));
sum =
_mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + 2 * i + 6),
_mm_unpackhi_pd (t, t)));
}
_mm_store_pd (o, sum);
}
MAKE_RESAMPLE_FUNC (gint16, none, 1, sse2);
MAKE_RESAMPLE_FUNC (gint16, full, 1, sse2);
MAKE_RESAMPLE_FUNC (gint16, linear, 1, sse2);
MAKE_RESAMPLE_FUNC (gint16, cubic, 1, sse2);
MAKE_RESAMPLE_FUNC (gdouble, none, 1, sse2);
MAKE_RESAMPLE_FUNC (gdouble, full, 1, sse2);
MAKE_RESAMPLE_FUNC (gdouble, linear, 1, sse2);
MAKE_RESAMPLE_FUNC (gdouble, cubic, 1, sse2);
MAKE_RESAMPLE_FUNC (gint16, none, 2, sse2);
MAKE_RESAMPLE_FUNC (gdouble, none, 2, sse2);
static void
interpolate_gdouble_linear_sse2 (gdouble * o, const gdouble * a,
gint len, const gdouble * icoeff)
@ -414,7 +322,7 @@ interpolate_gdouble_cubic_sse2 (gdouble * o, const gdouble * a,
#include <smmintrin.h>
static inline void
inner_product_gint32_none_1_sse41 (gint32 * o, const gint32 * a,
inner_product_gint32_full_1_sse41 (gint32 * o, const gint32 * a,
const gint32 * b, gint len, const gint32 * icoeff)
{
gint i = 0;
@ -540,7 +448,7 @@ inner_product_gint32_cubic_1_sse41 (gint32 * o, const gint32 * a,
*o = CLAMP (res, -(1L << 31), (1L << 31) - 1);
}
MAKE_RESAMPLE_FUNC (gint32, none, 1, sse41);
MAKE_RESAMPLE_FUNC (gint32, full, 1, sse41);
MAKE_RESAMPLE_FUNC (gint32, linear, 1, sse41);
MAKE_RESAMPLE_FUNC (gint32, cubic, 1, sse41);
#endif
@ -551,11 +459,9 @@ audio_resampler_check_x86 (const gchar *target_name, const gchar *option)
if (!strcmp (target_name, "sse")) {
#if defined (HAVE_XMMINTRIN_H) && defined(__SSE__)
GST_DEBUG ("enable SSE optimisations");
resample_gfloat_none_1 = resample_gfloat_none_1_sse;
resample_gfloat_full_1 = resample_gfloat_full_1_sse;
resample_gfloat_linear_1 = resample_gfloat_linear_1_sse;
resample_gfloat_cubic_1 = resample_gfloat_cubic_1_sse;
resample_gfloat_none_2 = resample_gfloat_none_2_sse;
#else
GST_DEBUG ("SSE optimisations not enabled");
#endif
@ -563,17 +469,14 @@ audio_resampler_check_x86 (const gchar *target_name, const gchar *option)
if (!strcmp (option, "sse2")) {
#if defined (HAVE_EMMINTRIN_H) && defined(__SSE2__)
GST_DEBUG ("enable SSE2 optimisations");
resample_gint16_none_1 = resample_gint16_none_1_sse2;
resample_gint16_full_1 = resample_gint16_full_1_sse2;
resample_gint16_linear_1 = resample_gint16_linear_1_sse2;
resample_gint16_cubic_1 = resample_gint16_cubic_1_sse2;
resample_gdouble_none_1 = resample_gdouble_none_1_sse2;
resample_gdouble_full_1 = resample_gdouble_full_1_sse2;
resample_gdouble_linear_1 = resample_gdouble_linear_1_sse2;
resample_gdouble_cubic_1 = resample_gdouble_cubic_1_sse2;
resample_gint16_none_2 = resample_gint16_none_2_sse2;
resample_gdouble_none_2 = resample_gdouble_none_2_sse2;
interpolate_gdouble_linear = interpolate_gdouble_linear_sse2;
interpolate_gdouble_cubic = interpolate_gdouble_cubic_sse2;
#else
@ -582,7 +485,7 @@ audio_resampler_check_x86 (const gchar *target_name, const gchar *option)
} else if (!strcmp (option, "sse41")) {
#if defined (HAVE_SMMINTRIN_H) && defined(__SSE4_1__)
GST_DEBUG ("enable SSE41 optimisations");
resample_gint32_none_1 = resample_gint32_none_1_sse41;
resample_gint32_full_1 = resample_gint32_full_1_sse41;
resample_gint32_linear_1 = resample_gint32_linear_1_sse41;
resample_gint32_cubic_1 = resample_gint32_cubic_1_sse41;
#else

View file

@ -556,9 +556,9 @@ fill_taps (GstAudioResampler * resampler,
return res;
}
#define GET_TAPS_NONE_FUNC(type) \
#define GET_TAPS_FULL_FUNC(type) \
static inline gpointer \
get_taps_##type##_none (GstAudioResampler * resampler, \
get_taps_##type##_full (GstAudioResampler * resampler, \
gint *samp_index, gint *samp_phase, type icoeff[4]) \
{ \
gpointer res; \
@ -589,10 +589,10 @@ get_taps_##type##_none (GstAudioResampler * resampler,
} \
return res; \
}
GET_TAPS_NONE_FUNC (gint16);
GET_TAPS_NONE_FUNC (gint32);
GET_TAPS_NONE_FUNC (gfloat);
GET_TAPS_NONE_FUNC (gdouble);
GET_TAPS_FULL_FUNC (gint16);
GET_TAPS_FULL_FUNC (gint32);
GET_TAPS_FULL_FUNC (gfloat);
GET_TAPS_FULL_FUNC (gdouble);
#define GET_TAPS_INTERPOLATE_FUNC(type,inter) \
static inline gpointer \
@ -631,9 +631,9 @@ GET_TAPS_INTERPOLATE_FUNC (gint32, cubic);
GET_TAPS_INTERPOLATE_FUNC (gfloat, cubic);
GET_TAPS_INTERPOLATE_FUNC (gdouble, cubic);
#define INNER_PRODUCT_INT_NONE_FUNC(type,type2,prec,limit) \
#define INNER_PRODUCT_INT_FULL_FUNC(type,type2,prec,limit) \
static inline void \
inner_product_##type##_none_1_c (type * o, const type * a, \
inner_product_##type##_full_1_c (type * o, const type * a, \
const type * b, gint len, const type *ic) \
{ \
gint i; \
@ -650,8 +650,8 @@ inner_product_##type##_none_1_c (type * o, const type * a, \
*o = CLAMP (res[0], -(limit), (limit) - 1); \
}
INNER_PRODUCT_INT_NONE_FUNC (gint16, gint32, PRECISION_S16, (gint32) 1 << 15);
INNER_PRODUCT_INT_NONE_FUNC (gint32, gint64, PRECISION_S32, (gint64) 1 << 31);
INNER_PRODUCT_INT_FULL_FUNC (gint16, gint32, PRECISION_S16, (gint32) 1 << 15);
INNER_PRODUCT_INT_FULL_FUNC (gint32, gint64, PRECISION_S32, (gint64) 1 << 31);
#define INNER_PRODUCT_INT_LINEAR_FUNC(type,type2,prec,limit) \
static inline void \
@ -703,9 +703,9 @@ inner_product_##type##_cubic_1_c (type * o, const type * a, \
INNER_PRODUCT_INT_CUBIC_FUNC (gint16, gint32, PRECISION_S16, (gint32) 1 << 15);
INNER_PRODUCT_INT_CUBIC_FUNC (gint32, gint64, PRECISION_S32, (gint64) 1 << 31);
#define INNER_PRODUCT_FLOAT_NONE_FUNC(type) \
#define INNER_PRODUCT_FLOAT_FULL_FUNC(type) \
static inline void \
inner_product_##type##_none_1_c (type * o, const type * a, \
inner_product_##type##_full_1_c (type * o, const type * a, \
const type * b, gint len, const type *ic) \
{ \
gint i; \
@ -720,8 +720,8 @@ inner_product_##type##_none_1_c (type * o, const type * a, \
*o = res[0] + res[1] + res[2] + res[3]; \
}
INNER_PRODUCT_FLOAT_NONE_FUNC (gfloat);
INNER_PRODUCT_FLOAT_NONE_FUNC (gdouble);
INNER_PRODUCT_FLOAT_FULL_FUNC (gfloat);
INNER_PRODUCT_FLOAT_FULL_FUNC (gdouble);
#define INNER_PRODUCT_FLOAT_LINEAR_FUNC(type) \
static inline void \
@ -804,10 +804,10 @@ resample_ ##type## _ ##inter## _ ##channels## _ ##arch (GstAudioResampler * resa
resampler->samp_phase = samp_phase; \
}
MAKE_RESAMPLE_FUNC (gint16, none, 1, c);
MAKE_RESAMPLE_FUNC (gint32, none, 1, c);
MAKE_RESAMPLE_FUNC (gfloat, none, 1, c);
MAKE_RESAMPLE_FUNC (gdouble, none, 1, c);
MAKE_RESAMPLE_FUNC (gint16, full, 1, c);
MAKE_RESAMPLE_FUNC (gint32, full, 1, c);
MAKE_RESAMPLE_FUNC (gfloat, full, 1, c);
MAKE_RESAMPLE_FUNC (gdouble, full, 1, c);
MAKE_RESAMPLE_FUNC (gint16, linear, 1, c);
MAKE_RESAMPLE_FUNC (gint32, linear, 1, c);
@ -820,52 +820,36 @@ MAKE_RESAMPLE_FUNC (gfloat, cubic, 1, c);
MAKE_RESAMPLE_FUNC (gdouble, cubic, 1, c);
static ResampleFunc resample_funcs[] = {
resample_gint16_none_1_c,
resample_gint32_none_1_c,
resample_gfloat_none_1_c,
resample_gdouble_none_1_c,
NULL,
NULL,
NULL,
NULL,
resample_gint16_full_1_c,
resample_gint32_full_1_c,
resample_gfloat_full_1_c,
resample_gdouble_full_1_c,
resample_gint16_linear_1_c,
resample_gint32_linear_1_c,
resample_gfloat_linear_1_c,
resample_gdouble_linear_1_c,
NULL,
NULL,
NULL,
NULL,
resample_gint16_cubic_1_c,
resample_gint32_cubic_1_c,
resample_gfloat_cubic_1_c,
resample_gdouble_cubic_1_c,
NULL,
NULL,
NULL,
NULL,
};
#define resample_gint16_none_1 resample_funcs[0]
#define resample_gint32_none_1 resample_funcs[1]
#define resample_gfloat_none_1 resample_funcs[2]
#define resample_gdouble_none_1 resample_funcs[3]
#define resample_gint16_none_2 resample_funcs[4]
#define resample_gint32_none_2 resample_funcs[5]
#define resample_gfloat_none_2 resample_funcs[6]
#define resample_gdouble_none_2 resample_funcs[7]
#define resample_gint16_full_1 resample_funcs[0]
#define resample_gint32_full_1 resample_funcs[1]
#define resample_gfloat_full_1 resample_funcs[2]
#define resample_gdouble_full_1 resample_funcs[3]
#define resample_gint16_linear_1 resample_funcs[8]
#define resample_gint32_linear_1 resample_funcs[9]
#define resample_gfloat_linear_1 resample_funcs[10]
#define resample_gdouble_linear_1 resample_funcs[11]
#define resample_gint16_linear_1 resample_funcs[4]
#define resample_gint32_linear_1 resample_funcs[5]
#define resample_gfloat_linear_1 resample_funcs[6]
#define resample_gdouble_linear_1 resample_funcs[7]
#define resample_gint16_cubic_1 resample_funcs[16]
#define resample_gint32_cubic_1 resample_funcs[17]
#define resample_gfloat_cubic_1 resample_funcs[18]
#define resample_gdouble_cubic_1 resample_funcs[19]
#define resample_gint16_cubic_1 resample_funcs[8]
#define resample_gint32_cubic_1 resample_funcs[9]
#define resample_gfloat_cubic_1 resample_funcs[10]
#define resample_gdouble_cubic_1 resample_funcs[11]
#if defined HAVE_ORC && !defined DISABLE_ORC
# if defined (__ARM_NEON__)
@ -952,25 +936,6 @@ static DeinterleaveFunc deinterleave_funcs[] = {
deinterleave_gdouble
};
static void
deinterleave_copy (GstAudioResampler * resampler, gpointer sbuf[],
gpointer in[], gsize in_frames)
{
gint c, blocks = resampler->blocks;
gsize bytes_avail, in_bytes, bpf;
bpf = resampler->bps * resampler->inc;
bytes_avail = resampler->samples_avail * bpf;
in_bytes = in_frames * bpf;
for (c = 0; c < blocks; c++) {
if (G_UNLIKELY (in == NULL))
memset ((gint8 *) sbuf[c] + bytes_avail, 0, in_bytes);
else
memcpy ((gint8 *) sbuf[c] + bytes_avail, in[c], in_bytes);
}
}
static void
calculate_kaiser_params (GstAudioResampler * resampler)
{
@ -1062,11 +1027,10 @@ static void
setup_functions (GstAudioResampler * resampler)
{
gboolean non_interleaved;
gint n_taps, index;
gint index;
DeinterleaveFunc deinterleave;
ResampleFunc resample, resample_2;
ResampleFunc resample;
n_taps = resampler->n_taps;
non_interleaved =
(resampler->flags & GST_AUDIO_RESAMPLER_FLAG_NON_INTERLEAVED);
@ -1104,11 +1068,11 @@ setup_functions (GstAudioResampler * resampler)
switch (resampler->filter_interpolation) {
case GST_AUDIO_RESAMPLER_FILTER_INTERPOLATION_LINEAR:
GST_DEBUG ("using linear interpolation filter function");
index += 8;
index += 4;
break;
case GST_AUDIO_RESAMPLER_FILTER_INTERPOLATION_CUBIC:
GST_DEBUG ("using cubic interpolation filter function");
index += 16;
index += 8;
break;
default:
break;
@ -1116,23 +1080,12 @@ setup_functions (GstAudioResampler * resampler)
break;
}
resample = resample_funcs[index];
resample_2 = resample_funcs[index + 4];
if (!non_interleaved && resampler->channels == 2 && n_taps >= 4 && resample_2) {
/* we resample 2 channels in parallel */
resampler->resample = resample_2;
resampler->deinterleave = deinterleave_copy;
resampler->blocks = 1;
resampler->inc = resampler->channels;;
GST_DEBUG ("resample 2 channels at a time");
} else {
/* we resample each channel separately */
resampler->resample = resample;
resampler->deinterleave = deinterleave;
resampler->blocks = resampler->channels;
resampler->inc = 1;
GST_DEBUG ("resample 1 channel at a time");
}
/* we resample each channel separately */
resampler->resample = resample;
resampler->deinterleave = deinterleave;
resampler->blocks = resampler->channels;
resampler->inc = 1;
}
static void
@ -1308,7 +1261,7 @@ G_STMT_START { \
type icoeff[4]; \
gint samp_index = 0, samp_phase = i; \
\
taps = get_taps_##type##_none (resampler, &samp_index,\
taps = get_taps_##type##_full (resampler, &samp_index,\
&samp_phase, icoeff); \
\
for (j = 0; j < n_taps; j++) { \