audio-resampler: optimizations

Improve int16 resampling by using pmaddwd
Use intrinsics to scale and pack int16 samples
Align the coefficients so that we can use aligned loads
Add padding to taps and samples so that we don't have to use partial
loads for the remainder of the loops.
Remove copy_n, we can reuse the plain copy function with some new
parameters.
Align and pad the sample array.
This commit is contained in:
Wim Taymans 2016-01-13 17:44:39 +01:00
parent f55a67ca7c
commit 71871c5048
2 changed files with 118 additions and 157 deletions

View file

@ -19,171 +19,118 @@
#ifdef HAVE_EMMINTRIN_H
#include <emmintrin.h>
#endif
#ifdef HAVE_EMMINTRIN_H
static inline void
inner_product_gint16_1_sse (gint16 * o, const gint16 * a, const gint16 * b, gint len)
inner_product_gint16_1_sse2 (gint16 * o, const gint16 * a, const gint16 * b, gint len)
{
gint i = 0;
gint32 res = 0;
__m128i sum[2], ta, tb;
__m128i t1[2];
__m128i sum, ta, tb;
sum[0] = _mm_setzero_si128 ();
sum[1] = _mm_setzero_si128 ();
sum = _mm_setzero_si128 ();
for (; i < len - 7; i += 8) {
for (; i < len; i += 8) {
ta = _mm_loadu_si128 ((__m128i *) (a + i));
tb = _mm_loadu_si128 ((__m128i *) (b + i));
tb = _mm_load_si128 ((__m128i *) (b + i));
t1[0] = _mm_mullo_epi16 (ta, tb);
t1[1] = _mm_mulhi_epi16 (ta, tb);
sum[0] = _mm_add_epi32 (sum[0], _mm_unpacklo_epi16 (t1[0], t1[1]));
sum[1] = _mm_add_epi32 (sum[1], _mm_unpackhi_epi16 (t1[0], t1[1]));
sum = _mm_add_epi32 (sum, _mm_madd_epi16 (ta, tb));
}
sum[0] = _mm_add_epi32 (sum[0], sum[1]);
sum[0] =
_mm_add_epi32 (sum[0], _mm_shuffle_epi32 (sum[0], _MM_SHUFFLE (2, 3, 2,
sum =
_mm_add_epi32 (sum, _mm_shuffle_epi32 (sum, _MM_SHUFFLE (2, 3, 2,
3)));
sum[0] =
_mm_add_epi32 (sum[0], _mm_shuffle_epi32 (sum[0], _MM_SHUFFLE (1, 1, 1,
sum =
_mm_add_epi32 (sum, _mm_shuffle_epi32 (sum, _MM_SHUFFLE (1, 1, 1,
1)));
res = _mm_cvtsi128_si32 (sum[0]);
for (; i < len; i++)
res += (gint32) a[i] * (gint32) b[i];
res = (res + (1 << (PRECISION_S16 - 1))) >> PRECISION_S16;
*o = CLAMP (res, -(1L << 15), (1L << 15) - 1);
sum = _mm_add_epi32 (sum, _mm_set1_epi32 (1 << (PRECISION_S16 - 1)));
sum = _mm_srai_epi32 (sum, PRECISION_S16);
sum = _mm_packs_epi32 (sum, sum);
*o = _mm_extract_epi16 (sum, 0);
}
#endif
#ifdef HAVE_EMMINTRIN_H
static inline void
inner_product_gfloat_1_sse (gfloat * o, const gfloat * a, const gfloat * b, gint len)
{
gint i = 0;
gfloat res;
__m128 sum = _mm_setzero_ps ();
for (; i < len - 7; i += 8) {
for (; i < len; i += 8) {
sum =
_mm_add_ps (sum, _mm_mul_ps (_mm_loadu_ps (a + i + 0),
_mm_loadu_ps (b + i + 0)));
_mm_load_ps (b + i + 0)));
sum =
_mm_add_ps (sum, _mm_mul_ps (_mm_loadu_ps (a + i + 4),
_mm_loadu_ps (b + i + 4)));
_mm_load_ps (b + i + 4)));
}
sum = _mm_add_ps (sum, _mm_movehl_ps (sum, sum));
sum = _mm_add_ss (sum, _mm_shuffle_ps (sum, sum, 0x55));
_mm_store_ss (&res, sum);
for (; i < len; i++)
res += a[i] * b[i];
*o = res;
_mm_store_ss (o, sum);
}
#endif
#ifdef HAVE_EMMINTRIN_H
static inline void
inner_product_gdouble_1_sse (gdouble * o, const gdouble * a, const gdouble * b,
inner_product_gdouble_1_sse2 (gdouble * o, const gdouble * a, const gdouble * b,
gint len)
{
gint i = 0;
gdouble res;
__m128d sum = _mm_setzero_pd ();
for (; i < len - 7; i += 8) {
for (; i < len; i += 8) {
sum =
_mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + i + 0),
_mm_loadu_pd (b + i + 0)));
_mm_load_pd (b + i + 0)));
sum =
_mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + i + 2),
_mm_loadu_pd (b + i + 2)));
_mm_load_pd (b + i + 2)));
sum =
_mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + i + 4),
_mm_loadu_pd (b + i + 4)));
_mm_load_pd (b + i + 4)));
sum =
_mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + i + 6),
_mm_loadu_pd (b + i + 6)));
_mm_load_pd (b + i + 6)));
}
sum = _mm_add_sd (sum, _mm_unpackhi_pd (sum, sum));
_mm_store_sd (&res, sum);
for (; i < len; i++)
res += a[i] * b[i];
*o = res;
_mm_store_sd (o, sum);
}
#endif
#ifdef HAVE_EMMINTRIN_H
static inline void
inner_product_gint16_2_sse (gint16 * o, const gint16 * a, const gint16 * b, gint len)
inner_product_gint16_2_sse2 (gint16 * o, const gint16 * a, const gint16 * b, gint len)
{
gint i = 0;
gint32 r[2];
guint64 r64;
__m128i sum[2], ta, tb;
__m128i t1[2];
__m128i sum, ta, tb, t1;
sum[0] = _mm_setzero_si128 ();
sum[1] = _mm_setzero_si128 ();
sum = _mm_setzero_si128 ();
for (; i < len - 7; i += 8) {
tb = _mm_loadu_si128 ((__m128i *) (b + i));
t1[1] = _mm_unpacklo_epi16 (tb, tb);
for (; i < len; i += 8) {
tb = _mm_load_si128 ((__m128i *) (b + i));
t1 = _mm_unpacklo_epi16 (tb, tb);
ta = _mm_loadu_si128 ((__m128i *) (a + 2 * i));
t1[0] = _mm_mullo_epi16 (ta, t1[1]);
t1[1] = _mm_mulhi_epi16 (ta, t1[1]);
sum[0] = _mm_add_epi32 (sum[0], _mm_unpacklo_epi16 (t1[0], t1[1]));
sum[1] = _mm_add_epi32 (sum[1], _mm_unpackhi_epi16 (t1[0], t1[1]));
t1[1] = _mm_unpackhi_epi16 (tb, tb);
sum = _mm_add_epi32 (sum, _mm_madd_epi16 (ta, t1));
t1 = _mm_unpackhi_epi16 (tb, tb);
ta = _mm_loadu_si128 ((__m128i *) (a + 2 * i + 8));
t1[0] = _mm_mullo_epi16 (ta, t1[1]);
t1[1] = _mm_mulhi_epi16 (ta, t1[1]);
sum[0] = _mm_add_epi32 (sum[0], _mm_unpacklo_epi16 (t1[0], t1[1]));
sum[1] = _mm_add_epi32 (sum[1], _mm_unpackhi_epi16 (t1[0], t1[1]));
sum = _mm_add_epi32 (sum, _mm_madd_epi16 (ta, t1));
}
sum[0] = _mm_add_epi32 (sum[0], sum[1]);
sum[0] =
_mm_add_epi32 (sum[0], _mm_shuffle_epi32 (sum[0], _MM_SHUFFLE (2, 3, 2,
sum =
_mm_add_epi32 (sum, _mm_shuffle_epi32 (sum, _MM_SHUFFLE (2, 3, 2,
3)));
r64 = _mm_cvtsi128_si64 (sum[0]);
r[0] = r64 >> 32;
r[1] = r64 & 0xffffffff;
for (; i < len; i++) {
r[0] += (gint32) a[2 * i] * (gint32) b[i];
r[1] += (gint32) a[2 * i + 1] * (gint32) b[i];
}
r[0] = (r[0] + (1 << (PRECISION_S16 - 1))) >> PRECISION_S16;
r[1] = (r[1] + (1 << (PRECISION_S16 - 1))) >> PRECISION_S16;
o[0] = CLAMP (r[0], -(1L << 15), (1L << 15) - 1);
o[1] = CLAMP (r[1], -(1L << 15), (1L << 15) - 1);
sum = _mm_add_epi32 (sum, _mm_set1_epi32 (1 << (PRECISION_S16 - 1)));
sum = _mm_srai_epi32 (sum, PRECISION_S16);
sum = _mm_packs_epi32 (sum, sum);
*(gint32*)o = _mm_cvtsi128_si32 (sum);
}
#endif
#ifdef HAVE_EMMINTRIN_H
static inline void
inner_product_gdouble_2_sse (gdouble * o, const gdouble * a, const gdouble * b,
inner_product_gdouble_2_sse2 (gdouble * o, const gdouble * a, const gdouble * b,
gint len)
{
gint i = 0;
gdouble r[2];
__m128d sum = _mm_setzero_pd (), t;
for (; i < len - 3; i += 4) {
t = _mm_loadu_pd (b + i);
for (; i < len; i += 4) {
t = _mm_load_pd (b + i);
sum =
_mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + 2 * i),
_mm_unpacklo_pd (t, t)));
@ -191,7 +138,7 @@ inner_product_gdouble_2_sse (gdouble * o, const gdouble * a, const gdouble * b,
_mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + 2 * i + 2),
_mm_unpackhi_pd (t, t)));
t = _mm_loadu_pd (b + i + 2);
t = _mm_load_pd (b + i + 2);
sum =
_mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + 2 * i + 4),
_mm_unpacklo_pd (t, t)));
@ -199,34 +146,29 @@ inner_product_gdouble_2_sse (gdouble * o, const gdouble * a, const gdouble * b,
_mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + 2 * i + 6),
_mm_unpackhi_pd (t, t)));
}
_mm_store_pd (r, sum);
for (; i < len; i++) {
r[0] += a[2 * i] * b[i];
r[1] += a[2 * i + 1] * b[i];
}
o[0] = r[0];
o[1] = r[1];
_mm_store_pd (o, sum);
}
#endif
#ifdef HAVE_EMMINTRIN_H
MAKE_RESAMPLE_FUNC (gint16, 1, sse);
MAKE_RESAMPLE_FUNC (gint16, 1, sse2);
MAKE_RESAMPLE_FUNC (gfloat, 1, sse);
MAKE_RESAMPLE_FUNC (gdouble, 1, sse);
MAKE_RESAMPLE_FUNC (gint16, 2, sse);
MAKE_RESAMPLE_FUNC (gdouble, 2, sse);
MAKE_RESAMPLE_FUNC (gdouble, 1, sse2);
MAKE_RESAMPLE_FUNC (gint16, 2, sse2);
MAKE_RESAMPLE_FUNC (gdouble, 2, sse2);
#endif
static void
audio_resampler_check_x86 (const gchar *option)
{
if (!strcmp (option, "sse2")) {
GST_DEBUG ("enable SSE2 optimisations");
resample_gint16_1 = resample_gint16_1_sse;
#ifdef HAVE_EMMINTRIN_H
if (!strcmp (option, "sse")) {
GST_DEBUG ("enable SSE optimisations");
resample_gfloat_1 = resample_gfloat_1_sse;
resample_gdouble_1 = resample_gdouble_1_sse;
resample_gint16_2 = resample_gint16_2_sse;
resample_gdouble_2 = resample_gdouble_2_sse;
} else if (!strcmp (option, "sse2")) {
GST_DEBUG ("enable SSE2 optimisations");
resample_gint16_1 = resample_gint16_1_sse2;
resample_gdouble_1 = resample_gdouble_1_sse2;
resample_gint16_2 = resample_gint16_2_sse2;
resample_gdouble_2 = resample_gdouble_2_sse2;
}
#endif
}

View file

@ -48,6 +48,8 @@ typedef void (*DeinterleaveFunc) (GstAudioResampler * resampler,
gpointer * sbuf, gpointer in[], gsize in_frames);
typedef void (*MirrorFunc) (GstAudioResampler * resampler, gpointer * sbuf);
#define MEM_ALIGN(m,a) ((gint8 *)((guintptr)((gint8 *)(m) + ((a)-1)) & ~((a)-1)))
struct _GstAudioResampler
{
GstAudioResamplerMethod method;
@ -68,6 +70,8 @@ struct _GstAudioResampler
guint n_taps;
Tap *taps;
gpointer coeff;
gpointer coeffmem;
gsize cstride;
gpointer tmpcoeff;
DeinterleaveFunc deinterleave;
@ -75,6 +79,7 @@ struct _GstAudioResampler
ResampleFunc resample;
guint blocks;
guint inc;
gboolean filling;
gint samp_inc;
gint samp_frac;
@ -256,7 +261,7 @@ get_kaiser_tap (GstAudioResampler * resampler, gdouble x)
#define CONVERT_TAPS(type, precision) \
G_STMT_START { \
type *taps = t->taps = (type *) resampler->coeff + j * n_taps; \
type *taps = t->taps = (type *) ((gint8*)resampler->coeff + j * resampler->cstride); \
gdouble multiplier = (1 << precision); \
gint i, j; \
gdouble offset, l_offset, h_offset; \
@ -338,14 +343,16 @@ make_taps (GstAudioResampler * resampler, Tap * t, gint j)
switch (resampler->format) {
case GST_AUDIO_FORMAT_F64:
{
gdouble *taps = t->taps = (gdouble *) resampler->coeff + j * n_taps;
gdouble *taps = t->taps =
(gdouble *) ((gint8 *) resampler->coeff + j * resampler->cstride);
for (l = 0; l < n_taps; l++)
taps[l] = tmpcoeff[l] / weight;
break;
}
case GST_AUDIO_FORMAT_F32:
{
gfloat *taps = t->taps = (gfloat *) resampler->coeff + j * n_taps;
gfloat *taps = t->taps =
(gfloat *) ((gint8 *) resampler->coeff + j * resampler->cstride);
for (l = 0; l < n_taps; l++)
taps[l] = tmpcoeff[l] / weight;
break;
@ -593,28 +600,18 @@ static void
deinterleave_copy (GstAudioResampler * resampler, gpointer sbuf[],
gpointer in[], gsize in_frames)
{
gsize samples_avail = resampler->samples_avail;
gint bpf = resampler->bpf;
guint c, blocks = resampler->blocks;
gsize bytes_avail, in_bytes, bpf;
if (in == NULL)
memset ((guint8 *) sbuf[0] + samples_avail * bpf, 0, in_frames * bpf);
else
memcpy ((guint8 *) sbuf[0] + samples_avail * bpf, in[0], in_frames * bpf);
}
bpf = resampler->bps * resampler->inc;
bytes_avail = resampler->samples_avail * bpf;
in_bytes = in_frames * bpf;
static void
deinterleave_copy_n (GstAudioResampler * resampler, gpointer sbuf[],
gpointer in[], gsize in_frames)
{
guint c, channels = resampler->channels;
gsize samples_avail = resampler->samples_avail;
gint bps = resampler->bps;
for (c = 0; c < channels; c++) {
for (c = 0; c < blocks; c++) {
if (in == NULL)
memset ((guint8 *) sbuf[c] + samples_avail * bps, 0, in_frames * bps);
memset ((guint8 *) sbuf[c] + bytes_avail, 0, in_bytes);
else
memcpy ((guint8 *) sbuf[c] + samples_avail * bps, in[c], in_frames * bps);
memcpy ((guint8 *) sbuf[c] + bytes_avail, in[c], in_bytes);
}
}
@ -623,10 +620,10 @@ deinterleave_copy_n (GstAudioResampler * resampler, gpointer sbuf[],
static void \
mirror_ ##type (GstAudioResampler * resampler, gpointer sbuf[]) \
{ \
guint i, c, channels = resampler->channels; \
guint i, c, blocks = resampler->blocks; \
gint si = resampler->n_taps / 2; \
gint n_taps = resampler->n_taps; \
for (c = 0; c < channels; c++) { \
for (c = 0; c < blocks; c++) { \
type *s = sbuf[c]; \
for (i = 0; i < si; i++) \
s[i] = -s[n_taps - i]; \
@ -686,6 +683,7 @@ resampler_calculate_taps (GstAudioResampler * resampler)
gint n_taps;
gint out_rate;
gint in_rate;
gboolean non_interleaved;
switch (resampler->method) {
case GST_AUDIO_RESAMPLER_METHOD_NEAREST:
@ -729,7 +727,12 @@ resampler_calculate_taps (GstAudioResampler * resampler)
GST_LOG ("using n_taps %d cutoff %f", n_taps, resampler->cutoff);
resampler->taps = g_realloc_n (resampler->taps, out_rate, sizeof (Tap));
resampler->coeff = g_realloc_n (resampler->coeff, out_rate, bps * n_taps);
resampler->cstride = GST_ROUND_UP_32 (bps * (n_taps + 16));
g_free (resampler->coeffmem);
resampler->coeffmem = g_malloc0 (out_rate * resampler->cstride + 31);
resampler->coeff = MEM_ALIGN (resampler->coeffmem, 32);
resampler->tmpcoeff =
g_realloc_n (resampler->tmpcoeff, n_taps, sizeof (gdouble));
@ -743,13 +746,22 @@ resampler_calculate_taps (GstAudioResampler * resampler)
t->next_phase = (j + in_rate) % out_rate;
}
non_interleaved =
(resampler->flags & GST_AUDIO_RESAMPLER_FLAG_NON_INTERLEAVED);
resampler->ostride = non_interleaved ? 1 : resampler->channels;
/* we resample each channel separately */
resampler->blocks = resampler->channels;
resampler->inc = 1;
switch (resampler->format) {
case GST_AUDIO_FORMAT_F64:
if (resampler->channels == 2 && n_taps >= 4) {
if (!non_interleaved && resampler->channels == 2 && n_taps >= 4) {
resampler->resample = resample_gdouble_2;
resampler->deinterleave = deinterleave_copy;
resampler->blocks = 1;
resampler->inc = resampler->channels;;
} else {
resampler->resample = resample_gdouble_1;
resampler->deinterleave = deinterleave_gdouble;
@ -767,10 +779,11 @@ resampler_calculate_taps (GstAudioResampler * resampler)
resampler->mirror = mirror_gint32;
break;
case GST_AUDIO_FORMAT_S16:
if (resampler->channels == 2 && n_taps >= 4) {
if (!non_interleaved && resampler->channels == 2 && n_taps >= 4) {
resampler->resample = resample_gint16_2;
resampler->deinterleave = deinterleave_copy;
resampler->blocks = 1;
resampler->inc = resampler->channels;;
} else {
resampler->resample = resample_gint16_1;
resampler->deinterleave = deinterleave_gint16;
@ -780,12 +793,6 @@ resampler_calculate_taps (GstAudioResampler * resampler)
default:
break;
}
if (resampler->flags & GST_AUDIO_RESAMPLER_FLAG_NON_INTERLEAVED) {
resampler->deinterleave = deinterleave_copy_n;
resampler->ostride = 1;
} else {
resampler->ostride = resampler->channels;
}
}
#define PRINT_TAPS(type,print) \
@ -1017,7 +1024,7 @@ gst_audio_resampler_free (GstAudioResampler * resampler)
g_return_if_fail (resampler != NULL);
g_free (resampler->taps);
g_free (resampler->coeff);
g_free (resampler->coeffmem);
g_free (resampler->tmpcoeff);
g_free (resampler->samples);
g_free (resampler->sbuf);
@ -1119,16 +1126,28 @@ static inline gpointer *
get_sample_bufs (GstAudioResampler * resampler, gsize need)
{
if (resampler->samples_len < need) {
guint c, channels = resampler->channels;
guint c, blocks = resampler->blocks;
gsize bytes, bpf;
gint8 *ptr;
GST_LOG ("realloc %d -> %d", (gint) resampler->samples_len, (gint) need);
bpf = resampler->bps * resampler->inc;
bytes = (need + 8) * bpf;
bytes = GST_ROUND_UP_32 (bytes);
/* FIXME, move history */
resampler->samples = g_realloc (resampler->samples, need * resampler->bpf);
resampler->samples_len = need;
resampler->samples =
g_realloc (resampler->samples, resampler->blocks * bytes + 31);
resampler->samples_len = bytes / bpf;
ptr = MEM_ALIGN (resampler->samples, 32);
/* set up new pointers */
for (c = 0; c < channels; c++)
resampler->sbuf[c] =
(gint8 *) resampler->samples +
(c * resampler->samples_len * resampler->bps);
for (c = 0; c < blocks; c++) {
resampler->sbuf[c] = ptr + (c * bytes);
}
}
return resampler->sbuf;
}