diff --git a/gst-libs/gst/audio/audio-resampler-x86.h b/gst-libs/gst/audio/audio-resampler-x86.h index 65a5194090..284e7c95f1 100644 --- a/gst-libs/gst/audio/audio-resampler-x86.h +++ b/gst-libs/gst/audio/audio-resampler-x86.h @@ -19,171 +19,118 @@ #ifdef HAVE_EMMINTRIN_H #include -#endif -#ifdef HAVE_EMMINTRIN_H static inline void -inner_product_gint16_1_sse (gint16 * o, const gint16 * a, const gint16 * b, gint len) +inner_product_gint16_1_sse2 (gint16 * o, const gint16 * a, const gint16 * b, gint len) { gint i = 0; - gint32 res = 0; - __m128i sum[2], ta, tb; - __m128i t1[2]; + __m128i sum, ta, tb; - sum[0] = _mm_setzero_si128 (); - sum[1] = _mm_setzero_si128 (); + sum = _mm_setzero_si128 (); - for (; i < len - 7; i += 8) { + for (; i < len; i += 8) { ta = _mm_loadu_si128 ((__m128i *) (a + i)); - tb = _mm_loadu_si128 ((__m128i *) (b + i)); + tb = _mm_load_si128 ((__m128i *) (b + i)); - t1[0] = _mm_mullo_epi16 (ta, tb); - t1[1] = _mm_mulhi_epi16 (ta, tb); - - sum[0] = _mm_add_epi32 (sum[0], _mm_unpacklo_epi16 (t1[0], t1[1])); - sum[1] = _mm_add_epi32 (sum[1], _mm_unpackhi_epi16 (t1[0], t1[1])); + sum = _mm_add_epi32 (sum, _mm_madd_epi16 (ta, tb)); } - sum[0] = _mm_add_epi32 (sum[0], sum[1]); - sum[0] = - _mm_add_epi32 (sum[0], _mm_shuffle_epi32 (sum[0], _MM_SHUFFLE (2, 3, 2, + sum = + _mm_add_epi32 (sum, _mm_shuffle_epi32 (sum, _MM_SHUFFLE (2, 3, 2, 3))); - sum[0] = - _mm_add_epi32 (sum[0], _mm_shuffle_epi32 (sum[0], _MM_SHUFFLE (1, 1, 1, + sum = + _mm_add_epi32 (sum, _mm_shuffle_epi32 (sum, _MM_SHUFFLE (1, 1, 1, 1))); - res = _mm_cvtsi128_si32 (sum[0]); - for (; i < len; i++) - res += (gint32) a[i] * (gint32) b[i]; - - res = (res + (1 << (PRECISION_S16 - 1))) >> PRECISION_S16; - *o = CLAMP (res, -(1L << 15), (1L << 15) - 1); + sum = _mm_add_epi32 (sum, _mm_set1_epi32 (1 << (PRECISION_S16 - 1))); + sum = _mm_srai_epi32 (sum, PRECISION_S16); + sum = _mm_packs_epi32 (sum, sum); + *o = _mm_extract_epi16 (sum, 0); } -#endif -#ifdef HAVE_EMMINTRIN_H static inline void inner_product_gfloat_1_sse (gfloat * o, const gfloat * a, const gfloat * b, gint len) { gint i = 0; - gfloat res; __m128 sum = _mm_setzero_ps (); - for (; i < len - 7; i += 8) { + for (; i < len; i += 8) { sum = _mm_add_ps (sum, _mm_mul_ps (_mm_loadu_ps (a + i + 0), - _mm_loadu_ps (b + i + 0))); + _mm_load_ps (b + i + 0))); sum = _mm_add_ps (sum, _mm_mul_ps (_mm_loadu_ps (a + i + 4), - _mm_loadu_ps (b + i + 4))); + _mm_load_ps (b + i + 4))); } sum = _mm_add_ps (sum, _mm_movehl_ps (sum, sum)); sum = _mm_add_ss (sum, _mm_shuffle_ps (sum, sum, 0x55)); - _mm_store_ss (&res, sum); - - for (; i < len; i++) - res += a[i] * b[i]; - - *o = res; + _mm_store_ss (o, sum); } -#endif -#ifdef HAVE_EMMINTRIN_H static inline void -inner_product_gdouble_1_sse (gdouble * o, const gdouble * a, const gdouble * b, +inner_product_gdouble_1_sse2 (gdouble * o, const gdouble * a, const gdouble * b, gint len) { gint i = 0; - gdouble res; __m128d sum = _mm_setzero_pd (); - for (; i < len - 7; i += 8) { + for (; i < len; i += 8) { sum = _mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + i + 0), - _mm_loadu_pd (b + i + 0))); + _mm_load_pd (b + i + 0))); sum = _mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + i + 2), - _mm_loadu_pd (b + i + 2))); + _mm_load_pd (b + i + 2))); sum = _mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + i + 4), - _mm_loadu_pd (b + i + 4))); + _mm_load_pd (b + i + 4))); sum = _mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + i + 6), - _mm_loadu_pd (b + i + 6))); + _mm_load_pd (b + i + 6))); } sum = _mm_add_sd (sum, _mm_unpackhi_pd (sum, sum)); - _mm_store_sd (&res, sum); - - for (; i < len; i++) - res += a[i] * b[i]; - - *o = res; + _mm_store_sd (o, sum); } -#endif -#ifdef HAVE_EMMINTRIN_H static inline void -inner_product_gint16_2_sse (gint16 * o, const gint16 * a, const gint16 * b, gint len) +inner_product_gint16_2_sse2 (gint16 * o, const gint16 * a, const gint16 * b, gint len) { gint i = 0; - gint32 r[2]; - guint64 r64; - __m128i sum[2], ta, tb; - __m128i t1[2]; + __m128i sum, ta, tb, t1; - sum[0] = _mm_setzero_si128 (); - sum[1] = _mm_setzero_si128 (); + sum = _mm_setzero_si128 (); - for (; i < len - 7; i += 8) { - tb = _mm_loadu_si128 ((__m128i *) (b + i)); - - t1[1] = _mm_unpacklo_epi16 (tb, tb); + for (; i < len; i += 8) { + tb = _mm_load_si128 ((__m128i *) (b + i)); + t1 = _mm_unpacklo_epi16 (tb, tb); ta = _mm_loadu_si128 ((__m128i *) (a + 2 * i)); - t1[0] = _mm_mullo_epi16 (ta, t1[1]); - t1[1] = _mm_mulhi_epi16 (ta, t1[1]); - sum[0] = _mm_add_epi32 (sum[0], _mm_unpacklo_epi16 (t1[0], t1[1])); - sum[1] = _mm_add_epi32 (sum[1], _mm_unpackhi_epi16 (t1[0], t1[1])); - - t1[1] = _mm_unpackhi_epi16 (tb, tb); + sum = _mm_add_epi32 (sum, _mm_madd_epi16 (ta, t1)); + t1 = _mm_unpackhi_epi16 (tb, tb); ta = _mm_loadu_si128 ((__m128i *) (a + 2 * i + 8)); - t1[0] = _mm_mullo_epi16 (ta, t1[1]); - t1[1] = _mm_mulhi_epi16 (ta, t1[1]); - sum[0] = _mm_add_epi32 (sum[0], _mm_unpacklo_epi16 (t1[0], t1[1])); - sum[1] = _mm_add_epi32 (sum[1], _mm_unpackhi_epi16 (t1[0], t1[1])); + sum = _mm_add_epi32 (sum, _mm_madd_epi16 (ta, t1)); } - sum[0] = _mm_add_epi32 (sum[0], sum[1]); - sum[0] = - _mm_add_epi32 (sum[0], _mm_shuffle_epi32 (sum[0], _MM_SHUFFLE (2, 3, 2, + sum = + _mm_add_epi32 (sum, _mm_shuffle_epi32 (sum, _MM_SHUFFLE (2, 3, 2, 3))); - r64 = _mm_cvtsi128_si64 (sum[0]); - r[0] = r64 >> 32; - r[1] = r64 & 0xffffffff; - for (; i < len; i++) { - r[0] += (gint32) a[2 * i] * (gint32) b[i]; - r[1] += (gint32) a[2 * i + 1] * (gint32) b[i]; - } - r[0] = (r[0] + (1 << (PRECISION_S16 - 1))) >> PRECISION_S16; - r[1] = (r[1] + (1 << (PRECISION_S16 - 1))) >> PRECISION_S16; - o[0] = CLAMP (r[0], -(1L << 15), (1L << 15) - 1); - o[1] = CLAMP (r[1], -(1L << 15), (1L << 15) - 1); + sum = _mm_add_epi32 (sum, _mm_set1_epi32 (1 << (PRECISION_S16 - 1))); + sum = _mm_srai_epi32 (sum, PRECISION_S16); + sum = _mm_packs_epi32 (sum, sum); + *(gint32*)o = _mm_cvtsi128_si32 (sum); } -#endif -#ifdef HAVE_EMMINTRIN_H static inline void -inner_product_gdouble_2_sse (gdouble * o, const gdouble * a, const gdouble * b, +inner_product_gdouble_2_sse2 (gdouble * o, const gdouble * a, const gdouble * b, gint len) { gint i = 0; - gdouble r[2]; __m128d sum = _mm_setzero_pd (), t; - for (; i < len - 3; i += 4) { - t = _mm_loadu_pd (b + i); + for (; i < len; i += 4) { + t = _mm_load_pd (b + i); sum = _mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + 2 * i), _mm_unpacklo_pd (t, t))); @@ -191,7 +138,7 @@ inner_product_gdouble_2_sse (gdouble * o, const gdouble * a, const gdouble * b, _mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + 2 * i + 2), _mm_unpackhi_pd (t, t))); - t = _mm_loadu_pd (b + i + 2); + t = _mm_load_pd (b + i + 2); sum = _mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + 2 * i + 4), _mm_unpacklo_pd (t, t))); @@ -199,34 +146,29 @@ inner_product_gdouble_2_sse (gdouble * o, const gdouble * a, const gdouble * b, _mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + 2 * i + 6), _mm_unpackhi_pd (t, t))); } - _mm_store_pd (r, sum); - - for (; i < len; i++) { - r[0] += a[2 * i] * b[i]; - r[1] += a[2 * i + 1] * b[i]; - } - o[0] = r[0]; - o[1] = r[1]; + _mm_store_pd (o, sum); } -#endif -#ifdef HAVE_EMMINTRIN_H -MAKE_RESAMPLE_FUNC (gint16, 1, sse); +MAKE_RESAMPLE_FUNC (gint16, 1, sse2); MAKE_RESAMPLE_FUNC (gfloat, 1, sse); -MAKE_RESAMPLE_FUNC (gdouble, 1, sse); -MAKE_RESAMPLE_FUNC (gint16, 2, sse); -MAKE_RESAMPLE_FUNC (gdouble, 2, sse); +MAKE_RESAMPLE_FUNC (gdouble, 1, sse2); +MAKE_RESAMPLE_FUNC (gint16, 2, sse2); +MAKE_RESAMPLE_FUNC (gdouble, 2, sse2); #endif static void audio_resampler_check_x86 (const gchar *option) { - if (!strcmp (option, "sse2")) { - GST_DEBUG ("enable SSE2 optimisations"); - resample_gint16_1 = resample_gint16_1_sse; +#ifdef HAVE_EMMINTRIN_H + if (!strcmp (option, "sse")) { + GST_DEBUG ("enable SSE optimisations"); resample_gfloat_1 = resample_gfloat_1_sse; - resample_gdouble_1 = resample_gdouble_1_sse; - resample_gint16_2 = resample_gint16_2_sse; - resample_gdouble_2 = resample_gdouble_2_sse; + } else if (!strcmp (option, "sse2")) { + GST_DEBUG ("enable SSE2 optimisations"); + resample_gint16_1 = resample_gint16_1_sse2; + resample_gdouble_1 = resample_gdouble_1_sse2; + resample_gint16_2 = resample_gint16_2_sse2; + resample_gdouble_2 = resample_gdouble_2_sse2; } +#endif } diff --git a/gst-libs/gst/audio/audio-resampler.c b/gst-libs/gst/audio/audio-resampler.c index e07f8d7cae..26c44d0bac 100644 --- a/gst-libs/gst/audio/audio-resampler.c +++ b/gst-libs/gst/audio/audio-resampler.c @@ -48,6 +48,8 @@ typedef void (*DeinterleaveFunc) (GstAudioResampler * resampler, gpointer * sbuf, gpointer in[], gsize in_frames); typedef void (*MirrorFunc) (GstAudioResampler * resampler, gpointer * sbuf); +#define MEM_ALIGN(m,a) ((gint8 *)((guintptr)((gint8 *)(m) + ((a)-1)) & ~((a)-1))) + struct _GstAudioResampler { GstAudioResamplerMethod method; @@ -68,6 +70,8 @@ struct _GstAudioResampler guint n_taps; Tap *taps; gpointer coeff; + gpointer coeffmem; + gsize cstride; gpointer tmpcoeff; DeinterleaveFunc deinterleave; @@ -75,6 +79,7 @@ struct _GstAudioResampler ResampleFunc resample; guint blocks; + guint inc; gboolean filling; gint samp_inc; gint samp_frac; @@ -256,7 +261,7 @@ get_kaiser_tap (GstAudioResampler * resampler, gdouble x) #define CONVERT_TAPS(type, precision) \ G_STMT_START { \ - type *taps = t->taps = (type *) resampler->coeff + j * n_taps; \ + type *taps = t->taps = (type *) ((gint8*)resampler->coeff + j * resampler->cstride); \ gdouble multiplier = (1 << precision); \ gint i, j; \ gdouble offset, l_offset, h_offset; \ @@ -338,14 +343,16 @@ make_taps (GstAudioResampler * resampler, Tap * t, gint j) switch (resampler->format) { case GST_AUDIO_FORMAT_F64: { - gdouble *taps = t->taps = (gdouble *) resampler->coeff + j * n_taps; + gdouble *taps = t->taps = + (gdouble *) ((gint8 *) resampler->coeff + j * resampler->cstride); for (l = 0; l < n_taps; l++) taps[l] = tmpcoeff[l] / weight; break; } case GST_AUDIO_FORMAT_F32: { - gfloat *taps = t->taps = (gfloat *) resampler->coeff + j * n_taps; + gfloat *taps = t->taps = + (gfloat *) ((gint8 *) resampler->coeff + j * resampler->cstride); for (l = 0; l < n_taps; l++) taps[l] = tmpcoeff[l] / weight; break; @@ -593,28 +600,18 @@ static void deinterleave_copy (GstAudioResampler * resampler, gpointer sbuf[], gpointer in[], gsize in_frames) { - gsize samples_avail = resampler->samples_avail; - gint bpf = resampler->bpf; + guint c, blocks = resampler->blocks; + gsize bytes_avail, in_bytes, bpf; - if (in == NULL) - memset ((guint8 *) sbuf[0] + samples_avail * bpf, 0, in_frames * bpf); - else - memcpy ((guint8 *) sbuf[0] + samples_avail * bpf, in[0], in_frames * bpf); -} + bpf = resampler->bps * resampler->inc; + bytes_avail = resampler->samples_avail * bpf; + in_bytes = in_frames * bpf; -static void -deinterleave_copy_n (GstAudioResampler * resampler, gpointer sbuf[], - gpointer in[], gsize in_frames) -{ - guint c, channels = resampler->channels; - gsize samples_avail = resampler->samples_avail; - gint bps = resampler->bps; - - for (c = 0; c < channels; c++) { + for (c = 0; c < blocks; c++) { if (in == NULL) - memset ((guint8 *) sbuf[c] + samples_avail * bps, 0, in_frames * bps); + memset ((guint8 *) sbuf[c] + bytes_avail, 0, in_bytes); else - memcpy ((guint8 *) sbuf[c] + samples_avail * bps, in[c], in_frames * bps); + memcpy ((guint8 *) sbuf[c] + bytes_avail, in[c], in_bytes); } } @@ -623,10 +620,10 @@ deinterleave_copy_n (GstAudioResampler * resampler, gpointer sbuf[], static void \ mirror_ ##type (GstAudioResampler * resampler, gpointer sbuf[]) \ { \ - guint i, c, channels = resampler->channels; \ + guint i, c, blocks = resampler->blocks; \ gint si = resampler->n_taps / 2; \ gint n_taps = resampler->n_taps; \ - for (c = 0; c < channels; c++) { \ + for (c = 0; c < blocks; c++) { \ type *s = sbuf[c]; \ for (i = 0; i < si; i++) \ s[i] = -s[n_taps - i]; \ @@ -686,6 +683,7 @@ resampler_calculate_taps (GstAudioResampler * resampler) gint n_taps; gint out_rate; gint in_rate; + gboolean non_interleaved; switch (resampler->method) { case GST_AUDIO_RESAMPLER_METHOD_NEAREST: @@ -729,7 +727,12 @@ resampler_calculate_taps (GstAudioResampler * resampler) GST_LOG ("using n_taps %d cutoff %f", n_taps, resampler->cutoff); resampler->taps = g_realloc_n (resampler->taps, out_rate, sizeof (Tap)); - resampler->coeff = g_realloc_n (resampler->coeff, out_rate, bps * n_taps); + + resampler->cstride = GST_ROUND_UP_32 (bps * (n_taps + 16)); + g_free (resampler->coeffmem); + resampler->coeffmem = g_malloc0 (out_rate * resampler->cstride + 31); + resampler->coeff = MEM_ALIGN (resampler->coeffmem, 32); + resampler->tmpcoeff = g_realloc_n (resampler->tmpcoeff, n_taps, sizeof (gdouble)); @@ -743,13 +746,22 @@ resampler_calculate_taps (GstAudioResampler * resampler) t->next_phase = (j + in_rate) % out_rate; } + non_interleaved = + (resampler->flags & GST_AUDIO_RESAMPLER_FLAG_NON_INTERLEAVED); + + resampler->ostride = non_interleaved ? 1 : resampler->channels; + + /* we resample each channel separately */ resampler->blocks = resampler->channels; + resampler->inc = 1; + switch (resampler->format) { case GST_AUDIO_FORMAT_F64: - if (resampler->channels == 2 && n_taps >= 4) { + if (!non_interleaved && resampler->channels == 2 && n_taps >= 4) { resampler->resample = resample_gdouble_2; resampler->deinterleave = deinterleave_copy; resampler->blocks = 1; + resampler->inc = resampler->channels;; } else { resampler->resample = resample_gdouble_1; resampler->deinterleave = deinterleave_gdouble; @@ -767,10 +779,11 @@ resampler_calculate_taps (GstAudioResampler * resampler) resampler->mirror = mirror_gint32; break; case GST_AUDIO_FORMAT_S16: - if (resampler->channels == 2 && n_taps >= 4) { + if (!non_interleaved && resampler->channels == 2 && n_taps >= 4) { resampler->resample = resample_gint16_2; resampler->deinterleave = deinterleave_copy; resampler->blocks = 1; + resampler->inc = resampler->channels;; } else { resampler->resample = resample_gint16_1; resampler->deinterleave = deinterleave_gint16; @@ -780,12 +793,6 @@ resampler_calculate_taps (GstAudioResampler * resampler) default: break; } - if (resampler->flags & GST_AUDIO_RESAMPLER_FLAG_NON_INTERLEAVED) { - resampler->deinterleave = deinterleave_copy_n; - resampler->ostride = 1; - } else { - resampler->ostride = resampler->channels; - } } #define PRINT_TAPS(type,print) \ @@ -1017,7 +1024,7 @@ gst_audio_resampler_free (GstAudioResampler * resampler) g_return_if_fail (resampler != NULL); g_free (resampler->taps); - g_free (resampler->coeff); + g_free (resampler->coeffmem); g_free (resampler->tmpcoeff); g_free (resampler->samples); g_free (resampler->sbuf); @@ -1119,16 +1126,28 @@ static inline gpointer * get_sample_bufs (GstAudioResampler * resampler, gsize need) { if (resampler->samples_len < need) { - guint c, channels = resampler->channels; + guint c, blocks = resampler->blocks; + gsize bytes, bpf; + gint8 *ptr; + GST_LOG ("realloc %d -> %d", (gint) resampler->samples_len, (gint) need); + + bpf = resampler->bps * resampler->inc; + + bytes = (need + 8) * bpf; + bytes = GST_ROUND_UP_32 (bytes); + /* FIXME, move history */ - resampler->samples = g_realloc (resampler->samples, need * resampler->bpf); - resampler->samples_len = need; + resampler->samples = + g_realloc (resampler->samples, resampler->blocks * bytes + 31); + resampler->samples_len = bytes / bpf; + + ptr = MEM_ALIGN (resampler->samples, 32); + /* set up new pointers */ - for (c = 0; c < channels; c++) - resampler->sbuf[c] = - (gint8 *) resampler->samples + - (c * resampler->samples_len * resampler->bps); + for (c = 0; c < blocks; c++) { + resampler->sbuf[c] = ptr + (c * bytes); + } } return resampler->sbuf; }