From d969a7a9d8bd2da477e4de9f6a86e18c06174972 Mon Sep 17 00:00:00 2001 From: Wim Taymans Date: Fri, 19 Feb 2016 16:39:43 +0100 Subject: [PATCH] audio-resampler: reorder filter coefficients for more speed Reorder the filter coefficients to make it easier to use SIMD for interpolation. Fix orc flags a little. Add specialized nearest resampling function. --- gst-libs/gst/audio/audio-resampler-neon.h | 4 +- gst-libs/gst/audio/audio-resampler-x86.h | 450 ++++++++++++---------- gst-libs/gst/audio/audio-resampler.c | 258 ++++++++----- 3 files changed, 416 insertions(+), 296 deletions(-) diff --git a/gst-libs/gst/audio/audio-resampler-neon.h b/gst-libs/gst/audio/audio-resampler-neon.h index d194e0996a..bfdb4b998b 100644 --- a/gst-libs/gst/audio/audio-resampler-neon.h +++ b/gst-libs/gst/audio/audio-resampler-neon.h @@ -398,9 +398,9 @@ MAKE_RESAMPLE_FUNC (gfloat, linear, 1, neon); MAKE_RESAMPLE_FUNC (gfloat, cubic, 1, neon); static void -audio_resampler_check_neon (const gchar *target_name, const gchar *option) +audio_resampler_check_neon (const gchar *option) { - if (!strcmp (target_name, "neon")) { + if (!strcmp (option, "neon")) { GST_DEBUG ("enable NEON optimisations"); resample_gint16_full_1 = resample_gint16_full_1_neon; resample_gint16_linear_1 = resample_gint16_linear_1_neon; diff --git a/gst-libs/gst/audio/audio-resampler-x86.h b/gst-libs/gst/audio/audio-resampler-x86.h index 1f05e3096f..daa493390d 100644 --- a/gst-libs/gst/audio/audio-resampler-x86.h +++ b/gst-libs/gst/audio/audio-resampler-x86.h @@ -1,5 +1,5 @@ /* GStreamer - * Copyright (C) <2015> Wim Taymans + * Copyright (C) <2016> Wim Taymans * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Library General Public @@ -22,7 +22,7 @@ static inline void inner_product_gfloat_full_1_sse (gfloat * o, const gfloat * a, - const gfloat * b, gint len, const gfloat * icoeff) + const gfloat * b, gint len, const gfloat * icoeff, gint bstride) { gint i = 0; __m128 sum = _mm_setzero_ps (); @@ -42,44 +42,63 @@ inner_product_gfloat_full_1_sse (gfloat * o, const gfloat * a, static inline void inner_product_gfloat_linear_1_sse (gfloat * o, const gfloat * a, - const gfloat * b, gint len, const gfloat * icoeff) + const gfloat * b, gint len, const gfloat * icoeff, gint bstride) { gint i = 0; - __m128 sum, t; + __m128 sum[2], t; __m128 f = _mm_loadu_ps(icoeff); + const gfloat *c[2] = {(gfloat*)((gint8*)b + 0*bstride), + (gfloat*)((gint8*)b + 1*bstride)}; - sum = _mm_setzero_ps (); - for (; i < len; i += 4) { - t = _mm_loadu_ps (a + i); - sum = _mm_add_ps (sum, _mm_mul_ps (_mm_unpacklo_ps (t, t), - _mm_load_ps (b + 2 * (i + 0)))); - sum = _mm_add_ps (sum, _mm_mul_ps (_mm_unpackhi_ps (t, t), - _mm_load_ps (b + 2 * (i + 2)))); + sum[0] = sum[1] = _mm_setzero_ps (); + + for (; i < len; i += 8) { + t = _mm_loadu_ps (a + i + 0); + sum[0] = _mm_add_ps (sum[0], _mm_mul_ps (t, _mm_load_ps (c[0] + i + 0))); + sum[1] = _mm_add_ps (sum[1], _mm_mul_ps (t, _mm_load_ps (c[1] + i + 0))); + t = _mm_loadu_ps (a + i + 4); + sum[0] = _mm_add_ps (sum[0], _mm_mul_ps (t, _mm_load_ps (c[0] + i + 4))); + sum[1] = _mm_add_ps (sum[1], _mm_mul_ps (t, _mm_load_ps (c[1] + i + 4))); } - sum = _mm_mul_ps (sum, f); - sum = _mm_add_ps (sum, _mm_movehl_ps (sum, sum)); - sum = _mm_add_ss (sum, _mm_shuffle_ps (sum, sum, 0x55)); - _mm_store_ss (o, sum); + sum[0] = _mm_mul_ps (sum[0], _mm_shuffle_ps (f, f, 0x00)); + sum[1] = _mm_mul_ps (sum[1], _mm_shuffle_ps (f, f, 0x55)); + sum[0] = _mm_add_ps (sum[0], sum[1]); + sum[0] = _mm_add_ps (sum[0], _mm_movehl_ps (sum[0], sum[0])); + sum[0] = _mm_add_ss (sum[0], _mm_shuffle_ps (sum[0], sum[0], 0x55)); + _mm_store_ss (o, sum[0]); } static inline void inner_product_gfloat_cubic_1_sse (gfloat * o, const gfloat * a, - const gfloat * b, gint len, const gfloat * icoeff) + const gfloat * b, gint len, const gfloat * icoeff, gint bstride) { gint i = 0; - __m128 sum = _mm_setzero_ps (); - __m128 f = _mm_loadu_ps(icoeff); + __m128 sum[4]; + __m128 t, f = _mm_loadu_ps(icoeff); + const gfloat *c[4] = {(gfloat*)((gint8*)b + 0*bstride), + (gfloat*)((gint8*)b + 1*bstride), + (gfloat*)((gint8*)b + 2*bstride), + (gfloat*)((gint8*)b + 3*bstride)}; - for (; i < len; i += 2) { - sum = _mm_add_ps (sum, _mm_mul_ps (_mm_load1_ps (a + i + 0), - _mm_load_ps (b + 4 * (i + 0)))); - sum = _mm_add_ps (sum, _mm_mul_ps (_mm_load1_ps (a + i + 1), - _mm_load_ps (b + 4 * (i + 1)))); + sum[0] = sum[1] = sum[2] = sum[3] = _mm_setzero_ps (); + + for (; i < len; i += 4) { + t = _mm_loadu_ps (a + i); + sum[0] = _mm_add_ps (sum[0], _mm_mul_ps (t, _mm_load_ps (c[0] + i))); + sum[1] = _mm_add_ps (sum[1], _mm_mul_ps (t, _mm_load_ps (c[1] + i))); + sum[2] = _mm_add_ps (sum[2], _mm_mul_ps (t, _mm_load_ps (c[2] + i))); + sum[3] = _mm_add_ps (sum[3], _mm_mul_ps (t, _mm_load_ps (c[3] + i))); } - sum = _mm_mul_ps (sum, f); - sum = _mm_add_ps (sum, _mm_movehl_ps (sum, sum)); - sum = _mm_add_ss (sum, _mm_shuffle_ps (sum, sum, 0x55)); - _mm_store_ss (o, sum); + sum[0] = _mm_mul_ps (sum[0], _mm_shuffle_ps (f, f, 0x00)); + sum[1] = _mm_mul_ps (sum[1], _mm_shuffle_ps (f, f, 0x55)); + sum[2] = _mm_mul_ps (sum[2], _mm_shuffle_ps (f, f, 0xaa)); + sum[3] = _mm_mul_ps (sum[3], _mm_shuffle_ps (f, f, 0xff)); + sum[0] = _mm_add_ps (sum[0], sum[1]); + sum[2] = _mm_add_ps (sum[2], sum[3]); + sum[0] = _mm_add_ps (sum[0], sum[2]); + sum[0] = _mm_add_ps (sum[0], _mm_movehl_ps (sum[0], sum[0])); + sum[0] = _mm_add_ss (sum[0], _mm_shuffle_ps (sum[0], sum[0], 0x55)); + _mm_store_ss (o, sum[0]); } MAKE_RESAMPLE_FUNC (gfloat, full, 1, sse); @@ -92,25 +111,19 @@ MAKE_RESAMPLE_FUNC (gfloat, cubic, 1, sse); static inline void inner_product_gint16_full_1_sse2 (gint16 * o, const gint16 * a, - const gint16 * b, gint len, const gint16 * icoeff) + const gint16 * b, gint len, const gint16 * icoeff, gint bstride) { gint i = 0; - __m128i sum, ta, tb; + __m128i sum, t; sum = _mm_setzero_si128 (); for (; i < len; i += 8) { - ta = _mm_loadu_si128 ((__m128i *) (a + i)); - tb = _mm_load_si128 ((__m128i *) (b + i)); - - sum = _mm_add_epi32 (sum, _mm_madd_epi16 (ta, tb)); + t = _mm_loadu_si128 ((__m128i *) (a + i)); + sum = _mm_add_epi32 (sum, _mm_madd_epi16 (t, _mm_load_si128 ((__m128i *) (b + i)))); } - sum = - _mm_add_epi32 (sum, _mm_shuffle_epi32 (sum, _MM_SHUFFLE (2, 3, 2, - 3))); - sum = - _mm_add_epi32 (sum, _mm_shuffle_epi32 (sum, _MM_SHUFFLE (1, 1, 1, - 1))); + sum = _mm_add_epi32 (sum, _mm_shuffle_epi32 (sum, _MM_SHUFFLE (2, 3, 2, 3))); + sum = _mm_add_epi32 (sum, _mm_shuffle_epi32 (sum, _MM_SHUFFLE (1, 1, 1, 1))); sum = _mm_add_epi32 (sum, _mm_set1_epi32 (1 << (PRECISION_S16 - 1))); sum = _mm_srai_epi32 (sum, PRECISION_S16); @@ -120,88 +133,85 @@ inner_product_gint16_full_1_sse2 (gint16 * o, const gint16 * a, static inline void inner_product_gint16_linear_1_sse2 (gint16 * o, const gint16 * a, - const gint16 * b, gint len, const gint16 * icoeff) + const gint16 * b, gint len, const gint16 * icoeff, gint bstride) { gint i = 0; - __m128i sum, t, ta, tb; - __m128i f = _mm_cvtsi64_si128 (*((long long*)icoeff)); + __m128i sum[2], t; + __m128i f = _mm_cvtsi64_si128 (*((gint64*)icoeff)); + const gint16 *c[2] = {(gint16*)((gint8*)b + 0*bstride), + (gint16*)((gint8*)b + 1*bstride)}; - sum = _mm_setzero_si128 (); - f = _mm_unpacklo_epi16 (f, sum); + sum[0] = sum[1] = _mm_setzero_si128 (); + f = _mm_unpacklo_epi16 (f, sum[0]); for (; i < len; i += 8) { t = _mm_loadu_si128 ((__m128i *) (a + i)); - - ta = _mm_unpacklo_epi32 (t, t); - tb = _mm_load_si128 ((__m128i *) (b + 2 * i + 0)); - tb = _mm_shufflelo_epi16 (tb, _MM_SHUFFLE (3,1,2,0)); - tb = _mm_shufflehi_epi16 (tb, _MM_SHUFFLE (3,1,2,0)); - - sum = _mm_add_epi32 (sum, _mm_madd_epi16 (ta, tb)); - - ta = _mm_unpackhi_epi32 (t, t); - tb = _mm_load_si128 ((__m128i *) (b + 2 * i + 8)); - tb = _mm_shufflelo_epi16 (tb, _MM_SHUFFLE (3,1,2,0)); - tb = _mm_shufflehi_epi16 (tb, _MM_SHUFFLE (3,1,2,0)); - - sum = _mm_add_epi32 (sum, _mm_madd_epi16 (ta, tb)); + sum[0] = _mm_add_epi32 (sum[0], _mm_madd_epi16 (t, _mm_load_si128 ((__m128i *) (c[0] + i)))); + sum[1] = _mm_add_epi32 (sum[1], _mm_madd_epi16 (t, _mm_load_si128 ((__m128i *) (c[1] + i)))); } - sum = _mm_srai_epi32 (sum, PRECISION_S16); - sum = _mm_madd_epi16 (sum, f); + sum[0] = _mm_srai_epi32 (sum[0], PRECISION_S16); + sum[1] = _mm_srai_epi32 (sum[1], PRECISION_S16); - sum = - _mm_add_epi32 (sum, _mm_shuffle_epi32 (sum, _MM_SHUFFLE (2, 3, 2, - 3))); - sum = - _mm_add_epi32 (sum, _mm_shuffle_epi32 (sum, _MM_SHUFFLE (1, 1, 1, - 1))); + sum[0] = _mm_madd_epi16 (sum[0], _mm_shuffle_epi32 (f, _MM_SHUFFLE (0, 0, 0, 0))); + sum[1] = _mm_madd_epi16 (sum[1], _mm_shuffle_epi32 (f, _MM_SHUFFLE (1, 1, 1, 1))); + sum[0] = _mm_add_epi32 (sum[0], sum[1]); - sum = _mm_add_epi32 (sum, _mm_set1_epi32 (1 << (PRECISION_S16 - 1))); - sum = _mm_srai_epi32 (sum, PRECISION_S16); - sum = _mm_packs_epi32 (sum, sum); - *o = _mm_extract_epi16 (sum, 0); + sum[0] = _mm_add_epi32 (sum[0], _mm_shuffle_epi32 (sum[0], _MM_SHUFFLE (2, 3, 2, 3))); + sum[0] = _mm_add_epi32 (sum[0], _mm_shuffle_epi32 (sum[0], _MM_SHUFFLE (1, 1, 1, 1))); + + sum[0] = _mm_add_epi32 (sum[0], _mm_set1_epi32 (1 << (PRECISION_S16 - 1))); + sum[0] = _mm_srai_epi32 (sum[0], PRECISION_S16); + sum[0] = _mm_packs_epi32 (sum[0], sum[0]); + *o = _mm_extract_epi16 (sum[0], 0); } static inline void inner_product_gint16_cubic_1_sse2 (gint16 * o, const gint16 * a, - const gint16 * b, gint len, const gint16 * icoeff) + const gint16 * b, gint len, const gint16 * icoeff, gint bstride) { gint i = 0; - __m128i sum, ta, tb; + __m128i sum[4], t; __m128i f = _mm_cvtsi64_si128 (*((long long*)icoeff)); + const gint16 *c[4] = {(gint16*)((gint8*)b + 0*bstride), + (gint16*)((gint8*)b + 1*bstride), + (gint16*)((gint8*)b + 2*bstride), + (gint16*)((gint8*)b + 3*bstride)}; - sum = _mm_setzero_si128 (); - f = _mm_unpacklo_epi16 (f, sum); + sum[0] = sum[1] = sum[2] = sum[3] = _mm_setzero_si128 (); + f = _mm_unpacklo_epi16 (f, sum[0]); - for (; i < len; i += 2) { - ta = _mm_cvtsi32_si128 (*(gint32*)(a + i)); - ta = _mm_unpacklo_epi32 (ta, ta); - ta = _mm_unpacklo_epi32 (ta, ta); - - tb = _mm_unpacklo_epi16 (_mm_cvtsi64_si128 (*(gint64*)(b + 4 * i + 0)), - _mm_cvtsi64_si128 (*(gint64*)(b + 4 * i + 4))); - - sum = _mm_add_epi32 (sum, _mm_madd_epi16 (ta, tb)); + for (; i < len; i += 8) { + t = _mm_loadu_si128 ((__m128i *) (a + i)); + sum[0] = _mm_add_epi32 (sum[0], _mm_madd_epi16 (t, _mm_load_si128 ((__m128i *) (c[0] + i)))); + sum[1] = _mm_add_epi32 (sum[1], _mm_madd_epi16 (t, _mm_load_si128 ((__m128i *) (c[1] + i)))); + sum[2] = _mm_add_epi32 (sum[2], _mm_madd_epi16 (t, _mm_load_si128 ((__m128i *) (c[2] + i)))); + sum[3] = _mm_add_epi32 (sum[3], _mm_madd_epi16 (t, _mm_load_si128 ((__m128i *) (c[3] + i)))); } - sum = _mm_srai_epi32 (sum, PRECISION_S16); - sum = _mm_madd_epi16 (sum, f); + sum[0] = _mm_srai_epi32 (sum[0], PRECISION_S16); + sum[1] = _mm_srai_epi32 (sum[1], PRECISION_S16); + sum[2] = _mm_srai_epi32 (sum[2], PRECISION_S16); + sum[3] = _mm_srai_epi32 (sum[3], PRECISION_S16); - sum = - _mm_add_epi32 (sum, _mm_shuffle_epi32 (sum, _MM_SHUFFLE (2, 3, 2, - 3))); - sum = - _mm_add_epi32 (sum, _mm_shuffle_epi32 (sum, _MM_SHUFFLE (1, 1, 1, - 1))); + sum[0] = _mm_madd_epi16 (sum[0], _mm_shuffle_epi32 (f, _MM_SHUFFLE (0, 0, 0, 0))); + sum[1] = _mm_madd_epi16 (sum[1], _mm_shuffle_epi32 (f, _MM_SHUFFLE (1, 1, 1, 1))); + sum[2] = _mm_madd_epi16 (sum[2], _mm_shuffle_epi32 (f, _MM_SHUFFLE (2, 2, 2, 2))); + sum[3] = _mm_madd_epi16 (sum[3], _mm_shuffle_epi32 (f, _MM_SHUFFLE (3, 3, 3, 3))); + sum[0] = _mm_add_epi32 (sum[0], sum[1]); + sum[2] = _mm_add_epi32 (sum[2], sum[3]); + sum[0] = _mm_add_epi32 (sum[0], sum[2]); - sum = _mm_add_epi32 (sum, _mm_set1_epi32 (1 << (PRECISION_S16 - 1))); - sum = _mm_srai_epi32 (sum, PRECISION_S16); - sum = _mm_packs_epi32 (sum, sum); - *o = _mm_extract_epi16 (sum, 0); + sum[0] = _mm_add_epi32 (sum[0], _mm_shuffle_epi32 (sum[0], _MM_SHUFFLE (2, 3, 2, 3))); + sum[0] = _mm_add_epi32 (sum[0], _mm_shuffle_epi32 (sum[0], _MM_SHUFFLE (1, 1, 1, 1))); + + sum[0] = _mm_add_epi32 (sum[0], _mm_set1_epi32 (1 << (PRECISION_S16 - 1))); + sum[0] = _mm_srai_epi32 (sum[0], PRECISION_S16); + sum[0] = _mm_packs_epi32 (sum[0], sum[0]); + *o = _mm_extract_epi16 (sum[0], 0); } static inline void inner_product_gdouble_full_1_sse2 (gdouble * o, const gdouble * a, - const gdouble * b, gint len, const gdouble * icoeff) + const gdouble * b, gint len, const gdouble * icoeff, gint bstride) { gint i = 0; __m128d sum = _mm_setzero_pd (); @@ -226,47 +236,62 @@ inner_product_gdouble_full_1_sse2 (gdouble * o, const gdouble * a, static inline void inner_product_gdouble_linear_1_sse2 (gdouble * o, const gdouble * a, - const gdouble * b, gint len, const gdouble * icoeff) + const gdouble * b, gint len, const gdouble * icoeff, gint bstride) { gint i = 0; - __m128d sum = _mm_setzero_pd (); + __m128d sum[2], t; __m128d f = _mm_loadu_pd (icoeff); + const gdouble *c[2] = {(gdouble*)((gint8*)b + 0*bstride), + (gdouble*)((gint8*)b + 1*bstride)}; + + sum[0] = sum[1] = _mm_setzero_pd (); for (; i < len; i += 4) { - sum = _mm_add_pd (sum, _mm_mul_pd (_mm_load1_pd (a + i + 0), _mm_load_pd (b + 2 * i + 0))); - sum = _mm_add_pd (sum, _mm_mul_pd (_mm_load1_pd (a + i + 1), _mm_load_pd (b + 2 * i + 2))); - sum = _mm_add_pd (sum, _mm_mul_pd (_mm_load1_pd (a + i + 2), _mm_load_pd (b + 2 * i + 4))); - sum = _mm_add_pd (sum, _mm_mul_pd (_mm_load1_pd (a + i + 3), _mm_load_pd (b + 2 * i + 6))); + t = _mm_loadu_pd (a + i + 0); + sum[0] = _mm_add_pd (sum[0], _mm_mul_pd (t, _mm_load_pd (c[0] + i + 0))); + sum[1] = _mm_add_pd (sum[1], _mm_mul_pd (t, _mm_load_pd (c[1] + i + 0))); + t = _mm_loadu_pd (a + i + 2); + sum[0] = _mm_add_pd (sum[0], _mm_mul_pd (t, _mm_load_pd (c[0] + i + 2))); + sum[1] = _mm_add_pd (sum[1], _mm_mul_pd (t, _mm_load_pd (c[1] + i + 2))); } - sum = _mm_mul_pd (sum, f); - sum = _mm_add_sd (sum, _mm_unpackhi_pd (sum, sum)); - _mm_store_sd (o, sum); + sum[0] = _mm_mul_pd (sum[0], _mm_shuffle_pd (f, f, _MM_SHUFFLE2 (0, 0))); + sum[1] = _mm_mul_pd (sum[1], _mm_shuffle_pd (f, f, _MM_SHUFFLE2 (1, 1))); + sum[0] = _mm_add_pd (sum[0], sum[1]); + sum[0] = _mm_add_sd (sum[0], _mm_unpackhi_pd (sum[0], sum[0])); + _mm_store_sd (o, sum[0]); } static inline void inner_product_gdouble_cubic_1_sse2 (gdouble * o, const gdouble * a, - const gdouble * b, gint len, const gdouble * icoeff) + const gdouble * b, gint len, const gdouble * icoeff, gint bstride) { - gint i = 0; - __m128d sum1 = _mm_setzero_pd (), t; - __m128d sum2 = _mm_setzero_pd (); - __m128d f1 = _mm_loadu_pd (icoeff); - __m128d f2 = _mm_loadu_pd (icoeff+2); + gint i; + __m128d f[2], sum[4], t; + const gdouble *c[4] = {(gdouble*)((gint8*)b + 0*bstride), + (gdouble*)((gint8*)b + 1*bstride), + (gdouble*)((gint8*)b + 2*bstride), + (gdouble*)((gint8*)b + 3*bstride)}; - for (; i < len; i += 2) { - t = _mm_load1_pd (a + i + 0); - sum1 = _mm_add_pd (sum1, _mm_mul_pd (t, _mm_load_pd (b + 4 * i + 0))); - sum2 = _mm_add_pd (sum2, _mm_mul_pd (t, _mm_load_pd (b + 4 * i + 2))); + f[0] = _mm_loadu_pd (icoeff + 0); + f[1] = _mm_loadu_pd (icoeff + 2); + sum[0] = sum[1] = sum[2] = sum[3] = _mm_setzero_pd (); - t = _mm_load1_pd (a + i + 1); - sum1 = _mm_add_pd (sum1, _mm_mul_pd (t, _mm_load_pd (b + 4 * i + 4))); - sum2 = _mm_add_pd (sum2, _mm_mul_pd (t, _mm_load_pd (b + 4 * i + 6))); + for (i = 0; i < len; i += 2) { + t = _mm_loadu_pd (a + i + 0); + sum[0] = _mm_add_pd (sum[0], _mm_mul_pd (t, _mm_load_pd (c[0] + i))); + sum[1] = _mm_add_pd (sum[1], _mm_mul_pd (t, _mm_load_pd (c[1] + i))); + sum[2] = _mm_add_pd (sum[2], _mm_mul_pd (t, _mm_load_pd (c[2] + i))); + sum[3] = _mm_add_pd (sum[3], _mm_mul_pd (t, _mm_load_pd (c[3] + i))); } - sum1 = _mm_mul_pd (sum1, f1); - sum2 = _mm_mul_pd (sum2, f2); - sum1 = _mm_add_pd (sum1, sum2); - sum1 = _mm_add_sd (sum1, _mm_unpackhi_pd (sum1, sum1)); - _mm_store_sd (o, sum1); + sum[0] = _mm_mul_pd (sum[0], _mm_shuffle_pd (f[0], f[0], _MM_SHUFFLE2 (0, 0))); + sum[1] = _mm_mul_pd (sum[1], _mm_shuffle_pd (f[0], f[0], _MM_SHUFFLE2 (1, 1))); + sum[2] = _mm_mul_pd (sum[2], _mm_shuffle_pd (f[1], f[1], _MM_SHUFFLE2 (0, 0))); + sum[3] = _mm_mul_pd (sum[3], _mm_shuffle_pd (f[1], f[1], _MM_SHUFFLE2 (1, 1))); + sum[0] = _mm_add_pd (sum[0], sum[1]); + sum[2] = _mm_add_pd (sum[2], sum[3]); + sum[0] = _mm_add_pd (sum[0], sum[2]); + sum[0] = _mm_add_sd (sum[0], _mm_unpackhi_pd (sum[0], sum[0])); + _mm_store_sd (o, sum[0]); } MAKE_RESAMPLE_FUNC (gint16, full, 1, sse2); @@ -279,40 +304,51 @@ MAKE_RESAMPLE_FUNC (gdouble, cubic, 1, sse2); static void interpolate_gdouble_linear_sse2 (gdouble * o, const gdouble * a, - gint len, const gdouble * icoeff) + gint len, const gdouble * icoeff, gint astride) { - gint i = 0; - __m128d f = _mm_loadu_pd (icoeff), t1, t2; + gint i; + __m128d f[2], t1, t2; + const gdouble *c[2] = {(gdouble*)((gint8*)a + 0*astride), + (gdouble*)((gint8*)a + 1*astride)}; - for (; i < len; i += 2) { - t1 = _mm_mul_pd (_mm_load_pd (a + 2*i + 0), f); - t1 = _mm_add_sd (t1, _mm_unpackhi_pd (t1, t1)); - t2 = _mm_mul_pd (_mm_load_pd (a + 2*i + 2), f); - t2 = _mm_add_sd (t2, _mm_unpackhi_pd (t2, t2)); + f[0] = _mm_load1_pd (icoeff+0); + f[1] = _mm_load1_pd (icoeff+1); - _mm_store_pd (o + i, _mm_unpacklo_pd (t1, t2)); + for (i = 0; i < len; i += 4) { + t1 = _mm_mul_pd (_mm_load_pd (c[0] + i + 0), f[0]); + t2 = _mm_mul_pd (_mm_load_pd (c[1] + i + 0), f[1]); + _mm_store_pd (o + i + 0, _mm_add_pd (t1, t2)); + + t1 = _mm_mul_pd (_mm_load_pd (c[0] + i + 2), f[0]); + t2 = _mm_mul_pd (_mm_load_pd (c[1] + i + 2), f[1]); + _mm_store_pd (o + i + 2, _mm_add_pd (t1, t2)); } } static void interpolate_gdouble_cubic_sse2 (gdouble * o, const gdouble * a, - gint len, const gdouble * icoeff) + gint len, const gdouble * icoeff, gint astride) { - gint i = 0; - __m128d t1, t2; - __m128d f1 = _mm_loadu_pd (icoeff); - __m128d f2 = _mm_loadu_pd (icoeff+2); + gint i; + __m128d f[4], t[4]; + const gdouble *c[4] = {(gdouble*)((gint8*)a + 0*astride), + (gdouble*)((gint8*)a + 1*astride), + (gdouble*)((gint8*)a + 2*astride), + (gdouble*)((gint8*)a + 3*astride)}; - for (; i < len; i += 2) { - t1 = _mm_add_pd (_mm_mul_pd (_mm_load_pd (a + 4*i + 0), f1), - _mm_mul_pd (_mm_load_pd (a + 4*i + 2), f2)); - t1 = _mm_add_sd (t1, _mm_unpackhi_pd (t1, t1)); + f[0] = _mm_load1_pd (icoeff+0); + f[1] = _mm_load1_pd (icoeff+1); + f[2] = _mm_load1_pd (icoeff+2); + f[3] = _mm_load1_pd (icoeff+3); - t2 = _mm_add_pd (_mm_mul_pd (_mm_load_pd (a + 4*i + 4), f1), - _mm_mul_pd (_mm_load_pd (a + 4*i + 6), f2)); - t2 = _mm_add_sd (t2, _mm_unpackhi_pd (t2, t2)); - - _mm_store_pd (o + i, _mm_unpacklo_pd (t1, t2)); + for (i = 0; i < len; i += 2) { + t[0] = _mm_mul_pd (_mm_load_pd (c[0] + i + 0), f[0]); + t[1] = _mm_mul_pd (_mm_load_pd (c[1] + i + 0), f[1]); + t[2] = _mm_mul_pd (_mm_load_pd (c[2] + i + 0), f[2]); + t[3] = _mm_mul_pd (_mm_load_pd (c[3] + i + 0), f[3]); + t[0] = _mm_add_pd (t[0], t[1]); + t[2] = _mm_add_pd (t[2], t[3]); + _mm_store_pd (o + i + 0, _mm_add_pd (t[0], t[2])); } } @@ -323,7 +359,7 @@ interpolate_gdouble_cubic_sse2 (gdouble * o, const gdouble * a, static inline void inner_product_gint32_full_1_sse41 (gint32 * o, const gint32 * a, - const gint32 * b, gint len, const gint32 * icoeff) + const gint32 * b, gint len, const gint32 * icoeff, gint bstride) { gint i = 0; __m128i sum, ta, tb; @@ -361,43 +397,39 @@ inner_product_gint32_full_1_sse41 (gint32 * o, const gint32 * a, static inline void inner_product_gint32_linear_1_sse41 (gint32 * o, const gint32 * a, - const gint32 * b, gint len, const gint32 * icoeff) + const gint32 * b, gint len, const gint32 * icoeff, gint bstride) { gint i = 0; gint64 res; - __m128i sum, t, ta, tb; + __m128i sum[2], ta, tb; __m128i f = _mm_loadu_si128 ((__m128i *)icoeff); + const gint32 *c[2] = {(gint32*)((gint8*)b + 0*bstride), + (gint32*)((gint8*)b + 1*bstride)}; - sum = _mm_setzero_si128 (); - f = _mm_unpacklo_epi32 (f, f); + sum[0] = sum[1] = _mm_setzero_si128 (); for (; i < len; i += 4) { - t = _mm_loadu_si128 ((__m128i *)(a + i)); + ta = _mm_loadu_si128 ((__m128i *)(a + i)); - ta = _mm_unpacklo_epi32 (t, t); - tb = _mm_load_si128 ((__m128i *)(b + 2*i + 0)); - - sum = - _mm_add_epi64 (sum, _mm_mul_epi32 (_mm_unpacklo_epi64 (ta, ta), + tb = _mm_load_si128 ((__m128i *)(c[0] + i)); + sum[0] = _mm_add_epi64 (sum[0], _mm_mul_epi32 (_mm_unpacklo_epi32 (ta, ta), _mm_unpacklo_epi32 (tb, tb))); - sum = - _mm_add_epi64 (sum, _mm_mul_epi32 (_mm_unpackhi_epi64 (ta, ta), + sum[0] = _mm_add_epi64 (sum[0], _mm_mul_epi32 (_mm_unpackhi_epi32 (ta, ta), _mm_unpackhi_epi32 (tb, tb))); - ta = _mm_unpackhi_epi32 (t, t); - tb = _mm_load_si128 ((__m128i *)(b + 2*i + 4)); - - sum = - _mm_add_epi64 (sum, _mm_mul_epi32 (_mm_unpacklo_epi64 (ta, ta), + tb = _mm_load_si128 ((__m128i *)(c[1] + i)); + sum[1] = _mm_add_epi64 (sum[1], _mm_mul_epi32 (_mm_unpacklo_epi32 (ta, ta), _mm_unpacklo_epi32 (tb, tb))); - sum = - _mm_add_epi64 (sum, _mm_mul_epi32 (_mm_unpackhi_epi64 (ta, ta), + sum[1] = _mm_add_epi64 (sum[1], _mm_mul_epi32 (_mm_unpackhi_epi32 (ta, ta), _mm_unpackhi_epi32 (tb, tb))); } - sum = _mm_srli_epi64 (sum, PRECISION_S32); - sum = _mm_mul_epi32 (sum, f); - sum = _mm_add_epi64 (sum, _mm_unpackhi_epi64 (sum, sum)); - res = _mm_cvtsi128_si64 (sum); + sum[0] = _mm_srli_epi64 (sum[0], PRECISION_S32); + sum[1] = _mm_srli_epi64 (sum[1], PRECISION_S32); + sum[0] = _mm_mul_epi32 (sum[0], _mm_shuffle_epi32 (f, _MM_SHUFFLE (0, 0, 0, 0))); + sum[1] = _mm_mul_epi32 (sum[1], _mm_shuffle_epi32 (f, _MM_SHUFFLE (1, 1, 1, 1))); + sum[0] = _mm_add_epi64 (sum[0], sum[1]); + sum[0] = _mm_add_epi64 (sum[0], _mm_unpackhi_epi64 (sum[0], sum[0])); + res = _mm_cvtsi128_si64 (sum[0]); res = (res + (1 << (PRECISION_S32 - 1))) >> PRECISION_S32; *o = CLAMP (res, -(1L << 31), (1L << 31) - 1); @@ -405,44 +437,59 @@ inner_product_gint32_linear_1_sse41 (gint32 * o, const gint32 * a, static inline void inner_product_gint32_cubic_1_sse41 (gint32 * o, const gint32 * a, - const gint32 * b, gint len, const gint32 * icoeff) + const gint32 * b, gint len, const gint32 * icoeff, gint bstride) { gint i = 0; gint64 res; - __m128i sum1, sum2, t, ta, tb; - __m128i f = _mm_loadu_si128 ((__m128i *)icoeff), f1, f2; + __m128i sum[4], ta, tb; + __m128i f = _mm_loadu_si128 ((__m128i *)icoeff); + const gint32 *c[4] = {(gint32*)((gint8*)b + 0*bstride), + (gint32*)((gint8*)b + 1*bstride), + (gint32*)((gint8*)b + 2*bstride), + (gint32*)((gint8*)b + 3*bstride)}; - sum1 = sum2 = _mm_setzero_si128 (); - f1 = _mm_unpacklo_epi32 (f, f); - f2 = _mm_unpackhi_epi32 (f, f); + sum[0] = sum[1] = sum[2] = sum[3] = _mm_setzero_si128 (); - for (; i < len; i += 2) { - t = _mm_cvtsi64_si128 (*(gint64 *)(a + i)); - t = _mm_unpacklo_epi32 (t, t); + for (; i < len; i += 4) { + ta = _mm_loadu_si128 ((__m128i *)(a + i)); - ta = _mm_unpacklo_epi64 (t, t); - tb = _mm_load_si128 ((__m128i *)(b + 4*i + 0)); + tb = _mm_load_si128 ((__m128i *)(c[0] + i)); + sum[0] = _mm_add_epi64 (sum[0], _mm_mul_epi32 (_mm_unpacklo_epi32 (ta, ta), + _mm_unpacklo_epi32 (tb, tb))); + sum[0] = _mm_add_epi64 (sum[0], _mm_mul_epi32 (_mm_unpackhi_epi32 (ta, ta), + _mm_unpackhi_epi32 (tb, tb))); - sum1 = - _mm_add_epi64 (sum1, _mm_mul_epi32 (ta, _mm_unpacklo_epi32 (tb, tb))); - sum2 = - _mm_add_epi64 (sum2, _mm_mul_epi32 (ta, _mm_unpackhi_epi32 (tb, tb))); + tb = _mm_load_si128 ((__m128i *)(c[1] + i)); + sum[1] = _mm_add_epi64 (sum[1], _mm_mul_epi32 (_mm_unpacklo_epi32 (ta, ta), + _mm_unpacklo_epi32 (tb, tb))); + sum[1] = _mm_add_epi64 (sum[1], _mm_mul_epi32 (_mm_unpackhi_epi32 (ta, ta), + _mm_unpackhi_epi32 (tb, tb))); - ta = _mm_unpackhi_epi64 (t, t); - tb = _mm_load_si128 ((__m128i *)(b + 4*i + 4)); + tb = _mm_load_si128 ((__m128i *)(c[2] + i)); + sum[2] = _mm_add_epi64 (sum[2], _mm_mul_epi32 (_mm_unpacklo_epi32 (ta, ta), + _mm_unpacklo_epi32 (tb, tb))); + sum[2] = _mm_add_epi64 (sum[2], _mm_mul_epi32 (_mm_unpackhi_epi32 (ta, ta), + _mm_unpackhi_epi32 (tb, tb))); - sum1 = - _mm_add_epi64 (sum1, _mm_mul_epi32 (ta, _mm_unpacklo_epi32 (tb, tb))); - sum2 = - _mm_add_epi64 (sum2, _mm_mul_epi32 (ta, _mm_unpackhi_epi32 (tb, tb))); + tb = _mm_load_si128 ((__m128i *)(c[3] + i)); + sum[3] = _mm_add_epi64 (sum[3], _mm_mul_epi32 (_mm_unpacklo_epi32 (ta, ta), + _mm_unpacklo_epi32 (tb, tb))); + sum[3] = _mm_add_epi64 (sum[3], _mm_mul_epi32 (_mm_unpackhi_epi32 (ta, ta), + _mm_unpackhi_epi32 (tb, tb))); } - sum1 = _mm_srli_epi64 (sum1, PRECISION_S32); - sum2 = _mm_srli_epi64 (sum2, PRECISION_S32); - sum1 = _mm_mul_epi32 (sum1, f1); - sum2 = _mm_mul_epi32 (sum2, f2); - sum1 = _mm_add_epi64 (sum1, sum2); - sum1 = _mm_add_epi64 (sum1, _mm_unpackhi_epi64 (sum1, sum1)); - res = _mm_cvtsi128_si64 (sum1); + sum[0] = _mm_srli_epi64 (sum[0], PRECISION_S32); + sum[1] = _mm_srli_epi64 (sum[1], PRECISION_S32); + sum[2] = _mm_srli_epi64 (sum[2], PRECISION_S32); + sum[3] = _mm_srli_epi64 (sum[3], PRECISION_S32); + sum[0] = _mm_mul_epi32 (sum[0], _mm_shuffle_epi32 (f, _MM_SHUFFLE (0, 0, 0, 0))); + sum[1] = _mm_mul_epi32 (sum[1], _mm_shuffle_epi32 (f, _MM_SHUFFLE (1, 1, 1, 1))); + sum[2] = _mm_mul_epi32 (sum[2], _mm_shuffle_epi32 (f, _MM_SHUFFLE (2, 2, 2, 2))); + sum[3] = _mm_mul_epi32 (sum[3], _mm_shuffle_epi32 (f, _MM_SHUFFLE (3, 3, 3, 3))); + sum[0] = _mm_add_epi64 (sum[0], sum[1]); + sum[2] = _mm_add_epi64 (sum[2], sum[3]); + sum[0] = _mm_add_epi64 (sum[0], sum[2]); + sum[0] = _mm_add_epi64 (sum[0], _mm_unpackhi_epi64 (sum[0], sum[0])); + res = _mm_cvtsi128_si64 (sum[0]); res = (res + (1 << (PRECISION_S32 - 1))) >> PRECISION_S32; *o = CLAMP (res, -(1L << 31), (1L << 31) - 1); @@ -454,9 +501,9 @@ MAKE_RESAMPLE_FUNC (gint32, cubic, 1, sse41); #endif static void -audio_resampler_check_x86 (const gchar *target_name, const gchar *option) +audio_resampler_check_x86 (const gchar *option) { - if (!strcmp (target_name, "sse")) { + if (!strcmp (option, "sse")) { #if defined (HAVE_XMMINTRIN_H) && defined(__SSE__) GST_DEBUG ("enable SSE optimisations"); resample_gfloat_full_1 = resample_gfloat_full_1_sse; @@ -465,8 +512,7 @@ audio_resampler_check_x86 (const gchar *target_name, const gchar *option) #else GST_DEBUG ("SSE optimisations not enabled"); #endif - } - if (!strcmp (option, "sse2")) { + } else if (!strcmp (option, "sse2")) { #if defined (HAVE_EMMINTRIN_H) && defined(__SSE2__) GST_DEBUG ("enable SSE2 optimisations"); resample_gint16_full_1 = resample_gint16_full_1_sse2; diff --git a/gst-libs/gst/audio/audio-resampler.c b/gst-libs/gst/audio/audio-resampler.c index d6cfdade68..ef9c35e091 100644 --- a/gst-libs/gst/audio/audio-resampler.c +++ b/gst-libs/gst/audio/audio-resampler.c @@ -322,8 +322,6 @@ make_taps (GstAudioResampler * resampler, switch (resampler->method) { case GST_AUDIO_RESAMPLER_METHOD_NEAREST: - for (i = 0; i < n_taps; i++) - weight += tmp_taps[i] = get_nearest_tap (x + i, resampler->n_taps); break; case GST_AUDIO_RESAMPLER_METHOD_LINEAR: @@ -418,14 +416,12 @@ static inline void \ extract_taps_##type (GstAudioResampler * resampler, type *tmp_taps, \ gint n_taps, gint oversample, gint mult) \ { \ - gint i, j, k, o = oversample - 1; \ - for (i = 0; i < oversample; i++, o--) { \ + gint i, j, o = oversample + mult - 1; \ + for (i = 0; i < o; i++) { \ type *taps = (type *) ((gint8*)resampler->taps + \ - o * resampler->taps_stride); \ + i * resampler->taps_stride); \ for (j = 0; j < n_taps; j++) { \ - for (k = 0; k < mult; k++) { \ - *taps++ = tmp_taps[i + j*oversample + k]; \ - } \ + *taps++ = tmp_taps[i + j*oversample]; \ } \ } \ } @@ -435,28 +431,35 @@ MAKE_EXTRACT_TAPS_FUNC (gfloat); MAKE_EXTRACT_TAPS_FUNC (gdouble); typedef void (*InterpolateFunc) (gdouble * o, const gdouble * a, gint len, - const gdouble * icoeff); + const gdouble * icoeff, gint astride); static void interpolate_gdouble_linear_c (gdouble * o, const gdouble * a, gint len, - const gdouble * ic) + const gdouble * ic, gint astride) { gint i; + const gdouble *c[2] = { (gdouble *) ((gint8 *) a + 0 * astride), + (gdouble *) ((gint8 *) a + 1 * astride) + }; for (i = 0; i < len; i++) - o[i] = (a[2 * i + 0] - a[2 * i + 1]) * ic[0] + a[2 * i + 1]; + o[i] = (c[0][i] - c[1][i]) * ic[0] + c[1][i]; } static void interpolate_gdouble_cubic_c (gdouble * o, const gdouble * a, gint len, - const gdouble * ic) + const gdouble * ic, gint astride) { gint i; + const gdouble *c[4] = { (gdouble *) ((gint8 *) a + 0 * astride), + (gdouble *) ((gint8 *) a + 1 * astride), + (gdouble *) ((gint8 *) a + 2 * astride), + (gdouble *) ((gint8 *) a + 3 * astride) + }; for (i = 0; i < len; i++) - o[i] = - a[4 * i + 0] * ic[0] + a[4 * i + 1] * ic[1] + a[4 * i + 2] * ic[2] + - a[4 * i + 3] * ic[3]; + o[i] = c[0][i] * ic[0] + c[1][i] * ic[1] + + c[2][i] * ic[2] + c[3][i] * ic[3]; } static InterpolateFunc interpolate_funcs[] = { @@ -534,7 +537,7 @@ fill_taps (GstAudioResampler * resampler, gdouble ic[4], *taps; pos = phase * oversample; - offset = pos / n_phases; + offset = (oversample - 1) - pos / n_phases; frac = pos % n_phases; taps = (gdouble *) ((gint8 *) resampler->taps + offset * taps_stride); @@ -542,11 +545,11 @@ fill_taps (GstAudioResampler * resampler, switch (resampler->filter_interpolation) { case GST_AUDIO_RESAMPLER_FILTER_INTERPOLATION_LINEAR: make_coeff_gdouble_linear (frac, n_phases, ic); - interpolate_gdouble_linear (tmp_taps, taps, n_taps, ic); + interpolate_gdouble_linear (tmp_taps, taps, n_taps, ic, taps_stride); break; case GST_AUDIO_RESAMPLER_FILTER_INTERPOLATION_CUBIC: make_coeff_gdouble_cubic (frac, n_phases, ic); - interpolate_gdouble_cubic (tmp_taps, taps, n_taps, ic); + interpolate_gdouble_cubic (tmp_taps, taps, n_taps, ic, taps_stride); break; default: break; @@ -556,6 +559,25 @@ fill_taps (GstAudioResampler * resampler, return res; } +#define GET_TAPS_NEAREST_FUNC(type) \ +static inline gpointer \ +get_taps_##type##_nearest (GstAudioResampler * resampler, \ + gint *samp_index, gint *samp_phase, type icoeff[4]) \ +{ \ + gint out_rate = resampler->out_rate; \ + *samp_index += resampler->samp_inc; \ + *samp_phase += resampler->samp_frac; \ + if (*samp_phase >= out_rate) { \ + *samp_phase -= out_rate; \ + *samp_index += 1; \ + } \ + return NULL; \ +} +GET_TAPS_NEAREST_FUNC (gint16); +GET_TAPS_NEAREST_FUNC (gint32); +GET_TAPS_NEAREST_FUNC (gfloat); +GET_TAPS_NEAREST_FUNC (gdouble); + #define GET_TAPS_FULL_FUNC(type) \ static inline gpointer \ get_taps_##type##_full (GstAudioResampler * resampler, \ @@ -606,7 +628,7 @@ get_taps_##type##_##inter (GstAudioResampler * resampler, \ gint taps_stride = resampler->taps_stride; \ \ pos = *samp_phase * oversample; \ - offset = pos / out_rate; \ + offset = (oversample - 1) - pos / out_rate; \ frac = pos % out_rate; \ \ res = (gint8 *) resampler->taps + offset * taps_stride; \ @@ -631,10 +653,22 @@ GET_TAPS_INTERPOLATE_FUNC (gint32, cubic); GET_TAPS_INTERPOLATE_FUNC (gfloat, cubic); GET_TAPS_INTERPOLATE_FUNC (gdouble, cubic); +#define INNER_PRODUCT_NEAREST_FUNC(type) \ +static inline void \ +inner_product_##type##_nearest_1_c (type * o, const type * a, \ + const type * b, gint len, const type *ic, gint bstride) \ +{ \ + *o = *a; \ +} +INNER_PRODUCT_NEAREST_FUNC (gint16); +INNER_PRODUCT_NEAREST_FUNC (gint32); +INNER_PRODUCT_NEAREST_FUNC (gfloat); +INNER_PRODUCT_NEAREST_FUNC (gdouble); + #define INNER_PRODUCT_INT_FULL_FUNC(type,type2,prec,limit) \ static inline void \ inner_product_##type##_full_1_c (type * o, const type * a, \ - const type * b, gint len, const type *ic) \ + const type * b, gint len, const type *ic, gint bstride) \ { \ gint i; \ type2 res[4] = { 0, 0, 0, 0 }; \ @@ -656,16 +690,18 @@ INNER_PRODUCT_INT_FULL_FUNC (gint32, gint64, PRECISION_S32, (gint64) 1 << 31); #define INNER_PRODUCT_INT_LINEAR_FUNC(type,type2,prec,limit) \ static inline void \ inner_product_##type##_linear_1_c (type * o, const type * a, \ - const type * b, gint len, const type *ic) \ + const type * b, gint len, const type *ic, gint bstride) \ { \ gint i; \ type2 res[4] = { 0, 0, 0, 0 }; \ + const type *c[2] = {(type*)((gint8*)b + 0*bstride), \ + (type*)((gint8*)b + 1*bstride)}; \ \ for (i = 0; i < len; i += 2) { \ - res[0] += (type2) a[i + 0] * (type2) b[2 * i + 0]; \ - res[1] += (type2) a[i + 0] * (type2) b[2 * i + 1]; \ - res[2] += (type2) a[i + 1] * (type2) b[2 * i + 2]; \ - res[3] += (type2) a[i + 1] * (type2) b[2 * i + 3]; \ + res[0] += (type2) a[i + 0] * (type2) c[0][i + 0]; \ + res[1] += (type2) a[i + 0] * (type2) c[1][i + 0]; \ + res[2] += (type2) a[i + 1] * (type2) c[0][i + 1]; \ + res[3] += (type2) a[i + 1] * (type2) c[1][i + 1]; \ } \ res[0] = (res[0] + res[2]) >> (prec); \ res[1] = (res[1] + res[3]) >> (prec); \ @@ -681,16 +717,20 @@ INNER_PRODUCT_INT_LINEAR_FUNC (gint32, gint64, PRECISION_S32, (gint64) 1 << 31); #define INNER_PRODUCT_INT_CUBIC_FUNC(type,type2,prec,limit) \ static inline void \ inner_product_##type##_cubic_1_c (type * o, const type * a, \ - const type * b, gint len, const type *ic) \ + const type * b, gint len, const type *ic, gint bstride) \ { \ gint i; \ type2 res[4] = { 0, 0, 0, 0 }; \ + const type *c[4] = {(type*)((gint8*)b + 0*bstride), \ + (type*)((gint8*)b + 1*bstride), \ + (type*)((gint8*)b + 2*bstride), \ + (type*)((gint8*)b + 3*bstride)}; \ \ for (i = 0; i < len; i++) { \ - res[0] += (type2) a[i] * (type2) b[4 * i + 0]; \ - res[1] += (type2) a[i] * (type2) b[4 * i + 1]; \ - res[2] += (type2) a[i] * (type2) b[4 * i + 2]; \ - res[3] += (type2) a[i] * (type2) b[4 * i + 3]; \ + res[0] += (type2) a[i] * (type2) c[0][i]; \ + res[1] += (type2) a[i] * (type2) c[1][i]; \ + res[2] += (type2) a[i] * (type2) c[2][i]; \ + res[3] += (type2) a[i] * (type2) c[3][i]; \ } \ res[0] = (type2)(type)(res[0] >> (prec)) * (type2) ic[0] + \ (type2)(type)(res[1] >> (prec)) * (type2) ic[1] + \ @@ -706,7 +746,7 @@ INNER_PRODUCT_INT_CUBIC_FUNC (gint32, gint64, PRECISION_S32, (gint64) 1 << 31); #define INNER_PRODUCT_FLOAT_FULL_FUNC(type) \ static inline void \ inner_product_##type##_full_1_c (type * o, const type * a, \ - const type * b, gint len, const type *ic) \ + const type * b, gint len, const type *ic, gint bstride) \ { \ gint i; \ type res[4] = { 0.0, 0.0, 0.0, 0.0 }; \ @@ -726,16 +766,18 @@ INNER_PRODUCT_FLOAT_FULL_FUNC (gdouble); #define INNER_PRODUCT_FLOAT_LINEAR_FUNC(type) \ static inline void \ inner_product_##type##_linear_1_c (type * o, const type * a, \ - const type * b, gint len, const type *ic) \ + const type * b, gint len, const type *ic, gint bstride) \ { \ gint i; \ type res[4] = { 0.0, 0.0, 0.0, 0.0 }; \ + const type *c[2] = {(type*)((gint8*)b + 0*bstride), \ + (type*)((gint8*)b + 1*bstride)}; \ \ for (i = 0; i < len; i += 2) { \ - res[0] += a[i] * b[2 * i + 0]; \ - res[1] += a[i] * b[2 * i + 1]; \ - res[2] += a[i] * b[2 * i + 2]; \ - res[3] += a[i] * b[2 * i + 3]; \ + res[0] += a[i + 0] * c[0][i + 0]; \ + res[1] += a[i + 0] * c[1][i + 0]; \ + res[2] += a[i + 1] * c[0][i + 1]; \ + res[3] += a[i + 1] * c[1][i + 1]; \ } \ *o = (res[0] + res[2]) * ic[0] + \ (res[1] + res[3]) * ic[1]; \ @@ -746,16 +788,20 @@ INNER_PRODUCT_FLOAT_LINEAR_FUNC (gdouble); #define INNER_PRODUCT_FLOAT_CUBIC_FUNC(type) \ static inline void \ inner_product_##type##_cubic_1_c (type * o, const type * a, \ - const type * b, gint len, const type *ic) \ + const type * b, gint len, const type *ic, gint bstride) \ { \ gint i; \ type res[4] = { 0.0, 0.0, 0.0, 0.0 }; \ + const type *c[4] = {(type*)((gint8*)b + 0*bstride), \ + (type*)((gint8*)b + 1*bstride), \ + (type*)((gint8*)b + 2*bstride), \ + (type*)((gint8*)b + 3*bstride)}; \ \ for (i = 0; i < len; i++) { \ - res[0] += a[i] * b[4 * i + 0]; \ - res[1] += a[i] * b[4 * i + 1]; \ - res[2] += a[i] * b[4 * i + 2]; \ - res[3] += a[i] * b[4 * i + 3]; \ + res[0] += a[i] * c[0][i]; \ + res[1] += a[i] * c[1][i]; \ + res[2] += a[i] * c[2][i]; \ + res[3] += a[i] * c[3][i]; \ } \ *o = res[0] * ic[0] + res[1] * ic[1] + \ res[2] * ic[2] + res[3] * ic[3]; \ @@ -773,6 +819,7 @@ resample_ ##type## _ ##inter## _ ##channels## _ ##arch (GstAudioResampler * resa gint n_taps = resampler->n_taps; \ gint blocks = resampler->blocks; \ gint ostride = resampler->ostride; \ + gint taps_stride = resampler->taps_stride; \ gint samp_index = 0; \ gint samp_phase = 0; \ \ @@ -791,7 +838,7 @@ resample_ ##type## _ ##inter## _ ##channels## _ ##arch (GstAudioResampler * resa taps = get_taps_ ##type##_##inter \ (resampler, &samp_index, &samp_phase, icoeff); \ inner_product_ ##type##_##inter##_##channels##_##arch \ - (op, ipp, taps, n_taps, icoeff); \ + (op, ipp, taps, n_taps, icoeff, taps_stride); \ op += ostride; \ } \ if (in_len > samp_index) \ @@ -804,6 +851,11 @@ resample_ ##type## _ ##inter## _ ##channels## _ ##arch (GstAudioResampler * resa resampler->samp_phase = samp_phase; \ } +MAKE_RESAMPLE_FUNC (gint16, nearest, 1, c); +MAKE_RESAMPLE_FUNC (gint32, nearest, 1, c); +MAKE_RESAMPLE_FUNC (gfloat, nearest, 1, c); +MAKE_RESAMPLE_FUNC (gdouble, nearest, 1, c); + MAKE_RESAMPLE_FUNC (gint16, full, 1, c); MAKE_RESAMPLE_FUNC (gint32, full, 1, c); MAKE_RESAMPLE_FUNC (gfloat, full, 1, c); @@ -820,6 +872,11 @@ MAKE_RESAMPLE_FUNC (gfloat, cubic, 1, c); MAKE_RESAMPLE_FUNC (gdouble, cubic, 1, c); static ResampleFunc resample_funcs[] = { + resample_gint16_nearest_1_c, + resample_gint32_nearest_1_c, + resample_gfloat_nearest_1_c, + resample_gdouble_nearest_1_c, + resample_gint16_full_1_c, resample_gint32_full_1_c, resample_gfloat_full_1_c, @@ -836,20 +893,25 @@ static ResampleFunc resample_funcs[] = { resample_gdouble_cubic_1_c, }; -#define resample_gint16_full_1 resample_funcs[0] -#define resample_gint32_full_1 resample_funcs[1] -#define resample_gfloat_full_1 resample_funcs[2] -#define resample_gdouble_full_1 resample_funcs[3] +#define resample_gint16_nearest_1 resample_funcs[0] +#define resample_gint32_nearest_1 resample_funcs[1] +#define resample_gfloat_nearest_1 resample_funcs[2] +#define resample_gdouble_nearest_1 resample_funcs[3] -#define resample_gint16_linear_1 resample_funcs[4] -#define resample_gint32_linear_1 resample_funcs[5] -#define resample_gfloat_linear_1 resample_funcs[6] -#define resample_gdouble_linear_1 resample_funcs[7] +#define resample_gint16_full_1 resample_funcs[4] +#define resample_gint32_full_1 resample_funcs[5] +#define resample_gfloat_full_1 resample_funcs[6] +#define resample_gdouble_full_1 resample_funcs[7] -#define resample_gint16_cubic_1 resample_funcs[8] -#define resample_gint32_cubic_1 resample_funcs[9] -#define resample_gfloat_cubic_1 resample_funcs[10] -#define resample_gdouble_cubic_1 resample_funcs[11] +#define resample_gint16_linear_1 resample_funcs[8] +#define resample_gint32_linear_1 resample_funcs[9] +#define resample_gfloat_linear_1 resample_funcs[10] +#define resample_gdouble_linear_1 resample_funcs[11] + +#define resample_gint16_cubic_1 resample_funcs[12] +#define resample_gint32_cubic_1 resample_funcs[13] +#define resample_gfloat_cubic_1 resample_funcs[14] +#define resample_gdouble_cubic_1 resample_funcs[15] #if defined HAVE_ORC && !defined DISABLE_ORC # if defined (__ARM_NEON__) @@ -879,21 +941,25 @@ audio_resampler_init (void) gint i; if (target) { + const gchar *name; unsigned int flags = orc_target_get_default_flags (target); - const gchar *tname, *name; - tname = orc_target_get_name (target); - GST_DEBUG ("target %s, default flags %08x", tname, flags); - - for (i = 0; i < 32; ++i) { - if (flags & (1U << i)) { + for (i = -1; i < 32; ++i) { + if (i == -1) { + name = orc_target_get_name (target); + GST_DEBUG ("target %s, default flags %08x", name, flags); + } else if (flags & (1U << i)) { name = orc_target_get_flag_name (target, i); GST_DEBUG ("target flag %s", name); + } else + name = NULL; + + if (name) { #ifdef CHECK_X86 - audio_resampler_check_x86 (tname, name); + audio_resampler_check_x86 (name); #endif #ifdef CHECK_NEON - audio_resampler_check_neon (tname, name); + audio_resampler_check_neon (name); #endif } } @@ -978,19 +1044,18 @@ calculate_kaiser_params (GstAudioResampler * resampler) static void alloc_taps_mem (GstAudioResampler * resampler, gint bps, gint n_taps, - gint n_phases, gint n_mult) + gint n_phases, gint n_tmp) { if (resampler->alloc_taps >= n_taps && resampler->alloc_phases >= n_phases) return; - GST_DEBUG ("allocate n_taps %d n_phases %d n_mult %d", n_taps, n_phases, - n_mult); + GST_DEBUG ("allocate n_taps %d n_phases %d n_tmp %d", n_taps, n_phases, + n_tmp); resampler->tmp_taps = - g_realloc_n (resampler->tmp_taps, n_taps, sizeof (gdouble)); + g_realloc_n (resampler->tmp_taps, n_tmp, sizeof (gdouble)); - resampler->taps_stride = - GST_ROUND_UP_32 (bps * (n_mult * n_taps + TAPS_OVERREAD)); + resampler->taps_stride = GST_ROUND_UP_32 (bps * (n_taps + TAPS_OVERREAD)); g_free (resampler->taps_mem); resampler->taps_mem = @@ -1058,23 +1123,30 @@ setup_functions (GstAudioResampler * resampler) break; } deinterleave = deinterleave_funcs[index]; - - switch (resampler->filter_mode) { - default: - case GST_AUDIO_RESAMPLER_FILTER_MODE_FULL: - GST_DEBUG ("using full filter function"); + switch (resampler->method) { + case GST_AUDIO_RESAMPLER_METHOD_NEAREST: + GST_DEBUG ("using nearest filter function"); break; - case GST_AUDIO_RESAMPLER_FILTER_MODE_INTERPOLATED: - switch (resampler->filter_interpolation) { - case GST_AUDIO_RESAMPLER_FILTER_INTERPOLATION_LINEAR: - GST_DEBUG ("using linear interpolation filter function"); - index += 4; - break; - case GST_AUDIO_RESAMPLER_FILTER_INTERPOLATION_CUBIC: - GST_DEBUG ("using cubic interpolation filter function"); - index += 8; - break; + default: + index += 4; + switch (resampler->filter_mode) { default: + case GST_AUDIO_RESAMPLER_FILTER_MODE_FULL: + GST_DEBUG ("using full filter function"); + break; + case GST_AUDIO_RESAMPLER_FILTER_MODE_INTERPOLATED: + switch (resampler->filter_interpolation) { + case GST_AUDIO_RESAMPLER_FILTER_INTERPOLATION_LINEAR: + GST_DEBUG ("using linear interpolation filter function"); + index += 4; + break; + case GST_AUDIO_RESAMPLER_FILTER_INTERPOLATION_CUBIC: + GST_DEBUG ("using cubic interpolation filter function"); + index += 8; + break; + default: + break; + } break; } break; @@ -1194,7 +1266,8 @@ resampler_calculate_taps (GstAudioResampler * resampler) } } - if (resampler->filter_mode == GST_AUDIO_RESAMPLER_FILTER_MODE_FULL) { + if (resampler->filter_mode == GST_AUDIO_RESAMPLER_FILTER_MODE_FULL && + resampler->method != GST_AUDIO_RESAMPLER_METHOD_NEAREST) { GST_DEBUG ("setting up filter cache"); resampler->n_phases = out_rate; alloc_cache_mem (resampler, bps, n_taps, out_rate); @@ -1202,7 +1275,7 @@ resampler_calculate_taps (GstAudioResampler * resampler) if (resampler->filter_interpolation != GST_AUDIO_RESAMPLER_FILTER_INTERPOLATION_NONE) { - gint otaps, isize; + gint n_tmp, isize; gdouble x, weight, *tmp_taps; GstAudioFormat format; gpointer taps; @@ -1218,7 +1291,6 @@ resampler_calculate_taps (GstAudioResampler * resampler) isize = 4; break; } - otaps = oversample * n_taps + isize - 1; if (resampler->filter_mode == GST_AUDIO_RESAMPLER_FILTER_MODE_FULL) { format = GST_AUDIO_FORMAT_F64; @@ -1226,28 +1298,30 @@ resampler_calculate_taps (GstAudioResampler * resampler) } else format = resampler->format; - alloc_taps_mem (resampler, bps, otaps, oversample, isize); + n_tmp = oversample * n_taps + isize - 1; + + alloc_taps_mem (resampler, bps, n_taps, oversample + isize - 1, n_tmp); taps = tmp_taps = resampler->tmp_taps; x = 1.0 - n_taps / 2; - weight = make_taps (resampler, tmp_taps, x, otaps, oversample); + weight = make_taps (resampler, tmp_taps, x, n_tmp, oversample); switch (format) { case GST_AUDIO_FORMAT_S16: - convert_taps_gint16 (tmp_taps, taps, weight / oversample, otaps); + convert_taps_gint16 (tmp_taps, taps, weight / oversample, n_tmp); extract_taps_gint16 (resampler, taps, n_taps, oversample, isize); break; case GST_AUDIO_FORMAT_S32: - convert_taps_gint32 (tmp_taps, taps, weight / oversample, otaps); + convert_taps_gint32 (tmp_taps, taps, weight / oversample, n_tmp); extract_taps_gint32 (resampler, taps, n_taps, oversample, isize); break; case GST_AUDIO_FORMAT_F32: - convert_taps_gfloat (tmp_taps, taps, weight / oversample, otaps); + convert_taps_gfloat (tmp_taps, taps, weight / oversample, n_tmp); extract_taps_gfloat (resampler, taps, n_taps, oversample, isize); break; default: case GST_AUDIO_FORMAT_F64: - convert_taps_gdouble (tmp_taps, taps, weight / oversample, otaps); + convert_taps_gdouble (tmp_taps, taps, weight / oversample, n_tmp); extract_taps_gdouble (resampler, taps, n_taps, oversample, isize); break; }