From bdf194a09a5b5c30fa0afb6cd5d21d7f7a99e117 Mon Sep 17 00:00:00 2001 From: Wim Taymans Date: Thu, 11 Feb 2016 13:23:07 +0100 Subject: [PATCH] audio-resample: fix taps conversion We do taps conversion in place so make sure we don't overwrite the input with temporary data. Optimize some more gint16 functions. --- gst-libs/gst/audio/audio-resampler-x86.h | 37 ++++++++++-------------- gst-libs/gst/audio/audio-resampler.c | 4 ++- 2 files changed, 18 insertions(+), 23 deletions(-) diff --git a/gst-libs/gst/audio/audio-resampler-x86.h b/gst-libs/gst/audio/audio-resampler-x86.h index 56be8aa9ab..09c53318f8 100644 --- a/gst-libs/gst/audio/audio-resampler-x86.h +++ b/gst-libs/gst/audio/audio-resampler-x86.h @@ -152,7 +152,7 @@ inner_product_gint16_linear_1_sse2 (gint16 * o, const gint16 * a, const gint16 * b, gint len, const gint16 * icoeff, gint oversample) { gint i = 0; - __m128i sum, t, ta, tb, m1, m2; + __m128i sum, t, ta, tb; __m128i f = _mm_cvtsi64_si128 (*((long long*)icoeff)); sum = _mm_setzero_si128 (); @@ -161,23 +161,19 @@ inner_product_gint16_linear_1_sse2 (gint16 * o, const gint16 * a, for (; i < len; i += 8) { t = _mm_loadu_si128 ((__m128i *) (a + i)); - ta = _mm_unpacklo_epi16 (t, t); + ta = _mm_unpacklo_epi32 (t, t); tb = _mm_load_si128 ((__m128i *) (b + 2 * i + 0)); + tb = _mm_shufflelo_epi16 (tb, _MM_SHUFFLE (3,1,2,0)); + tb = _mm_shufflehi_epi16 (tb, _MM_SHUFFLE (3,1,2,0)); - m1 = _mm_mulhi_epi16 (ta, tb); - m2 = _mm_mullo_epi16 (ta, tb); + sum = _mm_add_epi32 (sum, _mm_madd_epi16 (ta, tb)); - sum = _mm_add_epi32 (sum, _mm_unpacklo_epi16 (m2, m1)); - sum = _mm_add_epi32 (sum, _mm_unpackhi_epi16 (m2, m1)); - - ta = _mm_unpackhi_epi16 (t, t); + ta = _mm_unpackhi_epi32 (t, t); tb = _mm_load_si128 ((__m128i *) (b + 2 * i + 8)); + tb = _mm_shufflelo_epi16 (tb, _MM_SHUFFLE (3,1,2,0)); + tb = _mm_shufflehi_epi16 (tb, _MM_SHUFFLE (3,1,2,0)); - m1 = _mm_mulhi_epi16 (ta, tb); - m2 = _mm_mullo_epi16 (ta, tb); - - sum = _mm_add_epi32 (sum, _mm_unpacklo_epi16 (m2, m1)); - sum = _mm_add_epi32 (sum, _mm_unpackhi_epi16 (m2, m1)); + sum = _mm_add_epi32 (sum, _mm_madd_epi16 (ta, tb)); } sum = _mm_srai_epi32 (sum, PRECISION_S16); sum = _mm_madd_epi16 (sum, f); @@ -200,7 +196,7 @@ inner_product_gint16_cubic_1_sse2 (gint16 * o, const gint16 * a, const gint16 * b, gint len, const gint16 * icoeff, gint oversample) { gint i = 0; - __m128i sum, ta, tb, m1, m2; + __m128i sum, ta, tb; __m128i f = _mm_cvtsi64_si128 (*((long long*)icoeff)); sum = _mm_setzero_si128 (); @@ -208,16 +204,13 @@ inner_product_gint16_cubic_1_sse2 (gint16 * o, const gint16 * a, for (; i < len; i += 2) { ta = _mm_cvtsi32_si128 (*(gint32*)(a + i)); - ta = _mm_unpacklo_epi16 (ta, ta); - ta = _mm_unpacklo_epi16 (ta, ta); + ta = _mm_unpacklo_epi32 (ta, ta); + ta = _mm_unpacklo_epi32 (ta, ta); - tb = _mm_load_si128 ((__m128i *) (b + 4 * i + 0)); + tb = _mm_unpacklo_epi16 (_mm_cvtsi64_si128 (*(gint64*)(b + 4 * i + 0)), + _mm_cvtsi64_si128 (*(gint64*)(b + 4 * i + 4))); - m1 = _mm_mulhi_epi16 (ta, tb); - m2 = _mm_mullo_epi16 (ta, tb); - - sum = _mm_add_epi32 (sum, _mm_unpacklo_epi16 (m2, m1)); - sum = _mm_add_epi32 (sum, _mm_unpackhi_epi16 (m2, m1)); + sum = _mm_add_epi32 (sum, _mm_madd_epi16 (ta, tb)); } sum = _mm_srai_epi32 (sum, PRECISION_S16); sum = _mm_madd_epi16 (sum, f); diff --git a/gst-libs/gst/audio/audio-resampler.c b/gst-libs/gst/audio/audio-resampler.c index bd03846da6..3765885072 100644 --- a/gst-libs/gst/audio/audio-resampler.c +++ b/gst-libs/gst/audio/audio-resampler.c @@ -354,7 +354,7 @@ convert_taps_##type (gdouble *tmpcoeff, type *taps, \ for (i = 0; i < 32; i++) { \ gint64 sum = 0; \ for (j = 0; j < n_taps; j++) \ - sum += taps[j] = floor (offset + tmpcoeff[j] * multiplier / weight); \ + sum += floor (offset + tmpcoeff[j] * multiplier / weight); \ if (sum == (1 << precision)) { \ exact = TRUE; \ break; \ @@ -371,6 +371,8 @@ convert_taps_##type (gdouble *tmpcoeff, type *taps, \ offset -= (h_offset - l_offset) / 2; \ } \ } \ + for (j = 0; j < n_taps; j++) \ + taps[j] = floor (offset + tmpcoeff[j] * multiplier / weight); \ if (!exact) \ GST_WARNING ("can't find exact taps"); \ }