audio-resampler: optimizations

Improve int16 resampling by using pmaddwd Use intrinsics to scale and pack int16 samples Align the coefficients so that we can use aligned loads Add padding to taps and samples so that we don't have to use partial loads for the remainder of the loops. Remove copy_n, we can reuse the plain copy function with some new parameters. Align and pad the sample array.
2025-06-04 22:48:54 +00:00 · 2016-01-13 17:44:39 +01:00 · 2016-01-13 17:44:39 +01:00 · 71871c5048
commit 71871c5048
parent f55a67ca7c
2 changed files with 118 additions and 157 deletions
--- a/gst-libs/gst/audio/audio-resampler-x86.h
+++ b/gst-libs/gst/audio/audio-resampler-x86.h
@ -19,171 +19,118 @@

 #ifdef HAVE_EMMINTRIN_H
 #include <emmintrin.h>
-#endif

-#ifdef HAVE_EMMINTRIN_H
 static inline void
-inner_product_gint16_1_sse (gint16 * o, const gint16 * a, const gint16 * b, gint len)
+inner_product_gint16_1_sse2 (gint16 * o, const gint16 * a, const gint16 * b, gint len)
 {
  gint i = 0;
-  gint32 res = 0;
-  __m128i sum[2], ta, tb;
-  __m128i t1[2];
+  __m128i sum, ta, tb;

-  sum[0] = _mm_setzero_si128 ();
-  sum[1] = _mm_setzero_si128 ();
+  sum = _mm_setzero_si128 ();

-  for (; i < len - 7; i += 8) {
+  for (; i < len; i += 8) {
    ta = _mm_loadu_si128 ((__m128i *) (a + i));
-    tb = _mm_loadu_si128 ((__m128i *) (b + i));
+    tb = _mm_load_si128 ((__m128i *) (b + i));

-    t1[0] = _mm_mullo_epi16 (ta, tb);
-    t1[1] = _mm_mulhi_epi16 (ta, tb);
-
-    sum[0] = _mm_add_epi32 (sum[0], _mm_unpacklo_epi16 (t1[0], t1[1]));
-    sum[1] = _mm_add_epi32 (sum[1], _mm_unpackhi_epi16 (t1[0], t1[1]));
+    sum = _mm_add_epi32 (sum, _mm_madd_epi16 (ta, tb));
  }
-  sum[0] = _mm_add_epi32 (sum[0], sum[1]);
-  sum[0] =
-      _mm_add_epi32 (sum[0], _mm_shuffle_epi32 (sum[0], _MM_SHUFFLE (2, 3, 2,
+  sum =
+      _mm_add_epi32 (sum, _mm_shuffle_epi32 (sum, _MM_SHUFFLE (2, 3, 2,
              3)));
-  sum[0] =
-      _mm_add_epi32 (sum[0], _mm_shuffle_epi32 (sum[0], _MM_SHUFFLE (1, 1, 1,
+  sum =
+      _mm_add_epi32 (sum, _mm_shuffle_epi32 (sum, _MM_SHUFFLE (1, 1, 1,
              1)));
-  res = _mm_cvtsi128_si32 (sum[0]);

-  for (; i < len; i++)
-    res += (gint32) a[i] * (gint32) b[i];
-
-  res = (res + (1 << (PRECISION_S16 - 1))) >> PRECISION_S16;
-  *o = CLAMP (res, -(1L << 15), (1L << 15) - 1);
+  sum = _mm_add_epi32 (sum, _mm_set1_epi32 (1 << (PRECISION_S16 - 1)));
+  sum = _mm_srai_epi32 (sum, PRECISION_S16);
+  sum = _mm_packs_epi32 (sum, sum);
+  *o = _mm_extract_epi16 (sum, 0);
 }
-#endif

-#ifdef HAVE_EMMINTRIN_H
 static inline void
 inner_product_gfloat_1_sse (gfloat * o, const gfloat * a, const gfloat * b, gint len)
 {
  gint i = 0;
-  gfloat res;
  __m128 sum = _mm_setzero_ps ();

-  for (; i < len - 7; i += 8) {
+  for (; i < len; i += 8) {
    sum =
        _mm_add_ps (sum, _mm_mul_ps (_mm_loadu_ps (a + i + 0),
-            _mm_loadu_ps (b + i + 0)));
+            _mm_load_ps (b + i + 0)));
    sum =
        _mm_add_ps (sum, _mm_mul_ps (_mm_loadu_ps (a + i + 4),
-            _mm_loadu_ps (b + i + 4)));
+            _mm_load_ps (b + i + 4)));
  }
  sum = _mm_add_ps (sum, _mm_movehl_ps (sum, sum));
  sum = _mm_add_ss (sum, _mm_shuffle_ps (sum, sum, 0x55));
-  _mm_store_ss (&res, sum);
-
-  for (; i < len; i++)
-    res += a[i] * b[i];
-
-  *o = res;
+  _mm_store_ss (o, sum);
 }
-#endif

-#ifdef HAVE_EMMINTRIN_H
 static inline void
-inner_product_gdouble_1_sse (gdouble * o, const gdouble * a, const gdouble * b,
+inner_product_gdouble_1_sse2 (gdouble * o, const gdouble * a, const gdouble * b,
    gint len)
 {
  gint i = 0;
-  gdouble res;
  __m128d sum = _mm_setzero_pd ();

-  for (; i < len - 7; i += 8) {
+  for (; i < len; i += 8) {
    sum =
        _mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + i + 0),
-            _mm_loadu_pd (b + i + 0)));
+            _mm_load_pd (b + i + 0)));
    sum =
        _mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + i + 2),
-            _mm_loadu_pd (b + i + 2)));
+            _mm_load_pd (b + i + 2)));
    sum =
        _mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + i + 4),
-            _mm_loadu_pd (b + i + 4)));
+            _mm_load_pd (b + i + 4)));
    sum =
        _mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + i + 6),
-            _mm_loadu_pd (b + i + 6)));
+            _mm_load_pd (b + i + 6)));
  }
  sum = _mm_add_sd (sum, _mm_unpackhi_pd (sum, sum));
-  _mm_store_sd (&res, sum);
-
-  for (; i < len; i++)
-    res += a[i] * b[i];
-
-  *o = res;
+  _mm_store_sd (o, sum);
 }
-#endif

-#ifdef HAVE_EMMINTRIN_H
 static inline void
-inner_product_gint16_2_sse (gint16 * o, const gint16 * a, const gint16 * b, gint len)
+inner_product_gint16_2_sse2 (gint16 * o, const gint16 * a, const gint16 * b, gint len)
 {
  gint i = 0;
-  gint32 r[2];
-  guint64 r64;
-  __m128i sum[2], ta, tb;
-  __m128i t1[2];
+  __m128i sum, ta, tb, t1;

-  sum[0] = _mm_setzero_si128 ();
-  sum[1] = _mm_setzero_si128 ();
+  sum = _mm_setzero_si128 ();

-  for (; i < len - 7; i += 8) {
-    tb = _mm_loadu_si128 ((__m128i *) (b + i));
-
-    t1[1] = _mm_unpacklo_epi16 (tb, tb);
+  for (; i < len; i += 8) {
+    tb = _mm_load_si128 ((__m128i *) (b + i));

+    t1 = _mm_unpacklo_epi16 (tb, tb);
    ta = _mm_loadu_si128 ((__m128i *) (a + 2 * i));
-    t1[0] = _mm_mullo_epi16 (ta, t1[1]);
-    t1[1] = _mm_mulhi_epi16 (ta, t1[1]);

-    sum[0] = _mm_add_epi32 (sum[0], _mm_unpacklo_epi16 (t1[0], t1[1]));
-    sum[1] = _mm_add_epi32 (sum[1], _mm_unpackhi_epi16 (t1[0], t1[1]));
-
-    t1[1] = _mm_unpackhi_epi16 (tb, tb);
+    sum = _mm_add_epi32 (sum, _mm_madd_epi16 (ta, t1));

+    t1 = _mm_unpackhi_epi16 (tb, tb);
    ta = _mm_loadu_si128 ((__m128i *) (a + 2 * i + 8));
-    t1[0] = _mm_mullo_epi16 (ta, t1[1]);
-    t1[1] = _mm_mulhi_epi16 (ta, t1[1]);

-    sum[0] = _mm_add_epi32 (sum[0], _mm_unpacklo_epi16 (t1[0], t1[1]));
-    sum[1] = _mm_add_epi32 (sum[1], _mm_unpackhi_epi16 (t1[0], t1[1]));
+    sum = _mm_add_epi32 (sum, _mm_madd_epi16 (ta, t1));
  }
-  sum[0] = _mm_add_epi32 (sum[0], sum[1]);
-  sum[0] =
-      _mm_add_epi32 (sum[0], _mm_shuffle_epi32 (sum[0], _MM_SHUFFLE (2, 3, 2,
+  sum =
+      _mm_add_epi32 (sum, _mm_shuffle_epi32 (sum, _MM_SHUFFLE (2, 3, 2,
              3)));
-  r64 = _mm_cvtsi128_si64 (sum[0]);
-  r[0] = r64 >> 32;
-  r[1] = r64 & 0xffffffff;

-  for (; i < len; i++) {
-    r[0] += (gint32) a[2 * i] * (gint32) b[i];
-    r[1] += (gint32) a[2 * i + 1] * (gint32) b[i];
-  }
-  r[0] = (r[0] + (1 << (PRECISION_S16 - 1))) >> PRECISION_S16;
-  r[1] = (r[1] + (1 << (PRECISION_S16 - 1))) >> PRECISION_S16;
-  o[0] = CLAMP (r[0], -(1L << 15), (1L << 15) - 1);
-  o[1] = CLAMP (r[1], -(1L << 15), (1L << 15) - 1);
+  sum = _mm_add_epi32 (sum, _mm_set1_epi32 (1 << (PRECISION_S16 - 1)));
+  sum = _mm_srai_epi32 (sum, PRECISION_S16);
+  sum = _mm_packs_epi32 (sum, sum);
+  *(gint32*)o = _mm_cvtsi128_si32 (sum);
 }
-#endif

-#ifdef HAVE_EMMINTRIN_H
 static inline void
-inner_product_gdouble_2_sse (gdouble * o, const gdouble * a, const gdouble * b,
+inner_product_gdouble_2_sse2 (gdouble * o, const gdouble * a, const gdouble * b,
    gint len)
 {
  gint i = 0;
-  gdouble r[2];
  __m128d sum = _mm_setzero_pd (), t;

-  for (; i < len - 3; i += 4) {
-    t = _mm_loadu_pd (b + i);
+  for (; i < len; i += 4) {
+    t = _mm_load_pd (b + i);
    sum =
        _mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + 2 * i),
            _mm_unpacklo_pd (t, t)));
@ -191,7 +138,7 @@ inner_product_gdouble_2_sse (gdouble * o, const gdouble * a, const gdouble * b,
        _mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + 2 * i + 2),
            _mm_unpackhi_pd (t, t)));

-    t = _mm_loadu_pd (b + i + 2);
+    t = _mm_load_pd (b + i + 2);
    sum =
        _mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + 2 * i + 4),
            _mm_unpacklo_pd (t, t)));
@ -199,34 +146,29 @@ inner_product_gdouble_2_sse (gdouble * o, const gdouble * a, const gdouble * b,
        _mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + 2 * i + 6),
            _mm_unpackhi_pd (t, t)));
  }
-  _mm_store_pd (r, sum);
-
-  for (; i < len; i++) {
-    r[0] += a[2 * i] * b[i];
-    r[1] += a[2 * i + 1] * b[i];
-  }
-  o[0] = r[0];
-  o[1] = r[1];
+  _mm_store_pd (o, sum);
 }
-#endif

-#ifdef HAVE_EMMINTRIN_H
-MAKE_RESAMPLE_FUNC (gint16, 1, sse);
+MAKE_RESAMPLE_FUNC (gint16, 1, sse2);
 MAKE_RESAMPLE_FUNC (gfloat, 1, sse);
-MAKE_RESAMPLE_FUNC (gdouble, 1, sse);
-MAKE_RESAMPLE_FUNC (gint16, 2, sse);
-MAKE_RESAMPLE_FUNC (gdouble, 2, sse);
+MAKE_RESAMPLE_FUNC (gdouble, 1, sse2);
+MAKE_RESAMPLE_FUNC (gint16, 2, sse2);
+MAKE_RESAMPLE_FUNC (gdouble, 2, sse2);
 #endif

 static void
 audio_resampler_check_x86 (const gchar *option)
 {
-  if (!strcmp (option, "sse2")) {
-    GST_DEBUG ("enable SSE2 optimisations");
-    resample_gint16_1 = resample_gint16_1_sse;
+#ifdef HAVE_EMMINTRIN_H
+  if (!strcmp (option, "sse")) {
+    GST_DEBUG ("enable SSE optimisations");
    resample_gfloat_1 = resample_gfloat_1_sse;
-    resample_gdouble_1 = resample_gdouble_1_sse;
-    resample_gint16_2 = resample_gint16_2_sse;
-    resample_gdouble_2 = resample_gdouble_2_sse;
+  } else if (!strcmp (option, "sse2")) {
+    GST_DEBUG ("enable SSE2 optimisations");
+    resample_gint16_1 = resample_gint16_1_sse2;
+    resample_gdouble_1 = resample_gdouble_1_sse2;
+    resample_gint16_2 = resample_gint16_2_sse2;
+    resample_gdouble_2 = resample_gdouble_2_sse2;
  }
+#endif
 }
--- a/gst-libs/gst/audio/audio-resampler.c
+++ b/gst-libs/gst/audio/audio-resampler.c
@ -48,6 +48,8 @@ typedef void (*DeinterleaveFunc) (GstAudioResampler * resampler,
    gpointer * sbuf, gpointer in[], gsize in_frames);
 typedef void (*MirrorFunc) (GstAudioResampler * resampler, gpointer * sbuf);

+#define MEM_ALIGN(m,a) ((gint8 *)((guintptr)((gint8 *)(m) + ((a)-1)) & ~((a)-1)))
+
 struct _GstAudioResampler
 {
  GstAudioResamplerMethod method;
@ -68,6 +70,8 @@ struct _GstAudioResampler
  guint n_taps;
  Tap *taps;
  gpointer coeff;
+  gpointer coeffmem;
+  gsize cstride;
  gpointer tmpcoeff;

  DeinterleaveFunc deinterleave;
@ -75,6 +79,7 @@ struct _GstAudioResampler
  ResampleFunc resample;

  guint blocks;
+  guint inc;
  gboolean filling;
  gint samp_inc;
  gint samp_frac;
@ -256,7 +261,7 @@ get_kaiser_tap (GstAudioResampler * resampler, gdouble x)

 #define CONVERT_TAPS(type, precision)                                   \
 G_STMT_START {                                                          \
-  type *taps = t->taps = (type *) resampler->coeff + j * n_taps;        \
+  type *taps = t->taps = (type *) ((gint8*)resampler->coeff + j * resampler->cstride);        \
  gdouble multiplier = (1 << precision);                                \
  gint i, j;                                                            \
  gdouble offset, l_offset, h_offset;                                   \
@ -338,14 +343,16 @@ make_taps (GstAudioResampler * resampler, Tap * t, gint j)
  switch (resampler->format) {
    case GST_AUDIO_FORMAT_F64:
    {
-      gdouble *taps = t->taps = (gdouble *) resampler->coeff + j * n_taps;
+      gdouble *taps = t->taps =
+          (gdouble *) ((gint8 *) resampler->coeff + j * resampler->cstride);
      for (l = 0; l < n_taps; l++)
        taps[l] = tmpcoeff[l] / weight;
      break;
    }
    case GST_AUDIO_FORMAT_F32:
    {
-      gfloat *taps = t->taps = (gfloat *) resampler->coeff + j * n_taps;
+      gfloat *taps = t->taps =
+          (gfloat *) ((gint8 *) resampler->coeff + j * resampler->cstride);
      for (l = 0; l < n_taps; l++)
        taps[l] = tmpcoeff[l] / weight;
      break;
@ -593,28 +600,18 @@ static void
 deinterleave_copy (GstAudioResampler * resampler, gpointer sbuf[],
    gpointer in[], gsize in_frames)
 {
-  gsize samples_avail = resampler->samples_avail;
-  gint bpf = resampler->bpf;
+  guint c, blocks = resampler->blocks;
+  gsize bytes_avail, in_bytes, bpf;

-  if (in == NULL)
-    memset ((guint8 *) sbuf[0] + samples_avail * bpf, 0, in_frames * bpf);
-  else
-    memcpy ((guint8 *) sbuf[0] + samples_avail * bpf, in[0], in_frames * bpf);
-}
+  bpf = resampler->bps * resampler->inc;
+  bytes_avail = resampler->samples_avail * bpf;
+  in_bytes = in_frames * bpf;

-static void
-deinterleave_copy_n (GstAudioResampler * resampler, gpointer sbuf[],
-    gpointer in[], gsize in_frames)
-{
-  guint c, channels = resampler->channels;
-  gsize samples_avail = resampler->samples_avail;
-  gint bps = resampler->bps;
-
-  for (c = 0; c < channels; c++) {
+  for (c = 0; c < blocks; c++) {
    if (in == NULL)
-      memset ((guint8 *) sbuf[c] + samples_avail * bps, 0, in_frames * bps);
+      memset ((guint8 *) sbuf[c] + bytes_avail, 0, in_bytes);
    else
-      memcpy ((guint8 *) sbuf[c] + samples_avail * bps, in[c], in_frames * bps);
+      memcpy ((guint8 *) sbuf[c] + bytes_avail, in[c], in_bytes);
  }
 }

@ -623,10 +620,10 @@ deinterleave_copy_n (GstAudioResampler * resampler, gpointer sbuf[],
 static void                                                             \
 mirror_ ##type (GstAudioResampler * resampler, gpointer sbuf[])         \
 {                                                                       \
-  guint i, c, channels = resampler->channels;                           \
+  guint i, c, blocks = resampler->blocks;                               \
  gint si = resampler->n_taps / 2;                                      \
  gint n_taps = resampler->n_taps;                                      \
-  for (c = 0; c < channels; c++) {                                      \
+  for (c = 0; c < blocks; c++) {                                        \
    type *s = sbuf[c];                                                  \
    for (i = 0; i < si; i++)                                            \
      s[i] = -s[n_taps - i];                                            \
@ -686,6 +683,7 @@ resampler_calculate_taps (GstAudioResampler * resampler)
  gint n_taps;
  gint out_rate;
  gint in_rate;
+  gboolean non_interleaved;

  switch (resampler->method) {
    case GST_AUDIO_RESAMPLER_METHOD_NEAREST:
@ -729,7 +727,12 @@ resampler_calculate_taps (GstAudioResampler * resampler)
  GST_LOG ("using n_taps %d cutoff %f", n_taps, resampler->cutoff);

  resampler->taps = g_realloc_n (resampler->taps, out_rate, sizeof (Tap));
-  resampler->coeff = g_realloc_n (resampler->coeff, out_rate, bps * n_taps);
+
+  resampler->cstride = GST_ROUND_UP_32 (bps * (n_taps + 16));
+  g_free (resampler->coeffmem);
+  resampler->coeffmem = g_malloc0 (out_rate * resampler->cstride + 31);
+  resampler->coeff = MEM_ALIGN (resampler->coeffmem, 32);
+
  resampler->tmpcoeff =
      g_realloc_n (resampler->tmpcoeff, n_taps, sizeof (gdouble));

@ -743,13 +746,22 @@ resampler_calculate_taps (GstAudioResampler * resampler)
    t->next_phase = (j + in_rate) % out_rate;
  }

+  non_interleaved =
+      (resampler->flags & GST_AUDIO_RESAMPLER_FLAG_NON_INTERLEAVED);
+
+  resampler->ostride = non_interleaved ? 1 : resampler->channels;
+
+  /* we resample each channel separately */
  resampler->blocks = resampler->channels;
+  resampler->inc = 1;
+
  switch (resampler->format) {
    case GST_AUDIO_FORMAT_F64:
-      if (resampler->channels == 2 && n_taps >= 4) {
+      if (!non_interleaved && resampler->channels == 2 && n_taps >= 4) {
        resampler->resample = resample_gdouble_2;
        resampler->deinterleave = deinterleave_copy;
        resampler->blocks = 1;
+        resampler->inc = resampler->channels;;
      } else {
        resampler->resample = resample_gdouble_1;
        resampler->deinterleave = deinterleave_gdouble;
@ -767,10 +779,11 @@ resampler_calculate_taps (GstAudioResampler * resampler)
      resampler->mirror = mirror_gint32;
      break;
    case GST_AUDIO_FORMAT_S16:
-      if (resampler->channels == 2 && n_taps >= 4) {
+      if (!non_interleaved && resampler->channels == 2 && n_taps >= 4) {
        resampler->resample = resample_gint16_2;
        resampler->deinterleave = deinterleave_copy;
        resampler->blocks = 1;
+        resampler->inc = resampler->channels;;
      } else {
        resampler->resample = resample_gint16_1;
        resampler->deinterleave = deinterleave_gint16;
@ -780,12 +793,6 @@ resampler_calculate_taps (GstAudioResampler * resampler)
    default:
      break;
  }
-  if (resampler->flags & GST_AUDIO_RESAMPLER_FLAG_NON_INTERLEAVED) {
-    resampler->deinterleave = deinterleave_copy_n;
-    resampler->ostride = 1;
-  } else {
-    resampler->ostride = resampler->channels;
-  }
 }

 #define PRINT_TAPS(type,print)                  \
@ -1017,7 +1024,7 @@ gst_audio_resampler_free (GstAudioResampler * resampler)
  g_return_if_fail (resampler != NULL);

  g_free (resampler->taps);
-  g_free (resampler->coeff);
+  g_free (resampler->coeffmem);
  g_free (resampler->tmpcoeff);
  g_free (resampler->samples);
  g_free (resampler->sbuf);
@ -1119,16 +1126,28 @@ static inline gpointer *
 get_sample_bufs (GstAudioResampler * resampler, gsize need)
 {
  if (resampler->samples_len < need) {
-    guint c, channels = resampler->channels;
+    guint c, blocks = resampler->blocks;
+    gsize bytes, bpf;
+    gint8 *ptr;
+
    GST_LOG ("realloc %d -> %d", (gint) resampler->samples_len, (gint) need);
+
+    bpf = resampler->bps * resampler->inc;
+
+    bytes = (need + 8) * bpf;
+    bytes = GST_ROUND_UP_32 (bytes);
+
    /* FIXME, move history */
-    resampler->samples = g_realloc (resampler->samples, need * resampler->bpf);
-    resampler->samples_len = need;
+    resampler->samples =
+        g_realloc (resampler->samples, resampler->blocks * bytes + 31);
+    resampler->samples_len = bytes / bpf;
+
+    ptr = MEM_ALIGN (resampler->samples, 32);
+
    /* set up new pointers */
-    for (c = 0; c < channels; c++)
-      resampler->sbuf[c] =
-          (gint8 *) resampler->samples +
-          (c * resampler->samples_len * resampler->bps);
+    for (c = 0; c < blocks; c++) {
+      resampler->sbuf[c] = ptr + (c * bytes);
+    }
  }
  return resampler->sbuf;
 }