From 010b9547d3832a5c3d71f73726f149b3b023882d Mon Sep 17 00:00:00 2001
From: Petr Kulhavy <brain@jikos.cz>
Date: Wed, 19 Oct 2016 12:21:37 +0200
Subject: [PATCH] audio-converter: optimize endian conversion

Optimize LE<->BE conversion by adding a dedicated fast path instead of
using the generic converter. Implement transform_ip function in order to do the
endian swap in place.

This saves buffer allocation for the intermediate format, can be done in place
and also performs the conversion in one step instead of unpack-convert-pack.

For all bit widths the naive algorithm is implemented, which provides the best
performance when compiled with -O3. ORC was considered but eventually removed
as it requires a dedicated function for in-place conversion (due to the
"restrict" parameters).

A more complex algorithm for the 24-bit conversion with unrolled loop and
32-bit processing is implemented in the #if 0 section. It performs better if
compiled with -O2. With -O3 however the naive algorithm performs better.

https://bugzilla.gnome.org/show_bug.cgi?id=773073
---
 gst-libs/gst/audio/audio-converter.c | 234 +++++++++++++++++++++++++--
 gst-libs/gst/audio/audio-converter.h |   2 +
 gst/audioconvert/gstaudioconvert.c   |  15 ++
 3 files changed, 241 insertions(+), 10 deletions(-)

diff --git a/gst-libs/gst/audio/audio-converter.c b/gst-libs/gst/audio/audio-converter.c
index 61cd603929..2d6ad72476 100644
--- a/gst-libs/gst/audio/audio-converter.c
+++ b/gst-libs/gst/audio/audio-converter.c
@@ -80,6 +80,8 @@ typedef void (*AudioConvertFunc) (gpointer dst, const gpointer src, gint count);
 typedef gboolean (*AudioConvertSamplesFunc) (GstAudioConverter * convert,
     GstAudioConverterFlags flags, gpointer in[], gsize in_frames,
     gpointer out[], gsize out_frames);
+typedef void (*AudioConvertEndianFunc) (gpointer dst, const gpointer src,
+    gint count);
 
 /*                           int/int    int/float  float/int float/float
  *
@@ -113,6 +115,8 @@ struct _GstAudioConverter
   gpointer *out_data;
   gsize out_frames;
 
+  gboolean in_place;            /* the conversion can be done in place; returned by gst_audio_converter_supports_inplace() */
+
   /* unpack */
   gboolean in_default;
   gboolean unpack_ip;
@@ -137,6 +141,9 @@ struct _GstAudioConverter
   gboolean out_default;
   AudioChain *chain_end;        /* NULL for empty chain or points to the last element in the chain */
 
+  /* endian swap */
+  AudioConvertEndianFunc swap_endian;
+
   AudioConvertSamplesFunc convert;
 };
 
@@ -826,6 +833,12 @@ converter_passthrough (GstAudioConverter * convert,
   AudioChain *chain;
   gsize samples;
 
+  /* in-place passthrough -> do nothing */
+  if (in == out) {
+    g_assert (convert->in_place);
+    return TRUE;
+  }
+
   chain = convert->chain_end;
 
   samples = in_frames * chain->inc;
@@ -847,6 +860,152 @@ converter_passthrough (GstAudioConverter * convert,
   return TRUE;
 }
 
+/* perform LE<->BE conversion on a block of @count 16-bit samples
+ * dst may equal src for in-place conversion
+ */
+static void
+converter_swap_endian_16 (gpointer dst, const gpointer src, gint count)
+{
+  guint16 *out = dst;
+  const guint16 *in = src;
+  gint i;
+
+  for (i = 0; i < count; i++)
+    out[i] = GUINT16_SWAP_LE_BE (in[i]);
+}
+
+/* perform LE<->BE conversion on a block of @count 24-bit samples
+ * dst may equal src for in-place conversion
+ *
+ * naive algorithm, which performs better with -O3 and worse with -O2
+ * than the commented out optimized algorithm below
+ */
+static void
+converter_swap_endian_24 (gpointer dst, const gpointer src, gint count)
+{
+  guint8 *out = dst;
+  const guint8 *in = src;
+  gint i;
+
+  count *= 3;
+
+  for (i = 0; i < count; i += 3) {
+    guint8 x = in[i + 0];
+    out[i + 0] = in[i + 2];
+    out[i + 1] = in[i + 1];
+    out[i + 2] = x;
+  }
+}
+
+/* the below code performs better with -O2 but worse with -O3 */
+#if 0
+/* perform LE<->BE conversion on a block of @count 24-bit samples
+ * dst may equal src for in-place conversion
+ *
+ * assumes that dst and src are 32-bit aligned
+ */
+static void
+converter_swap_endian_24 (gpointer dst, const gpointer src, gint count)
+{
+  guint32 *out = dst;
+  const guint32 *in = src;
+  guint8 *out8;
+  const guint8 *in8;
+  gint i;
+
+  /* first convert 24-bit samples in multiples of 4 reading 3x 32-bits in one cycle
+   *
+   * input:               A1 B1 C1 A2 , B2 C2 A3 B3 , C3 A4 B4 C4
+   * 32-bit endian swap:  A2 C1 B1 A1 , B3 A3 C2 B2 , C4 B4 A4 C3
+   *                      <--  x  -->   <--  y  --> , <--  z  -->
+   *
+   * desired output:      C1 B1 A1 C2 , B2 A2 C3 B3 , A3 C4 B4 A4
+   */
+  for (i = 0; i < count / 4; i++, in += 3, out += 3) {
+    guint32 x, y, z;
+
+    x = GUINT32_SWAP_LE_BE (in[0]);
+    y = GUINT32_SWAP_LE_BE (in[1]);
+    z = GUINT32_SWAP_LE_BE (in[2]);
+
+#if G_BYTE_ORDER == G_BIG_ENDIAN
+    out[0] = (x << 8) + ((y >> 8) & 0xff);
+    out[1] = (in[1] & 0xff0000ff) + ((x >> 8) & 0xff0000) + ((z << 8) & 0xff00);
+    out[2] = (z >> 8) + ((y << 8) & 0xff000000);
+#else
+    out[0] = (x >> 8) + ((y << 8) & 0xff000000);
+    out[1] = (in[1] & 0xff0000ff) + ((x << 8) & 0xff00) + ((z >> 8) & 0xff0000);
+    out[2] = (z << 8) + ((y >> 8) & 0xff);
+#endif
+  }
+
+  /* convert the remainder less efficiently */
+  for (out8 = (guint8 *) out, in8 = (const guint8 *) in, i = 0; i < (count & 3);
+      i++) {
+    guint8 x = in8[i + 0];
+    out8[i + 0] = in8[i + 2];
+    out8[i + 1] = in8[i + 1];
+    out8[i + 2] = x;
+  }
+}
+#endif
+
+/* perform LE<->BE conversion on a block of @count 32-bit samples
+ * dst may equal src for in-place conversion
+ */
+static void
+converter_swap_endian_32 (gpointer dst, const gpointer src, gint count)
+{
+  guint32 *out = dst;
+  const guint32 *in = src;
+  gint i;
+
+  for (i = 0; i < count; i++)
+    out[i] = GUINT32_SWAP_LE_BE (in[i]);
+}
+
+/* perform LE<->BE conversion on a block of @count 64-bit samples
+ * dst may equal src for in-place conversion
+ */
+static void
+converter_swap_endian_64 (gpointer dst, const gpointer src, gint count)
+{
+  guint64 *out = dst;
+  const guint64 *in = src;
+  gint i;
+
+  for (i = 0; i < count; i++)
+    out[i] = GUINT64_SWAP_LE_BE (in[i]);
+}
+
+/* the worker function to perform endian-conversion only
+ * assuming finfo and foutinfo have the same depth
+ */
+static gboolean
+converter_endian (GstAudioConverter * convert,
+    GstAudioConverterFlags flags, gpointer in[], gsize in_frames,
+    gpointer out[], gsize out_frames)
+{
+  gint i;
+  AudioChain *chain;
+  gsize samples;
+
+  chain = convert->chain_end;
+  samples = in_frames * chain->inc;
+
+  GST_LOG ("convert endian: %" G_GSIZE_FORMAT " / %" G_GSIZE_FORMAT " samples",
+      in_frames, samples);
+
+  if (in) {
+    for (i = 0; i < chain->blocks; i++)
+      convert->swap_endian (out[i], in[i], samples);
+  } else {
+    for (i = 0; i < chain->blocks; i++)
+      gst_audio_format_fill_silence (convert->in.finfo, out[i], samples);
+  }
+  return TRUE;
+}
+
 static gboolean
 converter_generic (GstAudioConverter * convert,
     GstAudioConverterFlags flags, gpointer in[], gsize in_frames,
@@ -889,6 +1048,14 @@ converter_resample (GstAudioConverter * convert,
   return TRUE;
 }
 
+#define GST_AUDIO_FORMAT_IS_ENDIAN_CONVERSION(info1, info2) \
+		( \
+			!(((info1)->flags ^ (info2)->flags) & (~GST_AUDIO_FORMAT_FLAG_UNPACK)) && \
+			(info1)->endianness != (info2)->endianness && \
+			(info1)->width == (info2)->width && \
+			(info1)->depth == (info2)->depth \
+		)
+
 /**
  * gst_audio_converter_new: (skip)
  * @flags: extra #GstAudioConverterFlags
@@ -950,18 +1117,50 @@ gst_audio_converter_new (GstAudioConverterFlags flags, GstAudioInfo * in_info,
   convert->chain_end = chain_pack (convert, prev);
 
   convert->convert = converter_generic;
+  convert->in_place = FALSE;
 
   /* optimize */
-  if (out_info->finfo->format == in_info->finfo->format
-      && convert->mix_passthrough) {
-    if (convert->resampler == NULL) {
-      GST_INFO
-          ("same formats, no resampler and passthrough mixing -> passthrough");
-      convert->convert = converter_passthrough;
-    } else {
-      if (is_intermediate_format (in_info->finfo->format)) {
-        GST_INFO ("same formats, and passthrough mixing -> only resampling");
-        convert->convert = converter_resample;
+  if (convert->mix_passthrough) {
+    if (out_info->finfo->format == in_info->finfo->format) {
+      if (convert->resampler == NULL) {
+        GST_INFO
+            ("same formats, no resampler and passthrough mixing -> passthrough");
+        convert->convert = converter_passthrough;
+        convert->in_place = TRUE;
+      } else {
+        if (is_intermediate_format (in_info->finfo->format)) {
+          GST_INFO ("same formats, and passthrough mixing -> only resampling");
+          convert->convert = converter_resample;
+        }
+      }
+    } else if (GST_AUDIO_FORMAT_IS_ENDIAN_CONVERSION (out_info->finfo,
+            in_info->finfo)) {
+      if (convert->resampler == NULL) {
+        GST_INFO ("no resampler, passthrough mixing -> only endian conversion");
+        convert->convert = converter_endian;
+        convert->in_place = TRUE;
+
+        switch (GST_AUDIO_INFO_BPS (in_info)) {
+          case 2:
+            GST_DEBUG ("initializing 16-bit endian conversion");
+            convert->swap_endian = converter_swap_endian_16;
+            break;
+          case 3:
+            GST_DEBUG ("initializing 24-bit endian conversion");
+            convert->swap_endian = converter_swap_endian_24;
+            break;
+          case 4:
+            GST_DEBUG ("initializing 32-bit endian conversion");
+            convert->swap_endian = converter_swap_endian_32;
+            break;
+          case 8:
+            GST_DEBUG ("initializing 64-bit endian conversion");
+            convert->swap_endian = converter_swap_endian_64;
+            break;
+          default:
+            GST_ERROR ("unsupported sample width for endian conversion");
+            g_assert_not_reached ();
+        }
       }
     }
   }
@@ -1129,3 +1328,18 @@ gst_audio_converter_samples (GstAudioConverter * convert,
   }
   return convert->convert (convert, flags, in, in_frames, out, out_frames);
 }
+
+/**
+ * gst_audio_converter_supports_inplace
+ * @convert: a #GstAudioConverter
+ *
+ * Returns whether the audio converter can perform the conversion in-place.
+ * The return value would be typically input to gst_base_transform_set_in_place()
+ *
+ * Returns: %TRUE when the conversion can be done in place.
+ */
+gboolean
+gst_audio_converter_supports_inplace (GstAudioConverter * convert)
+{
+  return convert->in_place;
+}
diff --git a/gst-libs/gst/audio/audio-converter.h b/gst-libs/gst/audio/audio-converter.h
index 1f6ddc5d1f..672d30c33c 100644
--- a/gst-libs/gst/audio/audio-converter.h
+++ b/gst-libs/gst/audio/audio-converter.h
@@ -108,4 +108,6 @@ gboolean             gst_audio_converter_samples         (GstAudioConverter * co
                                                           gpointer in[], gsize in_frames,
                                                           gpointer out[], gsize out_frames);
 
+gboolean             gst_audio_converter_supports_inplace (GstAudioConverter *convert);
+
 #endif /* __GST_AUDIO_CONVERTER_H__ */
diff --git a/gst/audioconvert/gstaudioconvert.c b/gst/audioconvert/gstaudioconvert.c
index cdc7778c76..a1090ec11c 100644
--- a/gst/audioconvert/gstaudioconvert.c
+++ b/gst/audioconvert/gstaudioconvert.c
@@ -85,6 +85,8 @@ static gboolean gst_audio_convert_set_caps (GstBaseTransform * base,
     GstCaps * incaps, GstCaps * outcaps);
 static GstFlowReturn gst_audio_convert_transform (GstBaseTransform * base,
     GstBuffer * inbuf, GstBuffer * outbuf);
+static GstFlowReturn gst_audio_convert_transform_ip (GstBaseTransform * base,
+    GstBuffer * buf);
 static gboolean gst_audio_convert_transform_meta (GstBaseTransform * trans,
     GstBuffer * outbuf, GstMeta * meta, GstBuffer * inbuf);
 static GstFlowReturn gst_audio_convert_submit_input_buffer (GstBaseTransform *
@@ -176,6 +178,8 @@ gst_audio_convert_class_init (GstAudioConvertClass * klass)
       GST_DEBUG_FUNCPTR (gst_audio_convert_set_caps);
   basetransform_class->transform =
       GST_DEBUG_FUNCPTR (gst_audio_convert_transform);
+  basetransform_class->transform_ip =
+      GST_DEBUG_FUNCPTR (gst_audio_convert_transform_ip);
   basetransform_class->transform_meta =
       GST_DEBUG_FUNCPTR (gst_audio_convert_transform_meta);
   basetransform_class->submit_input_buffer =
@@ -646,6 +650,7 @@ gst_audio_convert_set_caps (GstBaseTransform * base, GstCaps * incaps,
   GstAudioConvert *this = GST_AUDIO_CONVERT (base);
   GstAudioInfo in_info;
   GstAudioInfo out_info;
+  gboolean in_place;
 
   GST_DEBUG_OBJECT (base, "incaps %" GST_PTR_FORMAT ", outcaps %"
       GST_PTR_FORMAT, incaps, outcaps);
@@ -667,6 +672,9 @@ gst_audio_convert_set_caps (GstBaseTransform * base, GstCaps * incaps,
           GST_AUDIO_CONVERTER_OPT_NOISE_SHAPING_METHOD,
           GST_TYPE_AUDIO_NOISE_SHAPING_METHOD, this->ns, NULL));
 
+  in_place = gst_audio_converter_supports_inplace (this->convert);
+  gst_base_transform_set_in_place (base, in_place);
+
   if (this->convert == NULL)
     goto no_converter;
 
@@ -693,6 +701,7 @@ no_converter:
   }
 }
 
+/* if called through gst_audio_convert_transform_ip() inbuf == outbuf */
 static GstFlowReturn
 gst_audio_convert_transform (GstBaseTransform * base, GstBuffer * inbuf,
     GstBuffer * outbuf)
@@ -775,6 +784,12 @@ convert_error:
   }
 }
 
+static GstFlowReturn
+gst_audio_convert_transform_ip (GstBaseTransform * base, GstBuffer * buf)
+{
+  return gst_audio_convert_transform (base, buf, buf);
+}
+
 static gboolean
 gst_audio_convert_transform_meta (GstBaseTransform * trans, GstBuffer * outbuf,
     GstMeta * meta, GstBuffer * inbuf)