audioresample: Separate out CFLAGS used for SSE* code

This makes sure that we only build files that need explicit SIMD support with the relevant CFLAGS. This allows the rest of the code to be built without, and specific SSE* code is only called after runtime checks for CPU features. https://bugzilla.gnome.org/show_bug.cgi?id=729276
2025-04-16 04:54:12 +00:00 · 2016-09-28 17:37:38 +05:30 · 2016-09-28 17:37:38 +05:30 · 4b5f78337a
commit 4b5f78337a
parent f4cba79063
13 changed files with 1196 additions and 788 deletions
--- a/configure.ac
+++ b/configure.ac
@ -179,6 +179,30 @@ dnl check for GCC specific SSE headers
 dnl these are used by the speex resampler code
 AC_CHECK_HEADERS([xmmintrin.h emmintrin.h smmintrin.h])

+dnl also check which architecture we're on for building files with intrinsics
+dnl separately
+AC_CHECK_DECLS([__i386__], [HAVE_X86=1])
+AC_CHECK_DECLS([__x86_64__], [HAVE_X86=1])
+
+dnl check for -m* compiler flags too
+SSE_CFLAGS="-msse"
+SSE2_CFLAGS="-msse2"
+SSE41_CFLAGS="-msse4.1"
+
+AS_COMPILER_FLAG([$SSE_CFLAGS], [HAVE_SSE=1], [HAVE_SSE=0])
+AS_COMPILER_FLAG([$SSE2_CFLAGS], [HAVE_SSE2=1], [HAVE_SSE2=0])
+AS_COMPILER_FLAG([$SSE41_CFLAGS], [HAVE_SSE41=1], [HAVE_SSE41=0])
+
+AM_CONDITIONAL(HAVE_X86, [test "x${HAVE_X86}" = "x1"])
+
+AC_DEFINE_UNQUOTED(HAVE_SSE, [$HAVE_SSE], [SSE support is enabled])
+AC_DEFINE_UNQUOTED(HAVE_SSE2, [$HAVE_SSE2], [SSE2 support is enabled])
+AC_DEFINE_UNQUOTED(HAVE_SSE41, [$HAVE_SSE41], [SSE4.1 support is enabled])
+
+AC_SUBST(SSE_CFLAGS)
+AC_SUBST(SSE2_CFLAGS)
+AC_SUBST(SSE41_CFLAGS)
+
 dnl used in gst/tcp
 AC_CHECK_HEADERS([sys/socket.h],
  [HAVE_SYS_SOCKET_H="yes"], [HAVE_SYS_SOCKET_H="no"], [AC_INCLUDES_DEFAULT])
--- a/gst-libs/gst/audio/Makefile.am
+++ b/gst-libs/gst/audio/Makefile.am
@ -82,8 +82,12 @@ nodist_libgstaudio_@GST_API_VERSION@include_HEADERS = \
 	audio-enumtypes.h

 noinst_HEADERS = \
-	gstaudioutilsprivate.h \
-	audio-resampler-x86.h \
+	gstaudioutilsprivate.h 		\
+	audio-resampler-private.h 	\
+	audio-resampler-macros.h 	\
+	audio-resampler-x86.h 		\
+	audio-resampler-x86-sse.h	\
+	audio-resampler-x86-sse2.h	\
 	audio-resampler-neon.h

 libgstaudio_@GST_API_VERSION@_la_CFLAGS = $(GST_PLUGINS_BASE_CFLAGS) $(GST_BASE_CFLAGS) $(GST_CFLAGS) \
@ -93,6 +97,50 @@ libgstaudio_@GST_API_VERSION@_la_LIBADD = \
  $(GST_BASE_LIBS) $(GST_LIBS) $(LIBM) $(ORC_LIBS)
 libgstaudio_@GST_API_VERSION@_la_LDFLAGS = $(GST_LIB_LDFLAGS) $(GST_ALL_LDFLAGS) $(GST_LT_LDFLAGS)

+
+# Arch-specific bits
+
+noinst_LTLIBRARIES =
+
+if HAVE_X86
+# Don't use full GST_LT_LDFLAGS in LDFLAGS because we get things like
+# -version-info that cause a warning on private libs
+
+noinst_LTLIBRARIES += libaudio_resampler_sse.la
+libaudio_resampler_sse_la_SOURCES = audio-resampler-x86-sse.c
+libaudio_resampler_sse_la_CFLAGS = \
+	$(libgstaudio_@GST_API_VERSION@_la_CFLAGS) \
+	$(SSE_CFLAGS)
+libaudio_resampler_sse_la_LDFLAGS = \
+	$(GST_LIB_LDFLAGS) \
+	$(GST_ALL_LDFLAGS)
+libgstaudio_@GST_API_VERSION@_la_LIBADD += libaudio_resampler_sse.la
+
+noinst_LTLIBRARIES += libaudio_resampler_sse2.la
+libaudio_resampler_sse2_la_SOURCES = audio-resampler-x86-sse2.c
+libaudio_resampler_sse2_la_CFLAGS = \
+	$(libgstaudio_@GST_API_VERSION@_la_CFLAGS) \
+	$(SSE2_CFLAGS)
+libaudio_resampler_sse2_la_LDFLAGS = \
+	$(GST_LIB_LDFLAGS) \
+	$(GST_ALL_LDFLAGS)
+libgstaudio_@GST_API_VERSION@_la_LIBADD += libaudio_resampler_sse2.la
+
+noinst_LTLIBRARIES += libaudio_resampler_sse41.la
+libaudio_resampler_sse41_la_SOURCES = audio-resampler-x86-sse41.c
+libaudio_resampler_sse41_la_CFLAGS = \
+	$(libgstaudio_@GST_API_VERSION@_la_CFLAGS) \
+	$(SSE41_CFLAGS)
+libaudio_resampler_sse41_la_LDFLAGS = \
+	$(GST_LIB_LDFLAGS) \
+	$(GST_ALL_LDFLAGS)
+libgstaudio_@GST_API_VERSION@_la_LIBADD += libaudio_resampler_sse41.la
+
+endif
+
+
+# Introspection
+
 include $(top_srcdir)/common/gst-glib-gen.mak

 if HAVE_INTROSPECTION
--- a/gst-libs/gst/audio/audio-resampler-macros.h
+++ b/gst-libs/gst/audio/audio-resampler-macros.h
@ -0,0 +1,108 @@
+/* GStreamer
+ * Copyright (C) <2015> Wim Taymans <wim.taymans@gmail.com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Library General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Library General Public
+ * License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
+ * Boston, MA 02110-1301, USA.
+ */
+
+#ifndef __GST_AUDIO_RESAMPLER_MACROS_H__
+#define __GST_AUDIO_RESAMPLER_MACROS_H__
+
+#include <string.h>
+
+#include "audio-resampler-private.h"
+
+#define PRECISION_S16 15
+#define PRECISION_S32 31
+
+#define DECL_GET_TAPS_FULL_FUNC(type)                           \
+gpointer                                                        \
+get_taps_##type##_full (GstAudioResampler * resampler,          \
+    gint *samp_index, gint *samp_phase, type icoeff[4])
+
+DECL_GET_TAPS_FULL_FUNC (gint16);
+DECL_GET_TAPS_FULL_FUNC (gint32);
+DECL_GET_TAPS_FULL_FUNC (gfloat);
+DECL_GET_TAPS_FULL_FUNC (gdouble);
+
+
+#define DECL_GET_TAPS_INTERPOLATE_FUNC(type, inter)             \
+gpointer                                                        \
+get_taps_##type##_##inter (GstAudioResampler * resampler,       \
+    gint *samp_index, gint *samp_phase, type icoeff[4])         \
+
+DECL_GET_TAPS_INTERPOLATE_FUNC (gint16, linear);
+DECL_GET_TAPS_INTERPOLATE_FUNC (gint32, linear);
+DECL_GET_TAPS_INTERPOLATE_FUNC (gfloat, linear);
+DECL_GET_TAPS_INTERPOLATE_FUNC (gdouble, linear);
+
+DECL_GET_TAPS_INTERPOLATE_FUNC (gint16, cubic);
+DECL_GET_TAPS_INTERPOLATE_FUNC (gint32, cubic);
+DECL_GET_TAPS_INTERPOLATE_FUNC (gfloat, cubic);
+DECL_GET_TAPS_INTERPOLATE_FUNC (gdouble, cubic);
+
+
+#define DECL_RESAMPLE_FUNC(type,inter,channels,arch)                    \
+void                                                                    \
+resample_ ##type## _ ##inter## _ ##channels## _ ##arch (GstAudioResampler * resampler,      \
+    gpointer in[], gsize in_len,  gpointer out[], gsize out_len,        \
+    gsize * consumed)
+
+#define MAKE_RESAMPLE_FUNC(type,inter,channels,arch)            \
+DECL_RESAMPLE_FUNC (type, inter, channels, arch)                \
+{                                                               \
+  gint c, di = 0;                                               \
+  gint n_taps = resampler->n_taps;                              \
+  gint blocks = resampler->blocks;                              \
+  gint ostride = resampler->ostride;                            \
+  gint taps_stride = resampler->taps_stride;                    \
+  gint samp_index = 0;                                          \
+  gint samp_phase = 0;                                          \
+                                                                \
+  for (c = 0; c < blocks; c++) {                                \
+    type *ip = in[c];                                           \
+    type *op = ostride == 1 ? out[c] : (type *)out[0] + c;      \
+                                                                \
+    samp_index = resampler->samp_index;                         \
+    samp_phase = resampler->samp_phase;                         \
+                                                                \
+    for (di = 0; di < out_len; di++) {                          \
+      type *ipp, icoeff[4], *taps;                              \
+                                                                \
+      ipp = &ip[samp_index * channels];                         \
+                                                                \
+      taps = get_taps_ ##type##_##inter                         \
+              (resampler, &samp_index, &samp_phase, icoeff);    \
+      inner_product_ ##type##_##inter##_##channels##_##arch     \
+              (op, ipp, taps, n_taps, icoeff, taps_stride);     \
+      op += ostride;                                            \
+    }                                                           \
+    if (in_len > samp_index)                                    \
+      memmove (ip, &ip[samp_index * channels],                  \
+          (in_len - samp_index) * sizeof(type) * channels);     \
+  }                                                             \
+  *consumed = samp_index - resampler->samp_index;               \
+                                                                \
+  resampler->samp_index = 0;                                    \
+  resampler->samp_phase = samp_phase;                           \
+}
+
+#define DECL_RESAMPLE_FUNC_STATIC(type,inter,channels,arch)     \
+static DECL_RESAMPLE_FUNC (type, inter, channels, arch)
+
+#define MAKE_RESAMPLE_FUNC_STATIC(type,inter,channels,arch)     \
+static MAKE_RESAMPLE_FUNC (type, inter, channels, arch)
+
+#endif /* __GST_AUDIO_RESAMPLER_MACROS_H__ */
--- a/gst-libs/gst/audio/audio-resampler-neon.h
+++ b/gst-libs/gst/audio/audio-resampler-neon.h
@ -650,17 +650,17 @@ interpolate_gfloat_cubic_neon (gpointer op, const gpointer ap,
                    "q10", "q11", "q12", "q13", "q14", "q15", "memory");
 }

-MAKE_RESAMPLE_FUNC (gint16, full, 1, neon);
-MAKE_RESAMPLE_FUNC (gint16, linear, 1, neon);
-MAKE_RESAMPLE_FUNC (gint16, cubic, 1, neon);
+MAKE_RESAMPLE_FUNC_STATIC (gint16, full, 1, neon);
+MAKE_RESAMPLE_FUNC_STATIC (gint16, linear, 1, neon);
+MAKE_RESAMPLE_FUNC_STATIC (gint16, cubic, 1, neon);

-MAKE_RESAMPLE_FUNC (gint32, full, 1, neon);
-MAKE_RESAMPLE_FUNC (gint32, linear, 1, neon);
-MAKE_RESAMPLE_FUNC (gint32, cubic, 1, neon);
+MAKE_RESAMPLE_FUNC_STATIC (gint32, full, 1, neon);
+MAKE_RESAMPLE_FUNC_STATIC (gint32, linear, 1, neon);
+MAKE_RESAMPLE_FUNC_STATIC (gint32, cubic, 1, neon);

-MAKE_RESAMPLE_FUNC (gfloat, full, 1, neon);
-MAKE_RESAMPLE_FUNC (gfloat, linear, 1, neon);
-MAKE_RESAMPLE_FUNC (gfloat, cubic, 1, neon);
+MAKE_RESAMPLE_FUNC_STATIC (gfloat, full, 1, neon);
+MAKE_RESAMPLE_FUNC_STATIC (gfloat, linear, 1, neon);
+MAKE_RESAMPLE_FUNC_STATIC (gfloat, cubic, 1, neon);

 static void
 audio_resampler_check_neon (const gchar *option)
--- a/gst-libs/gst/audio/audio-resampler-private.h
+++ b/gst-libs/gst/audio/audio-resampler-private.h
@ -0,0 +1,113 @@
+/* GStreamer
+ * Copyright (C) <2015> Wim Taymans <wim.taymans@gmail.com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Library General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Library General Public
+ * License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
+ * Boston, MA 02110-1301, USA.
+ */
+
+#ifndef __GST_AUDIO_RESAMPLER_PRIVATE_H__
+#define __GST_AUDIO_RESAMPLER_PRIVATE_H__
+
+#include "audio-resampler.h"
+
+/* Contains a collection of all things found in other resamplers:
+ * speex (filter construction, optimizations), ffmpeg (fixed phase filter, blackman filter),
+ * SRC (linear interpolation, fixed precomputed tables),...
+ *
+ *  Supports:
+ *   - S16, S32, F32 and F64 formats
+ *   - nearest, linear and cubic interpolation
+ *   - sinc based interpolation with kaiser or blackman-nutall windows
+ *   - fully configurable kaiser parameters
+ *   - dynamic linear or cubic interpolation of filter table, this can
+ *     use less memory but more CPU
+ *   - full filter table, generated from optionally linear or cubic
+ *     interpolation of filter table
+ *   - fixed filter table size with nearest neighbour phase, optionally
+ *     using a precomputed tables
+ *   - dynamic samplerate changes
+ *   - x86 and neon optimizations
+ */
+typedef void (*ConvertTapsFunc) (gdouble * tmp_taps, gpointer taps,
+    gdouble weight, gint n_taps);
+typedef void (*InterpolateFunc) (gpointer o, const gpointer a, gint len,
+    const gpointer icoeff, gint astride);
+typedef void (*ResampleFunc) (GstAudioResampler * resampler, gpointer in[],
+    gsize in_len, gpointer out[], gsize out_len, gsize * consumed);
+typedef void (*DeinterleaveFunc) (GstAudioResampler * resampler,
+    gpointer * sbuf, gpointer in[], gsize in_frames);
+
+struct _GstAudioResampler
+{
+  GstAudioResamplerMethod method;
+  GstAudioResamplerFlags flags;
+  GstAudioFormat format;
+  GstStructure *options;
+  gint format_index;
+  gint channels;
+  gint in_rate;
+  gint out_rate;
+
+  gint bps;
+  gint ostride;
+
+  GstAudioResamplerFilterMode filter_mode;
+  guint filter_threshold;
+  GstAudioResamplerFilterInterpolation filter_interpolation;
+
+  gdouble cutoff;
+  gdouble kaiser_beta;
+  /* for cubic */
+  gdouble b, c;
+
+  /* temp taps */
+  gpointer tmp_taps;
+
+  /* oversampled main filter table */
+  gint oversample;
+  gint n_taps;
+  gpointer taps;
+  gpointer taps_mem;
+  gsize taps_stride;
+  gint n_phases;
+  gint alloc_taps;
+  gint alloc_phases;
+
+  /* cached taps */
+  gpointer *cached_phases;
+  gpointer cached_taps;
+  gpointer cached_taps_mem;
+  gsize cached_taps_stride;
+
+  ConvertTapsFunc convert_taps;
+  InterpolateFunc interpolate;
+  DeinterleaveFunc deinterleave;
+  ResampleFunc resample;
+
+  gint blocks;
+  gint inc;
+  gint samp_inc;
+  gint samp_frac;
+  gint samp_index;
+  gint samp_phase;
+  gint skip;
+
+  gpointer samples;
+  gsize samples_len;
+  gsize samples_avail;
+  gpointer *sbuf;
+};
+
+#endif /* __GST_AUDIO_RESAMPLER_PRIVATE_H__ */
--- a/gst-libs/gst/audio/audio-resampler-x86-sse.c
+++ b/gst-libs/gst/audio/audio-resampler-x86-sse.c
@ -0,0 +1,168 @@
+/* GStreamer
+ * Copyright (C) <2016> Wim Taymans <wim.taymans@gmail.com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Library General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Library General Public
+ * License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
+ * Boston, MA 02110-1301, USA.
+ */
+
+#ifdef HAVE_CONFIG_H
+#  include "config.h"
+#endif
+
+#include "audio-resampler-x86-sse.h"
+
+#if defined (HAVE_XMMINTRIN_H) && defined(__SSE__)
+#include <xmmintrin.h>
+
+static inline void
+inner_product_gfloat_full_1_sse (gfloat * o, const gfloat * a,
+    const gfloat * b, gint len, const gfloat * icoeff, gint bstride)
+{
+  gint i = 0;
+  __m128 sum = _mm_setzero_ps ();
+
+  for (; i < len; i += 8) {
+    sum =
+        _mm_add_ps (sum, _mm_mul_ps (_mm_loadu_ps (a + i + 0),
+            _mm_load_ps (b + i + 0)));
+    sum =
+        _mm_add_ps (sum, _mm_mul_ps (_mm_loadu_ps (a + i + 4),
+            _mm_load_ps (b + i + 4)));
+  }
+  sum = _mm_add_ps (sum, _mm_movehl_ps (sum, sum));
+  sum = _mm_add_ss (sum, _mm_shuffle_ps (sum, sum, 0x55));
+  _mm_store_ss (o, sum);
+}
+
+static inline void
+inner_product_gfloat_linear_1_sse (gfloat * o, const gfloat * a,
+    const gfloat * b, gint len, const gfloat * icoeff, gint bstride)
+{
+  gint i = 0;
+  __m128 sum[2], t;
+  const gfloat *c[2] = { (gfloat *) ((gint8 *) b + 0 * bstride),
+    (gfloat *) ((gint8 *) b + 1 * bstride)
+  };
+
+  sum[0] = sum[1] = _mm_setzero_ps ();
+
+  for (; i < len; i += 8) {
+    t = _mm_loadu_ps (a + i + 0);
+    sum[0] = _mm_add_ps (sum[0], _mm_mul_ps (t, _mm_load_ps (c[0] + i + 0)));
+    sum[1] = _mm_add_ps (sum[1], _mm_mul_ps (t, _mm_load_ps (c[1] + i + 0)));
+    t = _mm_loadu_ps (a + i + 4);
+    sum[0] = _mm_add_ps (sum[0], _mm_mul_ps (t, _mm_load_ps (c[0] + i + 4)));
+    sum[1] = _mm_add_ps (sum[1], _mm_mul_ps (t, _mm_load_ps (c[1] + i + 4)));
+  }
+  sum[0] = _mm_mul_ps (_mm_sub_ps (sum[0], sum[1]), _mm_load1_ps (icoeff));
+  sum[0] = _mm_add_ps (sum[0], sum[1]);
+  sum[0] = _mm_add_ps (sum[0], _mm_movehl_ps (sum[0], sum[0]));
+  sum[0] = _mm_add_ss (sum[0], _mm_shuffle_ps (sum[0], sum[0], 0x55));
+  _mm_store_ss (o, sum[0]);
+}
+
+static inline void
+inner_product_gfloat_cubic_1_sse (gfloat * o, const gfloat * a,
+    const gfloat * b, gint len, const gfloat * icoeff, gint bstride)
+{
+  gint i = 0;
+  __m128 sum[4];
+  __m128 t, f = _mm_loadu_ps (icoeff);
+  const gfloat *c[4] = { (gfloat *) ((gint8 *) b + 0 * bstride),
+    (gfloat *) ((gint8 *) b + 1 * bstride),
+    (gfloat *) ((gint8 *) b + 2 * bstride),
+    (gfloat *) ((gint8 *) b + 3 * bstride)
+  };
+
+  sum[0] = sum[1] = sum[2] = sum[3] = _mm_setzero_ps ();
+
+  for (; i < len; i += 4) {
+    t = _mm_loadu_ps (a + i);
+    sum[0] = _mm_add_ps (sum[0], _mm_mul_ps (t, _mm_load_ps (c[0] + i)));
+    sum[1] = _mm_add_ps (sum[1], _mm_mul_ps (t, _mm_load_ps (c[1] + i)));
+    sum[2] = _mm_add_ps (sum[2], _mm_mul_ps (t, _mm_load_ps (c[2] + i)));
+    sum[3] = _mm_add_ps (sum[3], _mm_mul_ps (t, _mm_load_ps (c[3] + i)));
+  }
+  sum[0] = _mm_mul_ps (sum[0], _mm_shuffle_ps (f, f, 0x00));
+  sum[1] = _mm_mul_ps (sum[1], _mm_shuffle_ps (f, f, 0x55));
+  sum[2] = _mm_mul_ps (sum[2], _mm_shuffle_ps (f, f, 0xaa));
+  sum[3] = _mm_mul_ps (sum[3], _mm_shuffle_ps (f, f, 0xff));
+  sum[0] = _mm_add_ps (sum[0], sum[1]);
+  sum[2] = _mm_add_ps (sum[2], sum[3]);
+  sum[0] = _mm_add_ps (sum[0], sum[2]);
+  sum[0] = _mm_add_ps (sum[0], _mm_movehl_ps (sum[0], sum[0]));
+  sum[0] = _mm_add_ss (sum[0], _mm_shuffle_ps (sum[0], sum[0], 0x55));
+  _mm_store_ss (o, sum[0]);
+}
+
+MAKE_RESAMPLE_FUNC (gfloat, full, 1, sse);
+MAKE_RESAMPLE_FUNC (gfloat, linear, 1, sse);
+MAKE_RESAMPLE_FUNC (gfloat, cubic, 1, sse);
+
+void
+interpolate_gfloat_linear_sse (gpointer op, const gpointer ap,
+    gint len, const gpointer icp, gint astride)
+{
+  gint i;
+  gfloat *o = op, *a = ap, *ic = icp;
+  __m128 f[2], t1, t2;
+  const gfloat *c[2] = { (gfloat *) ((gint8 *) a + 0 * astride),
+    (gfloat *) ((gint8 *) a + 1 * astride)
+  };
+
+  f[0] = _mm_load1_ps (ic + 0);
+  f[1] = _mm_load1_ps (ic + 1);
+
+  for (i = 0; i < len; i += 8) {
+    t1 = _mm_mul_ps (_mm_load_ps (c[0] + i + 0), f[0]);
+    t2 = _mm_mul_ps (_mm_load_ps (c[1] + i + 0), f[1]);
+    _mm_store_ps (o + i + 0, _mm_add_ps (t1, t2));
+
+    t1 = _mm_mul_ps (_mm_load_ps (c[0] + i + 4), f[0]);
+    t2 = _mm_mul_ps (_mm_load_ps (c[1] + i + 4), f[1]);
+    _mm_store_ps (o + i + 4, _mm_add_ps (t1, t2));
+  }
+}
+
+void
+interpolate_gfloat_cubic_sse (gpointer op, const gpointer ap,
+    gint len, const gpointer icp, gint astride)
+{
+  gint i;
+  gfloat *o = op, *a = ap, *ic = icp;
+  __m128 f[4], t[4];
+  const gfloat *c[4] = { (gfloat *) ((gint8 *) a + 0 * astride),
+    (gfloat *) ((gint8 *) a + 1 * astride),
+    (gfloat *) ((gint8 *) a + 2 * astride),
+    (gfloat *) ((gint8 *) a + 3 * astride)
+  };
+
+  f[0] = _mm_load1_ps (ic + 0);
+  f[1] = _mm_load1_ps (ic + 1);
+  f[2] = _mm_load1_ps (ic + 2);
+  f[3] = _mm_load1_ps (ic + 3);
+
+  for (i = 0; i < len; i += 4) {
+    t[0] = _mm_mul_ps (_mm_load_ps (c[0] + i + 0), f[0]);
+    t[1] = _mm_mul_ps (_mm_load_ps (c[1] + i + 0), f[1]);
+    t[2] = _mm_mul_ps (_mm_load_ps (c[2] + i + 0), f[2]);
+    t[3] = _mm_mul_ps (_mm_load_ps (c[3] + i + 0), f[3]);
+    t[0] = _mm_add_ps (t[0], t[1]);
+    t[2] = _mm_add_ps (t[2], t[3]);
+    _mm_store_ps (o + i + 0, _mm_add_ps (t[0], t[2]));
+  }
+}
+
+#endif
--- a/gst-libs/gst/audio/audio-resampler-x86-sse.h
+++ b/gst-libs/gst/audio/audio-resampler-x86-sse.h
@ -0,0 +1,35 @@
+/* GStreamer
+ * Copyright (C) <2016> Wim Taymans <wim.taymans@gmail.com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Library General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Library General Public
+ * License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
+ * Boston, MA 02110-1301, USA.
+ */
+
+#ifndef AUDIO_RESAMPLER_X86_SSE_H
+#define AUDIO_RESAMPLER_X86_SSE_H
+
+#include "audio-resampler-macros.h"
+
+DECL_RESAMPLE_FUNC (gfloat, full, 1, sse);
+DECL_RESAMPLE_FUNC (gfloat, linear, 1, sse);
+DECL_RESAMPLE_FUNC (gfloat, cubic, 1, sse);
+
+void interpolate_gfloat_linear_sse (gpointer op, const gpointer ap,
+    gint len, const gpointer icp, gint astride);
+
+void interpolate_gfloat_cubic_sse (gpointer op, const gpointer ap,
+    gint len, const gpointer icp, gint astride);
+
+#endif /* AUDIO_RESAMPLER_X86_SSE_H */
--- a/gst-libs/gst/audio/audio-resampler-x86-sse2.c
+++ b/gst-libs/gst/audio/audio-resampler-x86-sse2.c
@ -0,0 +1,399 @@
+/* GStreamer
+ * Copyright (C) <2016> Wim Taymans <wim.taymans@gmail.com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Library General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Library General Public
+ * License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
+ * Boston, MA 02110-1301, USA.
+ */
+
+#ifdef HAVE_CONFIG_H
+#  include "config.h"
+#endif
+
+#include "audio-resampler-x86-sse2.h"
+
+#if defined (HAVE_EMMINTRIN_H) && defined(__SSE2__)
+#include <emmintrin.h>
+
+static inline void
+inner_product_gint16_full_1_sse2 (gint16 * o, const gint16 * a,
+    const gint16 * b, gint len, const gint16 * icoeff, gint bstride)
+{
+  gint i;
+  __m128i sum, t;
+
+  sum = _mm_setzero_si128 ();
+
+  for (i = 0; i < len; i += 16) {
+    t = _mm_loadu_si128 ((__m128i *) (a + i));
+    sum =
+        _mm_add_epi32 (sum, _mm_madd_epi16 (t,
+            _mm_load_si128 ((__m128i *) (b + i + 0))));
+
+    t = _mm_loadu_si128 ((__m128i *) (a + i + 8));
+    sum =
+        _mm_add_epi32 (sum, _mm_madd_epi16 (t,
+            _mm_load_si128 ((__m128i *) (b + i + 8))));
+  }
+  sum = _mm_add_epi32 (sum, _mm_shuffle_epi32 (sum, _MM_SHUFFLE (2, 3, 2, 3)));
+  sum = _mm_add_epi32 (sum, _mm_shuffle_epi32 (sum, _MM_SHUFFLE (1, 1, 1, 1)));
+
+  sum = _mm_add_epi32 (sum, _mm_set1_epi32 (1 << (PRECISION_S16 - 1)));
+  sum = _mm_srai_epi32 (sum, PRECISION_S16);
+  sum = _mm_packs_epi32 (sum, sum);
+  *o = _mm_extract_epi16 (sum, 0);
+}
+
+static inline void
+inner_product_gint16_linear_1_sse2 (gint16 * o, const gint16 * a,
+    const gint16 * b, gint len, const gint16 * icoeff, gint bstride)
+{
+  gint i = 0;
+  __m128i sum[2], t;
+  __m128i f = _mm_set_epi64x (0, *((gint64 *) icoeff));
+  const gint16 *c[2] = { (gint16 *) ((gint8 *) b + 0 * bstride),
+    (gint16 *) ((gint8 *) b + 1 * bstride)
+  };
+
+  sum[0] = sum[1] = _mm_setzero_si128 ();
+  f = _mm_unpacklo_epi16 (f, sum[0]);
+
+  for (; i < len; i += 16) {
+    t = _mm_loadu_si128 ((__m128i *) (a + i + 0));
+    sum[0] =
+        _mm_add_epi32 (sum[0], _mm_madd_epi16 (t,
+            _mm_load_si128 ((__m128i *) (c[0] + i + 0))));
+    sum[1] =
+        _mm_add_epi32 (sum[1], _mm_madd_epi16 (t,
+            _mm_load_si128 ((__m128i *) (c[1] + i + 0))));
+
+    t = _mm_loadu_si128 ((__m128i *) (a + i + 8));
+    sum[0] =
+        _mm_add_epi32 (sum[0], _mm_madd_epi16 (t,
+            _mm_load_si128 ((__m128i *) (c[0] + i + 8))));
+    sum[1] =
+        _mm_add_epi32 (sum[1], _mm_madd_epi16 (t,
+            _mm_load_si128 ((__m128i *) (c[1] + i + 8))));
+  }
+  sum[0] = _mm_srai_epi32 (sum[0], PRECISION_S16);
+  sum[1] = _mm_srai_epi32 (sum[1], PRECISION_S16);
+
+  sum[0] =
+      _mm_madd_epi16 (sum[0], _mm_shuffle_epi32 (f, _MM_SHUFFLE (0, 0, 0, 0)));
+  sum[1] =
+      _mm_madd_epi16 (sum[1], _mm_shuffle_epi32 (f, _MM_SHUFFLE (1, 1, 1, 1)));
+  sum[0] = _mm_add_epi32 (sum[0], sum[1]);
+
+  sum[0] =
+      _mm_add_epi32 (sum[0], _mm_shuffle_epi32 (sum[0], _MM_SHUFFLE (2, 3, 2,
+              3)));
+  sum[0] =
+      _mm_add_epi32 (sum[0], _mm_shuffle_epi32 (sum[0], _MM_SHUFFLE (1, 1, 1,
+              1)));
+
+  sum[0] = _mm_add_epi32 (sum[0], _mm_set1_epi32 (1 << (PRECISION_S16 - 1)));
+  sum[0] = _mm_srai_epi32 (sum[0], PRECISION_S16);
+  sum[0] = _mm_packs_epi32 (sum[0], sum[0]);
+  *o = _mm_extract_epi16 (sum[0], 0);
+}
+
+static inline void
+inner_product_gint16_cubic_1_sse2 (gint16 * o, const gint16 * a,
+    const gint16 * b, gint len, const gint16 * icoeff, gint bstride)
+{
+  gint i = 0;
+  __m128i sum[4], t[4];
+  __m128i f = _mm_set_epi64x (0, *((long long *) icoeff));
+  const gint16 *c[4] = { (gint16 *) ((gint8 *) b + 0 * bstride),
+    (gint16 *) ((gint8 *) b + 1 * bstride),
+    (gint16 *) ((gint8 *) b + 2 * bstride),
+    (gint16 *) ((gint8 *) b + 3 * bstride)
+  };
+
+  sum[0] = sum[1] = sum[2] = sum[3] = _mm_setzero_si128 ();
+  f = _mm_unpacklo_epi16 (f, sum[0]);
+
+  for (; i < len; i += 8) {
+    t[0] = _mm_loadu_si128 ((__m128i *) (a + i));
+    sum[0] =
+        _mm_add_epi32 (sum[0], _mm_madd_epi16 (t[0],
+            _mm_load_si128 ((__m128i *) (c[0] + i))));
+    sum[1] =
+        _mm_add_epi32 (sum[1], _mm_madd_epi16 (t[0],
+            _mm_load_si128 ((__m128i *) (c[1] + i))));
+    sum[2] =
+        _mm_add_epi32 (sum[2], _mm_madd_epi16 (t[0],
+            _mm_load_si128 ((__m128i *) (c[2] + i))));
+    sum[3] =
+        _mm_add_epi32 (sum[3], _mm_madd_epi16 (t[0],
+            _mm_load_si128 ((__m128i *) (c[3] + i))));
+  }
+  t[0] = _mm_unpacklo_epi32 (sum[0], sum[1]);
+  t[1] = _mm_unpacklo_epi32 (sum[2], sum[3]);
+  t[2] = _mm_unpackhi_epi32 (sum[0], sum[1]);
+  t[3] = _mm_unpackhi_epi32 (sum[2], sum[3]);
+
+  sum[0] =
+      _mm_add_epi32 (_mm_unpacklo_epi64 (t[0], t[1]), _mm_unpackhi_epi64 (t[0],
+          t[1]));
+  sum[2] =
+      _mm_add_epi32 (_mm_unpacklo_epi64 (t[2], t[3]), _mm_unpackhi_epi64 (t[2],
+          t[3]));
+  sum[0] = _mm_add_epi32 (sum[0], sum[2]);
+
+  sum[0] = _mm_srai_epi32 (sum[0], PRECISION_S16);
+  sum[0] = _mm_madd_epi16 (sum[0], f);
+
+  sum[0] =
+      _mm_add_epi32 (sum[0], _mm_shuffle_epi32 (sum[0], _MM_SHUFFLE (2, 3, 2,
+              3)));
+  sum[0] =
+      _mm_add_epi32 (sum[0], _mm_shuffle_epi32 (sum[0], _MM_SHUFFLE (1, 1, 1,
+              1)));
+
+  sum[0] = _mm_add_epi32 (sum[0], _mm_set1_epi32 (1 << (PRECISION_S16 - 1)));
+  sum[0] = _mm_srai_epi32 (sum[0], PRECISION_S16);
+  sum[0] = _mm_packs_epi32 (sum[0], sum[0]);
+  *o = _mm_extract_epi16 (sum[0], 0);
+}
+
+static inline void
+inner_product_gdouble_full_1_sse2 (gdouble * o, const gdouble * a,
+    const gdouble * b, gint len, const gdouble * icoeff, gint bstride)
+{
+  gint i = 0;
+  __m128d sum = _mm_setzero_pd ();
+
+  for (; i < len; i += 8) {
+    sum =
+        _mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + i + 0),
+            _mm_load_pd (b + i + 0)));
+    sum =
+        _mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + i + 2),
+            _mm_load_pd (b + i + 2)));
+    sum =
+        _mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + i + 4),
+            _mm_load_pd (b + i + 4)));
+    sum =
+        _mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + i + 6),
+            _mm_load_pd (b + i + 6)));
+  }
+  sum = _mm_add_sd (sum, _mm_unpackhi_pd (sum, sum));
+  _mm_store_sd (o, sum);
+}
+
+static inline void
+inner_product_gdouble_linear_1_sse2 (gdouble * o, const gdouble * a,
+    const gdouble * b, gint len, const gdouble * icoeff, gint bstride)
+{
+  gint i = 0;
+  __m128d sum[2], t;
+  const gdouble *c[2] = { (gdouble *) ((gint8 *) b + 0 * bstride),
+    (gdouble *) ((gint8 *) b + 1 * bstride)
+  };
+
+  sum[0] = sum[1] = _mm_setzero_pd ();
+
+  for (; i < len; i += 4) {
+    t = _mm_loadu_pd (a + i + 0);
+    sum[0] = _mm_add_pd (sum[0], _mm_mul_pd (t, _mm_load_pd (c[0] + i + 0)));
+    sum[1] = _mm_add_pd (sum[1], _mm_mul_pd (t, _mm_load_pd (c[1] + i + 0)));
+    t = _mm_loadu_pd (a + i + 2);
+    sum[0] = _mm_add_pd (sum[0], _mm_mul_pd (t, _mm_load_pd (c[0] + i + 2)));
+    sum[1] = _mm_add_pd (sum[1], _mm_mul_pd (t, _mm_load_pd (c[1] + i + 2)));
+  }
+  sum[0] = _mm_mul_pd (_mm_sub_pd (sum[0], sum[1]), _mm_load1_pd (icoeff));
+  sum[0] = _mm_add_pd (sum[0], sum[1]);
+  sum[0] = _mm_add_sd (sum[0], _mm_unpackhi_pd (sum[0], sum[0]));
+  _mm_store_sd (o, sum[0]);
+}
+
+static inline void
+inner_product_gdouble_cubic_1_sse2 (gdouble * o, const gdouble * a,
+    const gdouble * b, gint len, const gdouble * icoeff, gint bstride)
+{
+  gint i;
+  __m128d f[2], sum[4], t;
+  const gdouble *c[4] = { (gdouble *) ((gint8 *) b + 0 * bstride),
+    (gdouble *) ((gint8 *) b + 1 * bstride),
+    (gdouble *) ((gint8 *) b + 2 * bstride),
+    (gdouble *) ((gint8 *) b + 3 * bstride)
+  };
+
+  f[0] = _mm_loadu_pd (icoeff + 0);
+  f[1] = _mm_loadu_pd (icoeff + 2);
+  sum[0] = sum[1] = sum[2] = sum[3] = _mm_setzero_pd ();
+
+  for (i = 0; i < len; i += 2) {
+    t = _mm_loadu_pd (a + i + 0);
+    sum[0] = _mm_add_pd (sum[0], _mm_mul_pd (t, _mm_load_pd (c[0] + i)));
+    sum[1] = _mm_add_pd (sum[1], _mm_mul_pd (t, _mm_load_pd (c[1] + i)));
+    sum[2] = _mm_add_pd (sum[2], _mm_mul_pd (t, _mm_load_pd (c[2] + i)));
+    sum[3] = _mm_add_pd (sum[3], _mm_mul_pd (t, _mm_load_pd (c[3] + i)));
+  }
+  sum[0] =
+      _mm_mul_pd (sum[0], _mm_shuffle_pd (f[0], f[0], _MM_SHUFFLE2 (0, 0)));
+  sum[1] =
+      _mm_mul_pd (sum[1], _mm_shuffle_pd (f[0], f[0], _MM_SHUFFLE2 (1, 1)));
+  sum[2] =
+      _mm_mul_pd (sum[2], _mm_shuffle_pd (f[1], f[1], _MM_SHUFFLE2 (0, 0)));
+  sum[3] =
+      _mm_mul_pd (sum[3], _mm_shuffle_pd (f[1], f[1], _MM_SHUFFLE2 (1, 1)));
+  sum[0] = _mm_add_pd (sum[0], sum[1]);
+  sum[2] = _mm_add_pd (sum[2], sum[3]);
+  sum[0] = _mm_add_pd (sum[0], sum[2]);
+  sum[0] = _mm_add_sd (sum[0], _mm_unpackhi_pd (sum[0], sum[0]));
+  _mm_store_sd (o, sum[0]);
+}
+
+MAKE_RESAMPLE_FUNC (gint16, full, 1, sse2);
+MAKE_RESAMPLE_FUNC (gint16, linear, 1, sse2);
+MAKE_RESAMPLE_FUNC (gint16, cubic, 1, sse2);
+
+MAKE_RESAMPLE_FUNC (gdouble, full, 1, sse2);
+MAKE_RESAMPLE_FUNC (gdouble, linear, 1, sse2);
+MAKE_RESAMPLE_FUNC (gdouble, cubic, 1, sse2);
+
+void
+interpolate_gint16_linear_sse2 (gpointer op, const gpointer ap,
+    gint len, const gpointer icp, gint astride)
+{
+  gint i = 0;
+  gint16 *o = op, *a = ap, *ic = icp;
+  __m128i ta, tb, t1, t2;
+  __m128i f = _mm_set_epi64x (0, *((gint64 *) ic));
+  const gint16 *c[2] = { (gint16 *) ((gint8 *) a + 0 * astride),
+    (gint16 *) ((gint8 *) a + 1 * astride)
+  };
+
+  f = _mm_unpacklo_epi32 (f, f);
+  f = _mm_unpacklo_epi64 (f, f);
+
+  for (; i < len; i += 8) {
+    ta = _mm_load_si128 ((__m128i *) (c[0] + i));
+    tb = _mm_load_si128 ((__m128i *) (c[1] + i));
+
+    t1 = _mm_madd_epi16 (_mm_unpacklo_epi16 (ta, tb), f);
+    t2 = _mm_madd_epi16 (_mm_unpackhi_epi16 (ta, tb), f);
+
+    t1 = _mm_add_epi32 (t1, _mm_set1_epi32 (1 << (PRECISION_S16 - 1)));
+    t2 = _mm_add_epi32 (t2, _mm_set1_epi32 (1 << (PRECISION_S16 - 1)));
+
+    t1 = _mm_srai_epi32 (t1, PRECISION_S16);
+    t2 = _mm_srai_epi32 (t2, PRECISION_S16);
+
+    t1 = _mm_packs_epi32 (t1, t2);
+    _mm_store_si128 ((__m128i *) (o + i), t1);
+  }
+}
+
+void
+interpolate_gint16_cubic_sse2 (gpointer op, const gpointer ap,
+    gint len, const gpointer icp, gint astride)
+{
+  gint i = 0;
+  gint16 *o = op, *a = ap, *ic = icp;
+  __m128i ta, tb, tl1, tl2, th1, th2;
+  __m128i f[2];
+  const gint16 *c[4] = { (gint16 *) ((gint8 *) a + 0 * astride),
+    (gint16 *) ((gint8 *) a + 1 * astride),
+    (gint16 *) ((gint8 *) a + 2 * astride),
+    (gint16 *) ((gint8 *) a + 3 * astride)
+  };
+
+  f[0] = _mm_set_epi16 (ic[1], ic[0], ic[1], ic[0], ic[1], ic[0], ic[1], ic[0]);
+  f[1] = _mm_set_epi16 (ic[3], ic[2], ic[3], ic[2], ic[3], ic[2], ic[3], ic[2]);
+
+  for (; i < len; i += 8) {
+    ta = _mm_load_si128 ((__m128i *) (c[0] + i));
+    tb = _mm_load_si128 ((__m128i *) (c[1] + i));
+
+    tl1 = _mm_madd_epi16 (_mm_unpacklo_epi16 (ta, tb), f[0]);
+    th1 = _mm_madd_epi16 (_mm_unpackhi_epi16 (ta, tb), f[0]);
+
+    ta = _mm_load_si128 ((__m128i *) (c[2] + i));
+    tb = _mm_load_si128 ((__m128i *) (c[3] + i));
+
+    tl2 = _mm_madd_epi16 (_mm_unpacklo_epi16 (ta, tb), f[1]);
+    th2 = _mm_madd_epi16 (_mm_unpackhi_epi16 (ta, tb), f[1]);
+
+    tl1 = _mm_add_epi32 (tl1, tl2);
+    th1 = _mm_add_epi32 (th1, th2);
+
+    tl1 = _mm_add_epi32 (tl1, _mm_set1_epi32 (1 << (PRECISION_S16 - 1)));
+    th1 = _mm_add_epi32 (th1, _mm_set1_epi32 (1 << (PRECISION_S16 - 1)));
+
+    tl1 = _mm_srai_epi32 (tl1, PRECISION_S16);
+    th1 = _mm_srai_epi32 (th1, PRECISION_S16);
+
+    tl1 = _mm_packs_epi32 (tl1, th1);
+    _mm_store_si128 ((__m128i *) (o + i), tl1);
+  }
+}
+
+void
+interpolate_gdouble_linear_sse2 (gpointer op, const gpointer ap,
+    gint len, const gpointer icp, gint astride)
+{
+  gint i;
+  gdouble *o = op, *a = ap, *ic = icp;
+  __m128d f[2], t1, t2;
+  const gdouble *c[2] = { (gdouble *) ((gint8 *) a + 0 * astride),
+    (gdouble *) ((gint8 *) a + 1 * astride)
+  };
+
+  f[0] = _mm_load1_pd (ic + 0);
+  f[1] = _mm_load1_pd (ic + 1);
+
+  for (i = 0; i < len; i += 4) {
+    t1 = _mm_mul_pd (_mm_load_pd (c[0] + i + 0), f[0]);
+    t2 = _mm_mul_pd (_mm_load_pd (c[1] + i + 0), f[1]);
+    _mm_store_pd (o + i + 0, _mm_add_pd (t1, t2));
+
+    t1 = _mm_mul_pd (_mm_load_pd (c[0] + i + 2), f[0]);
+    t2 = _mm_mul_pd (_mm_load_pd (c[1] + i + 2), f[1]);
+    _mm_store_pd (o + i + 2, _mm_add_pd (t1, t2));
+  }
+}
+
+void
+interpolate_gdouble_cubic_sse2 (gpointer op, const gpointer ap,
+    gint len, const gpointer icp, gint astride)
+{
+  gint i;
+  gdouble *o = op, *a = ap, *ic = icp;
+  __m128d f[4], t[4];
+  const gdouble *c[4] = { (gdouble *) ((gint8 *) a + 0 * astride),
+    (gdouble *) ((gint8 *) a + 1 * astride),
+    (gdouble *) ((gint8 *) a + 2 * astride),
+    (gdouble *) ((gint8 *) a + 3 * astride)
+  };
+
+  f[0] = _mm_load1_pd (ic + 0);
+  f[1] = _mm_load1_pd (ic + 1);
+  f[2] = _mm_load1_pd (ic + 2);
+  f[3] = _mm_load1_pd (ic + 3);
+
+  for (i = 0; i < len; i += 2) {
+    t[0] = _mm_mul_pd (_mm_load_pd (c[0] + i + 0), f[0]);
+    t[1] = _mm_mul_pd (_mm_load_pd (c[1] + i + 0), f[1]);
+    t[2] = _mm_mul_pd (_mm_load_pd (c[2] + i + 0), f[2]);
+    t[3] = _mm_mul_pd (_mm_load_pd (c[3] + i + 0), f[3]);
+    t[0] = _mm_add_pd (t[0], t[1]);
+    t[2] = _mm_add_pd (t[2], t[3]);
+    _mm_store_pd (o + i + 0, _mm_add_pd (t[0], t[2]));
+  }
+}
+
+#endif
--- a/gst-libs/gst/audio/audio-resampler-x86-sse2.h
+++ b/gst-libs/gst/audio/audio-resampler-x86-sse2.h
@ -0,0 +1,49 @@
+/* GStreamer
+ * Copyright (C) <2016> Wim Taymans <wim.taymans@gmail.com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Library General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Library General Public
+ * License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
+ * Boston, MA 02110-1301, USA.
+ */
+
+#ifndef AUDIO_RESAMPLER_X86_SSE2_H
+#define AUDIO_RESAMPLER_X86_SSE2_H
+
+#include "audio-resampler-macros.h"
+
+DECL_RESAMPLE_FUNC (gint16, full, 1, sse2);
+DECL_RESAMPLE_FUNC (gint16, linear, 1, sse2);
+DECL_RESAMPLE_FUNC (gint16, cubic, 1, sse2);
+
+DECL_RESAMPLE_FUNC (gdouble, full, 1, sse2);
+DECL_RESAMPLE_FUNC (gdouble, linear, 1, sse2);
+DECL_RESAMPLE_FUNC (gdouble, cubic, 1, sse2);
+
+void
+interpolate_gint16_linear_sse2 (gpointer op, const gpointer ap,
+    gint len, const gpointer icp, gint astride);
+
+void
+interpolate_gint16_cubic_sse2 (gpointer op, const gpointer ap,
+    gint len, const gpointer icp, gint astride);
+
+void
+interpolate_gdouble_linear_sse2 (gpointer op, const gpointer ap,
+    gint len, const gpointer icp, gint astride);
+
+void
+interpolate_gdouble_cubic_sse2 (gpointer op, const gpointer ap,
+    gint len, const gpointer icp, gint astride);
+
+#endif /* AUDIO_RESAMPLER_X86_SSE2_H */
--- a/gst-libs/gst/audio/audio-resampler-x86-sse41.c
+++ b/gst-libs/gst/audio/audio-resampler-x86-sse41.c
@ -0,0 +1,185 @@
+/* GStreamer
+ * Copyright (C) <2016> Wim Taymans <wim.taymans@gmail.com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Library General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Library General Public
+ * License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
+ * Boston, MA 02110-1301, USA.
+ */
+
+#ifdef HAVE_CONFIG_H
+#  include "config.h"
+#endif
+
+#include "audio-resampler-x86-sse41.h"
+
+#if 0
+#define __SSE4_1__
+#pragma GCC target("sse4.1")
+#endif
+
+#if defined (HAVE_SMMINTRIN_H) && defined (HAVE_EMMINTRIN_H) && defined(__SSE4_1__)
+#include <emmintrin.h>
+#include <smmintrin.h>
+
+static inline void
+inner_product_gint32_full_1_sse41 (gint32 * o, const gint32 * a,
+    const gint32 * b, gint len, const gint32 * icoeff, gint bstride)
+{
+  gint i = 0;
+  __m128i sum, ta, tb;
+  gint64 res;
+
+  sum = _mm_setzero_si128 ();
+
+  for (; i < len; i += 8) {
+    ta = _mm_loadu_si128 ((__m128i *) (a + i));
+    tb = _mm_load_si128 ((__m128i *) (b + i));
+
+    sum =
+        _mm_add_epi64 (sum, _mm_mul_epi32 (_mm_unpacklo_epi32 (ta, ta),
+            _mm_unpacklo_epi32 (tb, tb)));
+    sum =
+        _mm_add_epi64 (sum, _mm_mul_epi32 (_mm_unpackhi_epi32 (ta, ta),
+            _mm_unpackhi_epi32 (tb, tb)));
+
+    ta = _mm_loadu_si128 ((__m128i *) (a + i + 4));
+    tb = _mm_load_si128 ((__m128i *) (b + i + 4));
+
+    sum =
+        _mm_add_epi64 (sum, _mm_mul_epi32 (_mm_unpacklo_epi32 (ta, ta),
+            _mm_unpacklo_epi32 (tb, tb)));
+    sum =
+        _mm_add_epi64 (sum, _mm_mul_epi32 (_mm_unpackhi_epi32 (ta, ta),
+            _mm_unpackhi_epi32 (tb, tb)));
+  }
+  sum = _mm_add_epi64 (sum, _mm_unpackhi_epi64 (sum, sum));
+  res = _mm_cvtsi128_si64 (sum);
+
+  res = (res + (1 << (PRECISION_S32 - 1))) >> PRECISION_S32;
+  *o = CLAMP (res, -(1L << 31), (1L << 31) - 1);
+}
+
+static inline void
+inner_product_gint32_linear_1_sse41 (gint32 * o, const gint32 * a,
+    const gint32 * b, gint len, const gint32 * icoeff, gint bstride)
+{
+  gint i = 0;
+  gint64 res;
+  __m128i sum[2], ta, tb;
+  __m128i f = _mm_loadu_si128 ((__m128i *) icoeff);
+  const gint32 *c[2] = { (gint32 *) ((gint8 *) b + 0 * bstride),
+    (gint32 *) ((gint8 *) b + 1 * bstride)
+  };
+
+  sum[0] = sum[1] = _mm_setzero_si128 ();
+
+  for (; i < len; i += 4) {
+    ta = _mm_loadu_si128 ((__m128i *) (a + i));
+
+    tb = _mm_load_si128 ((__m128i *) (c[0] + i));
+    sum[0] = _mm_add_epi64 (sum[0], _mm_mul_epi32 (_mm_unpacklo_epi32 (ta, ta),
+            _mm_unpacklo_epi32 (tb, tb)));
+    sum[0] = _mm_add_epi64 (sum[0], _mm_mul_epi32 (_mm_unpackhi_epi32 (ta, ta),
+            _mm_unpackhi_epi32 (tb, tb)));
+
+    tb = _mm_load_si128 ((__m128i *) (c[1] + i));
+    sum[1] = _mm_add_epi64 (sum[1], _mm_mul_epi32 (_mm_unpacklo_epi32 (ta, ta),
+            _mm_unpacklo_epi32 (tb, tb)));
+    sum[1] = _mm_add_epi64 (sum[1], _mm_mul_epi32 (_mm_unpackhi_epi32 (ta, ta),
+            _mm_unpackhi_epi32 (tb, tb)));
+  }
+  sum[0] = _mm_srli_epi64 (sum[0], PRECISION_S32);
+  sum[1] = _mm_srli_epi64 (sum[1], PRECISION_S32);
+  sum[0] =
+      _mm_mul_epi32 (sum[0], _mm_shuffle_epi32 (f, _MM_SHUFFLE (0, 0, 0, 0)));
+  sum[1] =
+      _mm_mul_epi32 (sum[1], _mm_shuffle_epi32 (f, _MM_SHUFFLE (1, 1, 1, 1)));
+  sum[0] = _mm_add_epi64 (sum[0], sum[1]);
+  sum[0] = _mm_add_epi64 (sum[0], _mm_unpackhi_epi64 (sum[0], sum[0]));
+  res = _mm_cvtsi128_si64 (sum[0]);
+
+  res = (res + (1 << (PRECISION_S32 - 1))) >> PRECISION_S32;
+  *o = CLAMP (res, -(1L << 31), (1L << 31) - 1);
+}
+
+static inline void
+inner_product_gint32_cubic_1_sse41 (gint32 * o, const gint32 * a,
+    const gint32 * b, gint len, const gint32 * icoeff, gint bstride)
+{
+  gint i = 0;
+  gint64 res;
+  __m128i sum[4], ta, tb;
+  __m128i f = _mm_loadu_si128 ((__m128i *) icoeff);
+  const gint32 *c[4] = { (gint32 *) ((gint8 *) b + 0 * bstride),
+    (gint32 *) ((gint8 *) b + 1 * bstride),
+    (gint32 *) ((gint8 *) b + 2 * bstride),
+    (gint32 *) ((gint8 *) b + 3 * bstride)
+  };
+
+  sum[0] = sum[1] = sum[2] = sum[3] = _mm_setzero_si128 ();
+
+  for (; i < len; i += 4) {
+    ta = _mm_loadu_si128 ((__m128i *) (a + i));
+
+    tb = _mm_load_si128 ((__m128i *) (c[0] + i));
+    sum[0] = _mm_add_epi64 (sum[0], _mm_mul_epi32 (_mm_unpacklo_epi32 (ta, ta),
+            _mm_unpacklo_epi32 (tb, tb)));
+    sum[0] = _mm_add_epi64 (sum[0], _mm_mul_epi32 (_mm_unpackhi_epi32 (ta, ta),
+            _mm_unpackhi_epi32 (tb, tb)));
+
+    tb = _mm_load_si128 ((__m128i *) (c[1] + i));
+    sum[1] = _mm_add_epi64 (sum[1], _mm_mul_epi32 (_mm_unpacklo_epi32 (ta, ta),
+            _mm_unpacklo_epi32 (tb, tb)));
+    sum[1] = _mm_add_epi64 (sum[1], _mm_mul_epi32 (_mm_unpackhi_epi32 (ta, ta),
+            _mm_unpackhi_epi32 (tb, tb)));
+
+    tb = _mm_load_si128 ((__m128i *) (c[2] + i));
+    sum[2] = _mm_add_epi64 (sum[2], _mm_mul_epi32 (_mm_unpacklo_epi32 (ta, ta),
+            _mm_unpacklo_epi32 (tb, tb)));
+    sum[2] = _mm_add_epi64 (sum[2], _mm_mul_epi32 (_mm_unpackhi_epi32 (ta, ta),
+            _mm_unpackhi_epi32 (tb, tb)));
+
+    tb = _mm_load_si128 ((__m128i *) (c[3] + i));
+    sum[3] = _mm_add_epi64 (sum[3], _mm_mul_epi32 (_mm_unpacklo_epi32 (ta, ta),
+            _mm_unpacklo_epi32 (tb, tb)));
+    sum[3] = _mm_add_epi64 (sum[3], _mm_mul_epi32 (_mm_unpackhi_epi32 (ta, ta),
+            _mm_unpackhi_epi32 (tb, tb)));
+  }
+  sum[0] = _mm_srli_epi64 (sum[0], PRECISION_S32);
+  sum[1] = _mm_srli_epi64 (sum[1], PRECISION_S32);
+  sum[2] = _mm_srli_epi64 (sum[2], PRECISION_S32);
+  sum[3] = _mm_srli_epi64 (sum[3], PRECISION_S32);
+  sum[0] =
+      _mm_mul_epi32 (sum[0], _mm_shuffle_epi32 (f, _MM_SHUFFLE (0, 0, 0, 0)));
+  sum[1] =
+      _mm_mul_epi32 (sum[1], _mm_shuffle_epi32 (f, _MM_SHUFFLE (1, 1, 1, 1)));
+  sum[2] =
+      _mm_mul_epi32 (sum[2], _mm_shuffle_epi32 (f, _MM_SHUFFLE (2, 2, 2, 2)));
+  sum[3] =
+      _mm_mul_epi32 (sum[3], _mm_shuffle_epi32 (f, _MM_SHUFFLE (3, 3, 3, 3)));
+  sum[0] = _mm_add_epi64 (sum[0], sum[1]);
+  sum[2] = _mm_add_epi64 (sum[2], sum[3]);
+  sum[0] = _mm_add_epi64 (sum[0], sum[2]);
+  sum[0] = _mm_add_epi64 (sum[0], _mm_unpackhi_epi64 (sum[0], sum[0]));
+  res = _mm_cvtsi128_si64 (sum[0]);
+
+  res = (res + (1 << (PRECISION_S32 - 1))) >> PRECISION_S32;
+  *o = CLAMP (res, -(1L << 31), (1L << 31) - 1);
+}
+
+MAKE_RESAMPLE_FUNC (gint32, full, 1, sse41);
+MAKE_RESAMPLE_FUNC (gint32, linear, 1, sse41);
+MAKE_RESAMPLE_FUNC (gint32, cubic, 1, sse41);
+
+#endif
--- a/gst-libs/gst/audio/audio-resampler-x86-sse41.h
+++ b/gst-libs/gst/audio/audio-resampler-x86-sse41.h
@ -0,0 +1,29 @@
+/* GStreamer
+ * Copyright (C) <2016> Wim Taymans <wim.taymans@gmail.com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Library General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Library General Public
+ * License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
+ * Boston, MA 02110-1301, USA.
+ */
+
+#ifndef AUDIO_RESAMPLER_X86_SSE41_H
+#define AUDIO_RESAMPLER_X86_SSE41_H
+
+#include "audio-resampler-macros.h"
+
+DECL_RESAMPLE_FUNC (gint32, full, 1, sse41);
+DECL_RESAMPLE_FUNC (gint32, linear, 1, sse41);
+DECL_RESAMPLE_FUNC (gint32, cubic, 1, sse41);
+
+#endif /* AUDIO_RESAMPLER_X86_SSE41_H */
--- a/gst-libs/gst/audio/audio-resampler-x86.h
+++ b/gst-libs/gst/audio/audio-resampler-x86.h
@ -17,631 +17,16 @@
 * Boston, MA 02110-1301, USA.
 */

-#if defined (HAVE_XMMINTRIN_H) && defined(__SSE__)
-#include <xmmintrin.h>
-
-static inline void
-inner_product_gfloat_full_1_sse (gfloat * o, const gfloat * a,
-    const gfloat * b, gint len, const gfloat * icoeff, gint bstride)
-{
-  gint i = 0;
-  __m128 sum = _mm_setzero_ps ();
-
-  for (; i < len; i += 8) {
-    sum =
-        _mm_add_ps (sum, _mm_mul_ps (_mm_loadu_ps (a + i + 0),
-            _mm_load_ps (b + i + 0)));
-    sum =
-        _mm_add_ps (sum, _mm_mul_ps (_mm_loadu_ps (a + i + 4),
-            _mm_load_ps (b + i + 4)));
-  }
-  sum = _mm_add_ps (sum, _mm_movehl_ps (sum, sum));
-  sum = _mm_add_ss (sum, _mm_shuffle_ps (sum, sum, 0x55));
-  _mm_store_ss (o, sum);
-}
-
-static inline void
-inner_product_gfloat_linear_1_sse (gfloat * o, const gfloat * a,
-    const gfloat * b, gint len, const gfloat * icoeff, gint bstride)
-{
-  gint i = 0;
-  __m128 sum[2], t;
-  const gfloat *c[2] = {(gfloat*)((gint8*)b + 0*bstride),
-                        (gfloat*)((gint8*)b + 1*bstride)};
-
-  sum[0] = sum[1] = _mm_setzero_ps ();
-
-  for (; i < len; i += 8) {
-    t = _mm_loadu_ps (a + i + 0);
-    sum[0] = _mm_add_ps (sum[0], _mm_mul_ps (t, _mm_load_ps (c[0] + i + 0)));
-    sum[1] = _mm_add_ps (sum[1], _mm_mul_ps (t, _mm_load_ps (c[1] + i + 0)));
-    t = _mm_loadu_ps (a + i + 4);
-    sum[0] = _mm_add_ps (sum[0], _mm_mul_ps (t, _mm_load_ps (c[0] + i + 4)));
-    sum[1] = _mm_add_ps (sum[1], _mm_mul_ps (t, _mm_load_ps (c[1] + i + 4)));
-  }
-  sum[0] = _mm_mul_ps (_mm_sub_ps (sum[0], sum[1]), _mm_load1_ps (icoeff));
-  sum[0] = _mm_add_ps (sum[0], sum[1]);
-  sum[0] = _mm_add_ps (sum[0], _mm_movehl_ps (sum[0], sum[0]));
-  sum[0] = _mm_add_ss (sum[0], _mm_shuffle_ps (sum[0], sum[0], 0x55));
-  _mm_store_ss (o, sum[0]);
-}
-
-static inline void
-inner_product_gfloat_cubic_1_sse (gfloat * o, const gfloat * a,
-    const gfloat * b, gint len, const gfloat * icoeff, gint bstride)
-{
-  gint i = 0;
-  __m128 sum[4];
-  __m128 t, f = _mm_loadu_ps(icoeff);
-  const gfloat *c[4] = {(gfloat*)((gint8*)b + 0*bstride),
-                        (gfloat*)((gint8*)b + 1*bstride),
-                        (gfloat*)((gint8*)b + 2*bstride),
-                        (gfloat*)((gint8*)b + 3*bstride)};
-
-  sum[0] = sum[1] = sum[2] = sum[3] = _mm_setzero_ps ();
-
-  for (; i < len; i += 4) {
-    t = _mm_loadu_ps (a + i);
-    sum[0] = _mm_add_ps (sum[0], _mm_mul_ps (t, _mm_load_ps (c[0] + i)));
-    sum[1] = _mm_add_ps (sum[1], _mm_mul_ps (t, _mm_load_ps (c[1] + i)));
-    sum[2] = _mm_add_ps (sum[2], _mm_mul_ps (t, _mm_load_ps (c[2] + i)));
-    sum[3] = _mm_add_ps (sum[3], _mm_mul_ps (t, _mm_load_ps (c[3] + i)));
-  }
-  sum[0] = _mm_mul_ps (sum[0], _mm_shuffle_ps (f, f, 0x00));
-  sum[1] = _mm_mul_ps (sum[1], _mm_shuffle_ps (f, f, 0x55));
-  sum[2] = _mm_mul_ps (sum[2], _mm_shuffle_ps (f, f, 0xaa));
-  sum[3] = _mm_mul_ps (sum[3], _mm_shuffle_ps (f, f, 0xff));
-  sum[0] = _mm_add_ps (sum[0], sum[1]);
-  sum[2] = _mm_add_ps (sum[2], sum[3]);
-  sum[0] = _mm_add_ps (sum[0], sum[2]);
-  sum[0] = _mm_add_ps (sum[0], _mm_movehl_ps (sum[0], sum[0]));
-  sum[0] = _mm_add_ss (sum[0], _mm_shuffle_ps (sum[0], sum[0], 0x55));
-  _mm_store_ss (o, sum[0]);
-}
-
-MAKE_RESAMPLE_FUNC (gfloat, full, 1, sse);
-MAKE_RESAMPLE_FUNC (gfloat, linear, 1, sse);
-MAKE_RESAMPLE_FUNC (gfloat, cubic, 1, sse);
-
-static void
-interpolate_gfloat_linear_sse (gpointer op, const gpointer ap,
-    gint len, const gpointer icp, gint astride)
-{
-  gint i;
-  gfloat *o = op, *a = ap, *ic = icp;
-  __m128 f[2], t1, t2;
-  const gfloat *c[2] = {(gfloat*)((gint8*)a + 0*astride),
-                        (gfloat*)((gint8*)a + 1*astride)};
-
-  f[0] = _mm_load1_ps (ic+0);
-  f[1] = _mm_load1_ps (ic+1);
-
-  for (i = 0; i < len; i += 8) {
-    t1 = _mm_mul_ps (_mm_load_ps (c[0] + i + 0), f[0]);
-    t2 = _mm_mul_ps (_mm_load_ps (c[1] + i + 0), f[1]);
-    _mm_store_ps (o + i + 0, _mm_add_ps (t1, t2));
-
-    t1 = _mm_mul_ps (_mm_load_ps (c[0] + i + 4), f[0]);
-    t2 = _mm_mul_ps (_mm_load_ps (c[1] + i + 4), f[1]);
-    _mm_store_ps (o + i + 4, _mm_add_ps (t1, t2));
-  }
-}
-
-static void
-interpolate_gfloat_cubic_sse (gpointer op, const gpointer ap,
-    gint len, const gpointer icp, gint astride)
-{
-  gint i;
-  gfloat *o = op, *a = ap, *ic = icp;
-  __m128 f[4], t[4];
-  const gfloat *c[4] = {(gfloat*)((gint8*)a + 0*astride),
-                        (gfloat*)((gint8*)a + 1*astride),
-                        (gfloat*)((gint8*)a + 2*astride),
-                        (gfloat*)((gint8*)a + 3*astride)};
-
-  f[0] = _mm_load1_ps (ic+0);
-  f[1] = _mm_load1_ps (ic+1);
-  f[2] = _mm_load1_ps (ic+2);
-  f[3] = _mm_load1_ps (ic+3);
-
-  for (i = 0; i < len; i += 4) {
-    t[0] = _mm_mul_ps (_mm_load_ps (c[0] + i + 0), f[0]);
-    t[1] = _mm_mul_ps (_mm_load_ps (c[1] + i + 0), f[1]);
-    t[2] = _mm_mul_ps (_mm_load_ps (c[2] + i + 0), f[2]);
-    t[3] = _mm_mul_ps (_mm_load_ps (c[3] + i + 0), f[3]);
-    t[0] = _mm_add_ps (t[0], t[1]);
-    t[2] = _mm_add_ps (t[2], t[3]);
-    _mm_store_ps (o + i + 0, _mm_add_ps (t[0], t[2]));
-  }
-}
-
-#endif
-
-#if defined (HAVE_EMMINTRIN_H) && defined(__SSE2__)
-#include <emmintrin.h>
-
-static inline void
-inner_product_gint16_full_1_sse2 (gint16 * o, const gint16 * a,
-    const gint16 * b, gint len, const gint16 * icoeff, gint bstride)
-{
-  gint i;
-  __m128i sum, t;
-
-  sum = _mm_setzero_si128 ();
-
-  for (i = 0; i < len; i += 16) {
-    t = _mm_loadu_si128 ((__m128i *) (a + i));
-    sum = _mm_add_epi32 (sum, _mm_madd_epi16 (t, _mm_load_si128 ((__m128i *) (b + i + 0))));
-
-    t = _mm_loadu_si128 ((__m128i *) (a + i + 8));
-    sum = _mm_add_epi32 (sum, _mm_madd_epi16 (t, _mm_load_si128 ((__m128i *) (b + i + 8))));
-  }
-  sum = _mm_add_epi32 (sum, _mm_shuffle_epi32 (sum, _MM_SHUFFLE (2, 3, 2, 3)));
-  sum = _mm_add_epi32 (sum, _mm_shuffle_epi32 (sum, _MM_SHUFFLE (1, 1, 1, 1)));
-
-  sum = _mm_add_epi32 (sum, _mm_set1_epi32 (1 << (PRECISION_S16 - 1)));
-  sum = _mm_srai_epi32 (sum, PRECISION_S16);
-  sum = _mm_packs_epi32 (sum, sum);
-  *o = _mm_extract_epi16 (sum, 0);
-}
-
-static inline void
-inner_product_gint16_linear_1_sse2 (gint16 * o, const gint16 * a,
-    const gint16 * b, gint len, const gint16 * icoeff, gint bstride)
-{
-  gint i = 0;
-  __m128i sum[2], t;
-  __m128i f = _mm_set_epi64x (0, *((gint64*)icoeff));
-  const gint16 *c[2] = {(gint16*)((gint8*)b + 0*bstride),
-                        (gint16*)((gint8*)b + 1*bstride)};
-
-  sum[0] = sum[1] = _mm_setzero_si128 ();
-  f = _mm_unpacklo_epi16 (f, sum[0]);
-
-  for (; i < len; i += 16) {
-    t = _mm_loadu_si128 ((__m128i *) (a + i + 0));
-    sum[0] = _mm_add_epi32 (sum[0], _mm_madd_epi16 (t, _mm_load_si128 ((__m128i *) (c[0] + i + 0))));
-    sum[1] = _mm_add_epi32 (sum[1], _mm_madd_epi16 (t, _mm_load_si128 ((__m128i *) (c[1] + i + 0))));
-
-    t = _mm_loadu_si128 ((__m128i *) (a + i + 8));
-    sum[0] = _mm_add_epi32 (sum[0], _mm_madd_epi16 (t, _mm_load_si128 ((__m128i *) (c[0] + i + 8))));
-    sum[1] = _mm_add_epi32 (sum[1], _mm_madd_epi16 (t, _mm_load_si128 ((__m128i *) (c[1] + i + 8))));
-  }
-  sum[0] = _mm_srai_epi32 (sum[0], PRECISION_S16);
-  sum[1] = _mm_srai_epi32 (sum[1], PRECISION_S16);
-
-  sum[0] = _mm_madd_epi16 (sum[0], _mm_shuffle_epi32 (f,  _MM_SHUFFLE (0, 0, 0, 0)));
-  sum[1] = _mm_madd_epi16 (sum[1], _mm_shuffle_epi32 (f,  _MM_SHUFFLE (1, 1, 1, 1)));
-  sum[0] = _mm_add_epi32 (sum[0], sum[1]);
-
-  sum[0] = _mm_add_epi32 (sum[0], _mm_shuffle_epi32 (sum[0], _MM_SHUFFLE (2, 3, 2, 3)));
-  sum[0] = _mm_add_epi32 (sum[0], _mm_shuffle_epi32 (sum[0], _MM_SHUFFLE (1, 1, 1, 1)));
-
-  sum[0] = _mm_add_epi32 (sum[0], _mm_set1_epi32 (1 << (PRECISION_S16 - 1)));
-  sum[0] = _mm_srai_epi32 (sum[0], PRECISION_S16);
-  sum[0] = _mm_packs_epi32 (sum[0], sum[0]);
-  *o = _mm_extract_epi16 (sum[0], 0);
-}
-
-static inline void
-inner_product_gint16_cubic_1_sse2 (gint16 * o, const gint16 * a,
-    const gint16 * b, gint len, const gint16 * icoeff, gint bstride)
-{
-  gint i = 0;
-  __m128i sum[4], t[4];
-  __m128i f = _mm_set_epi64x (0, *((long long*)icoeff));
-  const gint16 *c[4] = {(gint16*)((gint8*)b + 0*bstride),
-                        (gint16*)((gint8*)b + 1*bstride),
-                        (gint16*)((gint8*)b + 2*bstride),
-                        (gint16*)((gint8*)b + 3*bstride)};
-
-  sum[0] = sum[1] = sum[2] = sum[3] = _mm_setzero_si128 ();
-  f = _mm_unpacklo_epi16 (f, sum[0]);
-
-  for (; i < len; i += 8) {
-    t[0] = _mm_loadu_si128 ((__m128i *) (a + i));
-    sum[0] = _mm_add_epi32 (sum[0], _mm_madd_epi16 (t[0], _mm_load_si128 ((__m128i *) (c[0] + i))));
-    sum[1] = _mm_add_epi32 (sum[1], _mm_madd_epi16 (t[0], _mm_load_si128 ((__m128i *) (c[1] + i))));
-    sum[2] = _mm_add_epi32 (sum[2], _mm_madd_epi16 (t[0], _mm_load_si128 ((__m128i *) (c[2] + i))));
-    sum[3] = _mm_add_epi32 (sum[3], _mm_madd_epi16 (t[0], _mm_load_si128 ((__m128i *) (c[3] + i))));
-  }
-  t[0] = _mm_unpacklo_epi32 (sum[0], sum[1]);
-  t[1] = _mm_unpacklo_epi32 (sum[2], sum[3]);
-  t[2] = _mm_unpackhi_epi32 (sum[0], sum[1]);
-  t[3] = _mm_unpackhi_epi32 (sum[2], sum[3]);
-
-  sum[0] = _mm_add_epi32 (_mm_unpacklo_epi64(t[0], t[1]), _mm_unpackhi_epi64(t[0], t[1]));
-  sum[2] = _mm_add_epi32 (_mm_unpacklo_epi64(t[2], t[3]), _mm_unpackhi_epi64(t[2], t[3]));
-  sum[0] = _mm_add_epi32 (sum[0], sum[2]);
-
-  sum[0] = _mm_srai_epi32 (sum[0], PRECISION_S16);
-  sum[0] = _mm_madd_epi16 (sum[0], f);
-
-  sum[0] = _mm_add_epi32 (sum[0], _mm_shuffle_epi32 (sum[0], _MM_SHUFFLE (2, 3, 2, 3)));
-  sum[0] = _mm_add_epi32 (sum[0], _mm_shuffle_epi32 (sum[0], _MM_SHUFFLE (1, 1, 1, 1)));
-
-  sum[0] = _mm_add_epi32 (sum[0], _mm_set1_epi32 (1 << (PRECISION_S16 - 1)));
-  sum[0] = _mm_srai_epi32 (sum[0], PRECISION_S16);
-  sum[0] = _mm_packs_epi32 (sum[0], sum[0]);
-  *o = _mm_extract_epi16 (sum[0], 0);
-}
-
-static inline void
-inner_product_gdouble_full_1_sse2 (gdouble * o, const gdouble * a,
-    const gdouble * b, gint len, const gdouble * icoeff, gint bstride)
-{
-  gint i = 0;
-  __m128d sum = _mm_setzero_pd ();
-
-  for (; i < len; i += 8) {
-    sum =
-        _mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + i + 0),
-            _mm_load_pd (b + i + 0)));
-    sum =
-        _mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + i + 2),
-            _mm_load_pd (b + i + 2)));
-    sum =
-        _mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + i + 4),
-            _mm_load_pd (b + i + 4)));
-    sum =
-        _mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + i + 6),
-            _mm_load_pd (b + i + 6)));
-  }
-  sum = _mm_add_sd (sum, _mm_unpackhi_pd (sum, sum));
-  _mm_store_sd (o, sum);
-}
-
-static inline void
-inner_product_gdouble_linear_1_sse2 (gdouble * o, const gdouble * a,
-    const gdouble * b, gint len, const gdouble * icoeff, gint bstride)
-{
-  gint i = 0;
-  __m128d sum[2], t;
-  const gdouble *c[2] = {(gdouble*)((gint8*)b + 0*bstride),
-                         (gdouble*)((gint8*)b + 1*bstride)};
-
-  sum[0] = sum[1] = _mm_setzero_pd ();
-
-  for (; i < len; i += 4) {
-    t = _mm_loadu_pd (a + i + 0);
-    sum[0] = _mm_add_pd (sum[0], _mm_mul_pd (t, _mm_load_pd (c[0] + i + 0)));
-    sum[1] = _mm_add_pd (sum[1], _mm_mul_pd (t, _mm_load_pd (c[1] + i + 0)));
-    t = _mm_loadu_pd (a + i + 2);
-    sum[0] = _mm_add_pd (sum[0], _mm_mul_pd (t, _mm_load_pd (c[0] + i + 2)));
-    sum[1] = _mm_add_pd (sum[1], _mm_mul_pd (t, _mm_load_pd (c[1] + i + 2)));
-  }
-  sum[0] = _mm_mul_pd (_mm_sub_pd (sum[0], sum[1]), _mm_load1_pd (icoeff));
-  sum[0] = _mm_add_pd (sum[0], sum[1]);
-  sum[0] = _mm_add_sd (sum[0], _mm_unpackhi_pd (sum[0], sum[0]));
-  _mm_store_sd (o, sum[0]);
-}
-
-static inline void
-inner_product_gdouble_cubic_1_sse2 (gdouble * o, const gdouble * a,
-    const gdouble * b, gint len, const gdouble * icoeff, gint bstride)
-{
-  gint i;
-  __m128d f[2], sum[4], t;
-  const gdouble *c[4] = {(gdouble*)((gint8*)b + 0*bstride),
-                         (gdouble*)((gint8*)b + 1*bstride),
-                         (gdouble*)((gint8*)b + 2*bstride),
-                         (gdouble*)((gint8*)b + 3*bstride)};
-
-  f[0] = _mm_loadu_pd (icoeff + 0);
-  f[1] = _mm_loadu_pd (icoeff + 2);
-  sum[0] = sum[1] = sum[2] = sum[3] = _mm_setzero_pd ();
-
-  for (i = 0; i < len; i += 2) {
-    t = _mm_loadu_pd (a + i + 0);
-    sum[0] = _mm_add_pd (sum[0], _mm_mul_pd (t, _mm_load_pd (c[0] + i)));
-    sum[1] = _mm_add_pd (sum[1], _mm_mul_pd (t, _mm_load_pd (c[1] + i)));
-    sum[2] = _mm_add_pd (sum[2], _mm_mul_pd (t, _mm_load_pd (c[2] + i)));
-    sum[3] = _mm_add_pd (sum[3], _mm_mul_pd (t, _mm_load_pd (c[3] + i)));
-  }
-  sum[0] = _mm_mul_pd (sum[0], _mm_shuffle_pd (f[0], f[0], _MM_SHUFFLE2 (0, 0)));
-  sum[1] = _mm_mul_pd (sum[1], _mm_shuffle_pd (f[0], f[0], _MM_SHUFFLE2 (1, 1)));
-  sum[2] = _mm_mul_pd (sum[2], _mm_shuffle_pd (f[1], f[1], _MM_SHUFFLE2 (0, 0)));
-  sum[3] = _mm_mul_pd (sum[3], _mm_shuffle_pd (f[1], f[1], _MM_SHUFFLE2 (1, 1)));
-  sum[0] = _mm_add_pd (sum[0], sum[1]);
-  sum[2] = _mm_add_pd (sum[2], sum[3]);
-  sum[0] = _mm_add_pd (sum[0], sum[2]);
-  sum[0] = _mm_add_sd (sum[0], _mm_unpackhi_pd (sum[0], sum[0]));
-  _mm_store_sd (o, sum[0]);
-}
-
-MAKE_RESAMPLE_FUNC (gint16, full, 1, sse2);
-MAKE_RESAMPLE_FUNC (gint16, linear, 1, sse2);
-MAKE_RESAMPLE_FUNC (gint16, cubic, 1, sse2);
-
-MAKE_RESAMPLE_FUNC (gdouble, full, 1, sse2);
-MAKE_RESAMPLE_FUNC (gdouble, linear, 1, sse2);
-MAKE_RESAMPLE_FUNC (gdouble, cubic, 1, sse2);
-
-static inline void
-interpolate_gint16_linear_sse2 (gpointer op, const gpointer ap,
-    gint len, const gpointer icp, gint astride)
-{
-  gint i = 0;
-  gint16 *o = op, *a = ap, *ic = icp;
-  __m128i ta, tb, t1, t2;
-  __m128i f = _mm_set_epi64x (0, *((gint64*)ic));
-  const gint16 *c[2] = {(gint16*)((gint8*)a + 0*astride),
-                        (gint16*)((gint8*)a + 1*astride)};
-
-  f = _mm_unpacklo_epi32 (f, f);
-  f = _mm_unpacklo_epi64 (f, f);
-
-  for (; i < len; i += 8) {
-    ta = _mm_load_si128 ((__m128i *) (c[0] + i));
-    tb = _mm_load_si128 ((__m128i *) (c[1] + i));
-
-    t1 = _mm_madd_epi16 (_mm_unpacklo_epi16 (ta, tb), f);
-    t2 = _mm_madd_epi16 (_mm_unpackhi_epi16 (ta, tb), f);
-
-    t1 = _mm_add_epi32 (t1, _mm_set1_epi32 (1 << (PRECISION_S16 - 1)));
-    t2 = _mm_add_epi32 (t2, _mm_set1_epi32 (1 << (PRECISION_S16 - 1)));
-
-    t1 = _mm_srai_epi32 (t1, PRECISION_S16);
-    t2 = _mm_srai_epi32 (t2, PRECISION_S16);
-
-    t1 = _mm_packs_epi32 (t1, t2);
-    _mm_store_si128 ((__m128i *) (o + i), t1);
-  }
-}
-
-static inline void
-interpolate_gint16_cubic_sse2 (gpointer op, const gpointer ap,
-    gint len, const gpointer icp, gint astride)
-{
-  gint i = 0;
-  gint16 *o = op, *a = ap, *ic = icp;
-  __m128i ta, tb, tl1, tl2, th1, th2;
-  __m128i f[2];
-  const gint16 *c[4] = {(gint16*)((gint8*)a + 0*astride),
-                        (gint16*)((gint8*)a + 1*astride),
-                        (gint16*)((gint8*)a + 2*astride),
-                        (gint16*)((gint8*)a + 3*astride)};
-
-  f[0] = _mm_set_epi16 (ic[1], ic[0], ic[1], ic[0], ic[1], ic[0], ic[1], ic[0]);
-  f[1] = _mm_set_epi16 (ic[3], ic[2], ic[3], ic[2], ic[3], ic[2], ic[3], ic[2]);
-
-  for (; i < len; i += 8) {
-    ta = _mm_load_si128 ((__m128i *) (c[0] + i));
-    tb = _mm_load_si128 ((__m128i *) (c[1] + i));
-
-    tl1 = _mm_madd_epi16 (_mm_unpacklo_epi16 (ta, tb), f[0]);
-    th1 = _mm_madd_epi16 (_mm_unpackhi_epi16 (ta, tb), f[0]);
-
-    ta = _mm_load_si128 ((__m128i *) (c[2] + i));
-    tb = _mm_load_si128 ((__m128i *) (c[3] + i));
-
-    tl2 = _mm_madd_epi16 (_mm_unpacklo_epi16 (ta, tb), f[1]);
-    th2 = _mm_madd_epi16 (_mm_unpackhi_epi16 (ta, tb), f[1]);
-
-    tl1 = _mm_add_epi32 (tl1, tl2);
-    th1 = _mm_add_epi32 (th1, th2);
-
-    tl1 = _mm_add_epi32 (tl1, _mm_set1_epi32 (1 << (PRECISION_S16 - 1)));
-    th1 = _mm_add_epi32 (th1, _mm_set1_epi32 (1 << (PRECISION_S16 - 1)));
-
-    tl1 = _mm_srai_epi32 (tl1, PRECISION_S16);
-    th1 = _mm_srai_epi32 (th1, PRECISION_S16);
-
-    tl1 = _mm_packs_epi32 (tl1, th1);
-    _mm_store_si128 ((__m128i *) (o + i), tl1);
-  }
-}
-
-static void
-interpolate_gdouble_linear_sse2 (gpointer op, const gpointer ap,
-    gint len, const gpointer icp, gint astride)
-{
-  gint i;
-  gdouble *o = op, *a = ap, *ic = icp;
-  __m128d f[2], t1, t2;
-  const gdouble *c[2] = {(gdouble*)((gint8*)a + 0*astride),
-                         (gdouble*)((gint8*)a + 1*astride)};
-
-  f[0] = _mm_load1_pd (ic+0);
-  f[1] = _mm_load1_pd (ic+1);
-
-  for (i = 0; i < len; i += 4) {
-    t1 = _mm_mul_pd (_mm_load_pd (c[0] + i + 0), f[0]);
-    t2 = _mm_mul_pd (_mm_load_pd (c[1] + i + 0), f[1]);
-    _mm_store_pd (o + i + 0, _mm_add_pd (t1, t2));
-
-    t1 = _mm_mul_pd (_mm_load_pd (c[0] + i + 2), f[0]);
-    t2 = _mm_mul_pd (_mm_load_pd (c[1] + i + 2), f[1]);
-    _mm_store_pd (o + i + 2, _mm_add_pd (t1, t2));
-  }
-}
-
-static void
-interpolate_gdouble_cubic_sse2 (gpointer op, const gpointer ap,
-    gint len, const gpointer icp, gint astride)
-{
-  gint i;
-  gdouble *o = op, *a = ap, *ic = icp;
-  __m128d f[4], t[4];
-  const gdouble *c[4] = {(gdouble*)((gint8*)a + 0*astride),
-                         (gdouble*)((gint8*)a + 1*astride),
-                         (gdouble*)((gint8*)a + 2*astride),
-                         (gdouble*)((gint8*)a + 3*astride)};
-
-  f[0] = _mm_load1_pd (ic+0);
-  f[1] = _mm_load1_pd (ic+1);
-  f[2] = _mm_load1_pd (ic+2);
-  f[3] = _mm_load1_pd (ic+3);
-
-  for (i = 0; i < len; i += 2) {
-    t[0] = _mm_mul_pd (_mm_load_pd (c[0] + i + 0), f[0]);
-    t[1] = _mm_mul_pd (_mm_load_pd (c[1] + i + 0), f[1]);
-    t[2] = _mm_mul_pd (_mm_load_pd (c[2] + i + 0), f[2]);
-    t[3] = _mm_mul_pd (_mm_load_pd (c[3] + i + 0), f[3]);
-    t[0] = _mm_add_pd (t[0], t[1]);
-    t[2] = _mm_add_pd (t[2], t[3]);
-    _mm_store_pd (o + i + 0, _mm_add_pd (t[0], t[2]));
-  }
-}
-
-#endif
-
-#if 0
-#define __SSE4_1__
-#pragma GCC target("sse4.1")
-#endif
-
-#if defined (HAVE_SMMINTRIN_H) && defined(__SSE4_1__)
-#include <smmintrin.h>
-
-static inline void
-inner_product_gint32_full_1_sse41 (gint32 * o, const gint32 * a,
-    const gint32 * b, gint len, const gint32 * icoeff, gint bstride)
-{
-  gint i = 0;
-  __m128i sum, ta, tb;
-  gint64 res;
-
-  sum = _mm_setzero_si128 ();
-
-  for (; i < len; i += 8) {
-    ta = _mm_loadu_si128 ((__m128i *) (a + i));
-    tb = _mm_load_si128 ((__m128i *) (b + i));
-
-    sum =
-        _mm_add_epi64 (sum, _mm_mul_epi32 (_mm_unpacklo_epi32 (ta, ta),
-            _mm_unpacklo_epi32 (tb, tb)));
-    sum =
-        _mm_add_epi64 (sum, _mm_mul_epi32 (_mm_unpackhi_epi32 (ta, ta),
-            _mm_unpackhi_epi32 (tb, tb)));
-
-    ta = _mm_loadu_si128 ((__m128i *) (a + i + 4));
-    tb = _mm_load_si128 ((__m128i *) (b + i + 4));
-
-    sum =
-        _mm_add_epi64 (sum, _mm_mul_epi32 (_mm_unpacklo_epi32 (ta, ta),
-            _mm_unpacklo_epi32 (tb, tb)));
-    sum =
-        _mm_add_epi64 (sum, _mm_mul_epi32 (_mm_unpackhi_epi32 (ta, ta),
-            _mm_unpackhi_epi32 (tb, tb)));
-  }
-  sum = _mm_add_epi64 (sum, _mm_unpackhi_epi64 (sum, sum));
-  res = _mm_cvtsi128_si64 (sum);
-
-  res = (res + (1 << (PRECISION_S32 - 1))) >> PRECISION_S32;
-  *o = CLAMP (res, -(1L << 31), (1L << 31) - 1);
-}
-
-static inline void
-inner_product_gint32_linear_1_sse41 (gint32 * o, const gint32 * a,
-    const gint32 * b, gint len, const gint32 * icoeff, gint bstride)
-{
-  gint i = 0;
-  gint64 res;
-  __m128i sum[2], ta, tb;
-  __m128i f = _mm_loadu_si128 ((__m128i *)icoeff);
-  const gint32 *c[2] = {(gint32*)((gint8*)b + 0*bstride),
-                        (gint32*)((gint8*)b + 1*bstride)};
-
-  sum[0] = sum[1] = _mm_setzero_si128 ();
-
-  for (; i < len; i += 4) {
-    ta = _mm_loadu_si128 ((__m128i *)(a + i));
-
-    tb = _mm_load_si128 ((__m128i *)(c[0] + i));
-    sum[0] = _mm_add_epi64 (sum[0], _mm_mul_epi32 (_mm_unpacklo_epi32 (ta, ta),
-              _mm_unpacklo_epi32 (tb, tb)));
-    sum[0] = _mm_add_epi64 (sum[0], _mm_mul_epi32 (_mm_unpackhi_epi32 (ta, ta),
-              _mm_unpackhi_epi32 (tb, tb)));
-
-    tb = _mm_load_si128 ((__m128i *)(c[1] + i));
-    sum[1] = _mm_add_epi64 (sum[1], _mm_mul_epi32 (_mm_unpacklo_epi32 (ta, ta),
-              _mm_unpacklo_epi32 (tb, tb)));
-    sum[1] = _mm_add_epi64 (sum[1], _mm_mul_epi32 (_mm_unpackhi_epi32 (ta, ta),
-              _mm_unpackhi_epi32 (tb, tb)));
-  }
-  sum[0] = _mm_srli_epi64 (sum[0], PRECISION_S32);
-  sum[1] = _mm_srli_epi64 (sum[1], PRECISION_S32);
-  sum[0] = _mm_mul_epi32 (sum[0], _mm_shuffle_epi32 (f, _MM_SHUFFLE (0, 0, 0, 0)));
-  sum[1] = _mm_mul_epi32 (sum[1], _mm_shuffle_epi32 (f, _MM_SHUFFLE (1, 1, 1, 1)));
-  sum[0] = _mm_add_epi64 (sum[0], sum[1]);
-  sum[0] = _mm_add_epi64 (sum[0], _mm_unpackhi_epi64 (sum[0], sum[0]));
-  res = _mm_cvtsi128_si64 (sum[0]);
-
-  res = (res + (1 << (PRECISION_S32 - 1))) >> PRECISION_S32;
-  *o = CLAMP (res, -(1L << 31), (1L << 31) - 1);
-}
-
-static inline void
-inner_product_gint32_cubic_1_sse41 (gint32 * o, const gint32 * a,
-    const gint32 * b, gint len, const gint32 * icoeff, gint bstride)
-{
-  gint i = 0;
-  gint64 res;
-  __m128i sum[4], ta, tb;
-  __m128i f = _mm_loadu_si128 ((__m128i *)icoeff);
-  const gint32 *c[4] = {(gint32*)((gint8*)b + 0*bstride),
-                        (gint32*)((gint8*)b + 1*bstride),
-                        (gint32*)((gint8*)b + 2*bstride),
-                        (gint32*)((gint8*)b + 3*bstride)};
-
-  sum[0] = sum[1] = sum[2] = sum[3] = _mm_setzero_si128 ();
-
-  for (; i < len; i += 4) {
-    ta = _mm_loadu_si128 ((__m128i *)(a + i));
-
-    tb = _mm_load_si128 ((__m128i *)(c[0] + i));
-    sum[0] = _mm_add_epi64 (sum[0], _mm_mul_epi32 (_mm_unpacklo_epi32 (ta, ta),
-              _mm_unpacklo_epi32 (tb, tb)));
-    sum[0] = _mm_add_epi64 (sum[0], _mm_mul_epi32 (_mm_unpackhi_epi32 (ta, ta),
-              _mm_unpackhi_epi32 (tb, tb)));
-
-    tb = _mm_load_si128 ((__m128i *)(c[1] + i));
-    sum[1] = _mm_add_epi64 (sum[1], _mm_mul_epi32 (_mm_unpacklo_epi32 (ta, ta),
-              _mm_unpacklo_epi32 (tb, tb)));
-    sum[1] = _mm_add_epi64 (sum[1], _mm_mul_epi32 (_mm_unpackhi_epi32 (ta, ta),
-              _mm_unpackhi_epi32 (tb, tb)));
-
-    tb = _mm_load_si128 ((__m128i *)(c[2] + i));
-    sum[2] = _mm_add_epi64 (sum[2], _mm_mul_epi32 (_mm_unpacklo_epi32 (ta, ta),
-              _mm_unpacklo_epi32 (tb, tb)));
-    sum[2] = _mm_add_epi64 (sum[2], _mm_mul_epi32 (_mm_unpackhi_epi32 (ta, ta),
-              _mm_unpackhi_epi32 (tb, tb)));
-
-    tb = _mm_load_si128 ((__m128i *)(c[3] + i));
-    sum[3] = _mm_add_epi64 (sum[3], _mm_mul_epi32 (_mm_unpacklo_epi32 (ta, ta),
-              _mm_unpacklo_epi32 (tb, tb)));
-    sum[3] = _mm_add_epi64 (sum[3], _mm_mul_epi32 (_mm_unpackhi_epi32 (ta, ta),
-              _mm_unpackhi_epi32 (tb, tb)));
-  }
-  sum[0] = _mm_srli_epi64 (sum[0], PRECISION_S32);
-  sum[1] = _mm_srli_epi64 (sum[1], PRECISION_S32);
-  sum[2] = _mm_srli_epi64 (sum[2], PRECISION_S32);
-  sum[3] = _mm_srli_epi64 (sum[3], PRECISION_S32);
-  sum[0] = _mm_mul_epi32 (sum[0], _mm_shuffle_epi32 (f, _MM_SHUFFLE (0, 0, 0, 0)));
-  sum[1] = _mm_mul_epi32 (sum[1], _mm_shuffle_epi32 (f, _MM_SHUFFLE (1, 1, 1, 1)));
-  sum[2] = _mm_mul_epi32 (sum[2], _mm_shuffle_epi32 (f, _MM_SHUFFLE (2, 2, 2, 2)));
-  sum[3] = _mm_mul_epi32 (sum[3], _mm_shuffle_epi32 (f, _MM_SHUFFLE (3, 3, 3, 3)));
-  sum[0] = _mm_add_epi64 (sum[0], sum[1]);
-  sum[2] = _mm_add_epi64 (sum[2], sum[3]);
-  sum[0] = _mm_add_epi64 (sum[0], sum[2]);
-  sum[0] = _mm_add_epi64 (sum[0], _mm_unpackhi_epi64 (sum[0], sum[0]));
-  res = _mm_cvtsi128_si64 (sum[0]);
-
-  res = (res + (1 << (PRECISION_S32 - 1))) >> PRECISION_S32;
-  *o = CLAMP (res, -(1L << 31), (1L << 31) - 1);
-}
-
-MAKE_RESAMPLE_FUNC (gint32, full, 1, sse41);
-MAKE_RESAMPLE_FUNC (gint32, linear, 1, sse41);
-MAKE_RESAMPLE_FUNC (gint32, cubic, 1, sse41);
-#endif
+#include "audio-resampler-macros.h"
+#include "audio-resampler-x86-sse.h"
+#include "audio-resampler-x86-sse2.h"
+#include "audio-resampler-x86-sse41.h"

 static void
 audio_resampler_check_x86 (const gchar *option)
 {
  if (!strcmp (option, "sse")) {
-#if defined (HAVE_XMMINTRIN_H) && defined(__SSE__)
+#if defined (HAVE_XMMINTRIN_H) && HAVE_SSE
    GST_DEBUG ("enable SSE optimisations");
    resample_gfloat_full_1 = resample_gfloat_full_1_sse;
    resample_gfloat_linear_1 = resample_gfloat_linear_1_sse;
@ -653,7 +38,7 @@ audio_resampler_check_x86 (const gchar *option)
    GST_DEBUG ("SSE optimisations not enabled");
 #endif
  } else if (!strcmp (option, "sse2")) {
-#if defined (HAVE_EMMINTRIN_H) && defined(__SSE2__)
+#if defined (HAVE_EMMINTRIN_H) && HAVE_SSE2
    GST_DEBUG ("enable SSE2 optimisations");
    resample_gint16_full_1 = resample_gint16_full_1_sse2;
    resample_gint16_linear_1 = resample_gint16_linear_1_sse2;
@ -672,7 +57,7 @@ audio_resampler_check_x86 (const gchar *option)
    GST_DEBUG ("SSE2 optimisations not enabled");
 #endif
  } else if (!strcmp (option, "sse41")) {
-#if defined (HAVE_SMMINTRIN_H) && defined(__SSE4_1__)
+#if defined (HAVE_SMMINTRIN_H) && defined (HAVE_EMMINTRIN_H) && HAVE_SSE41
    GST_DEBUG ("enable SSE41 optimisations");
    resample_gint32_full_1 = resample_gint32_full_1_sse41;
    resample_gint32_linear_1 = resample_gint32_linear_1_sse41;
--- a/gst-libs/gst/audio/audio-resampler.c
+++ b/gst-libs/gst/audio/audio-resampler.c
@ -30,99 +30,13 @@
 #endif

 #include "audio-resampler.h"
-
-/* Contains a collection of all things found in other resamplers:
- * speex (filter construction, optimizations), ffmpeg (fixed phase filter, blackman filter),
- * SRC (linear interpolation, fixed precomputed tables),...
- *
- *  Supports:
- *   - S16, S32, F32 and F64 formats
- *   - nearest, linear and cubic interpolation
- *   - sinc based interpolation with kaiser or blackman-nutall windows
- *   - fully configurable kaiser parameters
- *   - dynamic linear or cubic interpolation of filter table, this can
- *     use less memory but more CPU
- *   - full filter table, generated from optionally linear or cubic
- *     interpolation of filter table
- *   - fixed filter table size with nearest neighbour phase, optionally
- *     using a precomputed tables
- *   - dynamic samplerate changes
- *   - x86 and neon optimizations
- */
-typedef void (*ConvertTapsFunc) (gdouble * tmp_taps, gpointer taps,
-    gdouble weight, gint n_taps);
-typedef void (*InterpolateFunc) (gpointer o, const gpointer a, gint len,
-    const gpointer icoeff, gint astride);
-typedef void (*ResampleFunc) (GstAudioResampler * resampler, gpointer in[],
-    gsize in_len, gpointer out[], gsize out_len, gsize * consumed);
-typedef void (*DeinterleaveFunc) (GstAudioResampler * resampler,
-    gpointer * sbuf, gpointer in[], gsize in_frames);
+#include "audio-resampler-private.h"
+#include "audio-resampler-macros.h"

 #define MEM_ALIGN(m,a) ((gint8 *)((guintptr)((gint8 *)(m) + ((a)-1)) & ~((a)-1)))
 #define ALIGN 16
 #define TAPS_OVERREAD 16

-struct _GstAudioResampler
-{
-  GstAudioResamplerMethod method;
-  GstAudioResamplerFlags flags;
-  GstAudioFormat format;
-  GstStructure *options;
-  gint format_index;
-  gint channels;
-  gint in_rate;
-  gint out_rate;
-
-  gint bps;
-  gint ostride;
-
-  GstAudioResamplerFilterMode filter_mode;
-  guint filter_threshold;
-  GstAudioResamplerFilterInterpolation filter_interpolation;
-
-  gdouble cutoff;
-  gdouble kaiser_beta;
-  /* for cubic */
-  gdouble b, c;
-
-  /* temp taps */
-  gpointer tmp_taps;
-
-  /* oversampled main filter table */
-  gint oversample;
-  gint n_taps;
-  gpointer taps;
-  gpointer taps_mem;
-  gsize taps_stride;
-  gint n_phases;
-  gint alloc_taps;
-  gint alloc_phases;
-
-  /* cached taps */
-  gpointer *cached_phases;
-  gpointer cached_taps;
-  gpointer cached_taps_mem;
-  gsize cached_taps_stride;
-
-  ConvertTapsFunc convert_taps;
-  InterpolateFunc interpolate;
-  DeinterleaveFunc deinterleave;
-  ResampleFunc resample;
-
-  gint blocks;
-  gint inc;
-  gint samp_inc;
-  gint samp_frac;
-  gint samp_index;
-  gint samp_phase;
-  gint skip;
-
-  gpointer samples;
-  gsize samples_len;
-  gsize samples_avail;
-  gpointer *sbuf;
-};
-
 GST_DEBUG_CATEGORY_STATIC (audio_resampler_debug);
 #define GST_CAT_DEFAULT audio_resampler_debug

@ -303,9 +217,6 @@ get_kaiser_tap (gdouble x, gint n_taps, gdouble Fc, gdouble beta)
  return s * bessel (beta * sqrt (MAX (1 - w * w, 0)));
 }

-#define PRECISION_S16 15
-#define PRECISION_S32 31
-
 #define MAKE_CONVERT_TAPS_INT_FUNC(type, precision)                     \
 static void                                                             \
 convert_taps_##type##_c (gdouble *tmp_taps, gpointer taps,              \
@ -593,9 +504,7 @@ GET_TAPS_NEAREST_FUNC (gdouble);
 #define get_taps_gdouble_nearest get_taps_gdouble_nearest

 #define GET_TAPS_FULL_FUNC(type)                                                \
-static inline gpointer                                                          \
-get_taps_##type##_full (GstAudioResampler * resampler,                          \
-    gint *samp_index, gint *samp_phase, type icoeff[4])                         \
+DECL_GET_TAPS_FULL_FUNC(type)                                                   \
 {                                                                               \
  gpointer res;                                                                 \
  gint out_rate = resampler->out_rate;                                          \
@ -659,9 +568,7 @@ GET_TAPS_FULL_FUNC (gfloat);
 GET_TAPS_FULL_FUNC (gdouble);

 #define GET_TAPS_INTERPOLATE_FUNC(type,inter)                   \
-static inline gpointer                                          \
-get_taps_##type##_##inter (GstAudioResampler * resampler,       \
-    gint *samp_index, gint *samp_phase, type icoeff[4])         \
+DECL_GET_TAPS_INTERPOLATE_FUNC (type, inter)                    \
 {                                                               \
  gpointer res;                                                 \
  gint out_rate = resampler->out_rate;                          \
@ -852,67 +759,25 @@ inner_product_##type##_cubic_1_c (type * o, const type * a,     \
 INNER_PRODUCT_FLOAT_CUBIC_FUNC (gfloat);
 INNER_PRODUCT_FLOAT_CUBIC_FUNC (gdouble);

-#define MAKE_RESAMPLE_FUNC(type,inter,channels,arch)                            \
-static void                                                                     \
-resample_ ##type## _ ##inter## _ ##channels## _ ##arch (GstAudioResampler * resampler,      \
-    gpointer in[], gsize in_len,  gpointer out[], gsize out_len,                \
-    gsize * consumed)                                                           \
-{                                                                               \
-  gint c, di = 0;                                                               \
-  gint n_taps = resampler->n_taps;                                              \
-  gint blocks = resampler->blocks;                                              \
-  gint ostride = resampler->ostride;                                            \
-  gint taps_stride = resampler->taps_stride;                                    \
-  gint samp_index = 0;                                                          \
-  gint samp_phase = 0;                                                          \
-                                                                                \
-  for (c = 0; c < blocks; c++) {                                                \
-    type *ip = in[c];                                                           \
-    type *op = ostride == 1 ? out[c] : (type *)out[0] + c;                      \
-                                                                                \
-    samp_index = resampler->samp_index;                                         \
-    samp_phase = resampler->samp_phase;                                         \
-                                                                                \
-    for (di = 0; di < out_len; di++) {                                          \
-      type *ipp, icoeff[4], *taps;                                              \
-                                                                                \
-      ipp = &ip[samp_index * channels];                                         \
-                                                                                \
-      taps = get_taps_ ##type##_##inter                                         \
-              (resampler, &samp_index, &samp_phase, icoeff);                    \
-      inner_product_ ##type##_##inter##_##channels##_##arch                     \
-              (op, ipp, taps, n_taps, icoeff, taps_stride);                     \
-      op += ostride;                                                            \
-    }                                                                           \
-    if (in_len > samp_index)                                                    \
-      memmove (ip, &ip[samp_index * channels],                                  \
-          (in_len - samp_index) * sizeof(type) * channels);                     \
-  }                                                                             \
-  *consumed = samp_index - resampler->samp_index;                               \
-                                                                                \
-  resampler->samp_index = 0;                                                    \
-  resampler->samp_phase = samp_phase;                                           \
-}
+MAKE_RESAMPLE_FUNC_STATIC (gint16, nearest, 1, c);
+MAKE_RESAMPLE_FUNC_STATIC (gint32, nearest, 1, c);
+MAKE_RESAMPLE_FUNC_STATIC (gfloat, nearest, 1, c);
+MAKE_RESAMPLE_FUNC_STATIC (gdouble, nearest, 1, c);

-MAKE_RESAMPLE_FUNC (gint16, nearest, 1, c);
-MAKE_RESAMPLE_FUNC (gint32, nearest, 1, c);
-MAKE_RESAMPLE_FUNC (gfloat, nearest, 1, c);
-MAKE_RESAMPLE_FUNC (gdouble, nearest, 1, c);
+MAKE_RESAMPLE_FUNC_STATIC (gint16, full, 1, c);
+MAKE_RESAMPLE_FUNC_STATIC (gint32, full, 1, c);
+MAKE_RESAMPLE_FUNC_STATIC (gfloat, full, 1, c);
+MAKE_RESAMPLE_FUNC_STATIC (gdouble, full, 1, c);

-MAKE_RESAMPLE_FUNC (gint16, full, 1, c);
-MAKE_RESAMPLE_FUNC (gint32, full, 1, c);
-MAKE_RESAMPLE_FUNC (gfloat, full, 1, c);
-MAKE_RESAMPLE_FUNC (gdouble, full, 1, c);
+MAKE_RESAMPLE_FUNC_STATIC (gint16, linear, 1, c);
+MAKE_RESAMPLE_FUNC_STATIC (gint32, linear, 1, c);
+MAKE_RESAMPLE_FUNC_STATIC (gfloat, linear, 1, c);
+MAKE_RESAMPLE_FUNC_STATIC (gdouble, linear, 1, c);

-MAKE_RESAMPLE_FUNC (gint16, linear, 1, c);
-MAKE_RESAMPLE_FUNC (gint32, linear, 1, c);
-MAKE_RESAMPLE_FUNC (gfloat, linear, 1, c);
-MAKE_RESAMPLE_FUNC (gdouble, linear, 1, c);
-
-MAKE_RESAMPLE_FUNC (gint16, cubic, 1, c);
-MAKE_RESAMPLE_FUNC (gint32, cubic, 1, c);
-MAKE_RESAMPLE_FUNC (gfloat, cubic, 1, c);
-MAKE_RESAMPLE_FUNC (gdouble, cubic, 1, c);
+MAKE_RESAMPLE_FUNC_STATIC (gint16, cubic, 1, c);
+MAKE_RESAMPLE_FUNC_STATIC (gint32, cubic, 1, c);
+MAKE_RESAMPLE_FUNC_STATIC (gfloat, cubic, 1, c);
+MAKE_RESAMPLE_FUNC_STATIC (gdouble, cubic, 1, c);

 static ResampleFunc resample_funcs[] = {
  resample_gint16_nearest_1_c,