diff --git a/configure.ac b/configure.ac
index c2a1091e5e..0c13ee8569 100644
--- a/configure.ac
+++ b/configure.ac
@@ -179,6 +179,30 @@ dnl check for GCC specific SSE headers
 dnl these are used by the speex resampler code
 AC_CHECK_HEADERS([xmmintrin.h emmintrin.h smmintrin.h])
 
+dnl also check which architecture we're on for building files with intrinsics
+dnl separately
+AC_CHECK_DECLS([__i386__], [HAVE_X86=1])
+AC_CHECK_DECLS([__x86_64__], [HAVE_X86=1])
+
+dnl check for -m* compiler flags too
+SSE_CFLAGS="-msse"
+SSE2_CFLAGS="-msse2"
+SSE41_CFLAGS="-msse4.1"
+
+AS_COMPILER_FLAG([$SSE_CFLAGS], [HAVE_SSE=1], [HAVE_SSE=0])
+AS_COMPILER_FLAG([$SSE2_CFLAGS], [HAVE_SSE2=1], [HAVE_SSE2=0])
+AS_COMPILER_FLAG([$SSE41_CFLAGS], [HAVE_SSE41=1], [HAVE_SSE41=0])
+
+AM_CONDITIONAL(HAVE_X86, [test "x${HAVE_X86}" = "x1"])
+
+AC_DEFINE_UNQUOTED(HAVE_SSE, [$HAVE_SSE], [SSE support is enabled])
+AC_DEFINE_UNQUOTED(HAVE_SSE2, [$HAVE_SSE2], [SSE2 support is enabled])
+AC_DEFINE_UNQUOTED(HAVE_SSE41, [$HAVE_SSE41], [SSE4.1 support is enabled])
+
+AC_SUBST(SSE_CFLAGS)
+AC_SUBST(SSE2_CFLAGS)
+AC_SUBST(SSE41_CFLAGS)
+
 dnl used in gst/tcp
 AC_CHECK_HEADERS([sys/socket.h],
   [HAVE_SYS_SOCKET_H="yes"], [HAVE_SYS_SOCKET_H="no"], [AC_INCLUDES_DEFAULT])
diff --git a/gst-libs/gst/audio/Makefile.am b/gst-libs/gst/audio/Makefile.am
index 1f3ec51ae0..bce52bea10 100644
--- a/gst-libs/gst/audio/Makefile.am
+++ b/gst-libs/gst/audio/Makefile.am
@@ -82,8 +82,12 @@ nodist_libgstaudio_@GST_API_VERSION@include_HEADERS = \
 	audio-enumtypes.h
 
 noinst_HEADERS = \
-	gstaudioutilsprivate.h \
-	audio-resampler-x86.h \
+	gstaudioutilsprivate.h 		\
+	audio-resampler-private.h 	\
+	audio-resampler-macros.h 	\
+	audio-resampler-x86.h 		\
+	audio-resampler-x86-sse.h	\
+	audio-resampler-x86-sse2.h	\
 	audio-resampler-neon.h
 
 libgstaudio_@GST_API_VERSION@_la_CFLAGS = $(GST_PLUGINS_BASE_CFLAGS) $(GST_BASE_CFLAGS) $(GST_CFLAGS) \
@@ -93,6 +97,50 @@ libgstaudio_@GST_API_VERSION@_la_LIBADD = \
   $(GST_BASE_LIBS) $(GST_LIBS) $(LIBM) $(ORC_LIBS)
 libgstaudio_@GST_API_VERSION@_la_LDFLAGS = $(GST_LIB_LDFLAGS) $(GST_ALL_LDFLAGS) $(GST_LT_LDFLAGS)
 
+
+# Arch-specific bits
+
+noinst_LTLIBRARIES =
+
+if HAVE_X86
+# Don't use full GST_LT_LDFLAGS in LDFLAGS because we get things like
+# -version-info that cause a warning on private libs
+
+noinst_LTLIBRARIES += libaudio_resampler_sse.la
+libaudio_resampler_sse_la_SOURCES = audio-resampler-x86-sse.c
+libaudio_resampler_sse_la_CFLAGS = \
+	$(libgstaudio_@GST_API_VERSION@_la_CFLAGS) \
+	$(SSE_CFLAGS)
+libaudio_resampler_sse_la_LDFLAGS = \
+	$(GST_LIB_LDFLAGS) \
+	$(GST_ALL_LDFLAGS)
+libgstaudio_@GST_API_VERSION@_la_LIBADD += libaudio_resampler_sse.la
+
+noinst_LTLIBRARIES += libaudio_resampler_sse2.la
+libaudio_resampler_sse2_la_SOURCES = audio-resampler-x86-sse2.c
+libaudio_resampler_sse2_la_CFLAGS = \
+	$(libgstaudio_@GST_API_VERSION@_la_CFLAGS) \
+	$(SSE2_CFLAGS)
+libaudio_resampler_sse2_la_LDFLAGS = \
+	$(GST_LIB_LDFLAGS) \
+	$(GST_ALL_LDFLAGS)
+libgstaudio_@GST_API_VERSION@_la_LIBADD += libaudio_resampler_sse2.la
+
+noinst_LTLIBRARIES += libaudio_resampler_sse41.la
+libaudio_resampler_sse41_la_SOURCES = audio-resampler-x86-sse41.c
+libaudio_resampler_sse41_la_CFLAGS = \
+	$(libgstaudio_@GST_API_VERSION@_la_CFLAGS) \
+	$(SSE41_CFLAGS)
+libaudio_resampler_sse41_la_LDFLAGS = \
+	$(GST_LIB_LDFLAGS) \
+	$(GST_ALL_LDFLAGS)
+libgstaudio_@GST_API_VERSION@_la_LIBADD += libaudio_resampler_sse41.la
+
+endif
+
+
+# Introspection
+
 include $(top_srcdir)/common/gst-glib-gen.mak
 
 if HAVE_INTROSPECTION
diff --git a/gst-libs/gst/audio/audio-resampler-macros.h b/gst-libs/gst/audio/audio-resampler-macros.h
new file mode 100644
index 0000000000..fd6652cd80
--- /dev/null
+++ b/gst-libs/gst/audio/audio-resampler-macros.h
@@ -0,0 +1,108 @@
+/* GStreamer
+ * Copyright (C) <2015> Wim Taymans <wim.taymans@gmail.com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Library General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Library General Public
+ * License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
+ * Boston, MA 02110-1301, USA.
+ */
+
+#ifndef __GST_AUDIO_RESAMPLER_MACROS_H__
+#define __GST_AUDIO_RESAMPLER_MACROS_H__
+
+#include <string.h>
+
+#include "audio-resampler-private.h"
+
+#define PRECISION_S16 15
+#define PRECISION_S32 31
+
+#define DECL_GET_TAPS_FULL_FUNC(type)                           \
+gpointer                                                        \
+get_taps_##type##_full (GstAudioResampler * resampler,          \
+    gint *samp_index, gint *samp_phase, type icoeff[4])
+
+DECL_GET_TAPS_FULL_FUNC (gint16);
+DECL_GET_TAPS_FULL_FUNC (gint32);
+DECL_GET_TAPS_FULL_FUNC (gfloat);
+DECL_GET_TAPS_FULL_FUNC (gdouble);
+
+
+#define DECL_GET_TAPS_INTERPOLATE_FUNC(type, inter)             \
+gpointer                                                        \
+get_taps_##type##_##inter (GstAudioResampler * resampler,       \
+    gint *samp_index, gint *samp_phase, type icoeff[4])         \
+
+DECL_GET_TAPS_INTERPOLATE_FUNC (gint16, linear);
+DECL_GET_TAPS_INTERPOLATE_FUNC (gint32, linear);
+DECL_GET_TAPS_INTERPOLATE_FUNC (gfloat, linear);
+DECL_GET_TAPS_INTERPOLATE_FUNC (gdouble, linear);
+
+DECL_GET_TAPS_INTERPOLATE_FUNC (gint16, cubic);
+DECL_GET_TAPS_INTERPOLATE_FUNC (gint32, cubic);
+DECL_GET_TAPS_INTERPOLATE_FUNC (gfloat, cubic);
+DECL_GET_TAPS_INTERPOLATE_FUNC (gdouble, cubic);
+
+
+#define DECL_RESAMPLE_FUNC(type,inter,channels,arch)                    \
+void                                                                    \
+resample_ ##type## _ ##inter## _ ##channels## _ ##arch (GstAudioResampler * resampler,      \
+    gpointer in[], gsize in_len,  gpointer out[], gsize out_len,        \
+    gsize * consumed)
+
+#define MAKE_RESAMPLE_FUNC(type,inter,channels,arch)            \
+DECL_RESAMPLE_FUNC (type, inter, channels, arch)                \
+{                                                               \
+  gint c, di = 0;                                               \
+  gint n_taps = resampler->n_taps;                              \
+  gint blocks = resampler->blocks;                              \
+  gint ostride = resampler->ostride;                            \
+  gint taps_stride = resampler->taps_stride;                    \
+  gint samp_index = 0;                                          \
+  gint samp_phase = 0;                                          \
+                                                                \
+  for (c = 0; c < blocks; c++) {                                \
+    type *ip = in[c];                                           \
+    type *op = ostride == 1 ? out[c] : (type *)out[0] + c;      \
+                                                                \
+    samp_index = resampler->samp_index;                         \
+    samp_phase = resampler->samp_phase;                         \
+                                                                \
+    for (di = 0; di < out_len; di++) {                          \
+      type *ipp, icoeff[4], *taps;                              \
+                                                                \
+      ipp = &ip[samp_index * channels];                         \
+                                                                \
+      taps = get_taps_ ##type##_##inter                         \
+              (resampler, &samp_index, &samp_phase, icoeff);    \
+      inner_product_ ##type##_##inter##_##channels##_##arch     \
+              (op, ipp, taps, n_taps, icoeff, taps_stride);     \
+      op += ostride;                                            \
+    }                                                           \
+    if (in_len > samp_index)                                    \
+      memmove (ip, &ip[samp_index * channels],                  \
+          (in_len - samp_index) * sizeof(type) * channels);     \
+  }                                                             \
+  *consumed = samp_index - resampler->samp_index;               \
+                                                                \
+  resampler->samp_index = 0;                                    \
+  resampler->samp_phase = samp_phase;                           \
+}
+
+#define DECL_RESAMPLE_FUNC_STATIC(type,inter,channels,arch)     \
+static DECL_RESAMPLE_FUNC (type, inter, channels, arch)
+
+#define MAKE_RESAMPLE_FUNC_STATIC(type,inter,channels,arch)     \
+static MAKE_RESAMPLE_FUNC (type, inter, channels, arch)
+
+#endif /* __GST_AUDIO_RESAMPLER_MACROS_H__ */
diff --git a/gst-libs/gst/audio/audio-resampler-neon.h b/gst-libs/gst/audio/audio-resampler-neon.h
index 5520b070bb..5863e18fe0 100644
--- a/gst-libs/gst/audio/audio-resampler-neon.h
+++ b/gst-libs/gst/audio/audio-resampler-neon.h
@@ -650,17 +650,17 @@ interpolate_gfloat_cubic_neon (gpointer op, const gpointer ap,
                     "q10", "q11", "q12", "q13", "q14", "q15", "memory");
 }
 
-MAKE_RESAMPLE_FUNC (gint16, full, 1, neon);
-MAKE_RESAMPLE_FUNC (gint16, linear, 1, neon);
-MAKE_RESAMPLE_FUNC (gint16, cubic, 1, neon);
+MAKE_RESAMPLE_FUNC_STATIC (gint16, full, 1, neon);
+MAKE_RESAMPLE_FUNC_STATIC (gint16, linear, 1, neon);
+MAKE_RESAMPLE_FUNC_STATIC (gint16, cubic, 1, neon);
 
-MAKE_RESAMPLE_FUNC (gint32, full, 1, neon);
-MAKE_RESAMPLE_FUNC (gint32, linear, 1, neon);
-MAKE_RESAMPLE_FUNC (gint32, cubic, 1, neon);
+MAKE_RESAMPLE_FUNC_STATIC (gint32, full, 1, neon);
+MAKE_RESAMPLE_FUNC_STATIC (gint32, linear, 1, neon);
+MAKE_RESAMPLE_FUNC_STATIC (gint32, cubic, 1, neon);
 
-MAKE_RESAMPLE_FUNC (gfloat, full, 1, neon);
-MAKE_RESAMPLE_FUNC (gfloat, linear, 1, neon);
-MAKE_RESAMPLE_FUNC (gfloat, cubic, 1, neon);
+MAKE_RESAMPLE_FUNC_STATIC (gfloat, full, 1, neon);
+MAKE_RESAMPLE_FUNC_STATIC (gfloat, linear, 1, neon);
+MAKE_RESAMPLE_FUNC_STATIC (gfloat, cubic, 1, neon);
 
 static void
 audio_resampler_check_neon (const gchar *option)
diff --git a/gst-libs/gst/audio/audio-resampler-private.h b/gst-libs/gst/audio/audio-resampler-private.h
new file mode 100644
index 0000000000..c8d1a7ece6
--- /dev/null
+++ b/gst-libs/gst/audio/audio-resampler-private.h
@@ -0,0 +1,113 @@
+/* GStreamer
+ * Copyright (C) <2015> Wim Taymans <wim.taymans@gmail.com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Library General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Library General Public
+ * License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
+ * Boston, MA 02110-1301, USA.
+ */
+
+#ifndef __GST_AUDIO_RESAMPLER_PRIVATE_H__
+#define __GST_AUDIO_RESAMPLER_PRIVATE_H__
+
+#include "audio-resampler.h"
+
+/* Contains a collection of all things found in other resamplers:
+ * speex (filter construction, optimizations), ffmpeg (fixed phase filter, blackman filter),
+ * SRC (linear interpolation, fixed precomputed tables),...
+ *
+ *  Supports:
+ *   - S16, S32, F32 and F64 formats
+ *   - nearest, linear and cubic interpolation
+ *   - sinc based interpolation with kaiser or blackman-nutall windows
+ *   - fully configurable kaiser parameters
+ *   - dynamic linear or cubic interpolation of filter table, this can
+ *     use less memory but more CPU
+ *   - full filter table, generated from optionally linear or cubic
+ *     interpolation of filter table
+ *   - fixed filter table size with nearest neighbour phase, optionally
+ *     using a precomputed tables
+ *   - dynamic samplerate changes
+ *   - x86 and neon optimizations
+ */
+typedef void (*ConvertTapsFunc) (gdouble * tmp_taps, gpointer taps,
+    gdouble weight, gint n_taps);
+typedef void (*InterpolateFunc) (gpointer o, const gpointer a, gint len,
+    const gpointer icoeff, gint astride);
+typedef void (*ResampleFunc) (GstAudioResampler * resampler, gpointer in[],
+    gsize in_len, gpointer out[], gsize out_len, gsize * consumed);
+typedef void (*DeinterleaveFunc) (GstAudioResampler * resampler,
+    gpointer * sbuf, gpointer in[], gsize in_frames);
+
+struct _GstAudioResampler
+{
+  GstAudioResamplerMethod method;
+  GstAudioResamplerFlags flags;
+  GstAudioFormat format;
+  GstStructure *options;
+  gint format_index;
+  gint channels;
+  gint in_rate;
+  gint out_rate;
+
+  gint bps;
+  gint ostride;
+
+  GstAudioResamplerFilterMode filter_mode;
+  guint filter_threshold;
+  GstAudioResamplerFilterInterpolation filter_interpolation;
+
+  gdouble cutoff;
+  gdouble kaiser_beta;
+  /* for cubic */
+  gdouble b, c;
+
+  /* temp taps */
+  gpointer tmp_taps;
+
+  /* oversampled main filter table */
+  gint oversample;
+  gint n_taps;
+  gpointer taps;
+  gpointer taps_mem;
+  gsize taps_stride;
+  gint n_phases;
+  gint alloc_taps;
+  gint alloc_phases;
+
+  /* cached taps */
+  gpointer *cached_phases;
+  gpointer cached_taps;
+  gpointer cached_taps_mem;
+  gsize cached_taps_stride;
+
+  ConvertTapsFunc convert_taps;
+  InterpolateFunc interpolate;
+  DeinterleaveFunc deinterleave;
+  ResampleFunc resample;
+
+  gint blocks;
+  gint inc;
+  gint samp_inc;
+  gint samp_frac;
+  gint samp_index;
+  gint samp_phase;
+  gint skip;
+
+  gpointer samples;
+  gsize samples_len;
+  gsize samples_avail;
+  gpointer *sbuf;
+};
+
+#endif /* __GST_AUDIO_RESAMPLER_PRIVATE_H__ */
diff --git a/gst-libs/gst/audio/audio-resampler-x86-sse.c b/gst-libs/gst/audio/audio-resampler-x86-sse.c
new file mode 100644
index 0000000000..d100c59882
--- /dev/null
+++ b/gst-libs/gst/audio/audio-resampler-x86-sse.c
@@ -0,0 +1,168 @@
+/* GStreamer
+ * Copyright (C) <2016> Wim Taymans <wim.taymans@gmail.com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Library General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Library General Public
+ * License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
+ * Boston, MA 02110-1301, USA.
+ */
+
+#ifdef HAVE_CONFIG_H
+#  include "config.h"
+#endif
+
+#include "audio-resampler-x86-sse.h"
+
+#if defined (HAVE_XMMINTRIN_H) && defined(__SSE__)
+#include <xmmintrin.h>
+
+static inline void
+inner_product_gfloat_full_1_sse (gfloat * o, const gfloat * a,
+    const gfloat * b, gint len, const gfloat * icoeff, gint bstride)
+{
+  gint i = 0;
+  __m128 sum = _mm_setzero_ps ();
+
+  for (; i < len; i += 8) {
+    sum =
+        _mm_add_ps (sum, _mm_mul_ps (_mm_loadu_ps (a + i + 0),
+            _mm_load_ps (b + i + 0)));
+    sum =
+        _mm_add_ps (sum, _mm_mul_ps (_mm_loadu_ps (a + i + 4),
+            _mm_load_ps (b + i + 4)));
+  }
+  sum = _mm_add_ps (sum, _mm_movehl_ps (sum, sum));
+  sum = _mm_add_ss (sum, _mm_shuffle_ps (sum, sum, 0x55));
+  _mm_store_ss (o, sum);
+}
+
+static inline void
+inner_product_gfloat_linear_1_sse (gfloat * o, const gfloat * a,
+    const gfloat * b, gint len, const gfloat * icoeff, gint bstride)
+{
+  gint i = 0;
+  __m128 sum[2], t;
+  const gfloat *c[2] = { (gfloat *) ((gint8 *) b + 0 * bstride),
+    (gfloat *) ((gint8 *) b + 1 * bstride)
+  };
+
+  sum[0] = sum[1] = _mm_setzero_ps ();
+
+  for (; i < len; i += 8) {
+    t = _mm_loadu_ps (a + i + 0);
+    sum[0] = _mm_add_ps (sum[0], _mm_mul_ps (t, _mm_load_ps (c[0] + i + 0)));
+    sum[1] = _mm_add_ps (sum[1], _mm_mul_ps (t, _mm_load_ps (c[1] + i + 0)));
+    t = _mm_loadu_ps (a + i + 4);
+    sum[0] = _mm_add_ps (sum[0], _mm_mul_ps (t, _mm_load_ps (c[0] + i + 4)));
+    sum[1] = _mm_add_ps (sum[1], _mm_mul_ps (t, _mm_load_ps (c[1] + i + 4)));
+  }
+  sum[0] = _mm_mul_ps (_mm_sub_ps (sum[0], sum[1]), _mm_load1_ps (icoeff));
+  sum[0] = _mm_add_ps (sum[0], sum[1]);
+  sum[0] = _mm_add_ps (sum[0], _mm_movehl_ps (sum[0], sum[0]));
+  sum[0] = _mm_add_ss (sum[0], _mm_shuffle_ps (sum[0], sum[0], 0x55));
+  _mm_store_ss (o, sum[0]);
+}
+
+static inline void
+inner_product_gfloat_cubic_1_sse (gfloat * o, const gfloat * a,
+    const gfloat * b, gint len, const gfloat * icoeff, gint bstride)
+{
+  gint i = 0;
+  __m128 sum[4];
+  __m128 t, f = _mm_loadu_ps (icoeff);
+  const gfloat *c[4] = { (gfloat *) ((gint8 *) b + 0 * bstride),
+    (gfloat *) ((gint8 *) b + 1 * bstride),
+    (gfloat *) ((gint8 *) b + 2 * bstride),
+    (gfloat *) ((gint8 *) b + 3 * bstride)
+  };
+
+  sum[0] = sum[1] = sum[2] = sum[3] = _mm_setzero_ps ();
+
+  for (; i < len; i += 4) {
+    t = _mm_loadu_ps (a + i);
+    sum[0] = _mm_add_ps (sum[0], _mm_mul_ps (t, _mm_load_ps (c[0] + i)));
+    sum[1] = _mm_add_ps (sum[1], _mm_mul_ps (t, _mm_load_ps (c[1] + i)));
+    sum[2] = _mm_add_ps (sum[2], _mm_mul_ps (t, _mm_load_ps (c[2] + i)));
+    sum[3] = _mm_add_ps (sum[3], _mm_mul_ps (t, _mm_load_ps (c[3] + i)));
+  }
+  sum[0] = _mm_mul_ps (sum[0], _mm_shuffle_ps (f, f, 0x00));
+  sum[1] = _mm_mul_ps (sum[1], _mm_shuffle_ps (f, f, 0x55));
+  sum[2] = _mm_mul_ps (sum[2], _mm_shuffle_ps (f, f, 0xaa));
+  sum[3] = _mm_mul_ps (sum[3], _mm_shuffle_ps (f, f, 0xff));
+  sum[0] = _mm_add_ps (sum[0], sum[1]);
+  sum[2] = _mm_add_ps (sum[2], sum[3]);
+  sum[0] = _mm_add_ps (sum[0], sum[2]);
+  sum[0] = _mm_add_ps (sum[0], _mm_movehl_ps (sum[0], sum[0]));
+  sum[0] = _mm_add_ss (sum[0], _mm_shuffle_ps (sum[0], sum[0], 0x55));
+  _mm_store_ss (o, sum[0]);
+}
+
+MAKE_RESAMPLE_FUNC (gfloat, full, 1, sse);
+MAKE_RESAMPLE_FUNC (gfloat, linear, 1, sse);
+MAKE_RESAMPLE_FUNC (gfloat, cubic, 1, sse);
+
+void
+interpolate_gfloat_linear_sse (gpointer op, const gpointer ap,
+    gint len, const gpointer icp, gint astride)
+{
+  gint i;
+  gfloat *o = op, *a = ap, *ic = icp;
+  __m128 f[2], t1, t2;
+  const gfloat *c[2] = { (gfloat *) ((gint8 *) a + 0 * astride),
+    (gfloat *) ((gint8 *) a + 1 * astride)
+  };
+
+  f[0] = _mm_load1_ps (ic + 0);
+  f[1] = _mm_load1_ps (ic + 1);
+
+  for (i = 0; i < len; i += 8) {
+    t1 = _mm_mul_ps (_mm_load_ps (c[0] + i + 0), f[0]);
+    t2 = _mm_mul_ps (_mm_load_ps (c[1] + i + 0), f[1]);
+    _mm_store_ps (o + i + 0, _mm_add_ps (t1, t2));
+
+    t1 = _mm_mul_ps (_mm_load_ps (c[0] + i + 4), f[0]);
+    t2 = _mm_mul_ps (_mm_load_ps (c[1] + i + 4), f[1]);
+    _mm_store_ps (o + i + 4, _mm_add_ps (t1, t2));
+  }
+}
+
+void
+interpolate_gfloat_cubic_sse (gpointer op, const gpointer ap,
+    gint len, const gpointer icp, gint astride)
+{
+  gint i;
+  gfloat *o = op, *a = ap, *ic = icp;
+  __m128 f[4], t[4];
+  const gfloat *c[4] = { (gfloat *) ((gint8 *) a + 0 * astride),
+    (gfloat *) ((gint8 *) a + 1 * astride),
+    (gfloat *) ((gint8 *) a + 2 * astride),
+    (gfloat *) ((gint8 *) a + 3 * astride)
+  };
+
+  f[0] = _mm_load1_ps (ic + 0);
+  f[1] = _mm_load1_ps (ic + 1);
+  f[2] = _mm_load1_ps (ic + 2);
+  f[3] = _mm_load1_ps (ic + 3);
+
+  for (i = 0; i < len; i += 4) {
+    t[0] = _mm_mul_ps (_mm_load_ps (c[0] + i + 0), f[0]);
+    t[1] = _mm_mul_ps (_mm_load_ps (c[1] + i + 0), f[1]);
+    t[2] = _mm_mul_ps (_mm_load_ps (c[2] + i + 0), f[2]);
+    t[3] = _mm_mul_ps (_mm_load_ps (c[3] + i + 0), f[3]);
+    t[0] = _mm_add_ps (t[0], t[1]);
+    t[2] = _mm_add_ps (t[2], t[3]);
+    _mm_store_ps (o + i + 0, _mm_add_ps (t[0], t[2]));
+  }
+}
+
+#endif
diff --git a/gst-libs/gst/audio/audio-resampler-x86-sse.h b/gst-libs/gst/audio/audio-resampler-x86-sse.h
new file mode 100644
index 0000000000..1d3e9a4db9
--- /dev/null
+++ b/gst-libs/gst/audio/audio-resampler-x86-sse.h
@@ -0,0 +1,35 @@
+/* GStreamer
+ * Copyright (C) <2016> Wim Taymans <wim.taymans@gmail.com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Library General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Library General Public
+ * License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
+ * Boston, MA 02110-1301, USA.
+ */
+
+#ifndef AUDIO_RESAMPLER_X86_SSE_H
+#define AUDIO_RESAMPLER_X86_SSE_H
+
+#include "audio-resampler-macros.h"
+
+DECL_RESAMPLE_FUNC (gfloat, full, 1, sse);
+DECL_RESAMPLE_FUNC (gfloat, linear, 1, sse);
+DECL_RESAMPLE_FUNC (gfloat, cubic, 1, sse);
+
+void interpolate_gfloat_linear_sse (gpointer op, const gpointer ap,
+    gint len, const gpointer icp, gint astride);
+
+void interpolate_gfloat_cubic_sse (gpointer op, const gpointer ap,
+    gint len, const gpointer icp, gint astride);
+
+#endif /* AUDIO_RESAMPLER_X86_SSE_H */
diff --git a/gst-libs/gst/audio/audio-resampler-x86-sse2.c b/gst-libs/gst/audio/audio-resampler-x86-sse2.c
new file mode 100644
index 0000000000..a89fb41337
--- /dev/null
+++ b/gst-libs/gst/audio/audio-resampler-x86-sse2.c
@@ -0,0 +1,399 @@
+/* GStreamer
+ * Copyright (C) <2016> Wim Taymans <wim.taymans@gmail.com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Library General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Library General Public
+ * License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
+ * Boston, MA 02110-1301, USA.
+ */
+
+#ifdef HAVE_CONFIG_H
+#  include "config.h"
+#endif
+
+#include "audio-resampler-x86-sse2.h"
+
+#if defined (HAVE_EMMINTRIN_H) && defined(__SSE2__)
+#include <emmintrin.h>
+
+static inline void
+inner_product_gint16_full_1_sse2 (gint16 * o, const gint16 * a,
+    const gint16 * b, gint len, const gint16 * icoeff, gint bstride)
+{
+  gint i;
+  __m128i sum, t;
+
+  sum = _mm_setzero_si128 ();
+
+  for (i = 0; i < len; i += 16) {
+    t = _mm_loadu_si128 ((__m128i *) (a + i));
+    sum =
+        _mm_add_epi32 (sum, _mm_madd_epi16 (t,
+            _mm_load_si128 ((__m128i *) (b + i + 0))));
+
+    t = _mm_loadu_si128 ((__m128i *) (a + i + 8));
+    sum =
+        _mm_add_epi32 (sum, _mm_madd_epi16 (t,
+            _mm_load_si128 ((__m128i *) (b + i + 8))));
+  }
+  sum = _mm_add_epi32 (sum, _mm_shuffle_epi32 (sum, _MM_SHUFFLE (2, 3, 2, 3)));
+  sum = _mm_add_epi32 (sum, _mm_shuffle_epi32 (sum, _MM_SHUFFLE (1, 1, 1, 1)));
+
+  sum = _mm_add_epi32 (sum, _mm_set1_epi32 (1 << (PRECISION_S16 - 1)));
+  sum = _mm_srai_epi32 (sum, PRECISION_S16);
+  sum = _mm_packs_epi32 (sum, sum);
+  *o = _mm_extract_epi16 (sum, 0);
+}
+
+static inline void
+inner_product_gint16_linear_1_sse2 (gint16 * o, const gint16 * a,
+    const gint16 * b, gint len, const gint16 * icoeff, gint bstride)
+{
+  gint i = 0;
+  __m128i sum[2], t;
+  __m128i f = _mm_set_epi64x (0, *((gint64 *) icoeff));
+  const gint16 *c[2] = { (gint16 *) ((gint8 *) b + 0 * bstride),
+    (gint16 *) ((gint8 *) b + 1 * bstride)
+  };
+
+  sum[0] = sum[1] = _mm_setzero_si128 ();
+  f = _mm_unpacklo_epi16 (f, sum[0]);
+
+  for (; i < len; i += 16) {
+    t = _mm_loadu_si128 ((__m128i *) (a + i + 0));
+    sum[0] =
+        _mm_add_epi32 (sum[0], _mm_madd_epi16 (t,
+            _mm_load_si128 ((__m128i *) (c[0] + i + 0))));
+    sum[1] =
+        _mm_add_epi32 (sum[1], _mm_madd_epi16 (t,
+            _mm_load_si128 ((__m128i *) (c[1] + i + 0))));
+
+    t = _mm_loadu_si128 ((__m128i *) (a + i + 8));
+    sum[0] =
+        _mm_add_epi32 (sum[0], _mm_madd_epi16 (t,
+            _mm_load_si128 ((__m128i *) (c[0] + i + 8))));
+    sum[1] =
+        _mm_add_epi32 (sum[1], _mm_madd_epi16 (t,
+            _mm_load_si128 ((__m128i *) (c[1] + i + 8))));
+  }
+  sum[0] = _mm_srai_epi32 (sum[0], PRECISION_S16);
+  sum[1] = _mm_srai_epi32 (sum[1], PRECISION_S16);
+
+  sum[0] =
+      _mm_madd_epi16 (sum[0], _mm_shuffle_epi32 (f, _MM_SHUFFLE (0, 0, 0, 0)));
+  sum[1] =
+      _mm_madd_epi16 (sum[1], _mm_shuffle_epi32 (f, _MM_SHUFFLE (1, 1, 1, 1)));
+  sum[0] = _mm_add_epi32 (sum[0], sum[1]);
+
+  sum[0] =
+      _mm_add_epi32 (sum[0], _mm_shuffle_epi32 (sum[0], _MM_SHUFFLE (2, 3, 2,
+              3)));
+  sum[0] =
+      _mm_add_epi32 (sum[0], _mm_shuffle_epi32 (sum[0], _MM_SHUFFLE (1, 1, 1,
+              1)));
+
+  sum[0] = _mm_add_epi32 (sum[0], _mm_set1_epi32 (1 << (PRECISION_S16 - 1)));
+  sum[0] = _mm_srai_epi32 (sum[0], PRECISION_S16);
+  sum[0] = _mm_packs_epi32 (sum[0], sum[0]);
+  *o = _mm_extract_epi16 (sum[0], 0);
+}
+
+static inline void
+inner_product_gint16_cubic_1_sse2 (gint16 * o, const gint16 * a,
+    const gint16 * b, gint len, const gint16 * icoeff, gint bstride)
+{
+  gint i = 0;
+  __m128i sum[4], t[4];
+  __m128i f = _mm_set_epi64x (0, *((long long *) icoeff));
+  const gint16 *c[4] = { (gint16 *) ((gint8 *) b + 0 * bstride),
+    (gint16 *) ((gint8 *) b + 1 * bstride),
+    (gint16 *) ((gint8 *) b + 2 * bstride),
+    (gint16 *) ((gint8 *) b + 3 * bstride)
+  };
+
+  sum[0] = sum[1] = sum[2] = sum[3] = _mm_setzero_si128 ();
+  f = _mm_unpacklo_epi16 (f, sum[0]);
+
+  for (; i < len; i += 8) {
+    t[0] = _mm_loadu_si128 ((__m128i *) (a + i));
+    sum[0] =
+        _mm_add_epi32 (sum[0], _mm_madd_epi16 (t[0],
+            _mm_load_si128 ((__m128i *) (c[0] + i))));
+    sum[1] =
+        _mm_add_epi32 (sum[1], _mm_madd_epi16 (t[0],
+            _mm_load_si128 ((__m128i *) (c[1] + i))));
+    sum[2] =
+        _mm_add_epi32 (sum[2], _mm_madd_epi16 (t[0],
+            _mm_load_si128 ((__m128i *) (c[2] + i))));
+    sum[3] =
+        _mm_add_epi32 (sum[3], _mm_madd_epi16 (t[0],
+            _mm_load_si128 ((__m128i *) (c[3] + i))));
+  }
+  t[0] = _mm_unpacklo_epi32 (sum[0], sum[1]);
+  t[1] = _mm_unpacklo_epi32 (sum[2], sum[3]);
+  t[2] = _mm_unpackhi_epi32 (sum[0], sum[1]);
+  t[3] = _mm_unpackhi_epi32 (sum[2], sum[3]);
+
+  sum[0] =
+      _mm_add_epi32 (_mm_unpacklo_epi64 (t[0], t[1]), _mm_unpackhi_epi64 (t[0],
+          t[1]));
+  sum[2] =
+      _mm_add_epi32 (_mm_unpacklo_epi64 (t[2], t[3]), _mm_unpackhi_epi64 (t[2],
+          t[3]));
+  sum[0] = _mm_add_epi32 (sum[0], sum[2]);
+
+  sum[0] = _mm_srai_epi32 (sum[0], PRECISION_S16);
+  sum[0] = _mm_madd_epi16 (sum[0], f);
+
+  sum[0] =
+      _mm_add_epi32 (sum[0], _mm_shuffle_epi32 (sum[0], _MM_SHUFFLE (2, 3, 2,
+              3)));
+  sum[0] =
+      _mm_add_epi32 (sum[0], _mm_shuffle_epi32 (sum[0], _MM_SHUFFLE (1, 1, 1,
+              1)));
+
+  sum[0] = _mm_add_epi32 (sum[0], _mm_set1_epi32 (1 << (PRECISION_S16 - 1)));
+  sum[0] = _mm_srai_epi32 (sum[0], PRECISION_S16);
+  sum[0] = _mm_packs_epi32 (sum[0], sum[0]);
+  *o = _mm_extract_epi16 (sum[0], 0);
+}
+
+static inline void
+inner_product_gdouble_full_1_sse2 (gdouble * o, const gdouble * a,
+    const gdouble * b, gint len, const gdouble * icoeff, gint bstride)
+{
+  gint i = 0;
+  __m128d sum = _mm_setzero_pd ();
+
+  for (; i < len; i += 8) {
+    sum =
+        _mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + i + 0),
+            _mm_load_pd (b + i + 0)));
+    sum =
+        _mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + i + 2),
+            _mm_load_pd (b + i + 2)));
+    sum =
+        _mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + i + 4),
+            _mm_load_pd (b + i + 4)));
+    sum =
+        _mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + i + 6),
+            _mm_load_pd (b + i + 6)));
+  }
+  sum = _mm_add_sd (sum, _mm_unpackhi_pd (sum, sum));
+  _mm_store_sd (o, sum);
+}
+
+static inline void
+inner_product_gdouble_linear_1_sse2 (gdouble * o, const gdouble * a,
+    const gdouble * b, gint len, const gdouble * icoeff, gint bstride)
+{
+  gint i = 0;
+  __m128d sum[2], t;
+  const gdouble *c[2] = { (gdouble *) ((gint8 *) b + 0 * bstride),
+    (gdouble *) ((gint8 *) b + 1 * bstride)
+  };
+
+  sum[0] = sum[1] = _mm_setzero_pd ();
+
+  for (; i < len; i += 4) {
+    t = _mm_loadu_pd (a + i + 0);
+    sum[0] = _mm_add_pd (sum[0], _mm_mul_pd (t, _mm_load_pd (c[0] + i + 0)));
+    sum[1] = _mm_add_pd (sum[1], _mm_mul_pd (t, _mm_load_pd (c[1] + i + 0)));
+    t = _mm_loadu_pd (a + i + 2);
+    sum[0] = _mm_add_pd (sum[0], _mm_mul_pd (t, _mm_load_pd (c[0] + i + 2)));
+    sum[1] = _mm_add_pd (sum[1], _mm_mul_pd (t, _mm_load_pd (c[1] + i + 2)));
+  }
+  sum[0] = _mm_mul_pd (_mm_sub_pd (sum[0], sum[1]), _mm_load1_pd (icoeff));
+  sum[0] = _mm_add_pd (sum[0], sum[1]);
+  sum[0] = _mm_add_sd (sum[0], _mm_unpackhi_pd (sum[0], sum[0]));
+  _mm_store_sd (o, sum[0]);
+}
+
+static inline void
+inner_product_gdouble_cubic_1_sse2 (gdouble * o, const gdouble * a,
+    const gdouble * b, gint len, const gdouble * icoeff, gint bstride)
+{
+  gint i;
+  __m128d f[2], sum[4], t;
+  const gdouble *c[4] = { (gdouble *) ((gint8 *) b + 0 * bstride),
+    (gdouble *) ((gint8 *) b + 1 * bstride),
+    (gdouble *) ((gint8 *) b + 2 * bstride),
+    (gdouble *) ((gint8 *) b + 3 * bstride)
+  };
+
+  f[0] = _mm_loadu_pd (icoeff + 0);
+  f[1] = _mm_loadu_pd (icoeff + 2);
+  sum[0] = sum[1] = sum[2] = sum[3] = _mm_setzero_pd ();
+
+  for (i = 0; i < len; i += 2) {
+    t = _mm_loadu_pd (a + i + 0);
+    sum[0] = _mm_add_pd (sum[0], _mm_mul_pd (t, _mm_load_pd (c[0] + i)));
+    sum[1] = _mm_add_pd (sum[1], _mm_mul_pd (t, _mm_load_pd (c[1] + i)));
+    sum[2] = _mm_add_pd (sum[2], _mm_mul_pd (t, _mm_load_pd (c[2] + i)));
+    sum[3] = _mm_add_pd (sum[3], _mm_mul_pd (t, _mm_load_pd (c[3] + i)));
+  }
+  sum[0] =
+      _mm_mul_pd (sum[0], _mm_shuffle_pd (f[0], f[0], _MM_SHUFFLE2 (0, 0)));
+  sum[1] =
+      _mm_mul_pd (sum[1], _mm_shuffle_pd (f[0], f[0], _MM_SHUFFLE2 (1, 1)));
+  sum[2] =
+      _mm_mul_pd (sum[2], _mm_shuffle_pd (f[1], f[1], _MM_SHUFFLE2 (0, 0)));
+  sum[3] =
+      _mm_mul_pd (sum[3], _mm_shuffle_pd (f[1], f[1], _MM_SHUFFLE2 (1, 1)));
+  sum[0] = _mm_add_pd (sum[0], sum[1]);
+  sum[2] = _mm_add_pd (sum[2], sum[3]);
+  sum[0] = _mm_add_pd (sum[0], sum[2]);
+  sum[0] = _mm_add_sd (sum[0], _mm_unpackhi_pd (sum[0], sum[0]));
+  _mm_store_sd (o, sum[0]);
+}
+
+MAKE_RESAMPLE_FUNC (gint16, full, 1, sse2);
+MAKE_RESAMPLE_FUNC (gint16, linear, 1, sse2);
+MAKE_RESAMPLE_FUNC (gint16, cubic, 1, sse2);
+
+MAKE_RESAMPLE_FUNC (gdouble, full, 1, sse2);
+MAKE_RESAMPLE_FUNC (gdouble, linear, 1, sse2);
+MAKE_RESAMPLE_FUNC (gdouble, cubic, 1, sse2);
+
+void
+interpolate_gint16_linear_sse2 (gpointer op, const gpointer ap,
+    gint len, const gpointer icp, gint astride)
+{
+  gint i = 0;
+  gint16 *o = op, *a = ap, *ic = icp;
+  __m128i ta, tb, t1, t2;
+  __m128i f = _mm_set_epi64x (0, *((gint64 *) ic));
+  const gint16 *c[2] = { (gint16 *) ((gint8 *) a + 0 * astride),
+    (gint16 *) ((gint8 *) a + 1 * astride)
+  };
+
+  f = _mm_unpacklo_epi32 (f, f);
+  f = _mm_unpacklo_epi64 (f, f);
+
+  for (; i < len; i += 8) {
+    ta = _mm_load_si128 ((__m128i *) (c[0] + i));
+    tb = _mm_load_si128 ((__m128i *) (c[1] + i));
+
+    t1 = _mm_madd_epi16 (_mm_unpacklo_epi16 (ta, tb), f);
+    t2 = _mm_madd_epi16 (_mm_unpackhi_epi16 (ta, tb), f);
+
+    t1 = _mm_add_epi32 (t1, _mm_set1_epi32 (1 << (PRECISION_S16 - 1)));
+    t2 = _mm_add_epi32 (t2, _mm_set1_epi32 (1 << (PRECISION_S16 - 1)));
+
+    t1 = _mm_srai_epi32 (t1, PRECISION_S16);
+    t2 = _mm_srai_epi32 (t2, PRECISION_S16);
+
+    t1 = _mm_packs_epi32 (t1, t2);
+    _mm_store_si128 ((__m128i *) (o + i), t1);
+  }
+}
+
+void
+interpolate_gint16_cubic_sse2 (gpointer op, const gpointer ap,
+    gint len, const gpointer icp, gint astride)
+{
+  gint i = 0;
+  gint16 *o = op, *a = ap, *ic = icp;
+  __m128i ta, tb, tl1, tl2, th1, th2;
+  __m128i f[2];
+  const gint16 *c[4] = { (gint16 *) ((gint8 *) a + 0 * astride),
+    (gint16 *) ((gint8 *) a + 1 * astride),
+    (gint16 *) ((gint8 *) a + 2 * astride),
+    (gint16 *) ((gint8 *) a + 3 * astride)
+  };
+
+  f[0] = _mm_set_epi16 (ic[1], ic[0], ic[1], ic[0], ic[1], ic[0], ic[1], ic[0]);
+  f[1] = _mm_set_epi16 (ic[3], ic[2], ic[3], ic[2], ic[3], ic[2], ic[3], ic[2]);
+
+  for (; i < len; i += 8) {
+    ta = _mm_load_si128 ((__m128i *) (c[0] + i));
+    tb = _mm_load_si128 ((__m128i *) (c[1] + i));
+
+    tl1 = _mm_madd_epi16 (_mm_unpacklo_epi16 (ta, tb), f[0]);
+    th1 = _mm_madd_epi16 (_mm_unpackhi_epi16 (ta, tb), f[0]);
+
+    ta = _mm_load_si128 ((__m128i *) (c[2] + i));
+    tb = _mm_load_si128 ((__m128i *) (c[3] + i));
+
+    tl2 = _mm_madd_epi16 (_mm_unpacklo_epi16 (ta, tb), f[1]);
+    th2 = _mm_madd_epi16 (_mm_unpackhi_epi16 (ta, tb), f[1]);
+
+    tl1 = _mm_add_epi32 (tl1, tl2);
+    th1 = _mm_add_epi32 (th1, th2);
+
+    tl1 = _mm_add_epi32 (tl1, _mm_set1_epi32 (1 << (PRECISION_S16 - 1)));
+    th1 = _mm_add_epi32 (th1, _mm_set1_epi32 (1 << (PRECISION_S16 - 1)));
+
+    tl1 = _mm_srai_epi32 (tl1, PRECISION_S16);
+    th1 = _mm_srai_epi32 (th1, PRECISION_S16);
+
+    tl1 = _mm_packs_epi32 (tl1, th1);
+    _mm_store_si128 ((__m128i *) (o + i), tl1);
+  }
+}
+
+void
+interpolate_gdouble_linear_sse2 (gpointer op, const gpointer ap,
+    gint len, const gpointer icp, gint astride)
+{
+  gint i;
+  gdouble *o = op, *a = ap, *ic = icp;
+  __m128d f[2], t1, t2;
+  const gdouble *c[2] = { (gdouble *) ((gint8 *) a + 0 * astride),
+    (gdouble *) ((gint8 *) a + 1 * astride)
+  };
+
+  f[0] = _mm_load1_pd (ic + 0);
+  f[1] = _mm_load1_pd (ic + 1);
+
+  for (i = 0; i < len; i += 4) {
+    t1 = _mm_mul_pd (_mm_load_pd (c[0] + i + 0), f[0]);
+    t2 = _mm_mul_pd (_mm_load_pd (c[1] + i + 0), f[1]);
+    _mm_store_pd (o + i + 0, _mm_add_pd (t1, t2));
+
+    t1 = _mm_mul_pd (_mm_load_pd (c[0] + i + 2), f[0]);
+    t2 = _mm_mul_pd (_mm_load_pd (c[1] + i + 2), f[1]);
+    _mm_store_pd (o + i + 2, _mm_add_pd (t1, t2));
+  }
+}
+
+void
+interpolate_gdouble_cubic_sse2 (gpointer op, const gpointer ap,
+    gint len, const gpointer icp, gint astride)
+{
+  gint i;
+  gdouble *o = op, *a = ap, *ic = icp;
+  __m128d f[4], t[4];
+  const gdouble *c[4] = { (gdouble *) ((gint8 *) a + 0 * astride),
+    (gdouble *) ((gint8 *) a + 1 * astride),
+    (gdouble *) ((gint8 *) a + 2 * astride),
+    (gdouble *) ((gint8 *) a + 3 * astride)
+  };
+
+  f[0] = _mm_load1_pd (ic + 0);
+  f[1] = _mm_load1_pd (ic + 1);
+  f[2] = _mm_load1_pd (ic + 2);
+  f[3] = _mm_load1_pd (ic + 3);
+
+  for (i = 0; i < len; i += 2) {
+    t[0] = _mm_mul_pd (_mm_load_pd (c[0] + i + 0), f[0]);
+    t[1] = _mm_mul_pd (_mm_load_pd (c[1] + i + 0), f[1]);
+    t[2] = _mm_mul_pd (_mm_load_pd (c[2] + i + 0), f[2]);
+    t[3] = _mm_mul_pd (_mm_load_pd (c[3] + i + 0), f[3]);
+    t[0] = _mm_add_pd (t[0], t[1]);
+    t[2] = _mm_add_pd (t[2], t[3]);
+    _mm_store_pd (o + i + 0, _mm_add_pd (t[0], t[2]));
+  }
+}
+
+#endif
diff --git a/gst-libs/gst/audio/audio-resampler-x86-sse2.h b/gst-libs/gst/audio/audio-resampler-x86-sse2.h
new file mode 100644
index 0000000000..3bbf5cded5
--- /dev/null
+++ b/gst-libs/gst/audio/audio-resampler-x86-sse2.h
@@ -0,0 +1,49 @@
+/* GStreamer
+ * Copyright (C) <2016> Wim Taymans <wim.taymans@gmail.com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Library General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Library General Public
+ * License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
+ * Boston, MA 02110-1301, USA.
+ */
+
+#ifndef AUDIO_RESAMPLER_X86_SSE2_H
+#define AUDIO_RESAMPLER_X86_SSE2_H
+
+#include "audio-resampler-macros.h"
+
+DECL_RESAMPLE_FUNC (gint16, full, 1, sse2);
+DECL_RESAMPLE_FUNC (gint16, linear, 1, sse2);
+DECL_RESAMPLE_FUNC (gint16, cubic, 1, sse2);
+
+DECL_RESAMPLE_FUNC (gdouble, full, 1, sse2);
+DECL_RESAMPLE_FUNC (gdouble, linear, 1, sse2);
+DECL_RESAMPLE_FUNC (gdouble, cubic, 1, sse2);
+
+void
+interpolate_gint16_linear_sse2 (gpointer op, const gpointer ap,
+    gint len, const gpointer icp, gint astride);
+
+void
+interpolate_gint16_cubic_sse2 (gpointer op, const gpointer ap,
+    gint len, const gpointer icp, gint astride);
+
+void
+interpolate_gdouble_linear_sse2 (gpointer op, const gpointer ap,
+    gint len, const gpointer icp, gint astride);
+
+void
+interpolate_gdouble_cubic_sse2 (gpointer op, const gpointer ap,
+    gint len, const gpointer icp, gint astride);
+
+#endif /* AUDIO_RESAMPLER_X86_SSE2_H */
diff --git a/gst-libs/gst/audio/audio-resampler-x86-sse41.c b/gst-libs/gst/audio/audio-resampler-x86-sse41.c
new file mode 100644
index 0000000000..cf3d8184aa
--- /dev/null
+++ b/gst-libs/gst/audio/audio-resampler-x86-sse41.c
@@ -0,0 +1,185 @@
+/* GStreamer
+ * Copyright (C) <2016> Wim Taymans <wim.taymans@gmail.com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Library General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Library General Public
+ * License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
+ * Boston, MA 02110-1301, USA.
+ */
+
+#ifdef HAVE_CONFIG_H
+#  include "config.h"
+#endif
+
+#include "audio-resampler-x86-sse41.h"
+
+#if 0
+#define __SSE4_1__
+#pragma GCC target("sse4.1")
+#endif
+
+#if defined (HAVE_SMMINTRIN_H) && defined (HAVE_EMMINTRIN_H) && defined(__SSE4_1__)
+#include <emmintrin.h>
+#include <smmintrin.h>
+
+static inline void
+inner_product_gint32_full_1_sse41 (gint32 * o, const gint32 * a,
+    const gint32 * b, gint len, const gint32 * icoeff, gint bstride)
+{
+  gint i = 0;
+  __m128i sum, ta, tb;
+  gint64 res;
+
+  sum = _mm_setzero_si128 ();
+
+  for (; i < len; i += 8) {
+    ta = _mm_loadu_si128 ((__m128i *) (a + i));
+    tb = _mm_load_si128 ((__m128i *) (b + i));
+
+    sum =
+        _mm_add_epi64 (sum, _mm_mul_epi32 (_mm_unpacklo_epi32 (ta, ta),
+            _mm_unpacklo_epi32 (tb, tb)));
+    sum =
+        _mm_add_epi64 (sum, _mm_mul_epi32 (_mm_unpackhi_epi32 (ta, ta),
+            _mm_unpackhi_epi32 (tb, tb)));
+
+    ta = _mm_loadu_si128 ((__m128i *) (a + i + 4));
+    tb = _mm_load_si128 ((__m128i *) (b + i + 4));
+
+    sum =
+        _mm_add_epi64 (sum, _mm_mul_epi32 (_mm_unpacklo_epi32 (ta, ta),
+            _mm_unpacklo_epi32 (tb, tb)));
+    sum =
+        _mm_add_epi64 (sum, _mm_mul_epi32 (_mm_unpackhi_epi32 (ta, ta),
+            _mm_unpackhi_epi32 (tb, tb)));
+  }
+  sum = _mm_add_epi64 (sum, _mm_unpackhi_epi64 (sum, sum));
+  res = _mm_cvtsi128_si64 (sum);
+
+  res = (res + (1 << (PRECISION_S32 - 1))) >> PRECISION_S32;
+  *o = CLAMP (res, -(1L << 31), (1L << 31) - 1);
+}
+
+static inline void
+inner_product_gint32_linear_1_sse41 (gint32 * o, const gint32 * a,
+    const gint32 * b, gint len, const gint32 * icoeff, gint bstride)
+{
+  gint i = 0;
+  gint64 res;
+  __m128i sum[2], ta, tb;
+  __m128i f = _mm_loadu_si128 ((__m128i *) icoeff);
+  const gint32 *c[2] = { (gint32 *) ((gint8 *) b + 0 * bstride),
+    (gint32 *) ((gint8 *) b + 1 * bstride)
+  };
+
+  sum[0] = sum[1] = _mm_setzero_si128 ();
+
+  for (; i < len; i += 4) {
+    ta = _mm_loadu_si128 ((__m128i *) (a + i));
+
+    tb = _mm_load_si128 ((__m128i *) (c[0] + i));
+    sum[0] = _mm_add_epi64 (sum[0], _mm_mul_epi32 (_mm_unpacklo_epi32 (ta, ta),
+            _mm_unpacklo_epi32 (tb, tb)));
+    sum[0] = _mm_add_epi64 (sum[0], _mm_mul_epi32 (_mm_unpackhi_epi32 (ta, ta),
+            _mm_unpackhi_epi32 (tb, tb)));
+
+    tb = _mm_load_si128 ((__m128i *) (c[1] + i));
+    sum[1] = _mm_add_epi64 (sum[1], _mm_mul_epi32 (_mm_unpacklo_epi32 (ta, ta),
+            _mm_unpacklo_epi32 (tb, tb)));
+    sum[1] = _mm_add_epi64 (sum[1], _mm_mul_epi32 (_mm_unpackhi_epi32 (ta, ta),
+            _mm_unpackhi_epi32 (tb, tb)));
+  }
+  sum[0] = _mm_srli_epi64 (sum[0], PRECISION_S32);
+  sum[1] = _mm_srli_epi64 (sum[1], PRECISION_S32);
+  sum[0] =
+      _mm_mul_epi32 (sum[0], _mm_shuffle_epi32 (f, _MM_SHUFFLE (0, 0, 0, 0)));
+  sum[1] =
+      _mm_mul_epi32 (sum[1], _mm_shuffle_epi32 (f, _MM_SHUFFLE (1, 1, 1, 1)));
+  sum[0] = _mm_add_epi64 (sum[0], sum[1]);
+  sum[0] = _mm_add_epi64 (sum[0], _mm_unpackhi_epi64 (sum[0], sum[0]));
+  res = _mm_cvtsi128_si64 (sum[0]);
+
+  res = (res + (1 << (PRECISION_S32 - 1))) >> PRECISION_S32;
+  *o = CLAMP (res, -(1L << 31), (1L << 31) - 1);
+}
+
+static inline void
+inner_product_gint32_cubic_1_sse41 (gint32 * o, const gint32 * a,
+    const gint32 * b, gint len, const gint32 * icoeff, gint bstride)
+{
+  gint i = 0;
+  gint64 res;
+  __m128i sum[4], ta, tb;
+  __m128i f = _mm_loadu_si128 ((__m128i *) icoeff);
+  const gint32 *c[4] = { (gint32 *) ((gint8 *) b + 0 * bstride),
+    (gint32 *) ((gint8 *) b + 1 * bstride),
+    (gint32 *) ((gint8 *) b + 2 * bstride),
+    (gint32 *) ((gint8 *) b + 3 * bstride)
+  };
+
+  sum[0] = sum[1] = sum[2] = sum[3] = _mm_setzero_si128 ();
+
+  for (; i < len; i += 4) {
+    ta = _mm_loadu_si128 ((__m128i *) (a + i));
+
+    tb = _mm_load_si128 ((__m128i *) (c[0] + i));
+    sum[0] = _mm_add_epi64 (sum[0], _mm_mul_epi32 (_mm_unpacklo_epi32 (ta, ta),
+            _mm_unpacklo_epi32 (tb, tb)));
+    sum[0] = _mm_add_epi64 (sum[0], _mm_mul_epi32 (_mm_unpackhi_epi32 (ta, ta),
+            _mm_unpackhi_epi32 (tb, tb)));
+
+    tb = _mm_load_si128 ((__m128i *) (c[1] + i));
+    sum[1] = _mm_add_epi64 (sum[1], _mm_mul_epi32 (_mm_unpacklo_epi32 (ta, ta),
+            _mm_unpacklo_epi32 (tb, tb)));
+    sum[1] = _mm_add_epi64 (sum[1], _mm_mul_epi32 (_mm_unpackhi_epi32 (ta, ta),
+            _mm_unpackhi_epi32 (tb, tb)));
+
+    tb = _mm_load_si128 ((__m128i *) (c[2] + i));
+    sum[2] = _mm_add_epi64 (sum[2], _mm_mul_epi32 (_mm_unpacklo_epi32 (ta, ta),
+            _mm_unpacklo_epi32 (tb, tb)));
+    sum[2] = _mm_add_epi64 (sum[2], _mm_mul_epi32 (_mm_unpackhi_epi32 (ta, ta),
+            _mm_unpackhi_epi32 (tb, tb)));
+
+    tb = _mm_load_si128 ((__m128i *) (c[3] + i));
+    sum[3] = _mm_add_epi64 (sum[3], _mm_mul_epi32 (_mm_unpacklo_epi32 (ta, ta),
+            _mm_unpacklo_epi32 (tb, tb)));
+    sum[3] = _mm_add_epi64 (sum[3], _mm_mul_epi32 (_mm_unpackhi_epi32 (ta, ta),
+            _mm_unpackhi_epi32 (tb, tb)));
+  }
+  sum[0] = _mm_srli_epi64 (sum[0], PRECISION_S32);
+  sum[1] = _mm_srli_epi64 (sum[1], PRECISION_S32);
+  sum[2] = _mm_srli_epi64 (sum[2], PRECISION_S32);
+  sum[3] = _mm_srli_epi64 (sum[3], PRECISION_S32);
+  sum[0] =
+      _mm_mul_epi32 (sum[0], _mm_shuffle_epi32 (f, _MM_SHUFFLE (0, 0, 0, 0)));
+  sum[1] =
+      _mm_mul_epi32 (sum[1], _mm_shuffle_epi32 (f, _MM_SHUFFLE (1, 1, 1, 1)));
+  sum[2] =
+      _mm_mul_epi32 (sum[2], _mm_shuffle_epi32 (f, _MM_SHUFFLE (2, 2, 2, 2)));
+  sum[3] =
+      _mm_mul_epi32 (sum[3], _mm_shuffle_epi32 (f, _MM_SHUFFLE (3, 3, 3, 3)));
+  sum[0] = _mm_add_epi64 (sum[0], sum[1]);
+  sum[2] = _mm_add_epi64 (sum[2], sum[3]);
+  sum[0] = _mm_add_epi64 (sum[0], sum[2]);
+  sum[0] = _mm_add_epi64 (sum[0], _mm_unpackhi_epi64 (sum[0], sum[0]));
+  res = _mm_cvtsi128_si64 (sum[0]);
+
+  res = (res + (1 << (PRECISION_S32 - 1))) >> PRECISION_S32;
+  *o = CLAMP (res, -(1L << 31), (1L << 31) - 1);
+}
+
+MAKE_RESAMPLE_FUNC (gint32, full, 1, sse41);
+MAKE_RESAMPLE_FUNC (gint32, linear, 1, sse41);
+MAKE_RESAMPLE_FUNC (gint32, cubic, 1, sse41);
+
+#endif
diff --git a/gst-libs/gst/audio/audio-resampler-x86-sse41.h b/gst-libs/gst/audio/audio-resampler-x86-sse41.h
new file mode 100644
index 0000000000..d8706b0dca
--- /dev/null
+++ b/gst-libs/gst/audio/audio-resampler-x86-sse41.h
@@ -0,0 +1,29 @@
+/* GStreamer
+ * Copyright (C) <2016> Wim Taymans <wim.taymans@gmail.com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Library General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Library General Public
+ * License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
+ * Boston, MA 02110-1301, USA.
+ */
+
+#ifndef AUDIO_RESAMPLER_X86_SSE41_H
+#define AUDIO_RESAMPLER_X86_SSE41_H
+
+#include "audio-resampler-macros.h"
+
+DECL_RESAMPLE_FUNC (gint32, full, 1, sse41);
+DECL_RESAMPLE_FUNC (gint32, linear, 1, sse41);
+DECL_RESAMPLE_FUNC (gint32, cubic, 1, sse41);
+
+#endif /* AUDIO_RESAMPLER_X86_SSE41_H */
diff --git a/gst-libs/gst/audio/audio-resampler-x86.h b/gst-libs/gst/audio/audio-resampler-x86.h
index c1b73d099f..8e2bed3f5c 100644
--- a/gst-libs/gst/audio/audio-resampler-x86.h
+++ b/gst-libs/gst/audio/audio-resampler-x86.h
@@ -17,631 +17,16 @@
  * Boston, MA 02110-1301, USA.
  */
 
-#if defined (HAVE_XMMINTRIN_H) && defined(__SSE__)
-#include <xmmintrin.h>
-
-static inline void
-inner_product_gfloat_full_1_sse (gfloat * o, const gfloat * a,
-    const gfloat * b, gint len, const gfloat * icoeff, gint bstride)
-{
-  gint i = 0;
-  __m128 sum = _mm_setzero_ps ();
-
-  for (; i < len; i += 8) {
-    sum =
-        _mm_add_ps (sum, _mm_mul_ps (_mm_loadu_ps (a + i + 0),
-            _mm_load_ps (b + i + 0)));
-    sum =
-        _mm_add_ps (sum, _mm_mul_ps (_mm_loadu_ps (a + i + 4),
-            _mm_load_ps (b + i + 4)));
-  }
-  sum = _mm_add_ps (sum, _mm_movehl_ps (sum, sum));
-  sum = _mm_add_ss (sum, _mm_shuffle_ps (sum, sum, 0x55));
-  _mm_store_ss (o, sum);
-}
-
-static inline void
-inner_product_gfloat_linear_1_sse (gfloat * o, const gfloat * a,
-    const gfloat * b, gint len, const gfloat * icoeff, gint bstride)
-{
-  gint i = 0;
-  __m128 sum[2], t;
-  const gfloat *c[2] = {(gfloat*)((gint8*)b + 0*bstride),
-                        (gfloat*)((gint8*)b + 1*bstride)};
-
-  sum[0] = sum[1] = _mm_setzero_ps ();
-
-  for (; i < len; i += 8) {
-    t = _mm_loadu_ps (a + i + 0);
-    sum[0] = _mm_add_ps (sum[0], _mm_mul_ps (t, _mm_load_ps (c[0] + i + 0)));
-    sum[1] = _mm_add_ps (sum[1], _mm_mul_ps (t, _mm_load_ps (c[1] + i + 0)));
-    t = _mm_loadu_ps (a + i + 4);
-    sum[0] = _mm_add_ps (sum[0], _mm_mul_ps (t, _mm_load_ps (c[0] + i + 4)));
-    sum[1] = _mm_add_ps (sum[1], _mm_mul_ps (t, _mm_load_ps (c[1] + i + 4)));
-  }
-  sum[0] = _mm_mul_ps (_mm_sub_ps (sum[0], sum[1]), _mm_load1_ps (icoeff));
-  sum[0] = _mm_add_ps (sum[0], sum[1]);
-  sum[0] = _mm_add_ps (sum[0], _mm_movehl_ps (sum[0], sum[0]));
-  sum[0] = _mm_add_ss (sum[0], _mm_shuffle_ps (sum[0], sum[0], 0x55));
-  _mm_store_ss (o, sum[0]);
-}
-
-static inline void
-inner_product_gfloat_cubic_1_sse (gfloat * o, const gfloat * a,
-    const gfloat * b, gint len, const gfloat * icoeff, gint bstride)
-{
-  gint i = 0;
-  __m128 sum[4];
-  __m128 t, f = _mm_loadu_ps(icoeff);
-  const gfloat *c[4] = {(gfloat*)((gint8*)b + 0*bstride),
-                        (gfloat*)((gint8*)b + 1*bstride),
-                        (gfloat*)((gint8*)b + 2*bstride),
-                        (gfloat*)((gint8*)b + 3*bstride)};
-
-  sum[0] = sum[1] = sum[2] = sum[3] = _mm_setzero_ps ();
-
-  for (; i < len; i += 4) {
-    t = _mm_loadu_ps (a + i);
-    sum[0] = _mm_add_ps (sum[0], _mm_mul_ps (t, _mm_load_ps (c[0] + i)));
-    sum[1] = _mm_add_ps (sum[1], _mm_mul_ps (t, _mm_load_ps (c[1] + i)));
-    sum[2] = _mm_add_ps (sum[2], _mm_mul_ps (t, _mm_load_ps (c[2] + i)));
-    sum[3] = _mm_add_ps (sum[3], _mm_mul_ps (t, _mm_load_ps (c[3] + i)));
-  }
-  sum[0] = _mm_mul_ps (sum[0], _mm_shuffle_ps (f, f, 0x00));
-  sum[1] = _mm_mul_ps (sum[1], _mm_shuffle_ps (f, f, 0x55));
-  sum[2] = _mm_mul_ps (sum[2], _mm_shuffle_ps (f, f, 0xaa));
-  sum[3] = _mm_mul_ps (sum[3], _mm_shuffle_ps (f, f, 0xff));
-  sum[0] = _mm_add_ps (sum[0], sum[1]);
-  sum[2] = _mm_add_ps (sum[2], sum[3]);
-  sum[0] = _mm_add_ps (sum[0], sum[2]);
-  sum[0] = _mm_add_ps (sum[0], _mm_movehl_ps (sum[0], sum[0]));
-  sum[0] = _mm_add_ss (sum[0], _mm_shuffle_ps (sum[0], sum[0], 0x55));
-  _mm_store_ss (o, sum[0]);
-}
-
-MAKE_RESAMPLE_FUNC (gfloat, full, 1, sse);
-MAKE_RESAMPLE_FUNC (gfloat, linear, 1, sse);
-MAKE_RESAMPLE_FUNC (gfloat, cubic, 1, sse);
-
-static void
-interpolate_gfloat_linear_sse (gpointer op, const gpointer ap,
-    gint len, const gpointer icp, gint astride)
-{
-  gint i;
-  gfloat *o = op, *a = ap, *ic = icp;
-  __m128 f[2], t1, t2;
-  const gfloat *c[2] = {(gfloat*)((gint8*)a + 0*astride),
-                        (gfloat*)((gint8*)a + 1*astride)};
-
-  f[0] = _mm_load1_ps (ic+0);
-  f[1] = _mm_load1_ps (ic+1);
-
-  for (i = 0; i < len; i += 8) {
-    t1 = _mm_mul_ps (_mm_load_ps (c[0] + i + 0), f[0]);
-    t2 = _mm_mul_ps (_mm_load_ps (c[1] + i + 0), f[1]);
-    _mm_store_ps (o + i + 0, _mm_add_ps (t1, t2));
-
-    t1 = _mm_mul_ps (_mm_load_ps (c[0] + i + 4), f[0]);
-    t2 = _mm_mul_ps (_mm_load_ps (c[1] + i + 4), f[1]);
-    _mm_store_ps (o + i + 4, _mm_add_ps (t1, t2));
-  }
-}
-
-static void
-interpolate_gfloat_cubic_sse (gpointer op, const gpointer ap,
-    gint len, const gpointer icp, gint astride)
-{
-  gint i;
-  gfloat *o = op, *a = ap, *ic = icp;
-  __m128 f[4], t[4];
-  const gfloat *c[4] = {(gfloat*)((gint8*)a + 0*astride),
-                        (gfloat*)((gint8*)a + 1*astride),
-                        (gfloat*)((gint8*)a + 2*astride),
-                        (gfloat*)((gint8*)a + 3*astride)};
-
-  f[0] = _mm_load1_ps (ic+0);
-  f[1] = _mm_load1_ps (ic+1);
-  f[2] = _mm_load1_ps (ic+2);
-  f[3] = _mm_load1_ps (ic+3);
-
-  for (i = 0; i < len; i += 4) {
-    t[0] = _mm_mul_ps (_mm_load_ps (c[0] + i + 0), f[0]);
-    t[1] = _mm_mul_ps (_mm_load_ps (c[1] + i + 0), f[1]);
-    t[2] = _mm_mul_ps (_mm_load_ps (c[2] + i + 0), f[2]);
-    t[3] = _mm_mul_ps (_mm_load_ps (c[3] + i + 0), f[3]);
-    t[0] = _mm_add_ps (t[0], t[1]);
-    t[2] = _mm_add_ps (t[2], t[3]);
-    _mm_store_ps (o + i + 0, _mm_add_ps (t[0], t[2]));
-  }
-}
-
-#endif
-
-#if defined (HAVE_EMMINTRIN_H) && defined(__SSE2__)
-#include <emmintrin.h>
-
-static inline void
-inner_product_gint16_full_1_sse2 (gint16 * o, const gint16 * a,
-    const gint16 * b, gint len, const gint16 * icoeff, gint bstride)
-{
-  gint i;
-  __m128i sum, t;
-
-  sum = _mm_setzero_si128 ();
-
-  for (i = 0; i < len; i += 16) {
-    t = _mm_loadu_si128 ((__m128i *) (a + i));
-    sum = _mm_add_epi32 (sum, _mm_madd_epi16 (t, _mm_load_si128 ((__m128i *) (b + i + 0))));
-
-    t = _mm_loadu_si128 ((__m128i *) (a + i + 8));
-    sum = _mm_add_epi32 (sum, _mm_madd_epi16 (t, _mm_load_si128 ((__m128i *) (b + i + 8))));
-  }
-  sum = _mm_add_epi32 (sum, _mm_shuffle_epi32 (sum, _MM_SHUFFLE (2, 3, 2, 3)));
-  sum = _mm_add_epi32 (sum, _mm_shuffle_epi32 (sum, _MM_SHUFFLE (1, 1, 1, 1)));
-
-  sum = _mm_add_epi32 (sum, _mm_set1_epi32 (1 << (PRECISION_S16 - 1)));
-  sum = _mm_srai_epi32 (sum, PRECISION_S16);
-  sum = _mm_packs_epi32 (sum, sum);
-  *o = _mm_extract_epi16 (sum, 0);
-}
-
-static inline void
-inner_product_gint16_linear_1_sse2 (gint16 * o, const gint16 * a,
-    const gint16 * b, gint len, const gint16 * icoeff, gint bstride)
-{
-  gint i = 0;
-  __m128i sum[2], t;
-  __m128i f = _mm_set_epi64x (0, *((gint64*)icoeff));
-  const gint16 *c[2] = {(gint16*)((gint8*)b + 0*bstride),
-                        (gint16*)((gint8*)b + 1*bstride)};
-
-  sum[0] = sum[1] = _mm_setzero_si128 ();
-  f = _mm_unpacklo_epi16 (f, sum[0]);
-
-  for (; i < len; i += 16) {
-    t = _mm_loadu_si128 ((__m128i *) (a + i + 0));
-    sum[0] = _mm_add_epi32 (sum[0], _mm_madd_epi16 (t, _mm_load_si128 ((__m128i *) (c[0] + i + 0))));
-    sum[1] = _mm_add_epi32 (sum[1], _mm_madd_epi16 (t, _mm_load_si128 ((__m128i *) (c[1] + i + 0))));
-
-    t = _mm_loadu_si128 ((__m128i *) (a + i + 8));
-    sum[0] = _mm_add_epi32 (sum[0], _mm_madd_epi16 (t, _mm_load_si128 ((__m128i *) (c[0] + i + 8))));
-    sum[1] = _mm_add_epi32 (sum[1], _mm_madd_epi16 (t, _mm_load_si128 ((__m128i *) (c[1] + i + 8))));
-  }
-  sum[0] = _mm_srai_epi32 (sum[0], PRECISION_S16);
-  sum[1] = _mm_srai_epi32 (sum[1], PRECISION_S16);
-
-  sum[0] = _mm_madd_epi16 (sum[0], _mm_shuffle_epi32 (f,  _MM_SHUFFLE (0, 0, 0, 0)));
-  sum[1] = _mm_madd_epi16 (sum[1], _mm_shuffle_epi32 (f,  _MM_SHUFFLE (1, 1, 1, 1)));
-  sum[0] = _mm_add_epi32 (sum[0], sum[1]);
-
-  sum[0] = _mm_add_epi32 (sum[0], _mm_shuffle_epi32 (sum[0], _MM_SHUFFLE (2, 3, 2, 3)));
-  sum[0] = _mm_add_epi32 (sum[0], _mm_shuffle_epi32 (sum[0], _MM_SHUFFLE (1, 1, 1, 1)));
-
-  sum[0] = _mm_add_epi32 (sum[0], _mm_set1_epi32 (1 << (PRECISION_S16 - 1)));
-  sum[0] = _mm_srai_epi32 (sum[0], PRECISION_S16);
-  sum[0] = _mm_packs_epi32 (sum[0], sum[0]);
-  *o = _mm_extract_epi16 (sum[0], 0);
-}
-
-static inline void
-inner_product_gint16_cubic_1_sse2 (gint16 * o, const gint16 * a,
-    const gint16 * b, gint len, const gint16 * icoeff, gint bstride)
-{
-  gint i = 0;
-  __m128i sum[4], t[4];
-  __m128i f = _mm_set_epi64x (0, *((long long*)icoeff));
-  const gint16 *c[4] = {(gint16*)((gint8*)b + 0*bstride),
-                        (gint16*)((gint8*)b + 1*bstride),
-                        (gint16*)((gint8*)b + 2*bstride),
-                        (gint16*)((gint8*)b + 3*bstride)};
-
-  sum[0] = sum[1] = sum[2] = sum[3] = _mm_setzero_si128 ();
-  f = _mm_unpacklo_epi16 (f, sum[0]);
-
-  for (; i < len; i += 8) {
-    t[0] = _mm_loadu_si128 ((__m128i *) (a + i));
-    sum[0] = _mm_add_epi32 (sum[0], _mm_madd_epi16 (t[0], _mm_load_si128 ((__m128i *) (c[0] + i))));
-    sum[1] = _mm_add_epi32 (sum[1], _mm_madd_epi16 (t[0], _mm_load_si128 ((__m128i *) (c[1] + i))));
-    sum[2] = _mm_add_epi32 (sum[2], _mm_madd_epi16 (t[0], _mm_load_si128 ((__m128i *) (c[2] + i))));
-    sum[3] = _mm_add_epi32 (sum[3], _mm_madd_epi16 (t[0], _mm_load_si128 ((__m128i *) (c[3] + i))));
-  }
-  t[0] = _mm_unpacklo_epi32 (sum[0], sum[1]);
-  t[1] = _mm_unpacklo_epi32 (sum[2], sum[3]);
-  t[2] = _mm_unpackhi_epi32 (sum[0], sum[1]);
-  t[3] = _mm_unpackhi_epi32 (sum[2], sum[3]);
-
-  sum[0] = _mm_add_epi32 (_mm_unpacklo_epi64(t[0], t[1]), _mm_unpackhi_epi64(t[0], t[1]));
-  sum[2] = _mm_add_epi32 (_mm_unpacklo_epi64(t[2], t[3]), _mm_unpackhi_epi64(t[2], t[3]));
-  sum[0] = _mm_add_epi32 (sum[0], sum[2]);
-
-  sum[0] = _mm_srai_epi32 (sum[0], PRECISION_S16);
-  sum[0] = _mm_madd_epi16 (sum[0], f);
-
-  sum[0] = _mm_add_epi32 (sum[0], _mm_shuffle_epi32 (sum[0], _MM_SHUFFLE (2, 3, 2, 3)));
-  sum[0] = _mm_add_epi32 (sum[0], _mm_shuffle_epi32 (sum[0], _MM_SHUFFLE (1, 1, 1, 1)));
-
-  sum[0] = _mm_add_epi32 (sum[0], _mm_set1_epi32 (1 << (PRECISION_S16 - 1)));
-  sum[0] = _mm_srai_epi32 (sum[0], PRECISION_S16);
-  sum[0] = _mm_packs_epi32 (sum[0], sum[0]);
-  *o = _mm_extract_epi16 (sum[0], 0);
-}
-
-static inline void
-inner_product_gdouble_full_1_sse2 (gdouble * o, const gdouble * a,
-    const gdouble * b, gint len, const gdouble * icoeff, gint bstride)
-{
-  gint i = 0;
-  __m128d sum = _mm_setzero_pd ();
-
-  for (; i < len; i += 8) {
-    sum =
-        _mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + i + 0),
-            _mm_load_pd (b + i + 0)));
-    sum =
-        _mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + i + 2),
-            _mm_load_pd (b + i + 2)));
-    sum =
-        _mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + i + 4),
-            _mm_load_pd (b + i + 4)));
-    sum =
-        _mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + i + 6),
-            _mm_load_pd (b + i + 6)));
-  }
-  sum = _mm_add_sd (sum, _mm_unpackhi_pd (sum, sum));
-  _mm_store_sd (o, sum);
-}
-
-static inline void
-inner_product_gdouble_linear_1_sse2 (gdouble * o, const gdouble * a,
-    const gdouble * b, gint len, const gdouble * icoeff, gint bstride)
-{
-  gint i = 0;
-  __m128d sum[2], t;
-  const gdouble *c[2] = {(gdouble*)((gint8*)b + 0*bstride),
-                         (gdouble*)((gint8*)b + 1*bstride)};
-
-  sum[0] = sum[1] = _mm_setzero_pd ();
-
-  for (; i < len; i += 4) {
-    t = _mm_loadu_pd (a + i + 0);
-    sum[0] = _mm_add_pd (sum[0], _mm_mul_pd (t, _mm_load_pd (c[0] + i + 0)));
-    sum[1] = _mm_add_pd (sum[1], _mm_mul_pd (t, _mm_load_pd (c[1] + i + 0)));
-    t = _mm_loadu_pd (a + i + 2);
-    sum[0] = _mm_add_pd (sum[0], _mm_mul_pd (t, _mm_load_pd (c[0] + i + 2)));
-    sum[1] = _mm_add_pd (sum[1], _mm_mul_pd (t, _mm_load_pd (c[1] + i + 2)));
-  }
-  sum[0] = _mm_mul_pd (_mm_sub_pd (sum[0], sum[1]), _mm_load1_pd (icoeff));
-  sum[0] = _mm_add_pd (sum[0], sum[1]);
-  sum[0] = _mm_add_sd (sum[0], _mm_unpackhi_pd (sum[0], sum[0]));
-  _mm_store_sd (o, sum[0]);
-}
-
-static inline void
-inner_product_gdouble_cubic_1_sse2 (gdouble * o, const gdouble * a,
-    const gdouble * b, gint len, const gdouble * icoeff, gint bstride)
-{
-  gint i;
-  __m128d f[2], sum[4], t;
-  const gdouble *c[4] = {(gdouble*)((gint8*)b + 0*bstride),
-                         (gdouble*)((gint8*)b + 1*bstride),
-                         (gdouble*)((gint8*)b + 2*bstride),
-                         (gdouble*)((gint8*)b + 3*bstride)};
-
-  f[0] = _mm_loadu_pd (icoeff + 0);
-  f[1] = _mm_loadu_pd (icoeff + 2);
-  sum[0] = sum[1] = sum[2] = sum[3] = _mm_setzero_pd ();
-
-  for (i = 0; i < len; i += 2) {
-    t = _mm_loadu_pd (a + i + 0);
-    sum[0] = _mm_add_pd (sum[0], _mm_mul_pd (t, _mm_load_pd (c[0] + i)));
-    sum[1] = _mm_add_pd (sum[1], _mm_mul_pd (t, _mm_load_pd (c[1] + i)));
-    sum[2] = _mm_add_pd (sum[2], _mm_mul_pd (t, _mm_load_pd (c[2] + i)));
-    sum[3] = _mm_add_pd (sum[3], _mm_mul_pd (t, _mm_load_pd (c[3] + i)));
-  }
-  sum[0] = _mm_mul_pd (sum[0], _mm_shuffle_pd (f[0], f[0], _MM_SHUFFLE2 (0, 0)));
-  sum[1] = _mm_mul_pd (sum[1], _mm_shuffle_pd (f[0], f[0], _MM_SHUFFLE2 (1, 1)));
-  sum[2] = _mm_mul_pd (sum[2], _mm_shuffle_pd (f[1], f[1], _MM_SHUFFLE2 (0, 0)));
-  sum[3] = _mm_mul_pd (sum[3], _mm_shuffle_pd (f[1], f[1], _MM_SHUFFLE2 (1, 1)));
-  sum[0] = _mm_add_pd (sum[0], sum[1]);
-  sum[2] = _mm_add_pd (sum[2], sum[3]);
-  sum[0] = _mm_add_pd (sum[0], sum[2]);
-  sum[0] = _mm_add_sd (sum[0], _mm_unpackhi_pd (sum[0], sum[0]));
-  _mm_store_sd (o, sum[0]);
-}
-
-MAKE_RESAMPLE_FUNC (gint16, full, 1, sse2);
-MAKE_RESAMPLE_FUNC (gint16, linear, 1, sse2);
-MAKE_RESAMPLE_FUNC (gint16, cubic, 1, sse2);
-
-MAKE_RESAMPLE_FUNC (gdouble, full, 1, sse2);
-MAKE_RESAMPLE_FUNC (gdouble, linear, 1, sse2);
-MAKE_RESAMPLE_FUNC (gdouble, cubic, 1, sse2);
-
-static inline void
-interpolate_gint16_linear_sse2 (gpointer op, const gpointer ap,
-    gint len, const gpointer icp, gint astride)
-{
-  gint i = 0;
-  gint16 *o = op, *a = ap, *ic = icp;
-  __m128i ta, tb, t1, t2;
-  __m128i f = _mm_set_epi64x (0, *((gint64*)ic));
-  const gint16 *c[2] = {(gint16*)((gint8*)a + 0*astride),
-                        (gint16*)((gint8*)a + 1*astride)};
-
-  f = _mm_unpacklo_epi32 (f, f);
-  f = _mm_unpacklo_epi64 (f, f);
-
-  for (; i < len; i += 8) {
-    ta = _mm_load_si128 ((__m128i *) (c[0] + i));
-    tb = _mm_load_si128 ((__m128i *) (c[1] + i));
-
-    t1 = _mm_madd_epi16 (_mm_unpacklo_epi16 (ta, tb), f);
-    t2 = _mm_madd_epi16 (_mm_unpackhi_epi16 (ta, tb), f);
-
-    t1 = _mm_add_epi32 (t1, _mm_set1_epi32 (1 << (PRECISION_S16 - 1)));
-    t2 = _mm_add_epi32 (t2, _mm_set1_epi32 (1 << (PRECISION_S16 - 1)));
-
-    t1 = _mm_srai_epi32 (t1, PRECISION_S16);
-    t2 = _mm_srai_epi32 (t2, PRECISION_S16);
-
-    t1 = _mm_packs_epi32 (t1, t2);
-    _mm_store_si128 ((__m128i *) (o + i), t1);
-  }
-}
-
-static inline void
-interpolate_gint16_cubic_sse2 (gpointer op, const gpointer ap,
-    gint len, const gpointer icp, gint astride)
-{
-  gint i = 0;
-  gint16 *o = op, *a = ap, *ic = icp;
-  __m128i ta, tb, tl1, tl2, th1, th2;
-  __m128i f[2];
-  const gint16 *c[4] = {(gint16*)((gint8*)a + 0*astride),
-                        (gint16*)((gint8*)a + 1*astride),
-                        (gint16*)((gint8*)a + 2*astride),
-                        (gint16*)((gint8*)a + 3*astride)};
-
-  f[0] = _mm_set_epi16 (ic[1], ic[0], ic[1], ic[0], ic[1], ic[0], ic[1], ic[0]);
-  f[1] = _mm_set_epi16 (ic[3], ic[2], ic[3], ic[2], ic[3], ic[2], ic[3], ic[2]);
-
-  for (; i < len; i += 8) {
-    ta = _mm_load_si128 ((__m128i *) (c[0] + i));
-    tb = _mm_load_si128 ((__m128i *) (c[1] + i));
-
-    tl1 = _mm_madd_epi16 (_mm_unpacklo_epi16 (ta, tb), f[0]);
-    th1 = _mm_madd_epi16 (_mm_unpackhi_epi16 (ta, tb), f[0]);
-
-    ta = _mm_load_si128 ((__m128i *) (c[2] + i));
-    tb = _mm_load_si128 ((__m128i *) (c[3] + i));
-
-    tl2 = _mm_madd_epi16 (_mm_unpacklo_epi16 (ta, tb), f[1]);
-    th2 = _mm_madd_epi16 (_mm_unpackhi_epi16 (ta, tb), f[1]);
-
-    tl1 = _mm_add_epi32 (tl1, tl2);
-    th1 = _mm_add_epi32 (th1, th2);
-
-    tl1 = _mm_add_epi32 (tl1, _mm_set1_epi32 (1 << (PRECISION_S16 - 1)));
-    th1 = _mm_add_epi32 (th1, _mm_set1_epi32 (1 << (PRECISION_S16 - 1)));
-
-    tl1 = _mm_srai_epi32 (tl1, PRECISION_S16);
-    th1 = _mm_srai_epi32 (th1, PRECISION_S16);
-
-    tl1 = _mm_packs_epi32 (tl1, th1);
-    _mm_store_si128 ((__m128i *) (o + i), tl1);
-  }
-}
-
-static void
-interpolate_gdouble_linear_sse2 (gpointer op, const gpointer ap,
-    gint len, const gpointer icp, gint astride)
-{
-  gint i;
-  gdouble *o = op, *a = ap, *ic = icp;
-  __m128d f[2], t1, t2;
-  const gdouble *c[2] = {(gdouble*)((gint8*)a + 0*astride),
-                         (gdouble*)((gint8*)a + 1*astride)};
-
-  f[0] = _mm_load1_pd (ic+0);
-  f[1] = _mm_load1_pd (ic+1);
-
-  for (i = 0; i < len; i += 4) {
-    t1 = _mm_mul_pd (_mm_load_pd (c[0] + i + 0), f[0]);
-    t2 = _mm_mul_pd (_mm_load_pd (c[1] + i + 0), f[1]);
-    _mm_store_pd (o + i + 0, _mm_add_pd (t1, t2));
-
-    t1 = _mm_mul_pd (_mm_load_pd (c[0] + i + 2), f[0]);
-    t2 = _mm_mul_pd (_mm_load_pd (c[1] + i + 2), f[1]);
-    _mm_store_pd (o + i + 2, _mm_add_pd (t1, t2));
-  }
-}
-
-static void
-interpolate_gdouble_cubic_sse2 (gpointer op, const gpointer ap,
-    gint len, const gpointer icp, gint astride)
-{
-  gint i;
-  gdouble *o = op, *a = ap, *ic = icp;
-  __m128d f[4], t[4];
-  const gdouble *c[4] = {(gdouble*)((gint8*)a + 0*astride),
-                         (gdouble*)((gint8*)a + 1*astride),
-                         (gdouble*)((gint8*)a + 2*astride),
-                         (gdouble*)((gint8*)a + 3*astride)};
-
-  f[0] = _mm_load1_pd (ic+0);
-  f[1] = _mm_load1_pd (ic+1);
-  f[2] = _mm_load1_pd (ic+2);
-  f[3] = _mm_load1_pd (ic+3);
-
-  for (i = 0; i < len; i += 2) {
-    t[0] = _mm_mul_pd (_mm_load_pd (c[0] + i + 0), f[0]);
-    t[1] = _mm_mul_pd (_mm_load_pd (c[1] + i + 0), f[1]);
-    t[2] = _mm_mul_pd (_mm_load_pd (c[2] + i + 0), f[2]);
-    t[3] = _mm_mul_pd (_mm_load_pd (c[3] + i + 0), f[3]);
-    t[0] = _mm_add_pd (t[0], t[1]);
-    t[2] = _mm_add_pd (t[2], t[3]);
-    _mm_store_pd (o + i + 0, _mm_add_pd (t[0], t[2]));
-  }
-}
-
-#endif
-
-#if 0
-#define __SSE4_1__
-#pragma GCC target("sse4.1")
-#endif
-
-#if defined (HAVE_SMMINTRIN_H) && defined(__SSE4_1__)
-#include <smmintrin.h>
-
-static inline void
-inner_product_gint32_full_1_sse41 (gint32 * o, const gint32 * a,
-    const gint32 * b, gint len, const gint32 * icoeff, gint bstride)
-{
-  gint i = 0;
-  __m128i sum, ta, tb;
-  gint64 res;
-
-  sum = _mm_setzero_si128 ();
-
-  for (; i < len; i += 8) {
-    ta = _mm_loadu_si128 ((__m128i *) (a + i));
-    tb = _mm_load_si128 ((__m128i *) (b + i));
-
-    sum =
-        _mm_add_epi64 (sum, _mm_mul_epi32 (_mm_unpacklo_epi32 (ta, ta),
-            _mm_unpacklo_epi32 (tb, tb)));
-    sum =
-        _mm_add_epi64 (sum, _mm_mul_epi32 (_mm_unpackhi_epi32 (ta, ta),
-            _mm_unpackhi_epi32 (tb, tb)));
-
-    ta = _mm_loadu_si128 ((__m128i *) (a + i + 4));
-    tb = _mm_load_si128 ((__m128i *) (b + i + 4));
-
-    sum =
-        _mm_add_epi64 (sum, _mm_mul_epi32 (_mm_unpacklo_epi32 (ta, ta),
-            _mm_unpacklo_epi32 (tb, tb)));
-    sum =
-        _mm_add_epi64 (sum, _mm_mul_epi32 (_mm_unpackhi_epi32 (ta, ta),
-            _mm_unpackhi_epi32 (tb, tb)));
-  }
-  sum = _mm_add_epi64 (sum, _mm_unpackhi_epi64 (sum, sum));
-  res = _mm_cvtsi128_si64 (sum);
-
-  res = (res + (1 << (PRECISION_S32 - 1))) >> PRECISION_S32;
-  *o = CLAMP (res, -(1L << 31), (1L << 31) - 1);
-}
-
-static inline void
-inner_product_gint32_linear_1_sse41 (gint32 * o, const gint32 * a,
-    const gint32 * b, gint len, const gint32 * icoeff, gint bstride)
-{
-  gint i = 0;
-  gint64 res;
-  __m128i sum[2], ta, tb;
-  __m128i f = _mm_loadu_si128 ((__m128i *)icoeff);
-  const gint32 *c[2] = {(gint32*)((gint8*)b + 0*bstride),
-                        (gint32*)((gint8*)b + 1*bstride)};
-
-  sum[0] = sum[1] = _mm_setzero_si128 ();
-
-  for (; i < len; i += 4) {
-    ta = _mm_loadu_si128 ((__m128i *)(a + i));
-
-    tb = _mm_load_si128 ((__m128i *)(c[0] + i));
-    sum[0] = _mm_add_epi64 (sum[0], _mm_mul_epi32 (_mm_unpacklo_epi32 (ta, ta),
-              _mm_unpacklo_epi32 (tb, tb)));
-    sum[0] = _mm_add_epi64 (sum[0], _mm_mul_epi32 (_mm_unpackhi_epi32 (ta, ta),
-              _mm_unpackhi_epi32 (tb, tb)));
-
-    tb = _mm_load_si128 ((__m128i *)(c[1] + i));
-    sum[1] = _mm_add_epi64 (sum[1], _mm_mul_epi32 (_mm_unpacklo_epi32 (ta, ta),
-              _mm_unpacklo_epi32 (tb, tb)));
-    sum[1] = _mm_add_epi64 (sum[1], _mm_mul_epi32 (_mm_unpackhi_epi32 (ta, ta),
-              _mm_unpackhi_epi32 (tb, tb)));
-  }
-  sum[0] = _mm_srli_epi64 (sum[0], PRECISION_S32);
-  sum[1] = _mm_srli_epi64 (sum[1], PRECISION_S32);
-  sum[0] = _mm_mul_epi32 (sum[0], _mm_shuffle_epi32 (f, _MM_SHUFFLE (0, 0, 0, 0)));
-  sum[1] = _mm_mul_epi32 (sum[1], _mm_shuffle_epi32 (f, _MM_SHUFFLE (1, 1, 1, 1)));
-  sum[0] = _mm_add_epi64 (sum[0], sum[1]);
-  sum[0] = _mm_add_epi64 (sum[0], _mm_unpackhi_epi64 (sum[0], sum[0]));
-  res = _mm_cvtsi128_si64 (sum[0]);
-
-  res = (res + (1 << (PRECISION_S32 - 1))) >> PRECISION_S32;
-  *o = CLAMP (res, -(1L << 31), (1L << 31) - 1);
-}
-
-static inline void
-inner_product_gint32_cubic_1_sse41 (gint32 * o, const gint32 * a,
-    const gint32 * b, gint len, const gint32 * icoeff, gint bstride)
-{
-  gint i = 0;
-  gint64 res;
-  __m128i sum[4], ta, tb;
-  __m128i f = _mm_loadu_si128 ((__m128i *)icoeff);
-  const gint32 *c[4] = {(gint32*)((gint8*)b + 0*bstride),
-                        (gint32*)((gint8*)b + 1*bstride),
-                        (gint32*)((gint8*)b + 2*bstride),
-                        (gint32*)((gint8*)b + 3*bstride)};
-
-  sum[0] = sum[1] = sum[2] = sum[3] = _mm_setzero_si128 ();
-
-  for (; i < len; i += 4) {
-    ta = _mm_loadu_si128 ((__m128i *)(a + i));
-
-    tb = _mm_load_si128 ((__m128i *)(c[0] + i));
-    sum[0] = _mm_add_epi64 (sum[0], _mm_mul_epi32 (_mm_unpacklo_epi32 (ta, ta),
-              _mm_unpacklo_epi32 (tb, tb)));
-    sum[0] = _mm_add_epi64 (sum[0], _mm_mul_epi32 (_mm_unpackhi_epi32 (ta, ta),
-              _mm_unpackhi_epi32 (tb, tb)));
-
-    tb = _mm_load_si128 ((__m128i *)(c[1] + i));
-    sum[1] = _mm_add_epi64 (sum[1], _mm_mul_epi32 (_mm_unpacklo_epi32 (ta, ta),
-              _mm_unpacklo_epi32 (tb, tb)));
-    sum[1] = _mm_add_epi64 (sum[1], _mm_mul_epi32 (_mm_unpackhi_epi32 (ta, ta),
-              _mm_unpackhi_epi32 (tb, tb)));
-
-    tb = _mm_load_si128 ((__m128i *)(c[2] + i));
-    sum[2] = _mm_add_epi64 (sum[2], _mm_mul_epi32 (_mm_unpacklo_epi32 (ta, ta),
-              _mm_unpacklo_epi32 (tb, tb)));
-    sum[2] = _mm_add_epi64 (sum[2], _mm_mul_epi32 (_mm_unpackhi_epi32 (ta, ta),
-              _mm_unpackhi_epi32 (tb, tb)));
-
-    tb = _mm_load_si128 ((__m128i *)(c[3] + i));
-    sum[3] = _mm_add_epi64 (sum[3], _mm_mul_epi32 (_mm_unpacklo_epi32 (ta, ta),
-              _mm_unpacklo_epi32 (tb, tb)));
-    sum[3] = _mm_add_epi64 (sum[3], _mm_mul_epi32 (_mm_unpackhi_epi32 (ta, ta),
-              _mm_unpackhi_epi32 (tb, tb)));
-  }
-  sum[0] = _mm_srli_epi64 (sum[0], PRECISION_S32);
-  sum[1] = _mm_srli_epi64 (sum[1], PRECISION_S32);
-  sum[2] = _mm_srli_epi64 (sum[2], PRECISION_S32);
-  sum[3] = _mm_srli_epi64 (sum[3], PRECISION_S32);
-  sum[0] = _mm_mul_epi32 (sum[0], _mm_shuffle_epi32 (f, _MM_SHUFFLE (0, 0, 0, 0)));
-  sum[1] = _mm_mul_epi32 (sum[1], _mm_shuffle_epi32 (f, _MM_SHUFFLE (1, 1, 1, 1)));
-  sum[2] = _mm_mul_epi32 (sum[2], _mm_shuffle_epi32 (f, _MM_SHUFFLE (2, 2, 2, 2)));
-  sum[3] = _mm_mul_epi32 (sum[3], _mm_shuffle_epi32 (f, _MM_SHUFFLE (3, 3, 3, 3)));
-  sum[0] = _mm_add_epi64 (sum[0], sum[1]);
-  sum[2] = _mm_add_epi64 (sum[2], sum[3]);
-  sum[0] = _mm_add_epi64 (sum[0], sum[2]);
-  sum[0] = _mm_add_epi64 (sum[0], _mm_unpackhi_epi64 (sum[0], sum[0]));
-  res = _mm_cvtsi128_si64 (sum[0]);
-
-  res = (res + (1 << (PRECISION_S32 - 1))) >> PRECISION_S32;
-  *o = CLAMP (res, -(1L << 31), (1L << 31) - 1);
-}
-
-MAKE_RESAMPLE_FUNC (gint32, full, 1, sse41);
-MAKE_RESAMPLE_FUNC (gint32, linear, 1, sse41);
-MAKE_RESAMPLE_FUNC (gint32, cubic, 1, sse41);
-#endif
+#include "audio-resampler-macros.h"
+#include "audio-resampler-x86-sse.h"
+#include "audio-resampler-x86-sse2.h"
+#include "audio-resampler-x86-sse41.h"
 
 static void
 audio_resampler_check_x86 (const gchar *option)
 {
   if (!strcmp (option, "sse")) {
-#if defined (HAVE_XMMINTRIN_H) && defined(__SSE__)
+#if defined (HAVE_XMMINTRIN_H) && HAVE_SSE
     GST_DEBUG ("enable SSE optimisations");
     resample_gfloat_full_1 = resample_gfloat_full_1_sse;
     resample_gfloat_linear_1 = resample_gfloat_linear_1_sse;
@@ -653,7 +38,7 @@ audio_resampler_check_x86 (const gchar *option)
     GST_DEBUG ("SSE optimisations not enabled");
 #endif
   } else if (!strcmp (option, "sse2")) {
-#if defined (HAVE_EMMINTRIN_H) && defined(__SSE2__)
+#if defined (HAVE_EMMINTRIN_H) && HAVE_SSE2
     GST_DEBUG ("enable SSE2 optimisations");
     resample_gint16_full_1 = resample_gint16_full_1_sse2;
     resample_gint16_linear_1 = resample_gint16_linear_1_sse2;
@@ -672,7 +57,7 @@ audio_resampler_check_x86 (const gchar *option)
     GST_DEBUG ("SSE2 optimisations not enabled");
 #endif
   } else if (!strcmp (option, "sse41")) {
-#if defined (HAVE_SMMINTRIN_H) && defined(__SSE4_1__)
+#if defined (HAVE_SMMINTRIN_H) && defined (HAVE_EMMINTRIN_H) && HAVE_SSE41
     GST_DEBUG ("enable SSE41 optimisations");
     resample_gint32_full_1 = resample_gint32_full_1_sse41;
     resample_gint32_linear_1 = resample_gint32_linear_1_sse41;
diff --git a/gst-libs/gst/audio/audio-resampler.c b/gst-libs/gst/audio/audio-resampler.c
index 6c14721d03..8cb562ca8c 100644
--- a/gst-libs/gst/audio/audio-resampler.c
+++ b/gst-libs/gst/audio/audio-resampler.c
@@ -30,99 +30,13 @@
 #endif
 
 #include "audio-resampler.h"
-
-/* Contains a collection of all things found in other resamplers:
- * speex (filter construction, optimizations), ffmpeg (fixed phase filter, blackman filter),
- * SRC (linear interpolation, fixed precomputed tables),...
- *
- *  Supports:
- *   - S16, S32, F32 and F64 formats
- *   - nearest, linear and cubic interpolation
- *   - sinc based interpolation with kaiser or blackman-nutall windows
- *   - fully configurable kaiser parameters
- *   - dynamic linear or cubic interpolation of filter table, this can
- *     use less memory but more CPU
- *   - full filter table, generated from optionally linear or cubic
- *     interpolation of filter table
- *   - fixed filter table size with nearest neighbour phase, optionally
- *     using a precomputed tables
- *   - dynamic samplerate changes
- *   - x86 and neon optimizations
- */
-typedef void (*ConvertTapsFunc) (gdouble * tmp_taps, gpointer taps,
-    gdouble weight, gint n_taps);
-typedef void (*InterpolateFunc) (gpointer o, const gpointer a, gint len,
-    const gpointer icoeff, gint astride);
-typedef void (*ResampleFunc) (GstAudioResampler * resampler, gpointer in[],
-    gsize in_len, gpointer out[], gsize out_len, gsize * consumed);
-typedef void (*DeinterleaveFunc) (GstAudioResampler * resampler,
-    gpointer * sbuf, gpointer in[], gsize in_frames);
+#include "audio-resampler-private.h"
+#include "audio-resampler-macros.h"
 
 #define MEM_ALIGN(m,a) ((gint8 *)((guintptr)((gint8 *)(m) + ((a)-1)) & ~((a)-1)))
 #define ALIGN 16
 #define TAPS_OVERREAD 16
 
-struct _GstAudioResampler
-{
-  GstAudioResamplerMethod method;
-  GstAudioResamplerFlags flags;
-  GstAudioFormat format;
-  GstStructure *options;
-  gint format_index;
-  gint channels;
-  gint in_rate;
-  gint out_rate;
-
-  gint bps;
-  gint ostride;
-
-  GstAudioResamplerFilterMode filter_mode;
-  guint filter_threshold;
-  GstAudioResamplerFilterInterpolation filter_interpolation;
-
-  gdouble cutoff;
-  gdouble kaiser_beta;
-  /* for cubic */
-  gdouble b, c;
-
-  /* temp taps */
-  gpointer tmp_taps;
-
-  /* oversampled main filter table */
-  gint oversample;
-  gint n_taps;
-  gpointer taps;
-  gpointer taps_mem;
-  gsize taps_stride;
-  gint n_phases;
-  gint alloc_taps;
-  gint alloc_phases;
-
-  /* cached taps */
-  gpointer *cached_phases;
-  gpointer cached_taps;
-  gpointer cached_taps_mem;
-  gsize cached_taps_stride;
-
-  ConvertTapsFunc convert_taps;
-  InterpolateFunc interpolate;
-  DeinterleaveFunc deinterleave;
-  ResampleFunc resample;
-
-  gint blocks;
-  gint inc;
-  gint samp_inc;
-  gint samp_frac;
-  gint samp_index;
-  gint samp_phase;
-  gint skip;
-
-  gpointer samples;
-  gsize samples_len;
-  gsize samples_avail;
-  gpointer *sbuf;
-};
-
 GST_DEBUG_CATEGORY_STATIC (audio_resampler_debug);
 #define GST_CAT_DEFAULT audio_resampler_debug
 
@@ -303,9 +217,6 @@ get_kaiser_tap (gdouble x, gint n_taps, gdouble Fc, gdouble beta)
   return s * bessel (beta * sqrt (MAX (1 - w * w, 0)));
 }
 
-#define PRECISION_S16 15
-#define PRECISION_S32 31
-
 #define MAKE_CONVERT_TAPS_INT_FUNC(type, precision)                     \
 static void                                                             \
 convert_taps_##type##_c (gdouble *tmp_taps, gpointer taps,              \
@@ -593,9 +504,7 @@ GET_TAPS_NEAREST_FUNC (gdouble);
 #define get_taps_gdouble_nearest get_taps_gdouble_nearest
 
 #define GET_TAPS_FULL_FUNC(type)                                                \
-static inline gpointer                                                          \
-get_taps_##type##_full (GstAudioResampler * resampler,                          \
-    gint *samp_index, gint *samp_phase, type icoeff[4])                         \
+DECL_GET_TAPS_FULL_FUNC(type)                                                   \
 {                                                                               \
   gpointer res;                                                                 \
   gint out_rate = resampler->out_rate;                                          \
@@ -659,9 +568,7 @@ GET_TAPS_FULL_FUNC (gfloat);
 GET_TAPS_FULL_FUNC (gdouble);
 
 #define GET_TAPS_INTERPOLATE_FUNC(type,inter)                   \
-static inline gpointer                                          \
-get_taps_##type##_##inter (GstAudioResampler * resampler,       \
-    gint *samp_index, gint *samp_phase, type icoeff[4])         \
+DECL_GET_TAPS_INTERPOLATE_FUNC (type, inter)                    \
 {                                                               \
   gpointer res;                                                 \
   gint out_rate = resampler->out_rate;                          \
@@ -852,67 +759,25 @@ inner_product_##type##_cubic_1_c (type * o, const type * a,     \
 INNER_PRODUCT_FLOAT_CUBIC_FUNC (gfloat);
 INNER_PRODUCT_FLOAT_CUBIC_FUNC (gdouble);
 
-#define MAKE_RESAMPLE_FUNC(type,inter,channels,arch)                            \
-static void                                                                     \
-resample_ ##type## _ ##inter## _ ##channels## _ ##arch (GstAudioResampler * resampler,      \
-    gpointer in[], gsize in_len,  gpointer out[], gsize out_len,                \
-    gsize * consumed)                                                           \
-{                                                                               \
-  gint c, di = 0;                                                               \
-  gint n_taps = resampler->n_taps;                                              \
-  gint blocks = resampler->blocks;                                              \
-  gint ostride = resampler->ostride;                                            \
-  gint taps_stride = resampler->taps_stride;                                    \
-  gint samp_index = 0;                                                          \
-  gint samp_phase = 0;                                                          \
-                                                                                \
-  for (c = 0; c < blocks; c++) {                                                \
-    type *ip = in[c];                                                           \
-    type *op = ostride == 1 ? out[c] : (type *)out[0] + c;                      \
-                                                                                \
-    samp_index = resampler->samp_index;                                         \
-    samp_phase = resampler->samp_phase;                                         \
-                                                                                \
-    for (di = 0; di < out_len; di++) {                                          \
-      type *ipp, icoeff[4], *taps;                                              \
-                                                                                \
-      ipp = &ip[samp_index * channels];                                         \
-                                                                                \
-      taps = get_taps_ ##type##_##inter                                         \
-              (resampler, &samp_index, &samp_phase, icoeff);                    \
-      inner_product_ ##type##_##inter##_##channels##_##arch                     \
-              (op, ipp, taps, n_taps, icoeff, taps_stride);                     \
-      op += ostride;                                                            \
-    }                                                                           \
-    if (in_len > samp_index)                                                    \
-      memmove (ip, &ip[samp_index * channels],                                  \
-          (in_len - samp_index) * sizeof(type) * channels);                     \
-  }                                                                             \
-  *consumed = samp_index - resampler->samp_index;                               \
-                                                                                \
-  resampler->samp_index = 0;                                                    \
-  resampler->samp_phase = samp_phase;                                           \
-}
+MAKE_RESAMPLE_FUNC_STATIC (gint16, nearest, 1, c);
+MAKE_RESAMPLE_FUNC_STATIC (gint32, nearest, 1, c);
+MAKE_RESAMPLE_FUNC_STATIC (gfloat, nearest, 1, c);
+MAKE_RESAMPLE_FUNC_STATIC (gdouble, nearest, 1, c);
 
-MAKE_RESAMPLE_FUNC (gint16, nearest, 1, c);
-MAKE_RESAMPLE_FUNC (gint32, nearest, 1, c);
-MAKE_RESAMPLE_FUNC (gfloat, nearest, 1, c);
-MAKE_RESAMPLE_FUNC (gdouble, nearest, 1, c);
+MAKE_RESAMPLE_FUNC_STATIC (gint16, full, 1, c);
+MAKE_RESAMPLE_FUNC_STATIC (gint32, full, 1, c);
+MAKE_RESAMPLE_FUNC_STATIC (gfloat, full, 1, c);
+MAKE_RESAMPLE_FUNC_STATIC (gdouble, full, 1, c);
 
-MAKE_RESAMPLE_FUNC (gint16, full, 1, c);
-MAKE_RESAMPLE_FUNC (gint32, full, 1, c);
-MAKE_RESAMPLE_FUNC (gfloat, full, 1, c);
-MAKE_RESAMPLE_FUNC (gdouble, full, 1, c);
+MAKE_RESAMPLE_FUNC_STATIC (gint16, linear, 1, c);
+MAKE_RESAMPLE_FUNC_STATIC (gint32, linear, 1, c);
+MAKE_RESAMPLE_FUNC_STATIC (gfloat, linear, 1, c);
+MAKE_RESAMPLE_FUNC_STATIC (gdouble, linear, 1, c);
 
-MAKE_RESAMPLE_FUNC (gint16, linear, 1, c);
-MAKE_RESAMPLE_FUNC (gint32, linear, 1, c);
-MAKE_RESAMPLE_FUNC (gfloat, linear, 1, c);
-MAKE_RESAMPLE_FUNC (gdouble, linear, 1, c);
-
-MAKE_RESAMPLE_FUNC (gint16, cubic, 1, c);
-MAKE_RESAMPLE_FUNC (gint32, cubic, 1, c);
-MAKE_RESAMPLE_FUNC (gfloat, cubic, 1, c);
-MAKE_RESAMPLE_FUNC (gdouble, cubic, 1, c);
+MAKE_RESAMPLE_FUNC_STATIC (gint16, cubic, 1, c);
+MAKE_RESAMPLE_FUNC_STATIC (gint32, cubic, 1, c);
+MAKE_RESAMPLE_FUNC_STATIC (gfloat, cubic, 1, c);
+MAKE_RESAMPLE_FUNC_STATIC (gdouble, cubic, 1, c);
 
 static ResampleFunc resample_funcs[] = {
   resample_gint16_nearest_1_c,