diff --git a/configure.ac b/configure.ac index c2a1091e5e..0c13ee8569 100644 --- a/configure.ac +++ b/configure.ac @@ -179,6 +179,30 @@ dnl check for GCC specific SSE headers dnl these are used by the speex resampler code AC_CHECK_HEADERS([xmmintrin.h emmintrin.h smmintrin.h]) +dnl also check which architecture we're on for building files with intrinsics +dnl separately +AC_CHECK_DECLS([__i386__], [HAVE_X86=1]) +AC_CHECK_DECLS([__x86_64__], [HAVE_X86=1]) + +dnl check for -m* compiler flags too +SSE_CFLAGS="-msse" +SSE2_CFLAGS="-msse2" +SSE41_CFLAGS="-msse4.1" + +AS_COMPILER_FLAG([$SSE_CFLAGS], [HAVE_SSE=1], [HAVE_SSE=0]) +AS_COMPILER_FLAG([$SSE2_CFLAGS], [HAVE_SSE2=1], [HAVE_SSE2=0]) +AS_COMPILER_FLAG([$SSE41_CFLAGS], [HAVE_SSE41=1], [HAVE_SSE41=0]) + +AM_CONDITIONAL(HAVE_X86, [test "x${HAVE_X86}" = "x1"]) + +AC_DEFINE_UNQUOTED(HAVE_SSE, [$HAVE_SSE], [SSE support is enabled]) +AC_DEFINE_UNQUOTED(HAVE_SSE2, [$HAVE_SSE2], [SSE2 support is enabled]) +AC_DEFINE_UNQUOTED(HAVE_SSE41, [$HAVE_SSE41], [SSE4.1 support is enabled]) + +AC_SUBST(SSE_CFLAGS) +AC_SUBST(SSE2_CFLAGS) +AC_SUBST(SSE41_CFLAGS) + dnl used in gst/tcp AC_CHECK_HEADERS([sys/socket.h], [HAVE_SYS_SOCKET_H="yes"], [HAVE_SYS_SOCKET_H="no"], [AC_INCLUDES_DEFAULT]) diff --git a/gst-libs/gst/audio/Makefile.am b/gst-libs/gst/audio/Makefile.am index 1f3ec51ae0..bce52bea10 100644 --- a/gst-libs/gst/audio/Makefile.am +++ b/gst-libs/gst/audio/Makefile.am @@ -82,8 +82,12 @@ nodist_libgstaudio_@GST_API_VERSION@include_HEADERS = \ audio-enumtypes.h noinst_HEADERS = \ - gstaudioutilsprivate.h \ - audio-resampler-x86.h \ + gstaudioutilsprivate.h \ + audio-resampler-private.h \ + audio-resampler-macros.h \ + audio-resampler-x86.h \ + audio-resampler-x86-sse.h \ + audio-resampler-x86-sse2.h \ audio-resampler-neon.h libgstaudio_@GST_API_VERSION@_la_CFLAGS = $(GST_PLUGINS_BASE_CFLAGS) $(GST_BASE_CFLAGS) $(GST_CFLAGS) \ @@ -93,6 +97,50 @@ libgstaudio_@GST_API_VERSION@_la_LIBADD = \ $(GST_BASE_LIBS) $(GST_LIBS) $(LIBM) $(ORC_LIBS) libgstaudio_@GST_API_VERSION@_la_LDFLAGS = $(GST_LIB_LDFLAGS) $(GST_ALL_LDFLAGS) $(GST_LT_LDFLAGS) + +# Arch-specific bits + +noinst_LTLIBRARIES = + +if HAVE_X86 +# Don't use full GST_LT_LDFLAGS in LDFLAGS because we get things like +# -version-info that cause a warning on private libs + +noinst_LTLIBRARIES += libaudio_resampler_sse.la +libaudio_resampler_sse_la_SOURCES = audio-resampler-x86-sse.c +libaudio_resampler_sse_la_CFLAGS = \ + $(libgstaudio_@GST_API_VERSION@_la_CFLAGS) \ + $(SSE_CFLAGS) +libaudio_resampler_sse_la_LDFLAGS = \ + $(GST_LIB_LDFLAGS) \ + $(GST_ALL_LDFLAGS) +libgstaudio_@GST_API_VERSION@_la_LIBADD += libaudio_resampler_sse.la + +noinst_LTLIBRARIES += libaudio_resampler_sse2.la +libaudio_resampler_sse2_la_SOURCES = audio-resampler-x86-sse2.c +libaudio_resampler_sse2_la_CFLAGS = \ + $(libgstaudio_@GST_API_VERSION@_la_CFLAGS) \ + $(SSE2_CFLAGS) +libaudio_resampler_sse2_la_LDFLAGS = \ + $(GST_LIB_LDFLAGS) \ + $(GST_ALL_LDFLAGS) +libgstaudio_@GST_API_VERSION@_la_LIBADD += libaudio_resampler_sse2.la + +noinst_LTLIBRARIES += libaudio_resampler_sse41.la +libaudio_resampler_sse41_la_SOURCES = audio-resampler-x86-sse41.c +libaudio_resampler_sse41_la_CFLAGS = \ + $(libgstaudio_@GST_API_VERSION@_la_CFLAGS) \ + $(SSE41_CFLAGS) +libaudio_resampler_sse41_la_LDFLAGS = \ + $(GST_LIB_LDFLAGS) \ + $(GST_ALL_LDFLAGS) +libgstaudio_@GST_API_VERSION@_la_LIBADD += libaudio_resampler_sse41.la + +endif + + +# Introspection + include $(top_srcdir)/common/gst-glib-gen.mak if HAVE_INTROSPECTION diff --git a/gst-libs/gst/audio/audio-resampler-macros.h b/gst-libs/gst/audio/audio-resampler-macros.h new file mode 100644 index 0000000000..fd6652cd80 --- /dev/null +++ b/gst-libs/gst/audio/audio-resampler-macros.h @@ -0,0 +1,108 @@ +/* GStreamer + * Copyright (C) <2015> Wim Taymans + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Library General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Library General Public License for more details. + * + * You should have received a copy of the GNU Library General Public + * License along with this library; if not, write to the + * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, + * Boston, MA 02110-1301, USA. + */ + +#ifndef __GST_AUDIO_RESAMPLER_MACROS_H__ +#define __GST_AUDIO_RESAMPLER_MACROS_H__ + +#include + +#include "audio-resampler-private.h" + +#define PRECISION_S16 15 +#define PRECISION_S32 31 + +#define DECL_GET_TAPS_FULL_FUNC(type) \ +gpointer \ +get_taps_##type##_full (GstAudioResampler * resampler, \ + gint *samp_index, gint *samp_phase, type icoeff[4]) + +DECL_GET_TAPS_FULL_FUNC (gint16); +DECL_GET_TAPS_FULL_FUNC (gint32); +DECL_GET_TAPS_FULL_FUNC (gfloat); +DECL_GET_TAPS_FULL_FUNC (gdouble); + + +#define DECL_GET_TAPS_INTERPOLATE_FUNC(type, inter) \ +gpointer \ +get_taps_##type##_##inter (GstAudioResampler * resampler, \ + gint *samp_index, gint *samp_phase, type icoeff[4]) \ + +DECL_GET_TAPS_INTERPOLATE_FUNC (gint16, linear); +DECL_GET_TAPS_INTERPOLATE_FUNC (gint32, linear); +DECL_GET_TAPS_INTERPOLATE_FUNC (gfloat, linear); +DECL_GET_TAPS_INTERPOLATE_FUNC (gdouble, linear); + +DECL_GET_TAPS_INTERPOLATE_FUNC (gint16, cubic); +DECL_GET_TAPS_INTERPOLATE_FUNC (gint32, cubic); +DECL_GET_TAPS_INTERPOLATE_FUNC (gfloat, cubic); +DECL_GET_TAPS_INTERPOLATE_FUNC (gdouble, cubic); + + +#define DECL_RESAMPLE_FUNC(type,inter,channels,arch) \ +void \ +resample_ ##type## _ ##inter## _ ##channels## _ ##arch (GstAudioResampler * resampler, \ + gpointer in[], gsize in_len, gpointer out[], gsize out_len, \ + gsize * consumed) + +#define MAKE_RESAMPLE_FUNC(type,inter,channels,arch) \ +DECL_RESAMPLE_FUNC (type, inter, channels, arch) \ +{ \ + gint c, di = 0; \ + gint n_taps = resampler->n_taps; \ + gint blocks = resampler->blocks; \ + gint ostride = resampler->ostride; \ + gint taps_stride = resampler->taps_stride; \ + gint samp_index = 0; \ + gint samp_phase = 0; \ + \ + for (c = 0; c < blocks; c++) { \ + type *ip = in[c]; \ + type *op = ostride == 1 ? out[c] : (type *)out[0] + c; \ + \ + samp_index = resampler->samp_index; \ + samp_phase = resampler->samp_phase; \ + \ + for (di = 0; di < out_len; di++) { \ + type *ipp, icoeff[4], *taps; \ + \ + ipp = &ip[samp_index * channels]; \ + \ + taps = get_taps_ ##type##_##inter \ + (resampler, &samp_index, &samp_phase, icoeff); \ + inner_product_ ##type##_##inter##_##channels##_##arch \ + (op, ipp, taps, n_taps, icoeff, taps_stride); \ + op += ostride; \ + } \ + if (in_len > samp_index) \ + memmove (ip, &ip[samp_index * channels], \ + (in_len - samp_index) * sizeof(type) * channels); \ + } \ + *consumed = samp_index - resampler->samp_index; \ + \ + resampler->samp_index = 0; \ + resampler->samp_phase = samp_phase; \ +} + +#define DECL_RESAMPLE_FUNC_STATIC(type,inter,channels,arch) \ +static DECL_RESAMPLE_FUNC (type, inter, channels, arch) + +#define MAKE_RESAMPLE_FUNC_STATIC(type,inter,channels,arch) \ +static MAKE_RESAMPLE_FUNC (type, inter, channels, arch) + +#endif /* __GST_AUDIO_RESAMPLER_MACROS_H__ */ diff --git a/gst-libs/gst/audio/audio-resampler-neon.h b/gst-libs/gst/audio/audio-resampler-neon.h index 5520b070bb..5863e18fe0 100644 --- a/gst-libs/gst/audio/audio-resampler-neon.h +++ b/gst-libs/gst/audio/audio-resampler-neon.h @@ -650,17 +650,17 @@ interpolate_gfloat_cubic_neon (gpointer op, const gpointer ap, "q10", "q11", "q12", "q13", "q14", "q15", "memory"); } -MAKE_RESAMPLE_FUNC (gint16, full, 1, neon); -MAKE_RESAMPLE_FUNC (gint16, linear, 1, neon); -MAKE_RESAMPLE_FUNC (gint16, cubic, 1, neon); +MAKE_RESAMPLE_FUNC_STATIC (gint16, full, 1, neon); +MAKE_RESAMPLE_FUNC_STATIC (gint16, linear, 1, neon); +MAKE_RESAMPLE_FUNC_STATIC (gint16, cubic, 1, neon); -MAKE_RESAMPLE_FUNC (gint32, full, 1, neon); -MAKE_RESAMPLE_FUNC (gint32, linear, 1, neon); -MAKE_RESAMPLE_FUNC (gint32, cubic, 1, neon); +MAKE_RESAMPLE_FUNC_STATIC (gint32, full, 1, neon); +MAKE_RESAMPLE_FUNC_STATIC (gint32, linear, 1, neon); +MAKE_RESAMPLE_FUNC_STATIC (gint32, cubic, 1, neon); -MAKE_RESAMPLE_FUNC (gfloat, full, 1, neon); -MAKE_RESAMPLE_FUNC (gfloat, linear, 1, neon); -MAKE_RESAMPLE_FUNC (gfloat, cubic, 1, neon); +MAKE_RESAMPLE_FUNC_STATIC (gfloat, full, 1, neon); +MAKE_RESAMPLE_FUNC_STATIC (gfloat, linear, 1, neon); +MAKE_RESAMPLE_FUNC_STATIC (gfloat, cubic, 1, neon); static void audio_resampler_check_neon (const gchar *option) diff --git a/gst-libs/gst/audio/audio-resampler-private.h b/gst-libs/gst/audio/audio-resampler-private.h new file mode 100644 index 0000000000..c8d1a7ece6 --- /dev/null +++ b/gst-libs/gst/audio/audio-resampler-private.h @@ -0,0 +1,113 @@ +/* GStreamer + * Copyright (C) <2015> Wim Taymans + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Library General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Library General Public License for more details. + * + * You should have received a copy of the GNU Library General Public + * License along with this library; if not, write to the + * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, + * Boston, MA 02110-1301, USA. + */ + +#ifndef __GST_AUDIO_RESAMPLER_PRIVATE_H__ +#define __GST_AUDIO_RESAMPLER_PRIVATE_H__ + +#include "audio-resampler.h" + +/* Contains a collection of all things found in other resamplers: + * speex (filter construction, optimizations), ffmpeg (fixed phase filter, blackman filter), + * SRC (linear interpolation, fixed precomputed tables),... + * + * Supports: + * - S16, S32, F32 and F64 formats + * - nearest, linear and cubic interpolation + * - sinc based interpolation with kaiser or blackman-nutall windows + * - fully configurable kaiser parameters + * - dynamic linear or cubic interpolation of filter table, this can + * use less memory but more CPU + * - full filter table, generated from optionally linear or cubic + * interpolation of filter table + * - fixed filter table size with nearest neighbour phase, optionally + * using a precomputed tables + * - dynamic samplerate changes + * - x86 and neon optimizations + */ +typedef void (*ConvertTapsFunc) (gdouble * tmp_taps, gpointer taps, + gdouble weight, gint n_taps); +typedef void (*InterpolateFunc) (gpointer o, const gpointer a, gint len, + const gpointer icoeff, gint astride); +typedef void (*ResampleFunc) (GstAudioResampler * resampler, gpointer in[], + gsize in_len, gpointer out[], gsize out_len, gsize * consumed); +typedef void (*DeinterleaveFunc) (GstAudioResampler * resampler, + gpointer * sbuf, gpointer in[], gsize in_frames); + +struct _GstAudioResampler +{ + GstAudioResamplerMethod method; + GstAudioResamplerFlags flags; + GstAudioFormat format; + GstStructure *options; + gint format_index; + gint channels; + gint in_rate; + gint out_rate; + + gint bps; + gint ostride; + + GstAudioResamplerFilterMode filter_mode; + guint filter_threshold; + GstAudioResamplerFilterInterpolation filter_interpolation; + + gdouble cutoff; + gdouble kaiser_beta; + /* for cubic */ + gdouble b, c; + + /* temp taps */ + gpointer tmp_taps; + + /* oversampled main filter table */ + gint oversample; + gint n_taps; + gpointer taps; + gpointer taps_mem; + gsize taps_stride; + gint n_phases; + gint alloc_taps; + gint alloc_phases; + + /* cached taps */ + gpointer *cached_phases; + gpointer cached_taps; + gpointer cached_taps_mem; + gsize cached_taps_stride; + + ConvertTapsFunc convert_taps; + InterpolateFunc interpolate; + DeinterleaveFunc deinterleave; + ResampleFunc resample; + + gint blocks; + gint inc; + gint samp_inc; + gint samp_frac; + gint samp_index; + gint samp_phase; + gint skip; + + gpointer samples; + gsize samples_len; + gsize samples_avail; + gpointer *sbuf; +}; + +#endif /* __GST_AUDIO_RESAMPLER_PRIVATE_H__ */ diff --git a/gst-libs/gst/audio/audio-resampler-x86-sse.c b/gst-libs/gst/audio/audio-resampler-x86-sse.c new file mode 100644 index 0000000000..d100c59882 --- /dev/null +++ b/gst-libs/gst/audio/audio-resampler-x86-sse.c @@ -0,0 +1,168 @@ +/* GStreamer + * Copyright (C) <2016> Wim Taymans + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Library General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Library General Public License for more details. + * + * You should have received a copy of the GNU Library General Public + * License along with this library; if not, write to the + * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, + * Boston, MA 02110-1301, USA. + */ + +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + +#include "audio-resampler-x86-sse.h" + +#if defined (HAVE_XMMINTRIN_H) && defined(__SSE__) +#include + +static inline void +inner_product_gfloat_full_1_sse (gfloat * o, const gfloat * a, + const gfloat * b, gint len, const gfloat * icoeff, gint bstride) +{ + gint i = 0; + __m128 sum = _mm_setzero_ps (); + + for (; i < len; i += 8) { + sum = + _mm_add_ps (sum, _mm_mul_ps (_mm_loadu_ps (a + i + 0), + _mm_load_ps (b + i + 0))); + sum = + _mm_add_ps (sum, _mm_mul_ps (_mm_loadu_ps (a + i + 4), + _mm_load_ps (b + i + 4))); + } + sum = _mm_add_ps (sum, _mm_movehl_ps (sum, sum)); + sum = _mm_add_ss (sum, _mm_shuffle_ps (sum, sum, 0x55)); + _mm_store_ss (o, sum); +} + +static inline void +inner_product_gfloat_linear_1_sse (gfloat * o, const gfloat * a, + const gfloat * b, gint len, const gfloat * icoeff, gint bstride) +{ + gint i = 0; + __m128 sum[2], t; + const gfloat *c[2] = { (gfloat *) ((gint8 *) b + 0 * bstride), + (gfloat *) ((gint8 *) b + 1 * bstride) + }; + + sum[0] = sum[1] = _mm_setzero_ps (); + + for (; i < len; i += 8) { + t = _mm_loadu_ps (a + i + 0); + sum[0] = _mm_add_ps (sum[0], _mm_mul_ps (t, _mm_load_ps (c[0] + i + 0))); + sum[1] = _mm_add_ps (sum[1], _mm_mul_ps (t, _mm_load_ps (c[1] + i + 0))); + t = _mm_loadu_ps (a + i + 4); + sum[0] = _mm_add_ps (sum[0], _mm_mul_ps (t, _mm_load_ps (c[0] + i + 4))); + sum[1] = _mm_add_ps (sum[1], _mm_mul_ps (t, _mm_load_ps (c[1] + i + 4))); + } + sum[0] = _mm_mul_ps (_mm_sub_ps (sum[0], sum[1]), _mm_load1_ps (icoeff)); + sum[0] = _mm_add_ps (sum[0], sum[1]); + sum[0] = _mm_add_ps (sum[0], _mm_movehl_ps (sum[0], sum[0])); + sum[0] = _mm_add_ss (sum[0], _mm_shuffle_ps (sum[0], sum[0], 0x55)); + _mm_store_ss (o, sum[0]); +} + +static inline void +inner_product_gfloat_cubic_1_sse (gfloat * o, const gfloat * a, + const gfloat * b, gint len, const gfloat * icoeff, gint bstride) +{ + gint i = 0; + __m128 sum[4]; + __m128 t, f = _mm_loadu_ps (icoeff); + const gfloat *c[4] = { (gfloat *) ((gint8 *) b + 0 * bstride), + (gfloat *) ((gint8 *) b + 1 * bstride), + (gfloat *) ((gint8 *) b + 2 * bstride), + (gfloat *) ((gint8 *) b + 3 * bstride) + }; + + sum[0] = sum[1] = sum[2] = sum[3] = _mm_setzero_ps (); + + for (; i < len; i += 4) { + t = _mm_loadu_ps (a + i); + sum[0] = _mm_add_ps (sum[0], _mm_mul_ps (t, _mm_load_ps (c[0] + i))); + sum[1] = _mm_add_ps (sum[1], _mm_mul_ps (t, _mm_load_ps (c[1] + i))); + sum[2] = _mm_add_ps (sum[2], _mm_mul_ps (t, _mm_load_ps (c[2] + i))); + sum[3] = _mm_add_ps (sum[3], _mm_mul_ps (t, _mm_load_ps (c[3] + i))); + } + sum[0] = _mm_mul_ps (sum[0], _mm_shuffle_ps (f, f, 0x00)); + sum[1] = _mm_mul_ps (sum[1], _mm_shuffle_ps (f, f, 0x55)); + sum[2] = _mm_mul_ps (sum[2], _mm_shuffle_ps (f, f, 0xaa)); + sum[3] = _mm_mul_ps (sum[3], _mm_shuffle_ps (f, f, 0xff)); + sum[0] = _mm_add_ps (sum[0], sum[1]); + sum[2] = _mm_add_ps (sum[2], sum[3]); + sum[0] = _mm_add_ps (sum[0], sum[2]); + sum[0] = _mm_add_ps (sum[0], _mm_movehl_ps (sum[0], sum[0])); + sum[0] = _mm_add_ss (sum[0], _mm_shuffle_ps (sum[0], sum[0], 0x55)); + _mm_store_ss (o, sum[0]); +} + +MAKE_RESAMPLE_FUNC (gfloat, full, 1, sse); +MAKE_RESAMPLE_FUNC (gfloat, linear, 1, sse); +MAKE_RESAMPLE_FUNC (gfloat, cubic, 1, sse); + +void +interpolate_gfloat_linear_sse (gpointer op, const gpointer ap, + gint len, const gpointer icp, gint astride) +{ + gint i; + gfloat *o = op, *a = ap, *ic = icp; + __m128 f[2], t1, t2; + const gfloat *c[2] = { (gfloat *) ((gint8 *) a + 0 * astride), + (gfloat *) ((gint8 *) a + 1 * astride) + }; + + f[0] = _mm_load1_ps (ic + 0); + f[1] = _mm_load1_ps (ic + 1); + + for (i = 0; i < len; i += 8) { + t1 = _mm_mul_ps (_mm_load_ps (c[0] + i + 0), f[0]); + t2 = _mm_mul_ps (_mm_load_ps (c[1] + i + 0), f[1]); + _mm_store_ps (o + i + 0, _mm_add_ps (t1, t2)); + + t1 = _mm_mul_ps (_mm_load_ps (c[0] + i + 4), f[0]); + t2 = _mm_mul_ps (_mm_load_ps (c[1] + i + 4), f[1]); + _mm_store_ps (o + i + 4, _mm_add_ps (t1, t2)); + } +} + +void +interpolate_gfloat_cubic_sse (gpointer op, const gpointer ap, + gint len, const gpointer icp, gint astride) +{ + gint i; + gfloat *o = op, *a = ap, *ic = icp; + __m128 f[4], t[4]; + const gfloat *c[4] = { (gfloat *) ((gint8 *) a + 0 * astride), + (gfloat *) ((gint8 *) a + 1 * astride), + (gfloat *) ((gint8 *) a + 2 * astride), + (gfloat *) ((gint8 *) a + 3 * astride) + }; + + f[0] = _mm_load1_ps (ic + 0); + f[1] = _mm_load1_ps (ic + 1); + f[2] = _mm_load1_ps (ic + 2); + f[3] = _mm_load1_ps (ic + 3); + + for (i = 0; i < len; i += 4) { + t[0] = _mm_mul_ps (_mm_load_ps (c[0] + i + 0), f[0]); + t[1] = _mm_mul_ps (_mm_load_ps (c[1] + i + 0), f[1]); + t[2] = _mm_mul_ps (_mm_load_ps (c[2] + i + 0), f[2]); + t[3] = _mm_mul_ps (_mm_load_ps (c[3] + i + 0), f[3]); + t[0] = _mm_add_ps (t[0], t[1]); + t[2] = _mm_add_ps (t[2], t[3]); + _mm_store_ps (o + i + 0, _mm_add_ps (t[0], t[2])); + } +} + +#endif diff --git a/gst-libs/gst/audio/audio-resampler-x86-sse.h b/gst-libs/gst/audio/audio-resampler-x86-sse.h new file mode 100644 index 0000000000..1d3e9a4db9 --- /dev/null +++ b/gst-libs/gst/audio/audio-resampler-x86-sse.h @@ -0,0 +1,35 @@ +/* GStreamer + * Copyright (C) <2016> Wim Taymans + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Library General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Library General Public License for more details. + * + * You should have received a copy of the GNU Library General Public + * License along with this library; if not, write to the + * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, + * Boston, MA 02110-1301, USA. + */ + +#ifndef AUDIO_RESAMPLER_X86_SSE_H +#define AUDIO_RESAMPLER_X86_SSE_H + +#include "audio-resampler-macros.h" + +DECL_RESAMPLE_FUNC (gfloat, full, 1, sse); +DECL_RESAMPLE_FUNC (gfloat, linear, 1, sse); +DECL_RESAMPLE_FUNC (gfloat, cubic, 1, sse); + +void interpolate_gfloat_linear_sse (gpointer op, const gpointer ap, + gint len, const gpointer icp, gint astride); + +void interpolate_gfloat_cubic_sse (gpointer op, const gpointer ap, + gint len, const gpointer icp, gint astride); + +#endif /* AUDIO_RESAMPLER_X86_SSE_H */ diff --git a/gst-libs/gst/audio/audio-resampler-x86-sse2.c b/gst-libs/gst/audio/audio-resampler-x86-sse2.c new file mode 100644 index 0000000000..a89fb41337 --- /dev/null +++ b/gst-libs/gst/audio/audio-resampler-x86-sse2.c @@ -0,0 +1,399 @@ +/* GStreamer + * Copyright (C) <2016> Wim Taymans + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Library General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Library General Public License for more details. + * + * You should have received a copy of the GNU Library General Public + * License along with this library; if not, write to the + * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, + * Boston, MA 02110-1301, USA. + */ + +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + +#include "audio-resampler-x86-sse2.h" + +#if defined (HAVE_EMMINTRIN_H) && defined(__SSE2__) +#include + +static inline void +inner_product_gint16_full_1_sse2 (gint16 * o, const gint16 * a, + const gint16 * b, gint len, const gint16 * icoeff, gint bstride) +{ + gint i; + __m128i sum, t; + + sum = _mm_setzero_si128 (); + + for (i = 0; i < len; i += 16) { + t = _mm_loadu_si128 ((__m128i *) (a + i)); + sum = + _mm_add_epi32 (sum, _mm_madd_epi16 (t, + _mm_load_si128 ((__m128i *) (b + i + 0)))); + + t = _mm_loadu_si128 ((__m128i *) (a + i + 8)); + sum = + _mm_add_epi32 (sum, _mm_madd_epi16 (t, + _mm_load_si128 ((__m128i *) (b + i + 8)))); + } + sum = _mm_add_epi32 (sum, _mm_shuffle_epi32 (sum, _MM_SHUFFLE (2, 3, 2, 3))); + sum = _mm_add_epi32 (sum, _mm_shuffle_epi32 (sum, _MM_SHUFFLE (1, 1, 1, 1))); + + sum = _mm_add_epi32 (sum, _mm_set1_epi32 (1 << (PRECISION_S16 - 1))); + sum = _mm_srai_epi32 (sum, PRECISION_S16); + sum = _mm_packs_epi32 (sum, sum); + *o = _mm_extract_epi16 (sum, 0); +} + +static inline void +inner_product_gint16_linear_1_sse2 (gint16 * o, const gint16 * a, + const gint16 * b, gint len, const gint16 * icoeff, gint bstride) +{ + gint i = 0; + __m128i sum[2], t; + __m128i f = _mm_set_epi64x (0, *((gint64 *) icoeff)); + const gint16 *c[2] = { (gint16 *) ((gint8 *) b + 0 * bstride), + (gint16 *) ((gint8 *) b + 1 * bstride) + }; + + sum[0] = sum[1] = _mm_setzero_si128 (); + f = _mm_unpacklo_epi16 (f, sum[0]); + + for (; i < len; i += 16) { + t = _mm_loadu_si128 ((__m128i *) (a + i + 0)); + sum[0] = + _mm_add_epi32 (sum[0], _mm_madd_epi16 (t, + _mm_load_si128 ((__m128i *) (c[0] + i + 0)))); + sum[1] = + _mm_add_epi32 (sum[1], _mm_madd_epi16 (t, + _mm_load_si128 ((__m128i *) (c[1] + i + 0)))); + + t = _mm_loadu_si128 ((__m128i *) (a + i + 8)); + sum[0] = + _mm_add_epi32 (sum[0], _mm_madd_epi16 (t, + _mm_load_si128 ((__m128i *) (c[0] + i + 8)))); + sum[1] = + _mm_add_epi32 (sum[1], _mm_madd_epi16 (t, + _mm_load_si128 ((__m128i *) (c[1] + i + 8)))); + } + sum[0] = _mm_srai_epi32 (sum[0], PRECISION_S16); + sum[1] = _mm_srai_epi32 (sum[1], PRECISION_S16); + + sum[0] = + _mm_madd_epi16 (sum[0], _mm_shuffle_epi32 (f, _MM_SHUFFLE (0, 0, 0, 0))); + sum[1] = + _mm_madd_epi16 (sum[1], _mm_shuffle_epi32 (f, _MM_SHUFFLE (1, 1, 1, 1))); + sum[0] = _mm_add_epi32 (sum[0], sum[1]); + + sum[0] = + _mm_add_epi32 (sum[0], _mm_shuffle_epi32 (sum[0], _MM_SHUFFLE (2, 3, 2, + 3))); + sum[0] = + _mm_add_epi32 (sum[0], _mm_shuffle_epi32 (sum[0], _MM_SHUFFLE (1, 1, 1, + 1))); + + sum[0] = _mm_add_epi32 (sum[0], _mm_set1_epi32 (1 << (PRECISION_S16 - 1))); + sum[0] = _mm_srai_epi32 (sum[0], PRECISION_S16); + sum[0] = _mm_packs_epi32 (sum[0], sum[0]); + *o = _mm_extract_epi16 (sum[0], 0); +} + +static inline void +inner_product_gint16_cubic_1_sse2 (gint16 * o, const gint16 * a, + const gint16 * b, gint len, const gint16 * icoeff, gint bstride) +{ + gint i = 0; + __m128i sum[4], t[4]; + __m128i f = _mm_set_epi64x (0, *((long long *) icoeff)); + const gint16 *c[4] = { (gint16 *) ((gint8 *) b + 0 * bstride), + (gint16 *) ((gint8 *) b + 1 * bstride), + (gint16 *) ((gint8 *) b + 2 * bstride), + (gint16 *) ((gint8 *) b + 3 * bstride) + }; + + sum[0] = sum[1] = sum[2] = sum[3] = _mm_setzero_si128 (); + f = _mm_unpacklo_epi16 (f, sum[0]); + + for (; i < len; i += 8) { + t[0] = _mm_loadu_si128 ((__m128i *) (a + i)); + sum[0] = + _mm_add_epi32 (sum[0], _mm_madd_epi16 (t[0], + _mm_load_si128 ((__m128i *) (c[0] + i)))); + sum[1] = + _mm_add_epi32 (sum[1], _mm_madd_epi16 (t[0], + _mm_load_si128 ((__m128i *) (c[1] + i)))); + sum[2] = + _mm_add_epi32 (sum[2], _mm_madd_epi16 (t[0], + _mm_load_si128 ((__m128i *) (c[2] + i)))); + sum[3] = + _mm_add_epi32 (sum[3], _mm_madd_epi16 (t[0], + _mm_load_si128 ((__m128i *) (c[3] + i)))); + } + t[0] = _mm_unpacklo_epi32 (sum[0], sum[1]); + t[1] = _mm_unpacklo_epi32 (sum[2], sum[3]); + t[2] = _mm_unpackhi_epi32 (sum[0], sum[1]); + t[3] = _mm_unpackhi_epi32 (sum[2], sum[3]); + + sum[0] = + _mm_add_epi32 (_mm_unpacklo_epi64 (t[0], t[1]), _mm_unpackhi_epi64 (t[0], + t[1])); + sum[2] = + _mm_add_epi32 (_mm_unpacklo_epi64 (t[2], t[3]), _mm_unpackhi_epi64 (t[2], + t[3])); + sum[0] = _mm_add_epi32 (sum[0], sum[2]); + + sum[0] = _mm_srai_epi32 (sum[0], PRECISION_S16); + sum[0] = _mm_madd_epi16 (sum[0], f); + + sum[0] = + _mm_add_epi32 (sum[0], _mm_shuffle_epi32 (sum[0], _MM_SHUFFLE (2, 3, 2, + 3))); + sum[0] = + _mm_add_epi32 (sum[0], _mm_shuffle_epi32 (sum[0], _MM_SHUFFLE (1, 1, 1, + 1))); + + sum[0] = _mm_add_epi32 (sum[0], _mm_set1_epi32 (1 << (PRECISION_S16 - 1))); + sum[0] = _mm_srai_epi32 (sum[0], PRECISION_S16); + sum[0] = _mm_packs_epi32 (sum[0], sum[0]); + *o = _mm_extract_epi16 (sum[0], 0); +} + +static inline void +inner_product_gdouble_full_1_sse2 (gdouble * o, const gdouble * a, + const gdouble * b, gint len, const gdouble * icoeff, gint bstride) +{ + gint i = 0; + __m128d sum = _mm_setzero_pd (); + + for (; i < len; i += 8) { + sum = + _mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + i + 0), + _mm_load_pd (b + i + 0))); + sum = + _mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + i + 2), + _mm_load_pd (b + i + 2))); + sum = + _mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + i + 4), + _mm_load_pd (b + i + 4))); + sum = + _mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + i + 6), + _mm_load_pd (b + i + 6))); + } + sum = _mm_add_sd (sum, _mm_unpackhi_pd (sum, sum)); + _mm_store_sd (o, sum); +} + +static inline void +inner_product_gdouble_linear_1_sse2 (gdouble * o, const gdouble * a, + const gdouble * b, gint len, const gdouble * icoeff, gint bstride) +{ + gint i = 0; + __m128d sum[2], t; + const gdouble *c[2] = { (gdouble *) ((gint8 *) b + 0 * bstride), + (gdouble *) ((gint8 *) b + 1 * bstride) + }; + + sum[0] = sum[1] = _mm_setzero_pd (); + + for (; i < len; i += 4) { + t = _mm_loadu_pd (a + i + 0); + sum[0] = _mm_add_pd (sum[0], _mm_mul_pd (t, _mm_load_pd (c[0] + i + 0))); + sum[1] = _mm_add_pd (sum[1], _mm_mul_pd (t, _mm_load_pd (c[1] + i + 0))); + t = _mm_loadu_pd (a + i + 2); + sum[0] = _mm_add_pd (sum[0], _mm_mul_pd (t, _mm_load_pd (c[0] + i + 2))); + sum[1] = _mm_add_pd (sum[1], _mm_mul_pd (t, _mm_load_pd (c[1] + i + 2))); + } + sum[0] = _mm_mul_pd (_mm_sub_pd (sum[0], sum[1]), _mm_load1_pd (icoeff)); + sum[0] = _mm_add_pd (sum[0], sum[1]); + sum[0] = _mm_add_sd (sum[0], _mm_unpackhi_pd (sum[0], sum[0])); + _mm_store_sd (o, sum[0]); +} + +static inline void +inner_product_gdouble_cubic_1_sse2 (gdouble * o, const gdouble * a, + const gdouble * b, gint len, const gdouble * icoeff, gint bstride) +{ + gint i; + __m128d f[2], sum[4], t; + const gdouble *c[4] = { (gdouble *) ((gint8 *) b + 0 * bstride), + (gdouble *) ((gint8 *) b + 1 * bstride), + (gdouble *) ((gint8 *) b + 2 * bstride), + (gdouble *) ((gint8 *) b + 3 * bstride) + }; + + f[0] = _mm_loadu_pd (icoeff + 0); + f[1] = _mm_loadu_pd (icoeff + 2); + sum[0] = sum[1] = sum[2] = sum[3] = _mm_setzero_pd (); + + for (i = 0; i < len; i += 2) { + t = _mm_loadu_pd (a + i + 0); + sum[0] = _mm_add_pd (sum[0], _mm_mul_pd (t, _mm_load_pd (c[0] + i))); + sum[1] = _mm_add_pd (sum[1], _mm_mul_pd (t, _mm_load_pd (c[1] + i))); + sum[2] = _mm_add_pd (sum[2], _mm_mul_pd (t, _mm_load_pd (c[2] + i))); + sum[3] = _mm_add_pd (sum[3], _mm_mul_pd (t, _mm_load_pd (c[3] + i))); + } + sum[0] = + _mm_mul_pd (sum[0], _mm_shuffle_pd (f[0], f[0], _MM_SHUFFLE2 (0, 0))); + sum[1] = + _mm_mul_pd (sum[1], _mm_shuffle_pd (f[0], f[0], _MM_SHUFFLE2 (1, 1))); + sum[2] = + _mm_mul_pd (sum[2], _mm_shuffle_pd (f[1], f[1], _MM_SHUFFLE2 (0, 0))); + sum[3] = + _mm_mul_pd (sum[3], _mm_shuffle_pd (f[1], f[1], _MM_SHUFFLE2 (1, 1))); + sum[0] = _mm_add_pd (sum[0], sum[1]); + sum[2] = _mm_add_pd (sum[2], sum[3]); + sum[0] = _mm_add_pd (sum[0], sum[2]); + sum[0] = _mm_add_sd (sum[0], _mm_unpackhi_pd (sum[0], sum[0])); + _mm_store_sd (o, sum[0]); +} + +MAKE_RESAMPLE_FUNC (gint16, full, 1, sse2); +MAKE_RESAMPLE_FUNC (gint16, linear, 1, sse2); +MAKE_RESAMPLE_FUNC (gint16, cubic, 1, sse2); + +MAKE_RESAMPLE_FUNC (gdouble, full, 1, sse2); +MAKE_RESAMPLE_FUNC (gdouble, linear, 1, sse2); +MAKE_RESAMPLE_FUNC (gdouble, cubic, 1, sse2); + +void +interpolate_gint16_linear_sse2 (gpointer op, const gpointer ap, + gint len, const gpointer icp, gint astride) +{ + gint i = 0; + gint16 *o = op, *a = ap, *ic = icp; + __m128i ta, tb, t1, t2; + __m128i f = _mm_set_epi64x (0, *((gint64 *) ic)); + const gint16 *c[2] = { (gint16 *) ((gint8 *) a + 0 * astride), + (gint16 *) ((gint8 *) a + 1 * astride) + }; + + f = _mm_unpacklo_epi32 (f, f); + f = _mm_unpacklo_epi64 (f, f); + + for (; i < len; i += 8) { + ta = _mm_load_si128 ((__m128i *) (c[0] + i)); + tb = _mm_load_si128 ((__m128i *) (c[1] + i)); + + t1 = _mm_madd_epi16 (_mm_unpacklo_epi16 (ta, tb), f); + t2 = _mm_madd_epi16 (_mm_unpackhi_epi16 (ta, tb), f); + + t1 = _mm_add_epi32 (t1, _mm_set1_epi32 (1 << (PRECISION_S16 - 1))); + t2 = _mm_add_epi32 (t2, _mm_set1_epi32 (1 << (PRECISION_S16 - 1))); + + t1 = _mm_srai_epi32 (t1, PRECISION_S16); + t2 = _mm_srai_epi32 (t2, PRECISION_S16); + + t1 = _mm_packs_epi32 (t1, t2); + _mm_store_si128 ((__m128i *) (o + i), t1); + } +} + +void +interpolate_gint16_cubic_sse2 (gpointer op, const gpointer ap, + gint len, const gpointer icp, gint astride) +{ + gint i = 0; + gint16 *o = op, *a = ap, *ic = icp; + __m128i ta, tb, tl1, tl2, th1, th2; + __m128i f[2]; + const gint16 *c[4] = { (gint16 *) ((gint8 *) a + 0 * astride), + (gint16 *) ((gint8 *) a + 1 * astride), + (gint16 *) ((gint8 *) a + 2 * astride), + (gint16 *) ((gint8 *) a + 3 * astride) + }; + + f[0] = _mm_set_epi16 (ic[1], ic[0], ic[1], ic[0], ic[1], ic[0], ic[1], ic[0]); + f[1] = _mm_set_epi16 (ic[3], ic[2], ic[3], ic[2], ic[3], ic[2], ic[3], ic[2]); + + for (; i < len; i += 8) { + ta = _mm_load_si128 ((__m128i *) (c[0] + i)); + tb = _mm_load_si128 ((__m128i *) (c[1] + i)); + + tl1 = _mm_madd_epi16 (_mm_unpacklo_epi16 (ta, tb), f[0]); + th1 = _mm_madd_epi16 (_mm_unpackhi_epi16 (ta, tb), f[0]); + + ta = _mm_load_si128 ((__m128i *) (c[2] + i)); + tb = _mm_load_si128 ((__m128i *) (c[3] + i)); + + tl2 = _mm_madd_epi16 (_mm_unpacklo_epi16 (ta, tb), f[1]); + th2 = _mm_madd_epi16 (_mm_unpackhi_epi16 (ta, tb), f[1]); + + tl1 = _mm_add_epi32 (tl1, tl2); + th1 = _mm_add_epi32 (th1, th2); + + tl1 = _mm_add_epi32 (tl1, _mm_set1_epi32 (1 << (PRECISION_S16 - 1))); + th1 = _mm_add_epi32 (th1, _mm_set1_epi32 (1 << (PRECISION_S16 - 1))); + + tl1 = _mm_srai_epi32 (tl1, PRECISION_S16); + th1 = _mm_srai_epi32 (th1, PRECISION_S16); + + tl1 = _mm_packs_epi32 (tl1, th1); + _mm_store_si128 ((__m128i *) (o + i), tl1); + } +} + +void +interpolate_gdouble_linear_sse2 (gpointer op, const gpointer ap, + gint len, const gpointer icp, gint astride) +{ + gint i; + gdouble *o = op, *a = ap, *ic = icp; + __m128d f[2], t1, t2; + const gdouble *c[2] = { (gdouble *) ((gint8 *) a + 0 * astride), + (gdouble *) ((gint8 *) a + 1 * astride) + }; + + f[0] = _mm_load1_pd (ic + 0); + f[1] = _mm_load1_pd (ic + 1); + + for (i = 0; i < len; i += 4) { + t1 = _mm_mul_pd (_mm_load_pd (c[0] + i + 0), f[0]); + t2 = _mm_mul_pd (_mm_load_pd (c[1] + i + 0), f[1]); + _mm_store_pd (o + i + 0, _mm_add_pd (t1, t2)); + + t1 = _mm_mul_pd (_mm_load_pd (c[0] + i + 2), f[0]); + t2 = _mm_mul_pd (_mm_load_pd (c[1] + i + 2), f[1]); + _mm_store_pd (o + i + 2, _mm_add_pd (t1, t2)); + } +} + +void +interpolate_gdouble_cubic_sse2 (gpointer op, const gpointer ap, + gint len, const gpointer icp, gint astride) +{ + gint i; + gdouble *o = op, *a = ap, *ic = icp; + __m128d f[4], t[4]; + const gdouble *c[4] = { (gdouble *) ((gint8 *) a + 0 * astride), + (gdouble *) ((gint8 *) a + 1 * astride), + (gdouble *) ((gint8 *) a + 2 * astride), + (gdouble *) ((gint8 *) a + 3 * astride) + }; + + f[0] = _mm_load1_pd (ic + 0); + f[1] = _mm_load1_pd (ic + 1); + f[2] = _mm_load1_pd (ic + 2); + f[3] = _mm_load1_pd (ic + 3); + + for (i = 0; i < len; i += 2) { + t[0] = _mm_mul_pd (_mm_load_pd (c[0] + i + 0), f[0]); + t[1] = _mm_mul_pd (_mm_load_pd (c[1] + i + 0), f[1]); + t[2] = _mm_mul_pd (_mm_load_pd (c[2] + i + 0), f[2]); + t[3] = _mm_mul_pd (_mm_load_pd (c[3] + i + 0), f[3]); + t[0] = _mm_add_pd (t[0], t[1]); + t[2] = _mm_add_pd (t[2], t[3]); + _mm_store_pd (o + i + 0, _mm_add_pd (t[0], t[2])); + } +} + +#endif diff --git a/gst-libs/gst/audio/audio-resampler-x86-sse2.h b/gst-libs/gst/audio/audio-resampler-x86-sse2.h new file mode 100644 index 0000000000..3bbf5cded5 --- /dev/null +++ b/gst-libs/gst/audio/audio-resampler-x86-sse2.h @@ -0,0 +1,49 @@ +/* GStreamer + * Copyright (C) <2016> Wim Taymans + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Library General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Library General Public License for more details. + * + * You should have received a copy of the GNU Library General Public + * License along with this library; if not, write to the + * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, + * Boston, MA 02110-1301, USA. + */ + +#ifndef AUDIO_RESAMPLER_X86_SSE2_H +#define AUDIO_RESAMPLER_X86_SSE2_H + +#include "audio-resampler-macros.h" + +DECL_RESAMPLE_FUNC (gint16, full, 1, sse2); +DECL_RESAMPLE_FUNC (gint16, linear, 1, sse2); +DECL_RESAMPLE_FUNC (gint16, cubic, 1, sse2); + +DECL_RESAMPLE_FUNC (gdouble, full, 1, sse2); +DECL_RESAMPLE_FUNC (gdouble, linear, 1, sse2); +DECL_RESAMPLE_FUNC (gdouble, cubic, 1, sse2); + +void +interpolate_gint16_linear_sse2 (gpointer op, const gpointer ap, + gint len, const gpointer icp, gint astride); + +void +interpolate_gint16_cubic_sse2 (gpointer op, const gpointer ap, + gint len, const gpointer icp, gint astride); + +void +interpolate_gdouble_linear_sse2 (gpointer op, const gpointer ap, + gint len, const gpointer icp, gint astride); + +void +interpolate_gdouble_cubic_sse2 (gpointer op, const gpointer ap, + gint len, const gpointer icp, gint astride); + +#endif /* AUDIO_RESAMPLER_X86_SSE2_H */ diff --git a/gst-libs/gst/audio/audio-resampler-x86-sse41.c b/gst-libs/gst/audio/audio-resampler-x86-sse41.c new file mode 100644 index 0000000000..cf3d8184aa --- /dev/null +++ b/gst-libs/gst/audio/audio-resampler-x86-sse41.c @@ -0,0 +1,185 @@ +/* GStreamer + * Copyright (C) <2016> Wim Taymans + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Library General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Library General Public License for more details. + * + * You should have received a copy of the GNU Library General Public + * License along with this library; if not, write to the + * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, + * Boston, MA 02110-1301, USA. + */ + +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + +#include "audio-resampler-x86-sse41.h" + +#if 0 +#define __SSE4_1__ +#pragma GCC target("sse4.1") +#endif + +#if defined (HAVE_SMMINTRIN_H) && defined (HAVE_EMMINTRIN_H) && defined(__SSE4_1__) +#include +#include + +static inline void +inner_product_gint32_full_1_sse41 (gint32 * o, const gint32 * a, + const gint32 * b, gint len, const gint32 * icoeff, gint bstride) +{ + gint i = 0; + __m128i sum, ta, tb; + gint64 res; + + sum = _mm_setzero_si128 (); + + for (; i < len; i += 8) { + ta = _mm_loadu_si128 ((__m128i *) (a + i)); + tb = _mm_load_si128 ((__m128i *) (b + i)); + + sum = + _mm_add_epi64 (sum, _mm_mul_epi32 (_mm_unpacklo_epi32 (ta, ta), + _mm_unpacklo_epi32 (tb, tb))); + sum = + _mm_add_epi64 (sum, _mm_mul_epi32 (_mm_unpackhi_epi32 (ta, ta), + _mm_unpackhi_epi32 (tb, tb))); + + ta = _mm_loadu_si128 ((__m128i *) (a + i + 4)); + tb = _mm_load_si128 ((__m128i *) (b + i + 4)); + + sum = + _mm_add_epi64 (sum, _mm_mul_epi32 (_mm_unpacklo_epi32 (ta, ta), + _mm_unpacklo_epi32 (tb, tb))); + sum = + _mm_add_epi64 (sum, _mm_mul_epi32 (_mm_unpackhi_epi32 (ta, ta), + _mm_unpackhi_epi32 (tb, tb))); + } + sum = _mm_add_epi64 (sum, _mm_unpackhi_epi64 (sum, sum)); + res = _mm_cvtsi128_si64 (sum); + + res = (res + (1 << (PRECISION_S32 - 1))) >> PRECISION_S32; + *o = CLAMP (res, -(1L << 31), (1L << 31) - 1); +} + +static inline void +inner_product_gint32_linear_1_sse41 (gint32 * o, const gint32 * a, + const gint32 * b, gint len, const gint32 * icoeff, gint bstride) +{ + gint i = 0; + gint64 res; + __m128i sum[2], ta, tb; + __m128i f = _mm_loadu_si128 ((__m128i *) icoeff); + const gint32 *c[2] = { (gint32 *) ((gint8 *) b + 0 * bstride), + (gint32 *) ((gint8 *) b + 1 * bstride) + }; + + sum[0] = sum[1] = _mm_setzero_si128 (); + + for (; i < len; i += 4) { + ta = _mm_loadu_si128 ((__m128i *) (a + i)); + + tb = _mm_load_si128 ((__m128i *) (c[0] + i)); + sum[0] = _mm_add_epi64 (sum[0], _mm_mul_epi32 (_mm_unpacklo_epi32 (ta, ta), + _mm_unpacklo_epi32 (tb, tb))); + sum[0] = _mm_add_epi64 (sum[0], _mm_mul_epi32 (_mm_unpackhi_epi32 (ta, ta), + _mm_unpackhi_epi32 (tb, tb))); + + tb = _mm_load_si128 ((__m128i *) (c[1] + i)); + sum[1] = _mm_add_epi64 (sum[1], _mm_mul_epi32 (_mm_unpacklo_epi32 (ta, ta), + _mm_unpacklo_epi32 (tb, tb))); + sum[1] = _mm_add_epi64 (sum[1], _mm_mul_epi32 (_mm_unpackhi_epi32 (ta, ta), + _mm_unpackhi_epi32 (tb, tb))); + } + sum[0] = _mm_srli_epi64 (sum[0], PRECISION_S32); + sum[1] = _mm_srli_epi64 (sum[1], PRECISION_S32); + sum[0] = + _mm_mul_epi32 (sum[0], _mm_shuffle_epi32 (f, _MM_SHUFFLE (0, 0, 0, 0))); + sum[1] = + _mm_mul_epi32 (sum[1], _mm_shuffle_epi32 (f, _MM_SHUFFLE (1, 1, 1, 1))); + sum[0] = _mm_add_epi64 (sum[0], sum[1]); + sum[0] = _mm_add_epi64 (sum[0], _mm_unpackhi_epi64 (sum[0], sum[0])); + res = _mm_cvtsi128_si64 (sum[0]); + + res = (res + (1 << (PRECISION_S32 - 1))) >> PRECISION_S32; + *o = CLAMP (res, -(1L << 31), (1L << 31) - 1); +} + +static inline void +inner_product_gint32_cubic_1_sse41 (gint32 * o, const gint32 * a, + const gint32 * b, gint len, const gint32 * icoeff, gint bstride) +{ + gint i = 0; + gint64 res; + __m128i sum[4], ta, tb; + __m128i f = _mm_loadu_si128 ((__m128i *) icoeff); + const gint32 *c[4] = { (gint32 *) ((gint8 *) b + 0 * bstride), + (gint32 *) ((gint8 *) b + 1 * bstride), + (gint32 *) ((gint8 *) b + 2 * bstride), + (gint32 *) ((gint8 *) b + 3 * bstride) + }; + + sum[0] = sum[1] = sum[2] = sum[3] = _mm_setzero_si128 (); + + for (; i < len; i += 4) { + ta = _mm_loadu_si128 ((__m128i *) (a + i)); + + tb = _mm_load_si128 ((__m128i *) (c[0] + i)); + sum[0] = _mm_add_epi64 (sum[0], _mm_mul_epi32 (_mm_unpacklo_epi32 (ta, ta), + _mm_unpacklo_epi32 (tb, tb))); + sum[0] = _mm_add_epi64 (sum[0], _mm_mul_epi32 (_mm_unpackhi_epi32 (ta, ta), + _mm_unpackhi_epi32 (tb, tb))); + + tb = _mm_load_si128 ((__m128i *) (c[1] + i)); + sum[1] = _mm_add_epi64 (sum[1], _mm_mul_epi32 (_mm_unpacklo_epi32 (ta, ta), + _mm_unpacklo_epi32 (tb, tb))); + sum[1] = _mm_add_epi64 (sum[1], _mm_mul_epi32 (_mm_unpackhi_epi32 (ta, ta), + _mm_unpackhi_epi32 (tb, tb))); + + tb = _mm_load_si128 ((__m128i *) (c[2] + i)); + sum[2] = _mm_add_epi64 (sum[2], _mm_mul_epi32 (_mm_unpacklo_epi32 (ta, ta), + _mm_unpacklo_epi32 (tb, tb))); + sum[2] = _mm_add_epi64 (sum[2], _mm_mul_epi32 (_mm_unpackhi_epi32 (ta, ta), + _mm_unpackhi_epi32 (tb, tb))); + + tb = _mm_load_si128 ((__m128i *) (c[3] + i)); + sum[3] = _mm_add_epi64 (sum[3], _mm_mul_epi32 (_mm_unpacklo_epi32 (ta, ta), + _mm_unpacklo_epi32 (tb, tb))); + sum[3] = _mm_add_epi64 (sum[3], _mm_mul_epi32 (_mm_unpackhi_epi32 (ta, ta), + _mm_unpackhi_epi32 (tb, tb))); + } + sum[0] = _mm_srli_epi64 (sum[0], PRECISION_S32); + sum[1] = _mm_srli_epi64 (sum[1], PRECISION_S32); + sum[2] = _mm_srli_epi64 (sum[2], PRECISION_S32); + sum[3] = _mm_srli_epi64 (sum[3], PRECISION_S32); + sum[0] = + _mm_mul_epi32 (sum[0], _mm_shuffle_epi32 (f, _MM_SHUFFLE (0, 0, 0, 0))); + sum[1] = + _mm_mul_epi32 (sum[1], _mm_shuffle_epi32 (f, _MM_SHUFFLE (1, 1, 1, 1))); + sum[2] = + _mm_mul_epi32 (sum[2], _mm_shuffle_epi32 (f, _MM_SHUFFLE (2, 2, 2, 2))); + sum[3] = + _mm_mul_epi32 (sum[3], _mm_shuffle_epi32 (f, _MM_SHUFFLE (3, 3, 3, 3))); + sum[0] = _mm_add_epi64 (sum[0], sum[1]); + sum[2] = _mm_add_epi64 (sum[2], sum[3]); + sum[0] = _mm_add_epi64 (sum[0], sum[2]); + sum[0] = _mm_add_epi64 (sum[0], _mm_unpackhi_epi64 (sum[0], sum[0])); + res = _mm_cvtsi128_si64 (sum[0]); + + res = (res + (1 << (PRECISION_S32 - 1))) >> PRECISION_S32; + *o = CLAMP (res, -(1L << 31), (1L << 31) - 1); +} + +MAKE_RESAMPLE_FUNC (gint32, full, 1, sse41); +MAKE_RESAMPLE_FUNC (gint32, linear, 1, sse41); +MAKE_RESAMPLE_FUNC (gint32, cubic, 1, sse41); + +#endif diff --git a/gst-libs/gst/audio/audio-resampler-x86-sse41.h b/gst-libs/gst/audio/audio-resampler-x86-sse41.h new file mode 100644 index 0000000000..d8706b0dca --- /dev/null +++ b/gst-libs/gst/audio/audio-resampler-x86-sse41.h @@ -0,0 +1,29 @@ +/* GStreamer + * Copyright (C) <2016> Wim Taymans + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Library General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Library General Public License for more details. + * + * You should have received a copy of the GNU Library General Public + * License along with this library; if not, write to the + * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, + * Boston, MA 02110-1301, USA. + */ + +#ifndef AUDIO_RESAMPLER_X86_SSE41_H +#define AUDIO_RESAMPLER_X86_SSE41_H + +#include "audio-resampler-macros.h" + +DECL_RESAMPLE_FUNC (gint32, full, 1, sse41); +DECL_RESAMPLE_FUNC (gint32, linear, 1, sse41); +DECL_RESAMPLE_FUNC (gint32, cubic, 1, sse41); + +#endif /* AUDIO_RESAMPLER_X86_SSE41_H */ diff --git a/gst-libs/gst/audio/audio-resampler-x86.h b/gst-libs/gst/audio/audio-resampler-x86.h index c1b73d099f..8e2bed3f5c 100644 --- a/gst-libs/gst/audio/audio-resampler-x86.h +++ b/gst-libs/gst/audio/audio-resampler-x86.h @@ -17,631 +17,16 @@ * Boston, MA 02110-1301, USA. */ -#if defined (HAVE_XMMINTRIN_H) && defined(__SSE__) -#include - -static inline void -inner_product_gfloat_full_1_sse (gfloat * o, const gfloat * a, - const gfloat * b, gint len, const gfloat * icoeff, gint bstride) -{ - gint i = 0; - __m128 sum = _mm_setzero_ps (); - - for (; i < len; i += 8) { - sum = - _mm_add_ps (sum, _mm_mul_ps (_mm_loadu_ps (a + i + 0), - _mm_load_ps (b + i + 0))); - sum = - _mm_add_ps (sum, _mm_mul_ps (_mm_loadu_ps (a + i + 4), - _mm_load_ps (b + i + 4))); - } - sum = _mm_add_ps (sum, _mm_movehl_ps (sum, sum)); - sum = _mm_add_ss (sum, _mm_shuffle_ps (sum, sum, 0x55)); - _mm_store_ss (o, sum); -} - -static inline void -inner_product_gfloat_linear_1_sse (gfloat * o, const gfloat * a, - const gfloat * b, gint len, const gfloat * icoeff, gint bstride) -{ - gint i = 0; - __m128 sum[2], t; - const gfloat *c[2] = {(gfloat*)((gint8*)b + 0*bstride), - (gfloat*)((gint8*)b + 1*bstride)}; - - sum[0] = sum[1] = _mm_setzero_ps (); - - for (; i < len; i += 8) { - t = _mm_loadu_ps (a + i + 0); - sum[0] = _mm_add_ps (sum[0], _mm_mul_ps (t, _mm_load_ps (c[0] + i + 0))); - sum[1] = _mm_add_ps (sum[1], _mm_mul_ps (t, _mm_load_ps (c[1] + i + 0))); - t = _mm_loadu_ps (a + i + 4); - sum[0] = _mm_add_ps (sum[0], _mm_mul_ps (t, _mm_load_ps (c[0] + i + 4))); - sum[1] = _mm_add_ps (sum[1], _mm_mul_ps (t, _mm_load_ps (c[1] + i + 4))); - } - sum[0] = _mm_mul_ps (_mm_sub_ps (sum[0], sum[1]), _mm_load1_ps (icoeff)); - sum[0] = _mm_add_ps (sum[0], sum[1]); - sum[0] = _mm_add_ps (sum[0], _mm_movehl_ps (sum[0], sum[0])); - sum[0] = _mm_add_ss (sum[0], _mm_shuffle_ps (sum[0], sum[0], 0x55)); - _mm_store_ss (o, sum[0]); -} - -static inline void -inner_product_gfloat_cubic_1_sse (gfloat * o, const gfloat * a, - const gfloat * b, gint len, const gfloat * icoeff, gint bstride) -{ - gint i = 0; - __m128 sum[4]; - __m128 t, f = _mm_loadu_ps(icoeff); - const gfloat *c[4] = {(gfloat*)((gint8*)b + 0*bstride), - (gfloat*)((gint8*)b + 1*bstride), - (gfloat*)((gint8*)b + 2*bstride), - (gfloat*)((gint8*)b + 3*bstride)}; - - sum[0] = sum[1] = sum[2] = sum[3] = _mm_setzero_ps (); - - for (; i < len; i += 4) { - t = _mm_loadu_ps (a + i); - sum[0] = _mm_add_ps (sum[0], _mm_mul_ps (t, _mm_load_ps (c[0] + i))); - sum[1] = _mm_add_ps (sum[1], _mm_mul_ps (t, _mm_load_ps (c[1] + i))); - sum[2] = _mm_add_ps (sum[2], _mm_mul_ps (t, _mm_load_ps (c[2] + i))); - sum[3] = _mm_add_ps (sum[3], _mm_mul_ps (t, _mm_load_ps (c[3] + i))); - } - sum[0] = _mm_mul_ps (sum[0], _mm_shuffle_ps (f, f, 0x00)); - sum[1] = _mm_mul_ps (sum[1], _mm_shuffle_ps (f, f, 0x55)); - sum[2] = _mm_mul_ps (sum[2], _mm_shuffle_ps (f, f, 0xaa)); - sum[3] = _mm_mul_ps (sum[3], _mm_shuffle_ps (f, f, 0xff)); - sum[0] = _mm_add_ps (sum[0], sum[1]); - sum[2] = _mm_add_ps (sum[2], sum[3]); - sum[0] = _mm_add_ps (sum[0], sum[2]); - sum[0] = _mm_add_ps (sum[0], _mm_movehl_ps (sum[0], sum[0])); - sum[0] = _mm_add_ss (sum[0], _mm_shuffle_ps (sum[0], sum[0], 0x55)); - _mm_store_ss (o, sum[0]); -} - -MAKE_RESAMPLE_FUNC (gfloat, full, 1, sse); -MAKE_RESAMPLE_FUNC (gfloat, linear, 1, sse); -MAKE_RESAMPLE_FUNC (gfloat, cubic, 1, sse); - -static void -interpolate_gfloat_linear_sse (gpointer op, const gpointer ap, - gint len, const gpointer icp, gint astride) -{ - gint i; - gfloat *o = op, *a = ap, *ic = icp; - __m128 f[2], t1, t2; - const gfloat *c[2] = {(gfloat*)((gint8*)a + 0*astride), - (gfloat*)((gint8*)a + 1*astride)}; - - f[0] = _mm_load1_ps (ic+0); - f[1] = _mm_load1_ps (ic+1); - - for (i = 0; i < len; i += 8) { - t1 = _mm_mul_ps (_mm_load_ps (c[0] + i + 0), f[0]); - t2 = _mm_mul_ps (_mm_load_ps (c[1] + i + 0), f[1]); - _mm_store_ps (o + i + 0, _mm_add_ps (t1, t2)); - - t1 = _mm_mul_ps (_mm_load_ps (c[0] + i + 4), f[0]); - t2 = _mm_mul_ps (_mm_load_ps (c[1] + i + 4), f[1]); - _mm_store_ps (o + i + 4, _mm_add_ps (t1, t2)); - } -} - -static void -interpolate_gfloat_cubic_sse (gpointer op, const gpointer ap, - gint len, const gpointer icp, gint astride) -{ - gint i; - gfloat *o = op, *a = ap, *ic = icp; - __m128 f[4], t[4]; - const gfloat *c[4] = {(gfloat*)((gint8*)a + 0*astride), - (gfloat*)((gint8*)a + 1*astride), - (gfloat*)((gint8*)a + 2*astride), - (gfloat*)((gint8*)a + 3*astride)}; - - f[0] = _mm_load1_ps (ic+0); - f[1] = _mm_load1_ps (ic+1); - f[2] = _mm_load1_ps (ic+2); - f[3] = _mm_load1_ps (ic+3); - - for (i = 0; i < len; i += 4) { - t[0] = _mm_mul_ps (_mm_load_ps (c[0] + i + 0), f[0]); - t[1] = _mm_mul_ps (_mm_load_ps (c[1] + i + 0), f[1]); - t[2] = _mm_mul_ps (_mm_load_ps (c[2] + i + 0), f[2]); - t[3] = _mm_mul_ps (_mm_load_ps (c[3] + i + 0), f[3]); - t[0] = _mm_add_ps (t[0], t[1]); - t[2] = _mm_add_ps (t[2], t[3]); - _mm_store_ps (o + i + 0, _mm_add_ps (t[0], t[2])); - } -} - -#endif - -#if defined (HAVE_EMMINTRIN_H) && defined(__SSE2__) -#include - -static inline void -inner_product_gint16_full_1_sse2 (gint16 * o, const gint16 * a, - const gint16 * b, gint len, const gint16 * icoeff, gint bstride) -{ - gint i; - __m128i sum, t; - - sum = _mm_setzero_si128 (); - - for (i = 0; i < len; i += 16) { - t = _mm_loadu_si128 ((__m128i *) (a + i)); - sum = _mm_add_epi32 (sum, _mm_madd_epi16 (t, _mm_load_si128 ((__m128i *) (b + i + 0)))); - - t = _mm_loadu_si128 ((__m128i *) (a + i + 8)); - sum = _mm_add_epi32 (sum, _mm_madd_epi16 (t, _mm_load_si128 ((__m128i *) (b + i + 8)))); - } - sum = _mm_add_epi32 (sum, _mm_shuffle_epi32 (sum, _MM_SHUFFLE (2, 3, 2, 3))); - sum = _mm_add_epi32 (sum, _mm_shuffle_epi32 (sum, _MM_SHUFFLE (1, 1, 1, 1))); - - sum = _mm_add_epi32 (sum, _mm_set1_epi32 (1 << (PRECISION_S16 - 1))); - sum = _mm_srai_epi32 (sum, PRECISION_S16); - sum = _mm_packs_epi32 (sum, sum); - *o = _mm_extract_epi16 (sum, 0); -} - -static inline void -inner_product_gint16_linear_1_sse2 (gint16 * o, const gint16 * a, - const gint16 * b, gint len, const gint16 * icoeff, gint bstride) -{ - gint i = 0; - __m128i sum[2], t; - __m128i f = _mm_set_epi64x (0, *((gint64*)icoeff)); - const gint16 *c[2] = {(gint16*)((gint8*)b + 0*bstride), - (gint16*)((gint8*)b + 1*bstride)}; - - sum[0] = sum[1] = _mm_setzero_si128 (); - f = _mm_unpacklo_epi16 (f, sum[0]); - - for (; i < len; i += 16) { - t = _mm_loadu_si128 ((__m128i *) (a + i + 0)); - sum[0] = _mm_add_epi32 (sum[0], _mm_madd_epi16 (t, _mm_load_si128 ((__m128i *) (c[0] + i + 0)))); - sum[1] = _mm_add_epi32 (sum[1], _mm_madd_epi16 (t, _mm_load_si128 ((__m128i *) (c[1] + i + 0)))); - - t = _mm_loadu_si128 ((__m128i *) (a + i + 8)); - sum[0] = _mm_add_epi32 (sum[0], _mm_madd_epi16 (t, _mm_load_si128 ((__m128i *) (c[0] + i + 8)))); - sum[1] = _mm_add_epi32 (sum[1], _mm_madd_epi16 (t, _mm_load_si128 ((__m128i *) (c[1] + i + 8)))); - } - sum[0] = _mm_srai_epi32 (sum[0], PRECISION_S16); - sum[1] = _mm_srai_epi32 (sum[1], PRECISION_S16); - - sum[0] = _mm_madd_epi16 (sum[0], _mm_shuffle_epi32 (f, _MM_SHUFFLE (0, 0, 0, 0))); - sum[1] = _mm_madd_epi16 (sum[1], _mm_shuffle_epi32 (f, _MM_SHUFFLE (1, 1, 1, 1))); - sum[0] = _mm_add_epi32 (sum[0], sum[1]); - - sum[0] = _mm_add_epi32 (sum[0], _mm_shuffle_epi32 (sum[0], _MM_SHUFFLE (2, 3, 2, 3))); - sum[0] = _mm_add_epi32 (sum[0], _mm_shuffle_epi32 (sum[0], _MM_SHUFFLE (1, 1, 1, 1))); - - sum[0] = _mm_add_epi32 (sum[0], _mm_set1_epi32 (1 << (PRECISION_S16 - 1))); - sum[0] = _mm_srai_epi32 (sum[0], PRECISION_S16); - sum[0] = _mm_packs_epi32 (sum[0], sum[0]); - *o = _mm_extract_epi16 (sum[0], 0); -} - -static inline void -inner_product_gint16_cubic_1_sse2 (gint16 * o, const gint16 * a, - const gint16 * b, gint len, const gint16 * icoeff, gint bstride) -{ - gint i = 0; - __m128i sum[4], t[4]; - __m128i f = _mm_set_epi64x (0, *((long long*)icoeff)); - const gint16 *c[4] = {(gint16*)((gint8*)b + 0*bstride), - (gint16*)((gint8*)b + 1*bstride), - (gint16*)((gint8*)b + 2*bstride), - (gint16*)((gint8*)b + 3*bstride)}; - - sum[0] = sum[1] = sum[2] = sum[3] = _mm_setzero_si128 (); - f = _mm_unpacklo_epi16 (f, sum[0]); - - for (; i < len; i += 8) { - t[0] = _mm_loadu_si128 ((__m128i *) (a + i)); - sum[0] = _mm_add_epi32 (sum[0], _mm_madd_epi16 (t[0], _mm_load_si128 ((__m128i *) (c[0] + i)))); - sum[1] = _mm_add_epi32 (sum[1], _mm_madd_epi16 (t[0], _mm_load_si128 ((__m128i *) (c[1] + i)))); - sum[2] = _mm_add_epi32 (sum[2], _mm_madd_epi16 (t[0], _mm_load_si128 ((__m128i *) (c[2] + i)))); - sum[3] = _mm_add_epi32 (sum[3], _mm_madd_epi16 (t[0], _mm_load_si128 ((__m128i *) (c[3] + i)))); - } - t[0] = _mm_unpacklo_epi32 (sum[0], sum[1]); - t[1] = _mm_unpacklo_epi32 (sum[2], sum[3]); - t[2] = _mm_unpackhi_epi32 (sum[0], sum[1]); - t[3] = _mm_unpackhi_epi32 (sum[2], sum[3]); - - sum[0] = _mm_add_epi32 (_mm_unpacklo_epi64(t[0], t[1]), _mm_unpackhi_epi64(t[0], t[1])); - sum[2] = _mm_add_epi32 (_mm_unpacklo_epi64(t[2], t[3]), _mm_unpackhi_epi64(t[2], t[3])); - sum[0] = _mm_add_epi32 (sum[0], sum[2]); - - sum[0] = _mm_srai_epi32 (sum[0], PRECISION_S16); - sum[0] = _mm_madd_epi16 (sum[0], f); - - sum[0] = _mm_add_epi32 (sum[0], _mm_shuffle_epi32 (sum[0], _MM_SHUFFLE (2, 3, 2, 3))); - sum[0] = _mm_add_epi32 (sum[0], _mm_shuffle_epi32 (sum[0], _MM_SHUFFLE (1, 1, 1, 1))); - - sum[0] = _mm_add_epi32 (sum[0], _mm_set1_epi32 (1 << (PRECISION_S16 - 1))); - sum[0] = _mm_srai_epi32 (sum[0], PRECISION_S16); - sum[0] = _mm_packs_epi32 (sum[0], sum[0]); - *o = _mm_extract_epi16 (sum[0], 0); -} - -static inline void -inner_product_gdouble_full_1_sse2 (gdouble * o, const gdouble * a, - const gdouble * b, gint len, const gdouble * icoeff, gint bstride) -{ - gint i = 0; - __m128d sum = _mm_setzero_pd (); - - for (; i < len; i += 8) { - sum = - _mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + i + 0), - _mm_load_pd (b + i + 0))); - sum = - _mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + i + 2), - _mm_load_pd (b + i + 2))); - sum = - _mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + i + 4), - _mm_load_pd (b + i + 4))); - sum = - _mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + i + 6), - _mm_load_pd (b + i + 6))); - } - sum = _mm_add_sd (sum, _mm_unpackhi_pd (sum, sum)); - _mm_store_sd (o, sum); -} - -static inline void -inner_product_gdouble_linear_1_sse2 (gdouble * o, const gdouble * a, - const gdouble * b, gint len, const gdouble * icoeff, gint bstride) -{ - gint i = 0; - __m128d sum[2], t; - const gdouble *c[2] = {(gdouble*)((gint8*)b + 0*bstride), - (gdouble*)((gint8*)b + 1*bstride)}; - - sum[0] = sum[1] = _mm_setzero_pd (); - - for (; i < len; i += 4) { - t = _mm_loadu_pd (a + i + 0); - sum[0] = _mm_add_pd (sum[0], _mm_mul_pd (t, _mm_load_pd (c[0] + i + 0))); - sum[1] = _mm_add_pd (sum[1], _mm_mul_pd (t, _mm_load_pd (c[1] + i + 0))); - t = _mm_loadu_pd (a + i + 2); - sum[0] = _mm_add_pd (sum[0], _mm_mul_pd (t, _mm_load_pd (c[0] + i + 2))); - sum[1] = _mm_add_pd (sum[1], _mm_mul_pd (t, _mm_load_pd (c[1] + i + 2))); - } - sum[0] = _mm_mul_pd (_mm_sub_pd (sum[0], sum[1]), _mm_load1_pd (icoeff)); - sum[0] = _mm_add_pd (sum[0], sum[1]); - sum[0] = _mm_add_sd (sum[0], _mm_unpackhi_pd (sum[0], sum[0])); - _mm_store_sd (o, sum[0]); -} - -static inline void -inner_product_gdouble_cubic_1_sse2 (gdouble * o, const gdouble * a, - const gdouble * b, gint len, const gdouble * icoeff, gint bstride) -{ - gint i; - __m128d f[2], sum[4], t; - const gdouble *c[4] = {(gdouble*)((gint8*)b + 0*bstride), - (gdouble*)((gint8*)b + 1*bstride), - (gdouble*)((gint8*)b + 2*bstride), - (gdouble*)((gint8*)b + 3*bstride)}; - - f[0] = _mm_loadu_pd (icoeff + 0); - f[1] = _mm_loadu_pd (icoeff + 2); - sum[0] = sum[1] = sum[2] = sum[3] = _mm_setzero_pd (); - - for (i = 0; i < len; i += 2) { - t = _mm_loadu_pd (a + i + 0); - sum[0] = _mm_add_pd (sum[0], _mm_mul_pd (t, _mm_load_pd (c[0] + i))); - sum[1] = _mm_add_pd (sum[1], _mm_mul_pd (t, _mm_load_pd (c[1] + i))); - sum[2] = _mm_add_pd (sum[2], _mm_mul_pd (t, _mm_load_pd (c[2] + i))); - sum[3] = _mm_add_pd (sum[3], _mm_mul_pd (t, _mm_load_pd (c[3] + i))); - } - sum[0] = _mm_mul_pd (sum[0], _mm_shuffle_pd (f[0], f[0], _MM_SHUFFLE2 (0, 0))); - sum[1] = _mm_mul_pd (sum[1], _mm_shuffle_pd (f[0], f[0], _MM_SHUFFLE2 (1, 1))); - sum[2] = _mm_mul_pd (sum[2], _mm_shuffle_pd (f[1], f[1], _MM_SHUFFLE2 (0, 0))); - sum[3] = _mm_mul_pd (sum[3], _mm_shuffle_pd (f[1], f[1], _MM_SHUFFLE2 (1, 1))); - sum[0] = _mm_add_pd (sum[0], sum[1]); - sum[2] = _mm_add_pd (sum[2], sum[3]); - sum[0] = _mm_add_pd (sum[0], sum[2]); - sum[0] = _mm_add_sd (sum[0], _mm_unpackhi_pd (sum[0], sum[0])); - _mm_store_sd (o, sum[0]); -} - -MAKE_RESAMPLE_FUNC (gint16, full, 1, sse2); -MAKE_RESAMPLE_FUNC (gint16, linear, 1, sse2); -MAKE_RESAMPLE_FUNC (gint16, cubic, 1, sse2); - -MAKE_RESAMPLE_FUNC (gdouble, full, 1, sse2); -MAKE_RESAMPLE_FUNC (gdouble, linear, 1, sse2); -MAKE_RESAMPLE_FUNC (gdouble, cubic, 1, sse2); - -static inline void -interpolate_gint16_linear_sse2 (gpointer op, const gpointer ap, - gint len, const gpointer icp, gint astride) -{ - gint i = 0; - gint16 *o = op, *a = ap, *ic = icp; - __m128i ta, tb, t1, t2; - __m128i f = _mm_set_epi64x (0, *((gint64*)ic)); - const gint16 *c[2] = {(gint16*)((gint8*)a + 0*astride), - (gint16*)((gint8*)a + 1*astride)}; - - f = _mm_unpacklo_epi32 (f, f); - f = _mm_unpacklo_epi64 (f, f); - - for (; i < len; i += 8) { - ta = _mm_load_si128 ((__m128i *) (c[0] + i)); - tb = _mm_load_si128 ((__m128i *) (c[1] + i)); - - t1 = _mm_madd_epi16 (_mm_unpacklo_epi16 (ta, tb), f); - t2 = _mm_madd_epi16 (_mm_unpackhi_epi16 (ta, tb), f); - - t1 = _mm_add_epi32 (t1, _mm_set1_epi32 (1 << (PRECISION_S16 - 1))); - t2 = _mm_add_epi32 (t2, _mm_set1_epi32 (1 << (PRECISION_S16 - 1))); - - t1 = _mm_srai_epi32 (t1, PRECISION_S16); - t2 = _mm_srai_epi32 (t2, PRECISION_S16); - - t1 = _mm_packs_epi32 (t1, t2); - _mm_store_si128 ((__m128i *) (o + i), t1); - } -} - -static inline void -interpolate_gint16_cubic_sse2 (gpointer op, const gpointer ap, - gint len, const gpointer icp, gint astride) -{ - gint i = 0; - gint16 *o = op, *a = ap, *ic = icp; - __m128i ta, tb, tl1, tl2, th1, th2; - __m128i f[2]; - const gint16 *c[4] = {(gint16*)((gint8*)a + 0*astride), - (gint16*)((gint8*)a + 1*astride), - (gint16*)((gint8*)a + 2*astride), - (gint16*)((gint8*)a + 3*astride)}; - - f[0] = _mm_set_epi16 (ic[1], ic[0], ic[1], ic[0], ic[1], ic[0], ic[1], ic[0]); - f[1] = _mm_set_epi16 (ic[3], ic[2], ic[3], ic[2], ic[3], ic[2], ic[3], ic[2]); - - for (; i < len; i += 8) { - ta = _mm_load_si128 ((__m128i *) (c[0] + i)); - tb = _mm_load_si128 ((__m128i *) (c[1] + i)); - - tl1 = _mm_madd_epi16 (_mm_unpacklo_epi16 (ta, tb), f[0]); - th1 = _mm_madd_epi16 (_mm_unpackhi_epi16 (ta, tb), f[0]); - - ta = _mm_load_si128 ((__m128i *) (c[2] + i)); - tb = _mm_load_si128 ((__m128i *) (c[3] + i)); - - tl2 = _mm_madd_epi16 (_mm_unpacklo_epi16 (ta, tb), f[1]); - th2 = _mm_madd_epi16 (_mm_unpackhi_epi16 (ta, tb), f[1]); - - tl1 = _mm_add_epi32 (tl1, tl2); - th1 = _mm_add_epi32 (th1, th2); - - tl1 = _mm_add_epi32 (tl1, _mm_set1_epi32 (1 << (PRECISION_S16 - 1))); - th1 = _mm_add_epi32 (th1, _mm_set1_epi32 (1 << (PRECISION_S16 - 1))); - - tl1 = _mm_srai_epi32 (tl1, PRECISION_S16); - th1 = _mm_srai_epi32 (th1, PRECISION_S16); - - tl1 = _mm_packs_epi32 (tl1, th1); - _mm_store_si128 ((__m128i *) (o + i), tl1); - } -} - -static void -interpolate_gdouble_linear_sse2 (gpointer op, const gpointer ap, - gint len, const gpointer icp, gint astride) -{ - gint i; - gdouble *o = op, *a = ap, *ic = icp; - __m128d f[2], t1, t2; - const gdouble *c[2] = {(gdouble*)((gint8*)a + 0*astride), - (gdouble*)((gint8*)a + 1*astride)}; - - f[0] = _mm_load1_pd (ic+0); - f[1] = _mm_load1_pd (ic+1); - - for (i = 0; i < len; i += 4) { - t1 = _mm_mul_pd (_mm_load_pd (c[0] + i + 0), f[0]); - t2 = _mm_mul_pd (_mm_load_pd (c[1] + i + 0), f[1]); - _mm_store_pd (o + i + 0, _mm_add_pd (t1, t2)); - - t1 = _mm_mul_pd (_mm_load_pd (c[0] + i + 2), f[0]); - t2 = _mm_mul_pd (_mm_load_pd (c[1] + i + 2), f[1]); - _mm_store_pd (o + i + 2, _mm_add_pd (t1, t2)); - } -} - -static void -interpolate_gdouble_cubic_sse2 (gpointer op, const gpointer ap, - gint len, const gpointer icp, gint astride) -{ - gint i; - gdouble *o = op, *a = ap, *ic = icp; - __m128d f[4], t[4]; - const gdouble *c[4] = {(gdouble*)((gint8*)a + 0*astride), - (gdouble*)((gint8*)a + 1*astride), - (gdouble*)((gint8*)a + 2*astride), - (gdouble*)((gint8*)a + 3*astride)}; - - f[0] = _mm_load1_pd (ic+0); - f[1] = _mm_load1_pd (ic+1); - f[2] = _mm_load1_pd (ic+2); - f[3] = _mm_load1_pd (ic+3); - - for (i = 0; i < len; i += 2) { - t[0] = _mm_mul_pd (_mm_load_pd (c[0] + i + 0), f[0]); - t[1] = _mm_mul_pd (_mm_load_pd (c[1] + i + 0), f[1]); - t[2] = _mm_mul_pd (_mm_load_pd (c[2] + i + 0), f[2]); - t[3] = _mm_mul_pd (_mm_load_pd (c[3] + i + 0), f[3]); - t[0] = _mm_add_pd (t[0], t[1]); - t[2] = _mm_add_pd (t[2], t[3]); - _mm_store_pd (o + i + 0, _mm_add_pd (t[0], t[2])); - } -} - -#endif - -#if 0 -#define __SSE4_1__ -#pragma GCC target("sse4.1") -#endif - -#if defined (HAVE_SMMINTRIN_H) && defined(__SSE4_1__) -#include - -static inline void -inner_product_gint32_full_1_sse41 (gint32 * o, const gint32 * a, - const gint32 * b, gint len, const gint32 * icoeff, gint bstride) -{ - gint i = 0; - __m128i sum, ta, tb; - gint64 res; - - sum = _mm_setzero_si128 (); - - for (; i < len; i += 8) { - ta = _mm_loadu_si128 ((__m128i *) (a + i)); - tb = _mm_load_si128 ((__m128i *) (b + i)); - - sum = - _mm_add_epi64 (sum, _mm_mul_epi32 (_mm_unpacklo_epi32 (ta, ta), - _mm_unpacklo_epi32 (tb, tb))); - sum = - _mm_add_epi64 (sum, _mm_mul_epi32 (_mm_unpackhi_epi32 (ta, ta), - _mm_unpackhi_epi32 (tb, tb))); - - ta = _mm_loadu_si128 ((__m128i *) (a + i + 4)); - tb = _mm_load_si128 ((__m128i *) (b + i + 4)); - - sum = - _mm_add_epi64 (sum, _mm_mul_epi32 (_mm_unpacklo_epi32 (ta, ta), - _mm_unpacklo_epi32 (tb, tb))); - sum = - _mm_add_epi64 (sum, _mm_mul_epi32 (_mm_unpackhi_epi32 (ta, ta), - _mm_unpackhi_epi32 (tb, tb))); - } - sum = _mm_add_epi64 (sum, _mm_unpackhi_epi64 (sum, sum)); - res = _mm_cvtsi128_si64 (sum); - - res = (res + (1 << (PRECISION_S32 - 1))) >> PRECISION_S32; - *o = CLAMP (res, -(1L << 31), (1L << 31) - 1); -} - -static inline void -inner_product_gint32_linear_1_sse41 (gint32 * o, const gint32 * a, - const gint32 * b, gint len, const gint32 * icoeff, gint bstride) -{ - gint i = 0; - gint64 res; - __m128i sum[2], ta, tb; - __m128i f = _mm_loadu_si128 ((__m128i *)icoeff); - const gint32 *c[2] = {(gint32*)((gint8*)b + 0*bstride), - (gint32*)((gint8*)b + 1*bstride)}; - - sum[0] = sum[1] = _mm_setzero_si128 (); - - for (; i < len; i += 4) { - ta = _mm_loadu_si128 ((__m128i *)(a + i)); - - tb = _mm_load_si128 ((__m128i *)(c[0] + i)); - sum[0] = _mm_add_epi64 (sum[0], _mm_mul_epi32 (_mm_unpacklo_epi32 (ta, ta), - _mm_unpacklo_epi32 (tb, tb))); - sum[0] = _mm_add_epi64 (sum[0], _mm_mul_epi32 (_mm_unpackhi_epi32 (ta, ta), - _mm_unpackhi_epi32 (tb, tb))); - - tb = _mm_load_si128 ((__m128i *)(c[1] + i)); - sum[1] = _mm_add_epi64 (sum[1], _mm_mul_epi32 (_mm_unpacklo_epi32 (ta, ta), - _mm_unpacklo_epi32 (tb, tb))); - sum[1] = _mm_add_epi64 (sum[1], _mm_mul_epi32 (_mm_unpackhi_epi32 (ta, ta), - _mm_unpackhi_epi32 (tb, tb))); - } - sum[0] = _mm_srli_epi64 (sum[0], PRECISION_S32); - sum[1] = _mm_srli_epi64 (sum[1], PRECISION_S32); - sum[0] = _mm_mul_epi32 (sum[0], _mm_shuffle_epi32 (f, _MM_SHUFFLE (0, 0, 0, 0))); - sum[1] = _mm_mul_epi32 (sum[1], _mm_shuffle_epi32 (f, _MM_SHUFFLE (1, 1, 1, 1))); - sum[0] = _mm_add_epi64 (sum[0], sum[1]); - sum[0] = _mm_add_epi64 (sum[0], _mm_unpackhi_epi64 (sum[0], sum[0])); - res = _mm_cvtsi128_si64 (sum[0]); - - res = (res + (1 << (PRECISION_S32 - 1))) >> PRECISION_S32; - *o = CLAMP (res, -(1L << 31), (1L << 31) - 1); -} - -static inline void -inner_product_gint32_cubic_1_sse41 (gint32 * o, const gint32 * a, - const gint32 * b, gint len, const gint32 * icoeff, gint bstride) -{ - gint i = 0; - gint64 res; - __m128i sum[4], ta, tb; - __m128i f = _mm_loadu_si128 ((__m128i *)icoeff); - const gint32 *c[4] = {(gint32*)((gint8*)b + 0*bstride), - (gint32*)((gint8*)b + 1*bstride), - (gint32*)((gint8*)b + 2*bstride), - (gint32*)((gint8*)b + 3*bstride)}; - - sum[0] = sum[1] = sum[2] = sum[3] = _mm_setzero_si128 (); - - for (; i < len; i += 4) { - ta = _mm_loadu_si128 ((__m128i *)(a + i)); - - tb = _mm_load_si128 ((__m128i *)(c[0] + i)); - sum[0] = _mm_add_epi64 (sum[0], _mm_mul_epi32 (_mm_unpacklo_epi32 (ta, ta), - _mm_unpacklo_epi32 (tb, tb))); - sum[0] = _mm_add_epi64 (sum[0], _mm_mul_epi32 (_mm_unpackhi_epi32 (ta, ta), - _mm_unpackhi_epi32 (tb, tb))); - - tb = _mm_load_si128 ((__m128i *)(c[1] + i)); - sum[1] = _mm_add_epi64 (sum[1], _mm_mul_epi32 (_mm_unpacklo_epi32 (ta, ta), - _mm_unpacklo_epi32 (tb, tb))); - sum[1] = _mm_add_epi64 (sum[1], _mm_mul_epi32 (_mm_unpackhi_epi32 (ta, ta), - _mm_unpackhi_epi32 (tb, tb))); - - tb = _mm_load_si128 ((__m128i *)(c[2] + i)); - sum[2] = _mm_add_epi64 (sum[2], _mm_mul_epi32 (_mm_unpacklo_epi32 (ta, ta), - _mm_unpacklo_epi32 (tb, tb))); - sum[2] = _mm_add_epi64 (sum[2], _mm_mul_epi32 (_mm_unpackhi_epi32 (ta, ta), - _mm_unpackhi_epi32 (tb, tb))); - - tb = _mm_load_si128 ((__m128i *)(c[3] + i)); - sum[3] = _mm_add_epi64 (sum[3], _mm_mul_epi32 (_mm_unpacklo_epi32 (ta, ta), - _mm_unpacklo_epi32 (tb, tb))); - sum[3] = _mm_add_epi64 (sum[3], _mm_mul_epi32 (_mm_unpackhi_epi32 (ta, ta), - _mm_unpackhi_epi32 (tb, tb))); - } - sum[0] = _mm_srli_epi64 (sum[0], PRECISION_S32); - sum[1] = _mm_srli_epi64 (sum[1], PRECISION_S32); - sum[2] = _mm_srli_epi64 (sum[2], PRECISION_S32); - sum[3] = _mm_srli_epi64 (sum[3], PRECISION_S32); - sum[0] = _mm_mul_epi32 (sum[0], _mm_shuffle_epi32 (f, _MM_SHUFFLE (0, 0, 0, 0))); - sum[1] = _mm_mul_epi32 (sum[1], _mm_shuffle_epi32 (f, _MM_SHUFFLE (1, 1, 1, 1))); - sum[2] = _mm_mul_epi32 (sum[2], _mm_shuffle_epi32 (f, _MM_SHUFFLE (2, 2, 2, 2))); - sum[3] = _mm_mul_epi32 (sum[3], _mm_shuffle_epi32 (f, _MM_SHUFFLE (3, 3, 3, 3))); - sum[0] = _mm_add_epi64 (sum[0], sum[1]); - sum[2] = _mm_add_epi64 (sum[2], sum[3]); - sum[0] = _mm_add_epi64 (sum[0], sum[2]); - sum[0] = _mm_add_epi64 (sum[0], _mm_unpackhi_epi64 (sum[0], sum[0])); - res = _mm_cvtsi128_si64 (sum[0]); - - res = (res + (1 << (PRECISION_S32 - 1))) >> PRECISION_S32; - *o = CLAMP (res, -(1L << 31), (1L << 31) - 1); -} - -MAKE_RESAMPLE_FUNC (gint32, full, 1, sse41); -MAKE_RESAMPLE_FUNC (gint32, linear, 1, sse41); -MAKE_RESAMPLE_FUNC (gint32, cubic, 1, sse41); -#endif +#include "audio-resampler-macros.h" +#include "audio-resampler-x86-sse.h" +#include "audio-resampler-x86-sse2.h" +#include "audio-resampler-x86-sse41.h" static void audio_resampler_check_x86 (const gchar *option) { if (!strcmp (option, "sse")) { -#if defined (HAVE_XMMINTRIN_H) && defined(__SSE__) +#if defined (HAVE_XMMINTRIN_H) && HAVE_SSE GST_DEBUG ("enable SSE optimisations"); resample_gfloat_full_1 = resample_gfloat_full_1_sse; resample_gfloat_linear_1 = resample_gfloat_linear_1_sse; @@ -653,7 +38,7 @@ audio_resampler_check_x86 (const gchar *option) GST_DEBUG ("SSE optimisations not enabled"); #endif } else if (!strcmp (option, "sse2")) { -#if defined (HAVE_EMMINTRIN_H) && defined(__SSE2__) +#if defined (HAVE_EMMINTRIN_H) && HAVE_SSE2 GST_DEBUG ("enable SSE2 optimisations"); resample_gint16_full_1 = resample_gint16_full_1_sse2; resample_gint16_linear_1 = resample_gint16_linear_1_sse2; @@ -672,7 +57,7 @@ audio_resampler_check_x86 (const gchar *option) GST_DEBUG ("SSE2 optimisations not enabled"); #endif } else if (!strcmp (option, "sse41")) { -#if defined (HAVE_SMMINTRIN_H) && defined(__SSE4_1__) +#if defined (HAVE_SMMINTRIN_H) && defined (HAVE_EMMINTRIN_H) && HAVE_SSE41 GST_DEBUG ("enable SSE41 optimisations"); resample_gint32_full_1 = resample_gint32_full_1_sse41; resample_gint32_linear_1 = resample_gint32_linear_1_sse41; diff --git a/gst-libs/gst/audio/audio-resampler.c b/gst-libs/gst/audio/audio-resampler.c index 6c14721d03..8cb562ca8c 100644 --- a/gst-libs/gst/audio/audio-resampler.c +++ b/gst-libs/gst/audio/audio-resampler.c @@ -30,99 +30,13 @@ #endif #include "audio-resampler.h" - -/* Contains a collection of all things found in other resamplers: - * speex (filter construction, optimizations), ffmpeg (fixed phase filter, blackman filter), - * SRC (linear interpolation, fixed precomputed tables),... - * - * Supports: - * - S16, S32, F32 and F64 formats - * - nearest, linear and cubic interpolation - * - sinc based interpolation with kaiser or blackman-nutall windows - * - fully configurable kaiser parameters - * - dynamic linear or cubic interpolation of filter table, this can - * use less memory but more CPU - * - full filter table, generated from optionally linear or cubic - * interpolation of filter table - * - fixed filter table size with nearest neighbour phase, optionally - * using a precomputed tables - * - dynamic samplerate changes - * - x86 and neon optimizations - */ -typedef void (*ConvertTapsFunc) (gdouble * tmp_taps, gpointer taps, - gdouble weight, gint n_taps); -typedef void (*InterpolateFunc) (gpointer o, const gpointer a, gint len, - const gpointer icoeff, gint astride); -typedef void (*ResampleFunc) (GstAudioResampler * resampler, gpointer in[], - gsize in_len, gpointer out[], gsize out_len, gsize * consumed); -typedef void (*DeinterleaveFunc) (GstAudioResampler * resampler, - gpointer * sbuf, gpointer in[], gsize in_frames); +#include "audio-resampler-private.h" +#include "audio-resampler-macros.h" #define MEM_ALIGN(m,a) ((gint8 *)((guintptr)((gint8 *)(m) + ((a)-1)) & ~((a)-1))) #define ALIGN 16 #define TAPS_OVERREAD 16 -struct _GstAudioResampler -{ - GstAudioResamplerMethod method; - GstAudioResamplerFlags flags; - GstAudioFormat format; - GstStructure *options; - gint format_index; - gint channels; - gint in_rate; - gint out_rate; - - gint bps; - gint ostride; - - GstAudioResamplerFilterMode filter_mode; - guint filter_threshold; - GstAudioResamplerFilterInterpolation filter_interpolation; - - gdouble cutoff; - gdouble kaiser_beta; - /* for cubic */ - gdouble b, c; - - /* temp taps */ - gpointer tmp_taps; - - /* oversampled main filter table */ - gint oversample; - gint n_taps; - gpointer taps; - gpointer taps_mem; - gsize taps_stride; - gint n_phases; - gint alloc_taps; - gint alloc_phases; - - /* cached taps */ - gpointer *cached_phases; - gpointer cached_taps; - gpointer cached_taps_mem; - gsize cached_taps_stride; - - ConvertTapsFunc convert_taps; - InterpolateFunc interpolate; - DeinterleaveFunc deinterleave; - ResampleFunc resample; - - gint blocks; - gint inc; - gint samp_inc; - gint samp_frac; - gint samp_index; - gint samp_phase; - gint skip; - - gpointer samples; - gsize samples_len; - gsize samples_avail; - gpointer *sbuf; -}; - GST_DEBUG_CATEGORY_STATIC (audio_resampler_debug); #define GST_CAT_DEFAULT audio_resampler_debug @@ -303,9 +217,6 @@ get_kaiser_tap (gdouble x, gint n_taps, gdouble Fc, gdouble beta) return s * bessel (beta * sqrt (MAX (1 - w * w, 0))); } -#define PRECISION_S16 15 -#define PRECISION_S32 31 - #define MAKE_CONVERT_TAPS_INT_FUNC(type, precision) \ static void \ convert_taps_##type##_c (gdouble *tmp_taps, gpointer taps, \ @@ -593,9 +504,7 @@ GET_TAPS_NEAREST_FUNC (gdouble); #define get_taps_gdouble_nearest get_taps_gdouble_nearest #define GET_TAPS_FULL_FUNC(type) \ -static inline gpointer \ -get_taps_##type##_full (GstAudioResampler * resampler, \ - gint *samp_index, gint *samp_phase, type icoeff[4]) \ +DECL_GET_TAPS_FULL_FUNC(type) \ { \ gpointer res; \ gint out_rate = resampler->out_rate; \ @@ -659,9 +568,7 @@ GET_TAPS_FULL_FUNC (gfloat); GET_TAPS_FULL_FUNC (gdouble); #define GET_TAPS_INTERPOLATE_FUNC(type,inter) \ -static inline gpointer \ -get_taps_##type##_##inter (GstAudioResampler * resampler, \ - gint *samp_index, gint *samp_phase, type icoeff[4]) \ +DECL_GET_TAPS_INTERPOLATE_FUNC (type, inter) \ { \ gpointer res; \ gint out_rate = resampler->out_rate; \ @@ -852,67 +759,25 @@ inner_product_##type##_cubic_1_c (type * o, const type * a, \ INNER_PRODUCT_FLOAT_CUBIC_FUNC (gfloat); INNER_PRODUCT_FLOAT_CUBIC_FUNC (gdouble); -#define MAKE_RESAMPLE_FUNC(type,inter,channels,arch) \ -static void \ -resample_ ##type## _ ##inter## _ ##channels## _ ##arch (GstAudioResampler * resampler, \ - gpointer in[], gsize in_len, gpointer out[], gsize out_len, \ - gsize * consumed) \ -{ \ - gint c, di = 0; \ - gint n_taps = resampler->n_taps; \ - gint blocks = resampler->blocks; \ - gint ostride = resampler->ostride; \ - gint taps_stride = resampler->taps_stride; \ - gint samp_index = 0; \ - gint samp_phase = 0; \ - \ - for (c = 0; c < blocks; c++) { \ - type *ip = in[c]; \ - type *op = ostride == 1 ? out[c] : (type *)out[0] + c; \ - \ - samp_index = resampler->samp_index; \ - samp_phase = resampler->samp_phase; \ - \ - for (di = 0; di < out_len; di++) { \ - type *ipp, icoeff[4], *taps; \ - \ - ipp = &ip[samp_index * channels]; \ - \ - taps = get_taps_ ##type##_##inter \ - (resampler, &samp_index, &samp_phase, icoeff); \ - inner_product_ ##type##_##inter##_##channels##_##arch \ - (op, ipp, taps, n_taps, icoeff, taps_stride); \ - op += ostride; \ - } \ - if (in_len > samp_index) \ - memmove (ip, &ip[samp_index * channels], \ - (in_len - samp_index) * sizeof(type) * channels); \ - } \ - *consumed = samp_index - resampler->samp_index; \ - \ - resampler->samp_index = 0; \ - resampler->samp_phase = samp_phase; \ -} +MAKE_RESAMPLE_FUNC_STATIC (gint16, nearest, 1, c); +MAKE_RESAMPLE_FUNC_STATIC (gint32, nearest, 1, c); +MAKE_RESAMPLE_FUNC_STATIC (gfloat, nearest, 1, c); +MAKE_RESAMPLE_FUNC_STATIC (gdouble, nearest, 1, c); -MAKE_RESAMPLE_FUNC (gint16, nearest, 1, c); -MAKE_RESAMPLE_FUNC (gint32, nearest, 1, c); -MAKE_RESAMPLE_FUNC (gfloat, nearest, 1, c); -MAKE_RESAMPLE_FUNC (gdouble, nearest, 1, c); +MAKE_RESAMPLE_FUNC_STATIC (gint16, full, 1, c); +MAKE_RESAMPLE_FUNC_STATIC (gint32, full, 1, c); +MAKE_RESAMPLE_FUNC_STATIC (gfloat, full, 1, c); +MAKE_RESAMPLE_FUNC_STATIC (gdouble, full, 1, c); -MAKE_RESAMPLE_FUNC (gint16, full, 1, c); -MAKE_RESAMPLE_FUNC (gint32, full, 1, c); -MAKE_RESAMPLE_FUNC (gfloat, full, 1, c); -MAKE_RESAMPLE_FUNC (gdouble, full, 1, c); +MAKE_RESAMPLE_FUNC_STATIC (gint16, linear, 1, c); +MAKE_RESAMPLE_FUNC_STATIC (gint32, linear, 1, c); +MAKE_RESAMPLE_FUNC_STATIC (gfloat, linear, 1, c); +MAKE_RESAMPLE_FUNC_STATIC (gdouble, linear, 1, c); -MAKE_RESAMPLE_FUNC (gint16, linear, 1, c); -MAKE_RESAMPLE_FUNC (gint32, linear, 1, c); -MAKE_RESAMPLE_FUNC (gfloat, linear, 1, c); -MAKE_RESAMPLE_FUNC (gdouble, linear, 1, c); - -MAKE_RESAMPLE_FUNC (gint16, cubic, 1, c); -MAKE_RESAMPLE_FUNC (gint32, cubic, 1, c); -MAKE_RESAMPLE_FUNC (gfloat, cubic, 1, c); -MAKE_RESAMPLE_FUNC (gdouble, cubic, 1, c); +MAKE_RESAMPLE_FUNC_STATIC (gint16, cubic, 1, c); +MAKE_RESAMPLE_FUNC_STATIC (gint32, cubic, 1, c); +MAKE_RESAMPLE_FUNC_STATIC (gfloat, cubic, 1, c); +MAKE_RESAMPLE_FUNC_STATIC (gdouble, cubic, 1, c); static ResampleFunc resample_funcs[] = { resample_gint16_nearest_1_c,