audioresample: Separate out CFLAGS used for SSE* code

This makes sure that we only build files that need explicit SIMD support
with the relevant CFLAGS. This allows the rest of the code to be built
without, and specific SSE* code is only called after runtime checks for
CPU features.

https://bugzilla.gnome.org/show_bug.cgi?id=729276
This commit is contained in:
Arun Raghavan 2016-09-28 17:37:38 +05:30 committed by Arun Raghavan
parent f4cba79063
commit 4b5f78337a
13 changed files with 1196 additions and 788 deletions

View file

@ -179,6 +179,30 @@ dnl check for GCC specific SSE headers
dnl these are used by the speex resampler code dnl these are used by the speex resampler code
AC_CHECK_HEADERS([xmmintrin.h emmintrin.h smmintrin.h]) AC_CHECK_HEADERS([xmmintrin.h emmintrin.h smmintrin.h])
dnl also check which architecture we're on for building files with intrinsics
dnl separately
AC_CHECK_DECLS([__i386__], [HAVE_X86=1])
AC_CHECK_DECLS([__x86_64__], [HAVE_X86=1])
dnl check for -m* compiler flags too
SSE_CFLAGS="-msse"
SSE2_CFLAGS="-msse2"
SSE41_CFLAGS="-msse4.1"
AS_COMPILER_FLAG([$SSE_CFLAGS], [HAVE_SSE=1], [HAVE_SSE=0])
AS_COMPILER_FLAG([$SSE2_CFLAGS], [HAVE_SSE2=1], [HAVE_SSE2=0])
AS_COMPILER_FLAG([$SSE41_CFLAGS], [HAVE_SSE41=1], [HAVE_SSE41=0])
AM_CONDITIONAL(HAVE_X86, [test "x${HAVE_X86}" = "x1"])
AC_DEFINE_UNQUOTED(HAVE_SSE, [$HAVE_SSE], [SSE support is enabled])
AC_DEFINE_UNQUOTED(HAVE_SSE2, [$HAVE_SSE2], [SSE2 support is enabled])
AC_DEFINE_UNQUOTED(HAVE_SSE41, [$HAVE_SSE41], [SSE4.1 support is enabled])
AC_SUBST(SSE_CFLAGS)
AC_SUBST(SSE2_CFLAGS)
AC_SUBST(SSE41_CFLAGS)
dnl used in gst/tcp dnl used in gst/tcp
AC_CHECK_HEADERS([sys/socket.h], AC_CHECK_HEADERS([sys/socket.h],
[HAVE_SYS_SOCKET_H="yes"], [HAVE_SYS_SOCKET_H="no"], [AC_INCLUDES_DEFAULT]) [HAVE_SYS_SOCKET_H="yes"], [HAVE_SYS_SOCKET_H="no"], [AC_INCLUDES_DEFAULT])

View file

@ -82,8 +82,12 @@ nodist_libgstaudio_@GST_API_VERSION@include_HEADERS = \
audio-enumtypes.h audio-enumtypes.h
noinst_HEADERS = \ noinst_HEADERS = \
gstaudioutilsprivate.h \ gstaudioutilsprivate.h \
audio-resampler-x86.h \ audio-resampler-private.h \
audio-resampler-macros.h \
audio-resampler-x86.h \
audio-resampler-x86-sse.h \
audio-resampler-x86-sse2.h \
audio-resampler-neon.h audio-resampler-neon.h
libgstaudio_@GST_API_VERSION@_la_CFLAGS = $(GST_PLUGINS_BASE_CFLAGS) $(GST_BASE_CFLAGS) $(GST_CFLAGS) \ libgstaudio_@GST_API_VERSION@_la_CFLAGS = $(GST_PLUGINS_BASE_CFLAGS) $(GST_BASE_CFLAGS) $(GST_CFLAGS) \
@ -93,6 +97,50 @@ libgstaudio_@GST_API_VERSION@_la_LIBADD = \
$(GST_BASE_LIBS) $(GST_LIBS) $(LIBM) $(ORC_LIBS) $(GST_BASE_LIBS) $(GST_LIBS) $(LIBM) $(ORC_LIBS)
libgstaudio_@GST_API_VERSION@_la_LDFLAGS = $(GST_LIB_LDFLAGS) $(GST_ALL_LDFLAGS) $(GST_LT_LDFLAGS) libgstaudio_@GST_API_VERSION@_la_LDFLAGS = $(GST_LIB_LDFLAGS) $(GST_ALL_LDFLAGS) $(GST_LT_LDFLAGS)
# Arch-specific bits
noinst_LTLIBRARIES =
if HAVE_X86
# Don't use full GST_LT_LDFLAGS in LDFLAGS because we get things like
# -version-info that cause a warning on private libs
noinst_LTLIBRARIES += libaudio_resampler_sse.la
libaudio_resampler_sse_la_SOURCES = audio-resampler-x86-sse.c
libaudio_resampler_sse_la_CFLAGS = \
$(libgstaudio_@GST_API_VERSION@_la_CFLAGS) \
$(SSE_CFLAGS)
libaudio_resampler_sse_la_LDFLAGS = \
$(GST_LIB_LDFLAGS) \
$(GST_ALL_LDFLAGS)
libgstaudio_@GST_API_VERSION@_la_LIBADD += libaudio_resampler_sse.la
noinst_LTLIBRARIES += libaudio_resampler_sse2.la
libaudio_resampler_sse2_la_SOURCES = audio-resampler-x86-sse2.c
libaudio_resampler_sse2_la_CFLAGS = \
$(libgstaudio_@GST_API_VERSION@_la_CFLAGS) \
$(SSE2_CFLAGS)
libaudio_resampler_sse2_la_LDFLAGS = \
$(GST_LIB_LDFLAGS) \
$(GST_ALL_LDFLAGS)
libgstaudio_@GST_API_VERSION@_la_LIBADD += libaudio_resampler_sse2.la
noinst_LTLIBRARIES += libaudio_resampler_sse41.la
libaudio_resampler_sse41_la_SOURCES = audio-resampler-x86-sse41.c
libaudio_resampler_sse41_la_CFLAGS = \
$(libgstaudio_@GST_API_VERSION@_la_CFLAGS) \
$(SSE41_CFLAGS)
libaudio_resampler_sse41_la_LDFLAGS = \
$(GST_LIB_LDFLAGS) \
$(GST_ALL_LDFLAGS)
libgstaudio_@GST_API_VERSION@_la_LIBADD += libaudio_resampler_sse41.la
endif
# Introspection
include $(top_srcdir)/common/gst-glib-gen.mak include $(top_srcdir)/common/gst-glib-gen.mak
if HAVE_INTROSPECTION if HAVE_INTROSPECTION

View file

@ -0,0 +1,108 @@
/* GStreamer
* Copyright (C) <2015> Wim Taymans <wim.taymans@gmail.com>
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Library General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Library General Public License for more details.
*
* You should have received a copy of the GNU Library General Public
* License along with this library; if not, write to the
* Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
* Boston, MA 02110-1301, USA.
*/
#ifndef __GST_AUDIO_RESAMPLER_MACROS_H__
#define __GST_AUDIO_RESAMPLER_MACROS_H__
#include <string.h>
#include "audio-resampler-private.h"
#define PRECISION_S16 15
#define PRECISION_S32 31
#define DECL_GET_TAPS_FULL_FUNC(type) \
gpointer \
get_taps_##type##_full (GstAudioResampler * resampler, \
gint *samp_index, gint *samp_phase, type icoeff[4])
DECL_GET_TAPS_FULL_FUNC (gint16);
DECL_GET_TAPS_FULL_FUNC (gint32);
DECL_GET_TAPS_FULL_FUNC (gfloat);
DECL_GET_TAPS_FULL_FUNC (gdouble);
#define DECL_GET_TAPS_INTERPOLATE_FUNC(type, inter) \
gpointer \
get_taps_##type##_##inter (GstAudioResampler * resampler, \
gint *samp_index, gint *samp_phase, type icoeff[4]) \
DECL_GET_TAPS_INTERPOLATE_FUNC (gint16, linear);
DECL_GET_TAPS_INTERPOLATE_FUNC (gint32, linear);
DECL_GET_TAPS_INTERPOLATE_FUNC (gfloat, linear);
DECL_GET_TAPS_INTERPOLATE_FUNC (gdouble, linear);
DECL_GET_TAPS_INTERPOLATE_FUNC (gint16, cubic);
DECL_GET_TAPS_INTERPOLATE_FUNC (gint32, cubic);
DECL_GET_TAPS_INTERPOLATE_FUNC (gfloat, cubic);
DECL_GET_TAPS_INTERPOLATE_FUNC (gdouble, cubic);
#define DECL_RESAMPLE_FUNC(type,inter,channels,arch) \
void \
resample_ ##type## _ ##inter## _ ##channels## _ ##arch (GstAudioResampler * resampler, \
gpointer in[], gsize in_len, gpointer out[], gsize out_len, \
gsize * consumed)
#define MAKE_RESAMPLE_FUNC(type,inter,channels,arch) \
DECL_RESAMPLE_FUNC (type, inter, channels, arch) \
{ \
gint c, di = 0; \
gint n_taps = resampler->n_taps; \
gint blocks = resampler->blocks; \
gint ostride = resampler->ostride; \
gint taps_stride = resampler->taps_stride; \
gint samp_index = 0; \
gint samp_phase = 0; \
\
for (c = 0; c < blocks; c++) { \
type *ip = in[c]; \
type *op = ostride == 1 ? out[c] : (type *)out[0] + c; \
\
samp_index = resampler->samp_index; \
samp_phase = resampler->samp_phase; \
\
for (di = 0; di < out_len; di++) { \
type *ipp, icoeff[4], *taps; \
\
ipp = &ip[samp_index * channels]; \
\
taps = get_taps_ ##type##_##inter \
(resampler, &samp_index, &samp_phase, icoeff); \
inner_product_ ##type##_##inter##_##channels##_##arch \
(op, ipp, taps, n_taps, icoeff, taps_stride); \
op += ostride; \
} \
if (in_len > samp_index) \
memmove (ip, &ip[samp_index * channels], \
(in_len - samp_index) * sizeof(type) * channels); \
} \
*consumed = samp_index - resampler->samp_index; \
\
resampler->samp_index = 0; \
resampler->samp_phase = samp_phase; \
}
#define DECL_RESAMPLE_FUNC_STATIC(type,inter,channels,arch) \
static DECL_RESAMPLE_FUNC (type, inter, channels, arch)
#define MAKE_RESAMPLE_FUNC_STATIC(type,inter,channels,arch) \
static MAKE_RESAMPLE_FUNC (type, inter, channels, arch)
#endif /* __GST_AUDIO_RESAMPLER_MACROS_H__ */

View file

@ -650,17 +650,17 @@ interpolate_gfloat_cubic_neon (gpointer op, const gpointer ap,
"q10", "q11", "q12", "q13", "q14", "q15", "memory"); "q10", "q11", "q12", "q13", "q14", "q15", "memory");
} }
MAKE_RESAMPLE_FUNC (gint16, full, 1, neon); MAKE_RESAMPLE_FUNC_STATIC (gint16, full, 1, neon);
MAKE_RESAMPLE_FUNC (gint16, linear, 1, neon); MAKE_RESAMPLE_FUNC_STATIC (gint16, linear, 1, neon);
MAKE_RESAMPLE_FUNC (gint16, cubic, 1, neon); MAKE_RESAMPLE_FUNC_STATIC (gint16, cubic, 1, neon);
MAKE_RESAMPLE_FUNC (gint32, full, 1, neon); MAKE_RESAMPLE_FUNC_STATIC (gint32, full, 1, neon);
MAKE_RESAMPLE_FUNC (gint32, linear, 1, neon); MAKE_RESAMPLE_FUNC_STATIC (gint32, linear, 1, neon);
MAKE_RESAMPLE_FUNC (gint32, cubic, 1, neon); MAKE_RESAMPLE_FUNC_STATIC (gint32, cubic, 1, neon);
MAKE_RESAMPLE_FUNC (gfloat, full, 1, neon); MAKE_RESAMPLE_FUNC_STATIC (gfloat, full, 1, neon);
MAKE_RESAMPLE_FUNC (gfloat, linear, 1, neon); MAKE_RESAMPLE_FUNC_STATIC (gfloat, linear, 1, neon);
MAKE_RESAMPLE_FUNC (gfloat, cubic, 1, neon); MAKE_RESAMPLE_FUNC_STATIC (gfloat, cubic, 1, neon);
static void static void
audio_resampler_check_neon (const gchar *option) audio_resampler_check_neon (const gchar *option)

View file

@ -0,0 +1,113 @@
/* GStreamer
* Copyright (C) <2015> Wim Taymans <wim.taymans@gmail.com>
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Library General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Library General Public License for more details.
*
* You should have received a copy of the GNU Library General Public
* License along with this library; if not, write to the
* Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
* Boston, MA 02110-1301, USA.
*/
#ifndef __GST_AUDIO_RESAMPLER_PRIVATE_H__
#define __GST_AUDIO_RESAMPLER_PRIVATE_H__
#include "audio-resampler.h"
/* Contains a collection of all things found in other resamplers:
* speex (filter construction, optimizations), ffmpeg (fixed phase filter, blackman filter),
* SRC (linear interpolation, fixed precomputed tables),...
*
* Supports:
* - S16, S32, F32 and F64 formats
* - nearest, linear and cubic interpolation
* - sinc based interpolation with kaiser or blackman-nutall windows
* - fully configurable kaiser parameters
* - dynamic linear or cubic interpolation of filter table, this can
* use less memory but more CPU
* - full filter table, generated from optionally linear or cubic
* interpolation of filter table
* - fixed filter table size with nearest neighbour phase, optionally
* using a precomputed tables
* - dynamic samplerate changes
* - x86 and neon optimizations
*/
typedef void (*ConvertTapsFunc) (gdouble * tmp_taps, gpointer taps,
gdouble weight, gint n_taps);
typedef void (*InterpolateFunc) (gpointer o, const gpointer a, gint len,
const gpointer icoeff, gint astride);
typedef void (*ResampleFunc) (GstAudioResampler * resampler, gpointer in[],
gsize in_len, gpointer out[], gsize out_len, gsize * consumed);
typedef void (*DeinterleaveFunc) (GstAudioResampler * resampler,
gpointer * sbuf, gpointer in[], gsize in_frames);
struct _GstAudioResampler
{
GstAudioResamplerMethod method;
GstAudioResamplerFlags flags;
GstAudioFormat format;
GstStructure *options;
gint format_index;
gint channels;
gint in_rate;
gint out_rate;
gint bps;
gint ostride;
GstAudioResamplerFilterMode filter_mode;
guint filter_threshold;
GstAudioResamplerFilterInterpolation filter_interpolation;
gdouble cutoff;
gdouble kaiser_beta;
/* for cubic */
gdouble b, c;
/* temp taps */
gpointer tmp_taps;
/* oversampled main filter table */
gint oversample;
gint n_taps;
gpointer taps;
gpointer taps_mem;
gsize taps_stride;
gint n_phases;
gint alloc_taps;
gint alloc_phases;
/* cached taps */
gpointer *cached_phases;
gpointer cached_taps;
gpointer cached_taps_mem;
gsize cached_taps_stride;
ConvertTapsFunc convert_taps;
InterpolateFunc interpolate;
DeinterleaveFunc deinterleave;
ResampleFunc resample;
gint blocks;
gint inc;
gint samp_inc;
gint samp_frac;
gint samp_index;
gint samp_phase;
gint skip;
gpointer samples;
gsize samples_len;
gsize samples_avail;
gpointer *sbuf;
};
#endif /* __GST_AUDIO_RESAMPLER_PRIVATE_H__ */

View file

@ -0,0 +1,168 @@
/* GStreamer
* Copyright (C) <2016> Wim Taymans <wim.taymans@gmail.com>
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Library General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Library General Public License for more details.
*
* You should have received a copy of the GNU Library General Public
* License along with this library; if not, write to the
* Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
* Boston, MA 02110-1301, USA.
*/
#ifdef HAVE_CONFIG_H
# include "config.h"
#endif
#include "audio-resampler-x86-sse.h"
#if defined (HAVE_XMMINTRIN_H) && defined(__SSE__)
#include <xmmintrin.h>
static inline void
inner_product_gfloat_full_1_sse (gfloat * o, const gfloat * a,
const gfloat * b, gint len, const gfloat * icoeff, gint bstride)
{
gint i = 0;
__m128 sum = _mm_setzero_ps ();
for (; i < len; i += 8) {
sum =
_mm_add_ps (sum, _mm_mul_ps (_mm_loadu_ps (a + i + 0),
_mm_load_ps (b + i + 0)));
sum =
_mm_add_ps (sum, _mm_mul_ps (_mm_loadu_ps (a + i + 4),
_mm_load_ps (b + i + 4)));
}
sum = _mm_add_ps (sum, _mm_movehl_ps (sum, sum));
sum = _mm_add_ss (sum, _mm_shuffle_ps (sum, sum, 0x55));
_mm_store_ss (o, sum);
}
static inline void
inner_product_gfloat_linear_1_sse (gfloat * o, const gfloat * a,
const gfloat * b, gint len, const gfloat * icoeff, gint bstride)
{
gint i = 0;
__m128 sum[2], t;
const gfloat *c[2] = { (gfloat *) ((gint8 *) b + 0 * bstride),
(gfloat *) ((gint8 *) b + 1 * bstride)
};
sum[0] = sum[1] = _mm_setzero_ps ();
for (; i < len; i += 8) {
t = _mm_loadu_ps (a + i + 0);
sum[0] = _mm_add_ps (sum[0], _mm_mul_ps (t, _mm_load_ps (c[0] + i + 0)));
sum[1] = _mm_add_ps (sum[1], _mm_mul_ps (t, _mm_load_ps (c[1] + i + 0)));
t = _mm_loadu_ps (a + i + 4);
sum[0] = _mm_add_ps (sum[0], _mm_mul_ps (t, _mm_load_ps (c[0] + i + 4)));
sum[1] = _mm_add_ps (sum[1], _mm_mul_ps (t, _mm_load_ps (c[1] + i + 4)));
}
sum[0] = _mm_mul_ps (_mm_sub_ps (sum[0], sum[1]), _mm_load1_ps (icoeff));
sum[0] = _mm_add_ps (sum[0], sum[1]);
sum[0] = _mm_add_ps (sum[0], _mm_movehl_ps (sum[0], sum[0]));
sum[0] = _mm_add_ss (sum[0], _mm_shuffle_ps (sum[0], sum[0], 0x55));
_mm_store_ss (o, sum[0]);
}
static inline void
inner_product_gfloat_cubic_1_sse (gfloat * o, const gfloat * a,
const gfloat * b, gint len, const gfloat * icoeff, gint bstride)
{
gint i = 0;
__m128 sum[4];
__m128 t, f = _mm_loadu_ps (icoeff);
const gfloat *c[4] = { (gfloat *) ((gint8 *) b + 0 * bstride),
(gfloat *) ((gint8 *) b + 1 * bstride),
(gfloat *) ((gint8 *) b + 2 * bstride),
(gfloat *) ((gint8 *) b + 3 * bstride)
};
sum[0] = sum[1] = sum[2] = sum[3] = _mm_setzero_ps ();
for (; i < len; i += 4) {
t = _mm_loadu_ps (a + i);
sum[0] = _mm_add_ps (sum[0], _mm_mul_ps (t, _mm_load_ps (c[0] + i)));
sum[1] = _mm_add_ps (sum[1], _mm_mul_ps (t, _mm_load_ps (c[1] + i)));
sum[2] = _mm_add_ps (sum[2], _mm_mul_ps (t, _mm_load_ps (c[2] + i)));
sum[3] = _mm_add_ps (sum[3], _mm_mul_ps (t, _mm_load_ps (c[3] + i)));
}
sum[0] = _mm_mul_ps (sum[0], _mm_shuffle_ps (f, f, 0x00));
sum[1] = _mm_mul_ps (sum[1], _mm_shuffle_ps (f, f, 0x55));
sum[2] = _mm_mul_ps (sum[2], _mm_shuffle_ps (f, f, 0xaa));
sum[3] = _mm_mul_ps (sum[3], _mm_shuffle_ps (f, f, 0xff));
sum[0] = _mm_add_ps (sum[0], sum[1]);
sum[2] = _mm_add_ps (sum[2], sum[3]);
sum[0] = _mm_add_ps (sum[0], sum[2]);
sum[0] = _mm_add_ps (sum[0], _mm_movehl_ps (sum[0], sum[0]));
sum[0] = _mm_add_ss (sum[0], _mm_shuffle_ps (sum[0], sum[0], 0x55));
_mm_store_ss (o, sum[0]);
}
MAKE_RESAMPLE_FUNC (gfloat, full, 1, sse);
MAKE_RESAMPLE_FUNC (gfloat, linear, 1, sse);
MAKE_RESAMPLE_FUNC (gfloat, cubic, 1, sse);
void
interpolate_gfloat_linear_sse (gpointer op, const gpointer ap,
gint len, const gpointer icp, gint astride)
{
gint i;
gfloat *o = op, *a = ap, *ic = icp;
__m128 f[2], t1, t2;
const gfloat *c[2] = { (gfloat *) ((gint8 *) a + 0 * astride),
(gfloat *) ((gint8 *) a + 1 * astride)
};
f[0] = _mm_load1_ps (ic + 0);
f[1] = _mm_load1_ps (ic + 1);
for (i = 0; i < len; i += 8) {
t1 = _mm_mul_ps (_mm_load_ps (c[0] + i + 0), f[0]);
t2 = _mm_mul_ps (_mm_load_ps (c[1] + i + 0), f[1]);
_mm_store_ps (o + i + 0, _mm_add_ps (t1, t2));
t1 = _mm_mul_ps (_mm_load_ps (c[0] + i + 4), f[0]);
t2 = _mm_mul_ps (_mm_load_ps (c[1] + i + 4), f[1]);
_mm_store_ps (o + i + 4, _mm_add_ps (t1, t2));
}
}
void
interpolate_gfloat_cubic_sse (gpointer op, const gpointer ap,
gint len, const gpointer icp, gint astride)
{
gint i;
gfloat *o = op, *a = ap, *ic = icp;
__m128 f[4], t[4];
const gfloat *c[4] = { (gfloat *) ((gint8 *) a + 0 * astride),
(gfloat *) ((gint8 *) a + 1 * astride),
(gfloat *) ((gint8 *) a + 2 * astride),
(gfloat *) ((gint8 *) a + 3 * astride)
};
f[0] = _mm_load1_ps (ic + 0);
f[1] = _mm_load1_ps (ic + 1);
f[2] = _mm_load1_ps (ic + 2);
f[3] = _mm_load1_ps (ic + 3);
for (i = 0; i < len; i += 4) {
t[0] = _mm_mul_ps (_mm_load_ps (c[0] + i + 0), f[0]);
t[1] = _mm_mul_ps (_mm_load_ps (c[1] + i + 0), f[1]);
t[2] = _mm_mul_ps (_mm_load_ps (c[2] + i + 0), f[2]);
t[3] = _mm_mul_ps (_mm_load_ps (c[3] + i + 0), f[3]);
t[0] = _mm_add_ps (t[0], t[1]);
t[2] = _mm_add_ps (t[2], t[3]);
_mm_store_ps (o + i + 0, _mm_add_ps (t[0], t[2]));
}
}
#endif

View file

@ -0,0 +1,35 @@
/* GStreamer
* Copyright (C) <2016> Wim Taymans <wim.taymans@gmail.com>
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Library General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Library General Public License for more details.
*
* You should have received a copy of the GNU Library General Public
* License along with this library; if not, write to the
* Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
* Boston, MA 02110-1301, USA.
*/
#ifndef AUDIO_RESAMPLER_X86_SSE_H
#define AUDIO_RESAMPLER_X86_SSE_H
#include "audio-resampler-macros.h"
DECL_RESAMPLE_FUNC (gfloat, full, 1, sse);
DECL_RESAMPLE_FUNC (gfloat, linear, 1, sse);
DECL_RESAMPLE_FUNC (gfloat, cubic, 1, sse);
void interpolate_gfloat_linear_sse (gpointer op, const gpointer ap,
gint len, const gpointer icp, gint astride);
void interpolate_gfloat_cubic_sse (gpointer op, const gpointer ap,
gint len, const gpointer icp, gint astride);
#endif /* AUDIO_RESAMPLER_X86_SSE_H */

View file

@ -0,0 +1,399 @@
/* GStreamer
* Copyright (C) <2016> Wim Taymans <wim.taymans@gmail.com>
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Library General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Library General Public License for more details.
*
* You should have received a copy of the GNU Library General Public
* License along with this library; if not, write to the
* Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
* Boston, MA 02110-1301, USA.
*/
#ifdef HAVE_CONFIG_H
# include "config.h"
#endif
#include "audio-resampler-x86-sse2.h"
#if defined (HAVE_EMMINTRIN_H) && defined(__SSE2__)
#include <emmintrin.h>
static inline void
inner_product_gint16_full_1_sse2 (gint16 * o, const gint16 * a,
const gint16 * b, gint len, const gint16 * icoeff, gint bstride)
{
gint i;
__m128i sum, t;
sum = _mm_setzero_si128 ();
for (i = 0; i < len; i += 16) {
t = _mm_loadu_si128 ((__m128i *) (a + i));
sum =
_mm_add_epi32 (sum, _mm_madd_epi16 (t,
_mm_load_si128 ((__m128i *) (b + i + 0))));
t = _mm_loadu_si128 ((__m128i *) (a + i + 8));
sum =
_mm_add_epi32 (sum, _mm_madd_epi16 (t,
_mm_load_si128 ((__m128i *) (b + i + 8))));
}
sum = _mm_add_epi32 (sum, _mm_shuffle_epi32 (sum, _MM_SHUFFLE (2, 3, 2, 3)));
sum = _mm_add_epi32 (sum, _mm_shuffle_epi32 (sum, _MM_SHUFFLE (1, 1, 1, 1)));
sum = _mm_add_epi32 (sum, _mm_set1_epi32 (1 << (PRECISION_S16 - 1)));
sum = _mm_srai_epi32 (sum, PRECISION_S16);
sum = _mm_packs_epi32 (sum, sum);
*o = _mm_extract_epi16 (sum, 0);
}
static inline void
inner_product_gint16_linear_1_sse2 (gint16 * o, const gint16 * a,
const gint16 * b, gint len, const gint16 * icoeff, gint bstride)
{
gint i = 0;
__m128i sum[2], t;
__m128i f = _mm_set_epi64x (0, *((gint64 *) icoeff));
const gint16 *c[2] = { (gint16 *) ((gint8 *) b + 0 * bstride),
(gint16 *) ((gint8 *) b + 1 * bstride)
};
sum[0] = sum[1] = _mm_setzero_si128 ();
f = _mm_unpacklo_epi16 (f, sum[0]);
for (; i < len; i += 16) {
t = _mm_loadu_si128 ((__m128i *) (a + i + 0));
sum[0] =
_mm_add_epi32 (sum[0], _mm_madd_epi16 (t,
_mm_load_si128 ((__m128i *) (c[0] + i + 0))));
sum[1] =
_mm_add_epi32 (sum[1], _mm_madd_epi16 (t,
_mm_load_si128 ((__m128i *) (c[1] + i + 0))));
t = _mm_loadu_si128 ((__m128i *) (a + i + 8));
sum[0] =
_mm_add_epi32 (sum[0], _mm_madd_epi16 (t,
_mm_load_si128 ((__m128i *) (c[0] + i + 8))));
sum[1] =
_mm_add_epi32 (sum[1], _mm_madd_epi16 (t,
_mm_load_si128 ((__m128i *) (c[1] + i + 8))));
}
sum[0] = _mm_srai_epi32 (sum[0], PRECISION_S16);
sum[1] = _mm_srai_epi32 (sum[1], PRECISION_S16);
sum[0] =
_mm_madd_epi16 (sum[0], _mm_shuffle_epi32 (f, _MM_SHUFFLE (0, 0, 0, 0)));
sum[1] =
_mm_madd_epi16 (sum[1], _mm_shuffle_epi32 (f, _MM_SHUFFLE (1, 1, 1, 1)));
sum[0] = _mm_add_epi32 (sum[0], sum[1]);
sum[0] =
_mm_add_epi32 (sum[0], _mm_shuffle_epi32 (sum[0], _MM_SHUFFLE (2, 3, 2,
3)));
sum[0] =
_mm_add_epi32 (sum[0], _mm_shuffle_epi32 (sum[0], _MM_SHUFFLE (1, 1, 1,
1)));
sum[0] = _mm_add_epi32 (sum[0], _mm_set1_epi32 (1 << (PRECISION_S16 - 1)));
sum[0] = _mm_srai_epi32 (sum[0], PRECISION_S16);
sum[0] = _mm_packs_epi32 (sum[0], sum[0]);
*o = _mm_extract_epi16 (sum[0], 0);
}
static inline void
inner_product_gint16_cubic_1_sse2 (gint16 * o, const gint16 * a,
const gint16 * b, gint len, const gint16 * icoeff, gint bstride)
{
gint i = 0;
__m128i sum[4], t[4];
__m128i f = _mm_set_epi64x (0, *((long long *) icoeff));
const gint16 *c[4] = { (gint16 *) ((gint8 *) b + 0 * bstride),
(gint16 *) ((gint8 *) b + 1 * bstride),
(gint16 *) ((gint8 *) b + 2 * bstride),
(gint16 *) ((gint8 *) b + 3 * bstride)
};
sum[0] = sum[1] = sum[2] = sum[3] = _mm_setzero_si128 ();
f = _mm_unpacklo_epi16 (f, sum[0]);
for (; i < len; i += 8) {
t[0] = _mm_loadu_si128 ((__m128i *) (a + i));
sum[0] =
_mm_add_epi32 (sum[0], _mm_madd_epi16 (t[0],
_mm_load_si128 ((__m128i *) (c[0] + i))));
sum[1] =
_mm_add_epi32 (sum[1], _mm_madd_epi16 (t[0],
_mm_load_si128 ((__m128i *) (c[1] + i))));
sum[2] =
_mm_add_epi32 (sum[2], _mm_madd_epi16 (t[0],
_mm_load_si128 ((__m128i *) (c[2] + i))));
sum[3] =
_mm_add_epi32 (sum[3], _mm_madd_epi16 (t[0],
_mm_load_si128 ((__m128i *) (c[3] + i))));
}
t[0] = _mm_unpacklo_epi32 (sum[0], sum[1]);
t[1] = _mm_unpacklo_epi32 (sum[2], sum[3]);
t[2] = _mm_unpackhi_epi32 (sum[0], sum[1]);
t[3] = _mm_unpackhi_epi32 (sum[2], sum[3]);
sum[0] =
_mm_add_epi32 (_mm_unpacklo_epi64 (t[0], t[1]), _mm_unpackhi_epi64 (t[0],
t[1]));
sum[2] =
_mm_add_epi32 (_mm_unpacklo_epi64 (t[2], t[3]), _mm_unpackhi_epi64 (t[2],
t[3]));
sum[0] = _mm_add_epi32 (sum[0], sum[2]);
sum[0] = _mm_srai_epi32 (sum[0], PRECISION_S16);
sum[0] = _mm_madd_epi16 (sum[0], f);
sum[0] =
_mm_add_epi32 (sum[0], _mm_shuffle_epi32 (sum[0], _MM_SHUFFLE (2, 3, 2,
3)));
sum[0] =
_mm_add_epi32 (sum[0], _mm_shuffle_epi32 (sum[0], _MM_SHUFFLE (1, 1, 1,
1)));
sum[0] = _mm_add_epi32 (sum[0], _mm_set1_epi32 (1 << (PRECISION_S16 - 1)));
sum[0] = _mm_srai_epi32 (sum[0], PRECISION_S16);
sum[0] = _mm_packs_epi32 (sum[0], sum[0]);
*o = _mm_extract_epi16 (sum[0], 0);
}
static inline void
inner_product_gdouble_full_1_sse2 (gdouble * o, const gdouble * a,
const gdouble * b, gint len, const gdouble * icoeff, gint bstride)
{
gint i = 0;
__m128d sum = _mm_setzero_pd ();
for (; i < len; i += 8) {
sum =
_mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + i + 0),
_mm_load_pd (b + i + 0)));
sum =
_mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + i + 2),
_mm_load_pd (b + i + 2)));
sum =
_mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + i + 4),
_mm_load_pd (b + i + 4)));
sum =
_mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + i + 6),
_mm_load_pd (b + i + 6)));
}
sum = _mm_add_sd (sum, _mm_unpackhi_pd (sum, sum));
_mm_store_sd (o, sum);
}
static inline void
inner_product_gdouble_linear_1_sse2 (gdouble * o, const gdouble * a,
const gdouble * b, gint len, const gdouble * icoeff, gint bstride)
{
gint i = 0;
__m128d sum[2], t;
const gdouble *c[2] = { (gdouble *) ((gint8 *) b + 0 * bstride),
(gdouble *) ((gint8 *) b + 1 * bstride)
};
sum[0] = sum[1] = _mm_setzero_pd ();
for (; i < len; i += 4) {
t = _mm_loadu_pd (a + i + 0);
sum[0] = _mm_add_pd (sum[0], _mm_mul_pd (t, _mm_load_pd (c[0] + i + 0)));
sum[1] = _mm_add_pd (sum[1], _mm_mul_pd (t, _mm_load_pd (c[1] + i + 0)));
t = _mm_loadu_pd (a + i + 2);
sum[0] = _mm_add_pd (sum[0], _mm_mul_pd (t, _mm_load_pd (c[0] + i + 2)));
sum[1] = _mm_add_pd (sum[1], _mm_mul_pd (t, _mm_load_pd (c[1] + i + 2)));
}
sum[0] = _mm_mul_pd (_mm_sub_pd (sum[0], sum[1]), _mm_load1_pd (icoeff));
sum[0] = _mm_add_pd (sum[0], sum[1]);
sum[0] = _mm_add_sd (sum[0], _mm_unpackhi_pd (sum[0], sum[0]));
_mm_store_sd (o, sum[0]);
}
static inline void
inner_product_gdouble_cubic_1_sse2 (gdouble * o, const gdouble * a,
const gdouble * b, gint len, const gdouble * icoeff, gint bstride)
{
gint i;
__m128d f[2], sum[4], t;
const gdouble *c[4] = { (gdouble *) ((gint8 *) b + 0 * bstride),
(gdouble *) ((gint8 *) b + 1 * bstride),
(gdouble *) ((gint8 *) b + 2 * bstride),
(gdouble *) ((gint8 *) b + 3 * bstride)
};
f[0] = _mm_loadu_pd (icoeff + 0);
f[1] = _mm_loadu_pd (icoeff + 2);
sum[0] = sum[1] = sum[2] = sum[3] = _mm_setzero_pd ();
for (i = 0; i < len; i += 2) {
t = _mm_loadu_pd (a + i + 0);
sum[0] = _mm_add_pd (sum[0], _mm_mul_pd (t, _mm_load_pd (c[0] + i)));
sum[1] = _mm_add_pd (sum[1], _mm_mul_pd (t, _mm_load_pd (c[1] + i)));
sum[2] = _mm_add_pd (sum[2], _mm_mul_pd (t, _mm_load_pd (c[2] + i)));
sum[3] = _mm_add_pd (sum[3], _mm_mul_pd (t, _mm_load_pd (c[3] + i)));
}
sum[0] =
_mm_mul_pd (sum[0], _mm_shuffle_pd (f[0], f[0], _MM_SHUFFLE2 (0, 0)));
sum[1] =
_mm_mul_pd (sum[1], _mm_shuffle_pd (f[0], f[0], _MM_SHUFFLE2 (1, 1)));
sum[2] =
_mm_mul_pd (sum[2], _mm_shuffle_pd (f[1], f[1], _MM_SHUFFLE2 (0, 0)));
sum[3] =
_mm_mul_pd (sum[3], _mm_shuffle_pd (f[1], f[1], _MM_SHUFFLE2 (1, 1)));
sum[0] = _mm_add_pd (sum[0], sum[1]);
sum[2] = _mm_add_pd (sum[2], sum[3]);
sum[0] = _mm_add_pd (sum[0], sum[2]);
sum[0] = _mm_add_sd (sum[0], _mm_unpackhi_pd (sum[0], sum[0]));
_mm_store_sd (o, sum[0]);
}
MAKE_RESAMPLE_FUNC (gint16, full, 1, sse2);
MAKE_RESAMPLE_FUNC (gint16, linear, 1, sse2);
MAKE_RESAMPLE_FUNC (gint16, cubic, 1, sse2);
MAKE_RESAMPLE_FUNC (gdouble, full, 1, sse2);
MAKE_RESAMPLE_FUNC (gdouble, linear, 1, sse2);
MAKE_RESAMPLE_FUNC (gdouble, cubic, 1, sse2);
void
interpolate_gint16_linear_sse2 (gpointer op, const gpointer ap,
gint len, const gpointer icp, gint astride)
{
gint i = 0;
gint16 *o = op, *a = ap, *ic = icp;
__m128i ta, tb, t1, t2;
__m128i f = _mm_set_epi64x (0, *((gint64 *) ic));
const gint16 *c[2] = { (gint16 *) ((gint8 *) a + 0 * astride),
(gint16 *) ((gint8 *) a + 1 * astride)
};
f = _mm_unpacklo_epi32 (f, f);
f = _mm_unpacklo_epi64 (f, f);
for (; i < len; i += 8) {
ta = _mm_load_si128 ((__m128i *) (c[0] + i));
tb = _mm_load_si128 ((__m128i *) (c[1] + i));
t1 = _mm_madd_epi16 (_mm_unpacklo_epi16 (ta, tb), f);
t2 = _mm_madd_epi16 (_mm_unpackhi_epi16 (ta, tb), f);
t1 = _mm_add_epi32 (t1, _mm_set1_epi32 (1 << (PRECISION_S16 - 1)));
t2 = _mm_add_epi32 (t2, _mm_set1_epi32 (1 << (PRECISION_S16 - 1)));
t1 = _mm_srai_epi32 (t1, PRECISION_S16);
t2 = _mm_srai_epi32 (t2, PRECISION_S16);
t1 = _mm_packs_epi32 (t1, t2);
_mm_store_si128 ((__m128i *) (o + i), t1);
}
}
void
interpolate_gint16_cubic_sse2 (gpointer op, const gpointer ap,
gint len, const gpointer icp, gint astride)
{
gint i = 0;
gint16 *o = op, *a = ap, *ic = icp;
__m128i ta, tb, tl1, tl2, th1, th2;
__m128i f[2];
const gint16 *c[4] = { (gint16 *) ((gint8 *) a + 0 * astride),
(gint16 *) ((gint8 *) a + 1 * astride),
(gint16 *) ((gint8 *) a + 2 * astride),
(gint16 *) ((gint8 *) a + 3 * astride)
};
f[0] = _mm_set_epi16 (ic[1], ic[0], ic[1], ic[0], ic[1], ic[0], ic[1], ic[0]);
f[1] = _mm_set_epi16 (ic[3], ic[2], ic[3], ic[2], ic[3], ic[2], ic[3], ic[2]);
for (; i < len; i += 8) {
ta = _mm_load_si128 ((__m128i *) (c[0] + i));
tb = _mm_load_si128 ((__m128i *) (c[1] + i));
tl1 = _mm_madd_epi16 (_mm_unpacklo_epi16 (ta, tb), f[0]);
th1 = _mm_madd_epi16 (_mm_unpackhi_epi16 (ta, tb), f[0]);
ta = _mm_load_si128 ((__m128i *) (c[2] + i));
tb = _mm_load_si128 ((__m128i *) (c[3] + i));
tl2 = _mm_madd_epi16 (_mm_unpacklo_epi16 (ta, tb), f[1]);
th2 = _mm_madd_epi16 (_mm_unpackhi_epi16 (ta, tb), f[1]);
tl1 = _mm_add_epi32 (tl1, tl2);
th1 = _mm_add_epi32 (th1, th2);
tl1 = _mm_add_epi32 (tl1, _mm_set1_epi32 (1 << (PRECISION_S16 - 1)));
th1 = _mm_add_epi32 (th1, _mm_set1_epi32 (1 << (PRECISION_S16 - 1)));
tl1 = _mm_srai_epi32 (tl1, PRECISION_S16);
th1 = _mm_srai_epi32 (th1, PRECISION_S16);
tl1 = _mm_packs_epi32 (tl1, th1);
_mm_store_si128 ((__m128i *) (o + i), tl1);
}
}
void
interpolate_gdouble_linear_sse2 (gpointer op, const gpointer ap,
gint len, const gpointer icp, gint astride)
{
gint i;
gdouble *o = op, *a = ap, *ic = icp;
__m128d f[2], t1, t2;
const gdouble *c[2] = { (gdouble *) ((gint8 *) a + 0 * astride),
(gdouble *) ((gint8 *) a + 1 * astride)
};
f[0] = _mm_load1_pd (ic + 0);
f[1] = _mm_load1_pd (ic + 1);
for (i = 0; i < len; i += 4) {
t1 = _mm_mul_pd (_mm_load_pd (c[0] + i + 0), f[0]);
t2 = _mm_mul_pd (_mm_load_pd (c[1] + i + 0), f[1]);
_mm_store_pd (o + i + 0, _mm_add_pd (t1, t2));
t1 = _mm_mul_pd (_mm_load_pd (c[0] + i + 2), f[0]);
t2 = _mm_mul_pd (_mm_load_pd (c[1] + i + 2), f[1]);
_mm_store_pd (o + i + 2, _mm_add_pd (t1, t2));
}
}
void
interpolate_gdouble_cubic_sse2 (gpointer op, const gpointer ap,
gint len, const gpointer icp, gint astride)
{
gint i;
gdouble *o = op, *a = ap, *ic = icp;
__m128d f[4], t[4];
const gdouble *c[4] = { (gdouble *) ((gint8 *) a + 0 * astride),
(gdouble *) ((gint8 *) a + 1 * astride),
(gdouble *) ((gint8 *) a + 2 * astride),
(gdouble *) ((gint8 *) a + 3 * astride)
};
f[0] = _mm_load1_pd (ic + 0);
f[1] = _mm_load1_pd (ic + 1);
f[2] = _mm_load1_pd (ic + 2);
f[3] = _mm_load1_pd (ic + 3);
for (i = 0; i < len; i += 2) {
t[0] = _mm_mul_pd (_mm_load_pd (c[0] + i + 0), f[0]);
t[1] = _mm_mul_pd (_mm_load_pd (c[1] + i + 0), f[1]);
t[2] = _mm_mul_pd (_mm_load_pd (c[2] + i + 0), f[2]);
t[3] = _mm_mul_pd (_mm_load_pd (c[3] + i + 0), f[3]);
t[0] = _mm_add_pd (t[0], t[1]);
t[2] = _mm_add_pd (t[2], t[3]);
_mm_store_pd (o + i + 0, _mm_add_pd (t[0], t[2]));
}
}
#endif

View file

@ -0,0 +1,49 @@
/* GStreamer
* Copyright (C) <2016> Wim Taymans <wim.taymans@gmail.com>
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Library General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Library General Public License for more details.
*
* You should have received a copy of the GNU Library General Public
* License along with this library; if not, write to the
* Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
* Boston, MA 02110-1301, USA.
*/
#ifndef AUDIO_RESAMPLER_X86_SSE2_H
#define AUDIO_RESAMPLER_X86_SSE2_H
#include "audio-resampler-macros.h"
DECL_RESAMPLE_FUNC (gint16, full, 1, sse2);
DECL_RESAMPLE_FUNC (gint16, linear, 1, sse2);
DECL_RESAMPLE_FUNC (gint16, cubic, 1, sse2);
DECL_RESAMPLE_FUNC (gdouble, full, 1, sse2);
DECL_RESAMPLE_FUNC (gdouble, linear, 1, sse2);
DECL_RESAMPLE_FUNC (gdouble, cubic, 1, sse2);
void
interpolate_gint16_linear_sse2 (gpointer op, const gpointer ap,
gint len, const gpointer icp, gint astride);
void
interpolate_gint16_cubic_sse2 (gpointer op, const gpointer ap,
gint len, const gpointer icp, gint astride);
void
interpolate_gdouble_linear_sse2 (gpointer op, const gpointer ap,
gint len, const gpointer icp, gint astride);
void
interpolate_gdouble_cubic_sse2 (gpointer op, const gpointer ap,
gint len, const gpointer icp, gint astride);
#endif /* AUDIO_RESAMPLER_X86_SSE2_H */

View file

@ -0,0 +1,185 @@
/* GStreamer
* Copyright (C) <2016> Wim Taymans <wim.taymans@gmail.com>
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Library General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Library General Public License for more details.
*
* You should have received a copy of the GNU Library General Public
* License along with this library; if not, write to the
* Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
* Boston, MA 02110-1301, USA.
*/
#ifdef HAVE_CONFIG_H
# include "config.h"
#endif
#include "audio-resampler-x86-sse41.h"
#if 0
#define __SSE4_1__
#pragma GCC target("sse4.1")
#endif
#if defined (HAVE_SMMINTRIN_H) && defined (HAVE_EMMINTRIN_H) && defined(__SSE4_1__)
#include <emmintrin.h>
#include <smmintrin.h>
static inline void
inner_product_gint32_full_1_sse41 (gint32 * o, const gint32 * a,
const gint32 * b, gint len, const gint32 * icoeff, gint bstride)
{
gint i = 0;
__m128i sum, ta, tb;
gint64 res;
sum = _mm_setzero_si128 ();
for (; i < len; i += 8) {
ta = _mm_loadu_si128 ((__m128i *) (a + i));
tb = _mm_load_si128 ((__m128i *) (b + i));
sum =
_mm_add_epi64 (sum, _mm_mul_epi32 (_mm_unpacklo_epi32 (ta, ta),
_mm_unpacklo_epi32 (tb, tb)));
sum =
_mm_add_epi64 (sum, _mm_mul_epi32 (_mm_unpackhi_epi32 (ta, ta),
_mm_unpackhi_epi32 (tb, tb)));
ta = _mm_loadu_si128 ((__m128i *) (a + i + 4));
tb = _mm_load_si128 ((__m128i *) (b + i + 4));
sum =
_mm_add_epi64 (sum, _mm_mul_epi32 (_mm_unpacklo_epi32 (ta, ta),
_mm_unpacklo_epi32 (tb, tb)));
sum =
_mm_add_epi64 (sum, _mm_mul_epi32 (_mm_unpackhi_epi32 (ta, ta),
_mm_unpackhi_epi32 (tb, tb)));
}
sum = _mm_add_epi64 (sum, _mm_unpackhi_epi64 (sum, sum));
res = _mm_cvtsi128_si64 (sum);
res = (res + (1 << (PRECISION_S32 - 1))) >> PRECISION_S32;
*o = CLAMP (res, -(1L << 31), (1L << 31) - 1);
}
static inline void
inner_product_gint32_linear_1_sse41 (gint32 * o, const gint32 * a,
const gint32 * b, gint len, const gint32 * icoeff, gint bstride)
{
gint i = 0;
gint64 res;
__m128i sum[2], ta, tb;
__m128i f = _mm_loadu_si128 ((__m128i *) icoeff);
const gint32 *c[2] = { (gint32 *) ((gint8 *) b + 0 * bstride),
(gint32 *) ((gint8 *) b + 1 * bstride)
};
sum[0] = sum[1] = _mm_setzero_si128 ();
for (; i < len; i += 4) {
ta = _mm_loadu_si128 ((__m128i *) (a + i));
tb = _mm_load_si128 ((__m128i *) (c[0] + i));
sum[0] = _mm_add_epi64 (sum[0], _mm_mul_epi32 (_mm_unpacklo_epi32 (ta, ta),
_mm_unpacklo_epi32 (tb, tb)));
sum[0] = _mm_add_epi64 (sum[0], _mm_mul_epi32 (_mm_unpackhi_epi32 (ta, ta),
_mm_unpackhi_epi32 (tb, tb)));
tb = _mm_load_si128 ((__m128i *) (c[1] + i));
sum[1] = _mm_add_epi64 (sum[1], _mm_mul_epi32 (_mm_unpacklo_epi32 (ta, ta),
_mm_unpacklo_epi32 (tb, tb)));
sum[1] = _mm_add_epi64 (sum[1], _mm_mul_epi32 (_mm_unpackhi_epi32 (ta, ta),
_mm_unpackhi_epi32 (tb, tb)));
}
sum[0] = _mm_srli_epi64 (sum[0], PRECISION_S32);
sum[1] = _mm_srli_epi64 (sum[1], PRECISION_S32);
sum[0] =
_mm_mul_epi32 (sum[0], _mm_shuffle_epi32 (f, _MM_SHUFFLE (0, 0, 0, 0)));
sum[1] =
_mm_mul_epi32 (sum[1], _mm_shuffle_epi32 (f, _MM_SHUFFLE (1, 1, 1, 1)));
sum[0] = _mm_add_epi64 (sum[0], sum[1]);
sum[0] = _mm_add_epi64 (sum[0], _mm_unpackhi_epi64 (sum[0], sum[0]));
res = _mm_cvtsi128_si64 (sum[0]);
res = (res + (1 << (PRECISION_S32 - 1))) >> PRECISION_S32;
*o = CLAMP (res, -(1L << 31), (1L << 31) - 1);
}
static inline void
inner_product_gint32_cubic_1_sse41 (gint32 * o, const gint32 * a,
const gint32 * b, gint len, const gint32 * icoeff, gint bstride)
{
gint i = 0;
gint64 res;
__m128i sum[4], ta, tb;
__m128i f = _mm_loadu_si128 ((__m128i *) icoeff);
const gint32 *c[4] = { (gint32 *) ((gint8 *) b + 0 * bstride),
(gint32 *) ((gint8 *) b + 1 * bstride),
(gint32 *) ((gint8 *) b + 2 * bstride),
(gint32 *) ((gint8 *) b + 3 * bstride)
};
sum[0] = sum[1] = sum[2] = sum[3] = _mm_setzero_si128 ();
for (; i < len; i += 4) {
ta = _mm_loadu_si128 ((__m128i *) (a + i));
tb = _mm_load_si128 ((__m128i *) (c[0] + i));
sum[0] = _mm_add_epi64 (sum[0], _mm_mul_epi32 (_mm_unpacklo_epi32 (ta, ta),
_mm_unpacklo_epi32 (tb, tb)));
sum[0] = _mm_add_epi64 (sum[0], _mm_mul_epi32 (_mm_unpackhi_epi32 (ta, ta),
_mm_unpackhi_epi32 (tb, tb)));
tb = _mm_load_si128 ((__m128i *) (c[1] + i));
sum[1] = _mm_add_epi64 (sum[1], _mm_mul_epi32 (_mm_unpacklo_epi32 (ta, ta),
_mm_unpacklo_epi32 (tb, tb)));
sum[1] = _mm_add_epi64 (sum[1], _mm_mul_epi32 (_mm_unpackhi_epi32 (ta, ta),
_mm_unpackhi_epi32 (tb, tb)));
tb = _mm_load_si128 ((__m128i *) (c[2] + i));
sum[2] = _mm_add_epi64 (sum[2], _mm_mul_epi32 (_mm_unpacklo_epi32 (ta, ta),
_mm_unpacklo_epi32 (tb, tb)));
sum[2] = _mm_add_epi64 (sum[2], _mm_mul_epi32 (_mm_unpackhi_epi32 (ta, ta),
_mm_unpackhi_epi32 (tb, tb)));
tb = _mm_load_si128 ((__m128i *) (c[3] + i));
sum[3] = _mm_add_epi64 (sum[3], _mm_mul_epi32 (_mm_unpacklo_epi32 (ta, ta),
_mm_unpacklo_epi32 (tb, tb)));
sum[3] = _mm_add_epi64 (sum[3], _mm_mul_epi32 (_mm_unpackhi_epi32 (ta, ta),
_mm_unpackhi_epi32 (tb, tb)));
}
sum[0] = _mm_srli_epi64 (sum[0], PRECISION_S32);
sum[1] = _mm_srli_epi64 (sum[1], PRECISION_S32);
sum[2] = _mm_srli_epi64 (sum[2], PRECISION_S32);
sum[3] = _mm_srli_epi64 (sum[3], PRECISION_S32);
sum[0] =
_mm_mul_epi32 (sum[0], _mm_shuffle_epi32 (f, _MM_SHUFFLE (0, 0, 0, 0)));
sum[1] =
_mm_mul_epi32 (sum[1], _mm_shuffle_epi32 (f, _MM_SHUFFLE (1, 1, 1, 1)));
sum[2] =
_mm_mul_epi32 (sum[2], _mm_shuffle_epi32 (f, _MM_SHUFFLE (2, 2, 2, 2)));
sum[3] =
_mm_mul_epi32 (sum[3], _mm_shuffle_epi32 (f, _MM_SHUFFLE (3, 3, 3, 3)));
sum[0] = _mm_add_epi64 (sum[0], sum[1]);
sum[2] = _mm_add_epi64 (sum[2], sum[3]);
sum[0] = _mm_add_epi64 (sum[0], sum[2]);
sum[0] = _mm_add_epi64 (sum[0], _mm_unpackhi_epi64 (sum[0], sum[0]));
res = _mm_cvtsi128_si64 (sum[0]);
res = (res + (1 << (PRECISION_S32 - 1))) >> PRECISION_S32;
*o = CLAMP (res, -(1L << 31), (1L << 31) - 1);
}
MAKE_RESAMPLE_FUNC (gint32, full, 1, sse41);
MAKE_RESAMPLE_FUNC (gint32, linear, 1, sse41);
MAKE_RESAMPLE_FUNC (gint32, cubic, 1, sse41);
#endif

View file

@ -0,0 +1,29 @@
/* GStreamer
* Copyright (C) <2016> Wim Taymans <wim.taymans@gmail.com>
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Library General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Library General Public License for more details.
*
* You should have received a copy of the GNU Library General Public
* License along with this library; if not, write to the
* Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
* Boston, MA 02110-1301, USA.
*/
#ifndef AUDIO_RESAMPLER_X86_SSE41_H
#define AUDIO_RESAMPLER_X86_SSE41_H
#include "audio-resampler-macros.h"
DECL_RESAMPLE_FUNC (gint32, full, 1, sse41);
DECL_RESAMPLE_FUNC (gint32, linear, 1, sse41);
DECL_RESAMPLE_FUNC (gint32, cubic, 1, sse41);
#endif /* AUDIO_RESAMPLER_X86_SSE41_H */

View file

@ -17,631 +17,16 @@
* Boston, MA 02110-1301, USA. * Boston, MA 02110-1301, USA.
*/ */
#if defined (HAVE_XMMINTRIN_H) && defined(__SSE__) #include "audio-resampler-macros.h"
#include <xmmintrin.h> #include "audio-resampler-x86-sse.h"
#include "audio-resampler-x86-sse2.h"
static inline void #include "audio-resampler-x86-sse41.h"
inner_product_gfloat_full_1_sse (gfloat * o, const gfloat * a,
const gfloat * b, gint len, const gfloat * icoeff, gint bstride)
{
gint i = 0;
__m128 sum = _mm_setzero_ps ();
for (; i < len; i += 8) {
sum =
_mm_add_ps (sum, _mm_mul_ps (_mm_loadu_ps (a + i + 0),
_mm_load_ps (b + i + 0)));
sum =
_mm_add_ps (sum, _mm_mul_ps (_mm_loadu_ps (a + i + 4),
_mm_load_ps (b + i + 4)));
}
sum = _mm_add_ps (sum, _mm_movehl_ps (sum, sum));
sum = _mm_add_ss (sum, _mm_shuffle_ps (sum, sum, 0x55));
_mm_store_ss (o, sum);
}
static inline void
inner_product_gfloat_linear_1_sse (gfloat * o, const gfloat * a,
const gfloat * b, gint len, const gfloat * icoeff, gint bstride)
{
gint i = 0;
__m128 sum[2], t;
const gfloat *c[2] = {(gfloat*)((gint8*)b + 0*bstride),
(gfloat*)((gint8*)b + 1*bstride)};
sum[0] = sum[1] = _mm_setzero_ps ();
for (; i < len; i += 8) {
t = _mm_loadu_ps (a + i + 0);
sum[0] = _mm_add_ps (sum[0], _mm_mul_ps (t, _mm_load_ps (c[0] + i + 0)));
sum[1] = _mm_add_ps (sum[1], _mm_mul_ps (t, _mm_load_ps (c[1] + i + 0)));
t = _mm_loadu_ps (a + i + 4);
sum[0] = _mm_add_ps (sum[0], _mm_mul_ps (t, _mm_load_ps (c[0] + i + 4)));
sum[1] = _mm_add_ps (sum[1], _mm_mul_ps (t, _mm_load_ps (c[1] + i + 4)));
}
sum[0] = _mm_mul_ps (_mm_sub_ps (sum[0], sum[1]), _mm_load1_ps (icoeff));
sum[0] = _mm_add_ps (sum[0], sum[1]);
sum[0] = _mm_add_ps (sum[0], _mm_movehl_ps (sum[0], sum[0]));
sum[0] = _mm_add_ss (sum[0], _mm_shuffle_ps (sum[0], sum[0], 0x55));
_mm_store_ss (o, sum[0]);
}
static inline void
inner_product_gfloat_cubic_1_sse (gfloat * o, const gfloat * a,
const gfloat * b, gint len, const gfloat * icoeff, gint bstride)
{
gint i = 0;
__m128 sum[4];
__m128 t, f = _mm_loadu_ps(icoeff);
const gfloat *c[4] = {(gfloat*)((gint8*)b + 0*bstride),
(gfloat*)((gint8*)b + 1*bstride),
(gfloat*)((gint8*)b + 2*bstride),
(gfloat*)((gint8*)b + 3*bstride)};
sum[0] = sum[1] = sum[2] = sum[3] = _mm_setzero_ps ();
for (; i < len; i += 4) {
t = _mm_loadu_ps (a + i);
sum[0] = _mm_add_ps (sum[0], _mm_mul_ps (t, _mm_load_ps (c[0] + i)));
sum[1] = _mm_add_ps (sum[1], _mm_mul_ps (t, _mm_load_ps (c[1] + i)));
sum[2] = _mm_add_ps (sum[2], _mm_mul_ps (t, _mm_load_ps (c[2] + i)));
sum[3] = _mm_add_ps (sum[3], _mm_mul_ps (t, _mm_load_ps (c[3] + i)));
}
sum[0] = _mm_mul_ps (sum[0], _mm_shuffle_ps (f, f, 0x00));
sum[1] = _mm_mul_ps (sum[1], _mm_shuffle_ps (f, f, 0x55));
sum[2] = _mm_mul_ps (sum[2], _mm_shuffle_ps (f, f, 0xaa));
sum[3] = _mm_mul_ps (sum[3], _mm_shuffle_ps (f, f, 0xff));
sum[0] = _mm_add_ps (sum[0], sum[1]);
sum[2] = _mm_add_ps (sum[2], sum[3]);
sum[0] = _mm_add_ps (sum[0], sum[2]);
sum[0] = _mm_add_ps (sum[0], _mm_movehl_ps (sum[0], sum[0]));
sum[0] = _mm_add_ss (sum[0], _mm_shuffle_ps (sum[0], sum[0], 0x55));
_mm_store_ss (o, sum[0]);
}
MAKE_RESAMPLE_FUNC (gfloat, full, 1, sse);
MAKE_RESAMPLE_FUNC (gfloat, linear, 1, sse);
MAKE_RESAMPLE_FUNC (gfloat, cubic, 1, sse);
static void
interpolate_gfloat_linear_sse (gpointer op, const gpointer ap,
gint len, const gpointer icp, gint astride)
{
gint i;
gfloat *o = op, *a = ap, *ic = icp;
__m128 f[2], t1, t2;
const gfloat *c[2] = {(gfloat*)((gint8*)a + 0*astride),
(gfloat*)((gint8*)a + 1*astride)};
f[0] = _mm_load1_ps (ic+0);
f[1] = _mm_load1_ps (ic+1);
for (i = 0; i < len; i += 8) {
t1 = _mm_mul_ps (_mm_load_ps (c[0] + i + 0), f[0]);
t2 = _mm_mul_ps (_mm_load_ps (c[1] + i + 0), f[1]);
_mm_store_ps (o + i + 0, _mm_add_ps (t1, t2));
t1 = _mm_mul_ps (_mm_load_ps (c[0] + i + 4), f[0]);
t2 = _mm_mul_ps (_mm_load_ps (c[1] + i + 4), f[1]);
_mm_store_ps (o + i + 4, _mm_add_ps (t1, t2));
}
}
static void
interpolate_gfloat_cubic_sse (gpointer op, const gpointer ap,
gint len, const gpointer icp, gint astride)
{
gint i;
gfloat *o = op, *a = ap, *ic = icp;
__m128 f[4], t[4];
const gfloat *c[4] = {(gfloat*)((gint8*)a + 0*astride),
(gfloat*)((gint8*)a + 1*astride),
(gfloat*)((gint8*)a + 2*astride),
(gfloat*)((gint8*)a + 3*astride)};
f[0] = _mm_load1_ps (ic+0);
f[1] = _mm_load1_ps (ic+1);
f[2] = _mm_load1_ps (ic+2);
f[3] = _mm_load1_ps (ic+3);
for (i = 0; i < len; i += 4) {
t[0] = _mm_mul_ps (_mm_load_ps (c[0] + i + 0), f[0]);
t[1] = _mm_mul_ps (_mm_load_ps (c[1] + i + 0), f[1]);
t[2] = _mm_mul_ps (_mm_load_ps (c[2] + i + 0), f[2]);
t[3] = _mm_mul_ps (_mm_load_ps (c[3] + i + 0), f[3]);
t[0] = _mm_add_ps (t[0], t[1]);
t[2] = _mm_add_ps (t[2], t[3]);
_mm_store_ps (o + i + 0, _mm_add_ps (t[0], t[2]));
}
}
#endif
#if defined (HAVE_EMMINTRIN_H) && defined(__SSE2__)
#include <emmintrin.h>
static inline void
inner_product_gint16_full_1_sse2 (gint16 * o, const gint16 * a,
const gint16 * b, gint len, const gint16 * icoeff, gint bstride)
{
gint i;
__m128i sum, t;
sum = _mm_setzero_si128 ();
for (i = 0; i < len; i += 16) {
t = _mm_loadu_si128 ((__m128i *) (a + i));
sum = _mm_add_epi32 (sum, _mm_madd_epi16 (t, _mm_load_si128 ((__m128i *) (b + i + 0))));
t = _mm_loadu_si128 ((__m128i *) (a + i + 8));
sum = _mm_add_epi32 (sum, _mm_madd_epi16 (t, _mm_load_si128 ((__m128i *) (b + i + 8))));
}
sum = _mm_add_epi32 (sum, _mm_shuffle_epi32 (sum, _MM_SHUFFLE (2, 3, 2, 3)));
sum = _mm_add_epi32 (sum, _mm_shuffle_epi32 (sum, _MM_SHUFFLE (1, 1, 1, 1)));
sum = _mm_add_epi32 (sum, _mm_set1_epi32 (1 << (PRECISION_S16 - 1)));
sum = _mm_srai_epi32 (sum, PRECISION_S16);
sum = _mm_packs_epi32 (sum, sum);
*o = _mm_extract_epi16 (sum, 0);
}
static inline void
inner_product_gint16_linear_1_sse2 (gint16 * o, const gint16 * a,
const gint16 * b, gint len, const gint16 * icoeff, gint bstride)
{
gint i = 0;
__m128i sum[2], t;
__m128i f = _mm_set_epi64x (0, *((gint64*)icoeff));
const gint16 *c[2] = {(gint16*)((gint8*)b + 0*bstride),
(gint16*)((gint8*)b + 1*bstride)};
sum[0] = sum[1] = _mm_setzero_si128 ();
f = _mm_unpacklo_epi16 (f, sum[0]);
for (; i < len; i += 16) {
t = _mm_loadu_si128 ((__m128i *) (a + i + 0));
sum[0] = _mm_add_epi32 (sum[0], _mm_madd_epi16 (t, _mm_load_si128 ((__m128i *) (c[0] + i + 0))));
sum[1] = _mm_add_epi32 (sum[1], _mm_madd_epi16 (t, _mm_load_si128 ((__m128i *) (c[1] + i + 0))));
t = _mm_loadu_si128 ((__m128i *) (a + i + 8));
sum[0] = _mm_add_epi32 (sum[0], _mm_madd_epi16 (t, _mm_load_si128 ((__m128i *) (c[0] + i + 8))));
sum[1] = _mm_add_epi32 (sum[1], _mm_madd_epi16 (t, _mm_load_si128 ((__m128i *) (c[1] + i + 8))));
}
sum[0] = _mm_srai_epi32 (sum[0], PRECISION_S16);
sum[1] = _mm_srai_epi32 (sum[1], PRECISION_S16);
sum[0] = _mm_madd_epi16 (sum[0], _mm_shuffle_epi32 (f, _MM_SHUFFLE (0, 0, 0, 0)));
sum[1] = _mm_madd_epi16 (sum[1], _mm_shuffle_epi32 (f, _MM_SHUFFLE (1, 1, 1, 1)));
sum[0] = _mm_add_epi32 (sum[0], sum[1]);
sum[0] = _mm_add_epi32 (sum[0], _mm_shuffle_epi32 (sum[0], _MM_SHUFFLE (2, 3, 2, 3)));
sum[0] = _mm_add_epi32 (sum[0], _mm_shuffle_epi32 (sum[0], _MM_SHUFFLE (1, 1, 1, 1)));
sum[0] = _mm_add_epi32 (sum[0], _mm_set1_epi32 (1 << (PRECISION_S16 - 1)));
sum[0] = _mm_srai_epi32 (sum[0], PRECISION_S16);
sum[0] = _mm_packs_epi32 (sum[0], sum[0]);
*o = _mm_extract_epi16 (sum[0], 0);
}
static inline void
inner_product_gint16_cubic_1_sse2 (gint16 * o, const gint16 * a,
const gint16 * b, gint len, const gint16 * icoeff, gint bstride)
{
gint i = 0;
__m128i sum[4], t[4];
__m128i f = _mm_set_epi64x (0, *((long long*)icoeff));
const gint16 *c[4] = {(gint16*)((gint8*)b + 0*bstride),
(gint16*)((gint8*)b + 1*bstride),
(gint16*)((gint8*)b + 2*bstride),
(gint16*)((gint8*)b + 3*bstride)};
sum[0] = sum[1] = sum[2] = sum[3] = _mm_setzero_si128 ();
f = _mm_unpacklo_epi16 (f, sum[0]);
for (; i < len; i += 8) {
t[0] = _mm_loadu_si128 ((__m128i *) (a + i));
sum[0] = _mm_add_epi32 (sum[0], _mm_madd_epi16 (t[0], _mm_load_si128 ((__m128i *) (c[0] + i))));
sum[1] = _mm_add_epi32 (sum[1], _mm_madd_epi16 (t[0], _mm_load_si128 ((__m128i *) (c[1] + i))));
sum[2] = _mm_add_epi32 (sum[2], _mm_madd_epi16 (t[0], _mm_load_si128 ((__m128i *) (c[2] + i))));
sum[3] = _mm_add_epi32 (sum[3], _mm_madd_epi16 (t[0], _mm_load_si128 ((__m128i *) (c[3] + i))));
}
t[0] = _mm_unpacklo_epi32 (sum[0], sum[1]);
t[1] = _mm_unpacklo_epi32 (sum[2], sum[3]);
t[2] = _mm_unpackhi_epi32 (sum[0], sum[1]);
t[3] = _mm_unpackhi_epi32 (sum[2], sum[3]);
sum[0] = _mm_add_epi32 (_mm_unpacklo_epi64(t[0], t[1]), _mm_unpackhi_epi64(t[0], t[1]));
sum[2] = _mm_add_epi32 (_mm_unpacklo_epi64(t[2], t[3]), _mm_unpackhi_epi64(t[2], t[3]));
sum[0] = _mm_add_epi32 (sum[0], sum[2]);
sum[0] = _mm_srai_epi32 (sum[0], PRECISION_S16);
sum[0] = _mm_madd_epi16 (sum[0], f);
sum[0] = _mm_add_epi32 (sum[0], _mm_shuffle_epi32 (sum[0], _MM_SHUFFLE (2, 3, 2, 3)));
sum[0] = _mm_add_epi32 (sum[0], _mm_shuffle_epi32 (sum[0], _MM_SHUFFLE (1, 1, 1, 1)));
sum[0] = _mm_add_epi32 (sum[0], _mm_set1_epi32 (1 << (PRECISION_S16 - 1)));
sum[0] = _mm_srai_epi32 (sum[0], PRECISION_S16);
sum[0] = _mm_packs_epi32 (sum[0], sum[0]);
*o = _mm_extract_epi16 (sum[0], 0);
}
static inline void
inner_product_gdouble_full_1_sse2 (gdouble * o, const gdouble * a,
const gdouble * b, gint len, const gdouble * icoeff, gint bstride)
{
gint i = 0;
__m128d sum = _mm_setzero_pd ();
for (; i < len; i += 8) {
sum =
_mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + i + 0),
_mm_load_pd (b + i + 0)));
sum =
_mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + i + 2),
_mm_load_pd (b + i + 2)));
sum =
_mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + i + 4),
_mm_load_pd (b + i + 4)));
sum =
_mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + i + 6),
_mm_load_pd (b + i + 6)));
}
sum = _mm_add_sd (sum, _mm_unpackhi_pd (sum, sum));
_mm_store_sd (o, sum);
}
static inline void
inner_product_gdouble_linear_1_sse2 (gdouble * o, const gdouble * a,
const gdouble * b, gint len, const gdouble * icoeff, gint bstride)
{
gint i = 0;
__m128d sum[2], t;
const gdouble *c[2] = {(gdouble*)((gint8*)b + 0*bstride),
(gdouble*)((gint8*)b + 1*bstride)};
sum[0] = sum[1] = _mm_setzero_pd ();
for (; i < len; i += 4) {
t = _mm_loadu_pd (a + i + 0);
sum[0] = _mm_add_pd (sum[0], _mm_mul_pd (t, _mm_load_pd (c[0] + i + 0)));
sum[1] = _mm_add_pd (sum[1], _mm_mul_pd (t, _mm_load_pd (c[1] + i + 0)));
t = _mm_loadu_pd (a + i + 2);
sum[0] = _mm_add_pd (sum[0], _mm_mul_pd (t, _mm_load_pd (c[0] + i + 2)));
sum[1] = _mm_add_pd (sum[1], _mm_mul_pd (t, _mm_load_pd (c[1] + i + 2)));
}
sum[0] = _mm_mul_pd (_mm_sub_pd (sum[0], sum[1]), _mm_load1_pd (icoeff));
sum[0] = _mm_add_pd (sum[0], sum[1]);
sum[0] = _mm_add_sd (sum[0], _mm_unpackhi_pd (sum[0], sum[0]));
_mm_store_sd (o, sum[0]);
}
static inline void
inner_product_gdouble_cubic_1_sse2 (gdouble * o, const gdouble * a,
const gdouble * b, gint len, const gdouble * icoeff, gint bstride)
{
gint i;
__m128d f[2], sum[4], t;
const gdouble *c[4] = {(gdouble*)((gint8*)b + 0*bstride),
(gdouble*)((gint8*)b + 1*bstride),
(gdouble*)((gint8*)b + 2*bstride),
(gdouble*)((gint8*)b + 3*bstride)};
f[0] = _mm_loadu_pd (icoeff + 0);
f[1] = _mm_loadu_pd (icoeff + 2);
sum[0] = sum[1] = sum[2] = sum[3] = _mm_setzero_pd ();
for (i = 0; i < len; i += 2) {
t = _mm_loadu_pd (a + i + 0);
sum[0] = _mm_add_pd (sum[0], _mm_mul_pd (t, _mm_load_pd (c[0] + i)));
sum[1] = _mm_add_pd (sum[1], _mm_mul_pd (t, _mm_load_pd (c[1] + i)));
sum[2] = _mm_add_pd (sum[2], _mm_mul_pd (t, _mm_load_pd (c[2] + i)));
sum[3] = _mm_add_pd (sum[3], _mm_mul_pd (t, _mm_load_pd (c[3] + i)));
}
sum[0] = _mm_mul_pd (sum[0], _mm_shuffle_pd (f[0], f[0], _MM_SHUFFLE2 (0, 0)));
sum[1] = _mm_mul_pd (sum[1], _mm_shuffle_pd (f[0], f[0], _MM_SHUFFLE2 (1, 1)));
sum[2] = _mm_mul_pd (sum[2], _mm_shuffle_pd (f[1], f[1], _MM_SHUFFLE2 (0, 0)));
sum[3] = _mm_mul_pd (sum[3], _mm_shuffle_pd (f[1], f[1], _MM_SHUFFLE2 (1, 1)));
sum[0] = _mm_add_pd (sum[0], sum[1]);
sum[2] = _mm_add_pd (sum[2], sum[3]);
sum[0] = _mm_add_pd (sum[0], sum[2]);
sum[0] = _mm_add_sd (sum[0], _mm_unpackhi_pd (sum[0], sum[0]));
_mm_store_sd (o, sum[0]);
}
MAKE_RESAMPLE_FUNC (gint16, full, 1, sse2);
MAKE_RESAMPLE_FUNC (gint16, linear, 1, sse2);
MAKE_RESAMPLE_FUNC (gint16, cubic, 1, sse2);
MAKE_RESAMPLE_FUNC (gdouble, full, 1, sse2);
MAKE_RESAMPLE_FUNC (gdouble, linear, 1, sse2);
MAKE_RESAMPLE_FUNC (gdouble, cubic, 1, sse2);
static inline void
interpolate_gint16_linear_sse2 (gpointer op, const gpointer ap,
gint len, const gpointer icp, gint astride)
{
gint i = 0;
gint16 *o = op, *a = ap, *ic = icp;
__m128i ta, tb, t1, t2;
__m128i f = _mm_set_epi64x (0, *((gint64*)ic));
const gint16 *c[2] = {(gint16*)((gint8*)a + 0*astride),
(gint16*)((gint8*)a + 1*astride)};
f = _mm_unpacklo_epi32 (f, f);
f = _mm_unpacklo_epi64 (f, f);
for (; i < len; i += 8) {
ta = _mm_load_si128 ((__m128i *) (c[0] + i));
tb = _mm_load_si128 ((__m128i *) (c[1] + i));
t1 = _mm_madd_epi16 (_mm_unpacklo_epi16 (ta, tb), f);
t2 = _mm_madd_epi16 (_mm_unpackhi_epi16 (ta, tb), f);
t1 = _mm_add_epi32 (t1, _mm_set1_epi32 (1 << (PRECISION_S16 - 1)));
t2 = _mm_add_epi32 (t2, _mm_set1_epi32 (1 << (PRECISION_S16 - 1)));
t1 = _mm_srai_epi32 (t1, PRECISION_S16);
t2 = _mm_srai_epi32 (t2, PRECISION_S16);
t1 = _mm_packs_epi32 (t1, t2);
_mm_store_si128 ((__m128i *) (o + i), t1);
}
}
static inline void
interpolate_gint16_cubic_sse2 (gpointer op, const gpointer ap,
gint len, const gpointer icp, gint astride)
{
gint i = 0;
gint16 *o = op, *a = ap, *ic = icp;
__m128i ta, tb, tl1, tl2, th1, th2;
__m128i f[2];
const gint16 *c[4] = {(gint16*)((gint8*)a + 0*astride),
(gint16*)((gint8*)a + 1*astride),
(gint16*)((gint8*)a + 2*astride),
(gint16*)((gint8*)a + 3*astride)};
f[0] = _mm_set_epi16 (ic[1], ic[0], ic[1], ic[0], ic[1], ic[0], ic[1], ic[0]);
f[1] = _mm_set_epi16 (ic[3], ic[2], ic[3], ic[2], ic[3], ic[2], ic[3], ic[2]);
for (; i < len; i += 8) {
ta = _mm_load_si128 ((__m128i *) (c[0] + i));
tb = _mm_load_si128 ((__m128i *) (c[1] + i));
tl1 = _mm_madd_epi16 (_mm_unpacklo_epi16 (ta, tb), f[0]);
th1 = _mm_madd_epi16 (_mm_unpackhi_epi16 (ta, tb), f[0]);
ta = _mm_load_si128 ((__m128i *) (c[2] + i));
tb = _mm_load_si128 ((__m128i *) (c[3] + i));
tl2 = _mm_madd_epi16 (_mm_unpacklo_epi16 (ta, tb), f[1]);
th2 = _mm_madd_epi16 (_mm_unpackhi_epi16 (ta, tb), f[1]);
tl1 = _mm_add_epi32 (tl1, tl2);
th1 = _mm_add_epi32 (th1, th2);
tl1 = _mm_add_epi32 (tl1, _mm_set1_epi32 (1 << (PRECISION_S16 - 1)));
th1 = _mm_add_epi32 (th1, _mm_set1_epi32 (1 << (PRECISION_S16 - 1)));
tl1 = _mm_srai_epi32 (tl1, PRECISION_S16);
th1 = _mm_srai_epi32 (th1, PRECISION_S16);
tl1 = _mm_packs_epi32 (tl1, th1);
_mm_store_si128 ((__m128i *) (o + i), tl1);
}
}
static void
interpolate_gdouble_linear_sse2 (gpointer op, const gpointer ap,
gint len, const gpointer icp, gint astride)
{
gint i;
gdouble *o = op, *a = ap, *ic = icp;
__m128d f[2], t1, t2;
const gdouble *c[2] = {(gdouble*)((gint8*)a + 0*astride),
(gdouble*)((gint8*)a + 1*astride)};
f[0] = _mm_load1_pd (ic+0);
f[1] = _mm_load1_pd (ic+1);
for (i = 0; i < len; i += 4) {
t1 = _mm_mul_pd (_mm_load_pd (c[0] + i + 0), f[0]);
t2 = _mm_mul_pd (_mm_load_pd (c[1] + i + 0), f[1]);
_mm_store_pd (o + i + 0, _mm_add_pd (t1, t2));
t1 = _mm_mul_pd (_mm_load_pd (c[0] + i + 2), f[0]);
t2 = _mm_mul_pd (_mm_load_pd (c[1] + i + 2), f[1]);
_mm_store_pd (o + i + 2, _mm_add_pd (t1, t2));
}
}
static void
interpolate_gdouble_cubic_sse2 (gpointer op, const gpointer ap,
gint len, const gpointer icp, gint astride)
{
gint i;
gdouble *o = op, *a = ap, *ic = icp;
__m128d f[4], t[4];
const gdouble *c[4] = {(gdouble*)((gint8*)a + 0*astride),
(gdouble*)((gint8*)a + 1*astride),
(gdouble*)((gint8*)a + 2*astride),
(gdouble*)((gint8*)a + 3*astride)};
f[0] = _mm_load1_pd (ic+0);
f[1] = _mm_load1_pd (ic+1);
f[2] = _mm_load1_pd (ic+2);
f[3] = _mm_load1_pd (ic+3);
for (i = 0; i < len; i += 2) {
t[0] = _mm_mul_pd (_mm_load_pd (c[0] + i + 0), f[0]);
t[1] = _mm_mul_pd (_mm_load_pd (c[1] + i + 0), f[1]);
t[2] = _mm_mul_pd (_mm_load_pd (c[2] + i + 0), f[2]);
t[3] = _mm_mul_pd (_mm_load_pd (c[3] + i + 0), f[3]);
t[0] = _mm_add_pd (t[0], t[1]);
t[2] = _mm_add_pd (t[2], t[3]);
_mm_store_pd (o + i + 0, _mm_add_pd (t[0], t[2]));
}
}
#endif
#if 0
#define __SSE4_1__
#pragma GCC target("sse4.1")
#endif
#if defined (HAVE_SMMINTRIN_H) && defined(__SSE4_1__)
#include <smmintrin.h>
static inline void
inner_product_gint32_full_1_sse41 (gint32 * o, const gint32 * a,
const gint32 * b, gint len, const gint32 * icoeff, gint bstride)
{
gint i = 0;
__m128i sum, ta, tb;
gint64 res;
sum = _mm_setzero_si128 ();
for (; i < len; i += 8) {
ta = _mm_loadu_si128 ((__m128i *) (a + i));
tb = _mm_load_si128 ((__m128i *) (b + i));
sum =
_mm_add_epi64 (sum, _mm_mul_epi32 (_mm_unpacklo_epi32 (ta, ta),
_mm_unpacklo_epi32 (tb, tb)));
sum =
_mm_add_epi64 (sum, _mm_mul_epi32 (_mm_unpackhi_epi32 (ta, ta),
_mm_unpackhi_epi32 (tb, tb)));
ta = _mm_loadu_si128 ((__m128i *) (a + i + 4));
tb = _mm_load_si128 ((__m128i *) (b + i + 4));
sum =
_mm_add_epi64 (sum, _mm_mul_epi32 (_mm_unpacklo_epi32 (ta, ta),
_mm_unpacklo_epi32 (tb, tb)));
sum =
_mm_add_epi64 (sum, _mm_mul_epi32 (_mm_unpackhi_epi32 (ta, ta),
_mm_unpackhi_epi32 (tb, tb)));
}
sum = _mm_add_epi64 (sum, _mm_unpackhi_epi64 (sum, sum));
res = _mm_cvtsi128_si64 (sum);
res = (res + (1 << (PRECISION_S32 - 1))) >> PRECISION_S32;
*o = CLAMP (res, -(1L << 31), (1L << 31) - 1);
}
static inline void
inner_product_gint32_linear_1_sse41 (gint32 * o, const gint32 * a,
const gint32 * b, gint len, const gint32 * icoeff, gint bstride)
{
gint i = 0;
gint64 res;
__m128i sum[2], ta, tb;
__m128i f = _mm_loadu_si128 ((__m128i *)icoeff);
const gint32 *c[2] = {(gint32*)((gint8*)b + 0*bstride),
(gint32*)((gint8*)b + 1*bstride)};
sum[0] = sum[1] = _mm_setzero_si128 ();
for (; i < len; i += 4) {
ta = _mm_loadu_si128 ((__m128i *)(a + i));
tb = _mm_load_si128 ((__m128i *)(c[0] + i));
sum[0] = _mm_add_epi64 (sum[0], _mm_mul_epi32 (_mm_unpacklo_epi32 (ta, ta),
_mm_unpacklo_epi32 (tb, tb)));
sum[0] = _mm_add_epi64 (sum[0], _mm_mul_epi32 (_mm_unpackhi_epi32 (ta, ta),
_mm_unpackhi_epi32 (tb, tb)));
tb = _mm_load_si128 ((__m128i *)(c[1] + i));
sum[1] = _mm_add_epi64 (sum[1], _mm_mul_epi32 (_mm_unpacklo_epi32 (ta, ta),
_mm_unpacklo_epi32 (tb, tb)));
sum[1] = _mm_add_epi64 (sum[1], _mm_mul_epi32 (_mm_unpackhi_epi32 (ta, ta),
_mm_unpackhi_epi32 (tb, tb)));
}
sum[0] = _mm_srli_epi64 (sum[0], PRECISION_S32);
sum[1] = _mm_srli_epi64 (sum[1], PRECISION_S32);
sum[0] = _mm_mul_epi32 (sum[0], _mm_shuffle_epi32 (f, _MM_SHUFFLE (0, 0, 0, 0)));
sum[1] = _mm_mul_epi32 (sum[1], _mm_shuffle_epi32 (f, _MM_SHUFFLE (1, 1, 1, 1)));
sum[0] = _mm_add_epi64 (sum[0], sum[1]);
sum[0] = _mm_add_epi64 (sum[0], _mm_unpackhi_epi64 (sum[0], sum[0]));
res = _mm_cvtsi128_si64 (sum[0]);
res = (res + (1 << (PRECISION_S32 - 1))) >> PRECISION_S32;
*o = CLAMP (res, -(1L << 31), (1L << 31) - 1);
}
static inline void
inner_product_gint32_cubic_1_sse41 (gint32 * o, const gint32 * a,
const gint32 * b, gint len, const gint32 * icoeff, gint bstride)
{
gint i = 0;
gint64 res;
__m128i sum[4], ta, tb;
__m128i f = _mm_loadu_si128 ((__m128i *)icoeff);
const gint32 *c[4] = {(gint32*)((gint8*)b + 0*bstride),
(gint32*)((gint8*)b + 1*bstride),
(gint32*)((gint8*)b + 2*bstride),
(gint32*)((gint8*)b + 3*bstride)};
sum[0] = sum[1] = sum[2] = sum[3] = _mm_setzero_si128 ();
for (; i < len; i += 4) {
ta = _mm_loadu_si128 ((__m128i *)(a + i));
tb = _mm_load_si128 ((__m128i *)(c[0] + i));
sum[0] = _mm_add_epi64 (sum[0], _mm_mul_epi32 (_mm_unpacklo_epi32 (ta, ta),
_mm_unpacklo_epi32 (tb, tb)));
sum[0] = _mm_add_epi64 (sum[0], _mm_mul_epi32 (_mm_unpackhi_epi32 (ta, ta),
_mm_unpackhi_epi32 (tb, tb)));
tb = _mm_load_si128 ((__m128i *)(c[1] + i));
sum[1] = _mm_add_epi64 (sum[1], _mm_mul_epi32 (_mm_unpacklo_epi32 (ta, ta),
_mm_unpacklo_epi32 (tb, tb)));
sum[1] = _mm_add_epi64 (sum[1], _mm_mul_epi32 (_mm_unpackhi_epi32 (ta, ta),
_mm_unpackhi_epi32 (tb, tb)));
tb = _mm_load_si128 ((__m128i *)(c[2] + i));
sum[2] = _mm_add_epi64 (sum[2], _mm_mul_epi32 (_mm_unpacklo_epi32 (ta, ta),
_mm_unpacklo_epi32 (tb, tb)));
sum[2] = _mm_add_epi64 (sum[2], _mm_mul_epi32 (_mm_unpackhi_epi32 (ta, ta),
_mm_unpackhi_epi32 (tb, tb)));
tb = _mm_load_si128 ((__m128i *)(c[3] + i));
sum[3] = _mm_add_epi64 (sum[3], _mm_mul_epi32 (_mm_unpacklo_epi32 (ta, ta),
_mm_unpacklo_epi32 (tb, tb)));
sum[3] = _mm_add_epi64 (sum[3], _mm_mul_epi32 (_mm_unpackhi_epi32 (ta, ta),
_mm_unpackhi_epi32 (tb, tb)));
}
sum[0] = _mm_srli_epi64 (sum[0], PRECISION_S32);
sum[1] = _mm_srli_epi64 (sum[1], PRECISION_S32);
sum[2] = _mm_srli_epi64 (sum[2], PRECISION_S32);
sum[3] = _mm_srli_epi64 (sum[3], PRECISION_S32);
sum[0] = _mm_mul_epi32 (sum[0], _mm_shuffle_epi32 (f, _MM_SHUFFLE (0, 0, 0, 0)));
sum[1] = _mm_mul_epi32 (sum[1], _mm_shuffle_epi32 (f, _MM_SHUFFLE (1, 1, 1, 1)));
sum[2] = _mm_mul_epi32 (sum[2], _mm_shuffle_epi32 (f, _MM_SHUFFLE (2, 2, 2, 2)));
sum[3] = _mm_mul_epi32 (sum[3], _mm_shuffle_epi32 (f, _MM_SHUFFLE (3, 3, 3, 3)));
sum[0] = _mm_add_epi64 (sum[0], sum[1]);
sum[2] = _mm_add_epi64 (sum[2], sum[3]);
sum[0] = _mm_add_epi64 (sum[0], sum[2]);
sum[0] = _mm_add_epi64 (sum[0], _mm_unpackhi_epi64 (sum[0], sum[0]));
res = _mm_cvtsi128_si64 (sum[0]);
res = (res + (1 << (PRECISION_S32 - 1))) >> PRECISION_S32;
*o = CLAMP (res, -(1L << 31), (1L << 31) - 1);
}
MAKE_RESAMPLE_FUNC (gint32, full, 1, sse41);
MAKE_RESAMPLE_FUNC (gint32, linear, 1, sse41);
MAKE_RESAMPLE_FUNC (gint32, cubic, 1, sse41);
#endif
static void static void
audio_resampler_check_x86 (const gchar *option) audio_resampler_check_x86 (const gchar *option)
{ {
if (!strcmp (option, "sse")) { if (!strcmp (option, "sse")) {
#if defined (HAVE_XMMINTRIN_H) && defined(__SSE__) #if defined (HAVE_XMMINTRIN_H) && HAVE_SSE
GST_DEBUG ("enable SSE optimisations"); GST_DEBUG ("enable SSE optimisations");
resample_gfloat_full_1 = resample_gfloat_full_1_sse; resample_gfloat_full_1 = resample_gfloat_full_1_sse;
resample_gfloat_linear_1 = resample_gfloat_linear_1_sse; resample_gfloat_linear_1 = resample_gfloat_linear_1_sse;
@ -653,7 +38,7 @@ audio_resampler_check_x86 (const gchar *option)
GST_DEBUG ("SSE optimisations not enabled"); GST_DEBUG ("SSE optimisations not enabled");
#endif #endif
} else if (!strcmp (option, "sse2")) { } else if (!strcmp (option, "sse2")) {
#if defined (HAVE_EMMINTRIN_H) && defined(__SSE2__) #if defined (HAVE_EMMINTRIN_H) && HAVE_SSE2
GST_DEBUG ("enable SSE2 optimisations"); GST_DEBUG ("enable SSE2 optimisations");
resample_gint16_full_1 = resample_gint16_full_1_sse2; resample_gint16_full_1 = resample_gint16_full_1_sse2;
resample_gint16_linear_1 = resample_gint16_linear_1_sse2; resample_gint16_linear_1 = resample_gint16_linear_1_sse2;
@ -672,7 +57,7 @@ audio_resampler_check_x86 (const gchar *option)
GST_DEBUG ("SSE2 optimisations not enabled"); GST_DEBUG ("SSE2 optimisations not enabled");
#endif #endif
} else if (!strcmp (option, "sse41")) { } else if (!strcmp (option, "sse41")) {
#if defined (HAVE_SMMINTRIN_H) && defined(__SSE4_1__) #if defined (HAVE_SMMINTRIN_H) && defined (HAVE_EMMINTRIN_H) && HAVE_SSE41
GST_DEBUG ("enable SSE41 optimisations"); GST_DEBUG ("enable SSE41 optimisations");
resample_gint32_full_1 = resample_gint32_full_1_sse41; resample_gint32_full_1 = resample_gint32_full_1_sse41;
resample_gint32_linear_1 = resample_gint32_linear_1_sse41; resample_gint32_linear_1 = resample_gint32_linear_1_sse41;

View file

@ -30,99 +30,13 @@
#endif #endif
#include "audio-resampler.h" #include "audio-resampler.h"
#include "audio-resampler-private.h"
/* Contains a collection of all things found in other resamplers: #include "audio-resampler-macros.h"
* speex (filter construction, optimizations), ffmpeg (fixed phase filter, blackman filter),
* SRC (linear interpolation, fixed precomputed tables),...
*
* Supports:
* - S16, S32, F32 and F64 formats
* - nearest, linear and cubic interpolation
* - sinc based interpolation with kaiser or blackman-nutall windows
* - fully configurable kaiser parameters
* - dynamic linear or cubic interpolation of filter table, this can
* use less memory but more CPU
* - full filter table, generated from optionally linear or cubic
* interpolation of filter table
* - fixed filter table size with nearest neighbour phase, optionally
* using a precomputed tables
* - dynamic samplerate changes
* - x86 and neon optimizations
*/
typedef void (*ConvertTapsFunc) (gdouble * tmp_taps, gpointer taps,
gdouble weight, gint n_taps);
typedef void (*InterpolateFunc) (gpointer o, const gpointer a, gint len,
const gpointer icoeff, gint astride);
typedef void (*ResampleFunc) (GstAudioResampler * resampler, gpointer in[],
gsize in_len, gpointer out[], gsize out_len, gsize * consumed);
typedef void (*DeinterleaveFunc) (GstAudioResampler * resampler,
gpointer * sbuf, gpointer in[], gsize in_frames);
#define MEM_ALIGN(m,a) ((gint8 *)((guintptr)((gint8 *)(m) + ((a)-1)) & ~((a)-1))) #define MEM_ALIGN(m,a) ((gint8 *)((guintptr)((gint8 *)(m) + ((a)-1)) & ~((a)-1)))
#define ALIGN 16 #define ALIGN 16
#define TAPS_OVERREAD 16 #define TAPS_OVERREAD 16
struct _GstAudioResampler
{
GstAudioResamplerMethod method;
GstAudioResamplerFlags flags;
GstAudioFormat format;
GstStructure *options;
gint format_index;
gint channels;
gint in_rate;
gint out_rate;
gint bps;
gint ostride;
GstAudioResamplerFilterMode filter_mode;
guint filter_threshold;
GstAudioResamplerFilterInterpolation filter_interpolation;
gdouble cutoff;
gdouble kaiser_beta;
/* for cubic */
gdouble b, c;
/* temp taps */
gpointer tmp_taps;
/* oversampled main filter table */
gint oversample;
gint n_taps;
gpointer taps;
gpointer taps_mem;
gsize taps_stride;
gint n_phases;
gint alloc_taps;
gint alloc_phases;
/* cached taps */
gpointer *cached_phases;
gpointer cached_taps;
gpointer cached_taps_mem;
gsize cached_taps_stride;
ConvertTapsFunc convert_taps;
InterpolateFunc interpolate;
DeinterleaveFunc deinterleave;
ResampleFunc resample;
gint blocks;
gint inc;
gint samp_inc;
gint samp_frac;
gint samp_index;
gint samp_phase;
gint skip;
gpointer samples;
gsize samples_len;
gsize samples_avail;
gpointer *sbuf;
};
GST_DEBUG_CATEGORY_STATIC (audio_resampler_debug); GST_DEBUG_CATEGORY_STATIC (audio_resampler_debug);
#define GST_CAT_DEFAULT audio_resampler_debug #define GST_CAT_DEFAULT audio_resampler_debug
@ -303,9 +217,6 @@ get_kaiser_tap (gdouble x, gint n_taps, gdouble Fc, gdouble beta)
return s * bessel (beta * sqrt (MAX (1 - w * w, 0))); return s * bessel (beta * sqrt (MAX (1 - w * w, 0)));
} }
#define PRECISION_S16 15
#define PRECISION_S32 31
#define MAKE_CONVERT_TAPS_INT_FUNC(type, precision) \ #define MAKE_CONVERT_TAPS_INT_FUNC(type, precision) \
static void \ static void \
convert_taps_##type##_c (gdouble *tmp_taps, gpointer taps, \ convert_taps_##type##_c (gdouble *tmp_taps, gpointer taps, \
@ -593,9 +504,7 @@ GET_TAPS_NEAREST_FUNC (gdouble);
#define get_taps_gdouble_nearest get_taps_gdouble_nearest #define get_taps_gdouble_nearest get_taps_gdouble_nearest
#define GET_TAPS_FULL_FUNC(type) \ #define GET_TAPS_FULL_FUNC(type) \
static inline gpointer \ DECL_GET_TAPS_FULL_FUNC(type) \
get_taps_##type##_full (GstAudioResampler * resampler, \
gint *samp_index, gint *samp_phase, type icoeff[4]) \
{ \ { \
gpointer res; \ gpointer res; \
gint out_rate = resampler->out_rate; \ gint out_rate = resampler->out_rate; \
@ -659,9 +568,7 @@ GET_TAPS_FULL_FUNC (gfloat);
GET_TAPS_FULL_FUNC (gdouble); GET_TAPS_FULL_FUNC (gdouble);
#define GET_TAPS_INTERPOLATE_FUNC(type,inter) \ #define GET_TAPS_INTERPOLATE_FUNC(type,inter) \
static inline gpointer \ DECL_GET_TAPS_INTERPOLATE_FUNC (type, inter) \
get_taps_##type##_##inter (GstAudioResampler * resampler, \
gint *samp_index, gint *samp_phase, type icoeff[4]) \
{ \ { \
gpointer res; \ gpointer res; \
gint out_rate = resampler->out_rate; \ gint out_rate = resampler->out_rate; \
@ -852,67 +759,25 @@ inner_product_##type##_cubic_1_c (type * o, const type * a, \
INNER_PRODUCT_FLOAT_CUBIC_FUNC (gfloat); INNER_PRODUCT_FLOAT_CUBIC_FUNC (gfloat);
INNER_PRODUCT_FLOAT_CUBIC_FUNC (gdouble); INNER_PRODUCT_FLOAT_CUBIC_FUNC (gdouble);
#define MAKE_RESAMPLE_FUNC(type,inter,channels,arch) \ MAKE_RESAMPLE_FUNC_STATIC (gint16, nearest, 1, c);
static void \ MAKE_RESAMPLE_FUNC_STATIC (gint32, nearest, 1, c);
resample_ ##type## _ ##inter## _ ##channels## _ ##arch (GstAudioResampler * resampler, \ MAKE_RESAMPLE_FUNC_STATIC (gfloat, nearest, 1, c);
gpointer in[], gsize in_len, gpointer out[], gsize out_len, \ MAKE_RESAMPLE_FUNC_STATIC (gdouble, nearest, 1, c);
gsize * consumed) \
{ \
gint c, di = 0; \
gint n_taps = resampler->n_taps; \
gint blocks = resampler->blocks; \
gint ostride = resampler->ostride; \
gint taps_stride = resampler->taps_stride; \
gint samp_index = 0; \
gint samp_phase = 0; \
\
for (c = 0; c < blocks; c++) { \
type *ip = in[c]; \
type *op = ostride == 1 ? out[c] : (type *)out[0] + c; \
\
samp_index = resampler->samp_index; \
samp_phase = resampler->samp_phase; \
\
for (di = 0; di < out_len; di++) { \
type *ipp, icoeff[4], *taps; \
\
ipp = &ip[samp_index * channels]; \
\
taps = get_taps_ ##type##_##inter \
(resampler, &samp_index, &samp_phase, icoeff); \
inner_product_ ##type##_##inter##_##channels##_##arch \
(op, ipp, taps, n_taps, icoeff, taps_stride); \
op += ostride; \
} \
if (in_len > samp_index) \
memmove (ip, &ip[samp_index * channels], \
(in_len - samp_index) * sizeof(type) * channels); \
} \
*consumed = samp_index - resampler->samp_index; \
\
resampler->samp_index = 0; \
resampler->samp_phase = samp_phase; \
}
MAKE_RESAMPLE_FUNC (gint16, nearest, 1, c); MAKE_RESAMPLE_FUNC_STATIC (gint16, full, 1, c);
MAKE_RESAMPLE_FUNC (gint32, nearest, 1, c); MAKE_RESAMPLE_FUNC_STATIC (gint32, full, 1, c);
MAKE_RESAMPLE_FUNC (gfloat, nearest, 1, c); MAKE_RESAMPLE_FUNC_STATIC (gfloat, full, 1, c);
MAKE_RESAMPLE_FUNC (gdouble, nearest, 1, c); MAKE_RESAMPLE_FUNC_STATIC (gdouble, full, 1, c);
MAKE_RESAMPLE_FUNC (gint16, full, 1, c); MAKE_RESAMPLE_FUNC_STATIC (gint16, linear, 1, c);
MAKE_RESAMPLE_FUNC (gint32, full, 1, c); MAKE_RESAMPLE_FUNC_STATIC (gint32, linear, 1, c);
MAKE_RESAMPLE_FUNC (gfloat, full, 1, c); MAKE_RESAMPLE_FUNC_STATIC (gfloat, linear, 1, c);
MAKE_RESAMPLE_FUNC (gdouble, full, 1, c); MAKE_RESAMPLE_FUNC_STATIC (gdouble, linear, 1, c);
MAKE_RESAMPLE_FUNC (gint16, linear, 1, c); MAKE_RESAMPLE_FUNC_STATIC (gint16, cubic, 1, c);
MAKE_RESAMPLE_FUNC (gint32, linear, 1, c); MAKE_RESAMPLE_FUNC_STATIC (gint32, cubic, 1, c);
MAKE_RESAMPLE_FUNC (gfloat, linear, 1, c); MAKE_RESAMPLE_FUNC_STATIC (gfloat, cubic, 1, c);
MAKE_RESAMPLE_FUNC (gdouble, linear, 1, c); MAKE_RESAMPLE_FUNC_STATIC (gdouble, cubic, 1, c);
MAKE_RESAMPLE_FUNC (gint16, cubic, 1, c);
MAKE_RESAMPLE_FUNC (gint32, cubic, 1, c);
MAKE_RESAMPLE_FUNC (gfloat, cubic, 1, c);
MAKE_RESAMPLE_FUNC (gdouble, cubic, 1, c);
static ResampleFunc resample_funcs[] = { static ResampleFunc resample_funcs[] = {
resample_gint16_nearest_1_c, resample_gint16_nearest_1_c,