audio-resampler: make pluggable optimized functions

Add support for x86 specialized functions and select them at runtime.
This commit is contained in:
Wim Taymans 2016-01-12 18:55:19 +01:00
parent 819c4c26c7
commit f55a67ca7c
2 changed files with 285 additions and 151 deletions

View file

@ -17,97 +17,16 @@
* Boston, MA 02110-1301, USA.
*/
#define PRECISION_S16 15
#define PRECISION_S32 30
#ifdef HAVE_EMMINTRIN_H
#include <emmintrin.h>
#endif
static inline void
inner_product_gdouble_1 (gdouble * o, const gdouble * a, const gdouble * b,
gint len)
{
gint i = 0;
gdouble res;
#ifdef HAVE_EMMINTRIN_H
__m128d sum = _mm_setzero_pd ();
for (; i < len - 7; i += 8) {
sum =
_mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + i + 0),
_mm_loadu_pd (b + i + 0)));
sum =
_mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + i + 2),
_mm_loadu_pd (b + i + 2)));
sum =
_mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + i + 4),
_mm_loadu_pd (b + i + 4)));
sum =
_mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + i + 6),
_mm_loadu_pd (b + i + 6)));
}
sum = _mm_add_sd (sum, _mm_unpackhi_pd (sum, sum));
_mm_store_sd (&res, sum);
#else
res = 0.0;
#endif
for (; i < len; i++)
res += a[i] * b[i];
*o = res;
}
static inline void
inner_product_gfloat_1 (gfloat * o, const gfloat * a, const gfloat * b, gint len)
{
gint i = 0;
gfloat res;
#ifdef HAVE_EMMINTRIN_H
__m128 sum = _mm_setzero_ps ();
for (; i < len - 7; i += 8) {
sum =
_mm_add_ps (sum, _mm_mul_ps (_mm_loadu_ps (a + i + 0),
_mm_loadu_ps (b + i + 0)));
sum =
_mm_add_ps (sum, _mm_mul_ps (_mm_loadu_ps (a + i + 4),
_mm_loadu_ps (b + i + 4)));
}
sum = _mm_add_ps (sum, _mm_movehl_ps (sum, sum));
sum = _mm_add_ss (sum, _mm_shuffle_ps (sum, sum, 0x55));
_mm_store_ss (&res, sum);
#else
res = 0.0;
#endif
for (; i < len; i++)
res += a[i] * b[i];
*o = res;
}
static inline void
inner_product_gint32_1 (gint32 * o, const gint32 * a, const gint32 * b, gint len)
{
gint i = 0;
gint64 res = 0;
for (; i < len; i++)
res += (gint64) a[i] * (gint64) b[i];
res = (res + (1 << (PRECISION_S32 - 1))) >> PRECISION_S32;
*o = CLAMP (res, -(1L << 31), (1L << 31) - 1);
}
static inline void
inner_product_gint16_1 (gint16 * o, const gint16 * a, const gint16 * b, gint len)
inner_product_gint16_1_sse (gint16 * o, const gint16 * a, const gint16 * b, gint len)
{
gint i = 0;
gint32 res = 0;
#ifdef HAVE_EMMINTRIN_H
__m128i sum[2], ta, tb;
__m128i t1[2];
@ -132,9 +51,6 @@ inner_product_gint16_1 (gint16 * o, const gint16 * a, const gint16 * b, gint len
_mm_add_epi32 (sum[0], _mm_shuffle_epi32 (sum[0], _MM_SHUFFLE (1, 1, 1,
1)));
res = _mm_cvtsi128_si32 (sum[0]);
#else
res = 0;
#endif
for (; i < len; i++)
res += (gint32) a[i] * (gint32) b[i];
@ -142,53 +58,74 @@ inner_product_gint16_1 (gint16 * o, const gint16 * a, const gint16 * b, gint len
res = (res + (1 << (PRECISION_S16 - 1))) >> PRECISION_S16;
*o = CLAMP (res, -(1L << 15), (1L << 15) - 1);
}
#endif
#ifdef HAVE_EMMINTRIN_H
static inline void
inner_product_gdouble_2 (gdouble * o, const gdouble * a, const gdouble * b,
inner_product_gfloat_1_sse (gfloat * o, const gfloat * a, const gfloat * b, gint len)
{
gint i = 0;
gfloat res;
__m128 sum = _mm_setzero_ps ();
for (; i < len - 7; i += 8) {
sum =
_mm_add_ps (sum, _mm_mul_ps (_mm_loadu_ps (a + i + 0),
_mm_loadu_ps (b + i + 0)));
sum =
_mm_add_ps (sum, _mm_mul_ps (_mm_loadu_ps (a + i + 4),
_mm_loadu_ps (b + i + 4)));
}
sum = _mm_add_ps (sum, _mm_movehl_ps (sum, sum));
sum = _mm_add_ss (sum, _mm_shuffle_ps (sum, sum, 0x55));
_mm_store_ss (&res, sum);
for (; i < len; i++)
res += a[i] * b[i];
*o = res;
}
#endif
#ifdef HAVE_EMMINTRIN_H
static inline void
inner_product_gdouble_1_sse (gdouble * o, const gdouble * a, const gdouble * b,
gint len)
{
gint i = 0;
gdouble r[2];
#ifdef HAVE_EMMINTRIN_H
__m128d sum = _mm_setzero_pd (), t;
gdouble res;
__m128d sum = _mm_setzero_pd ();
for (; i < len - 3; i += 4) {
t = _mm_loadu_pd (b + i);
for (; i < len - 7; i += 8) {
sum =
_mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + 2 * i),
_mm_unpacklo_pd (t, t)));
_mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + i + 0),
_mm_loadu_pd (b + i + 0)));
sum =
_mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + 2 * i + 2),
_mm_unpackhi_pd (t, t)));
t = _mm_loadu_pd (b + i + 2);
_mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + i + 2),
_mm_loadu_pd (b + i + 2)));
sum =
_mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + 2 * i + 4),
_mm_unpacklo_pd (t, t)));
_mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + i + 4),
_mm_loadu_pd (b + i + 4)));
sum =
_mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + 2 * i + 6),
_mm_unpackhi_pd (t, t)));
_mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + i + 6),
_mm_loadu_pd (b + i + 6)));
}
_mm_store_pd (r, sum);
#else
r[0] = 0.0;
r[1] = 0.0;
sum = _mm_add_sd (sum, _mm_unpackhi_pd (sum, sum));
_mm_store_sd (&res, sum);
for (; i < len; i++)
res += a[i] * b[i];
*o = res;
}
#endif
for (; i < len; i++) {
r[0] += a[2 * i] * b[i];
r[1] += a[2 * i + 1] * b[i];
}
o[0] = r[0];
o[1] = r[1];
}
#ifdef HAVE_EMMINTRIN_H
static inline void
inner_product_gint16_2 (gint16 * o, const gint16 * a, const gint16 * b, gint len)
inner_product_gint16_2_sse (gint16 * o, const gint16 * a, const gint16 * b, gint len)
{
gint i = 0;
gint32 r[2];
#ifdef HAVE_EMMINTRIN_H
guint64 r64;
__m128i sum[2], ta, tb;
__m128i t1[2];
@ -224,10 +161,6 @@ inner_product_gint16_2 (gint16 * o, const gint16 * a, const gint16 * b, gint len
r64 = _mm_cvtsi128_si64 (sum[0]);
r[0] = r64 >> 32;
r[1] = r64 & 0xffffffff;
#else
r[0] = 0;
r[1] = 0;
#endif
for (; i < len; i++) {
r[0] += (gint32) a[2 * i] * (gint32) b[i];
@ -238,3 +171,62 @@ inner_product_gint16_2 (gint16 * o, const gint16 * a, const gint16 * b, gint len
o[0] = CLAMP (r[0], -(1L << 15), (1L << 15) - 1);
o[1] = CLAMP (r[1], -(1L << 15), (1L << 15) - 1);
}
#endif
#ifdef HAVE_EMMINTRIN_H
static inline void
inner_product_gdouble_2_sse (gdouble * o, const gdouble * a, const gdouble * b,
gint len)
{
gint i = 0;
gdouble r[2];
__m128d sum = _mm_setzero_pd (), t;
for (; i < len - 3; i += 4) {
t = _mm_loadu_pd (b + i);
sum =
_mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + 2 * i),
_mm_unpacklo_pd (t, t)));
sum =
_mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + 2 * i + 2),
_mm_unpackhi_pd (t, t)));
t = _mm_loadu_pd (b + i + 2);
sum =
_mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + 2 * i + 4),
_mm_unpacklo_pd (t, t)));
sum =
_mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + 2 * i + 6),
_mm_unpackhi_pd (t, t)));
}
_mm_store_pd (r, sum);
for (; i < len; i++) {
r[0] += a[2 * i] * b[i];
r[1] += a[2 * i + 1] * b[i];
}
o[0] = r[0];
o[1] = r[1];
}
#endif
#ifdef HAVE_EMMINTRIN_H
MAKE_RESAMPLE_FUNC (gint16, 1, sse);
MAKE_RESAMPLE_FUNC (gfloat, 1, sse);
MAKE_RESAMPLE_FUNC (gdouble, 1, sse);
MAKE_RESAMPLE_FUNC (gint16, 2, sse);
MAKE_RESAMPLE_FUNC (gdouble, 2, sse);
#endif
static void
audio_resampler_check_x86 (const gchar *option)
{
if (!strcmp (option, "sse2")) {
GST_DEBUG ("enable SSE2 optimisations");
resample_gint16_1 = resample_gint16_1_sse;
resample_gfloat_1 = resample_gfloat_1_sse;
resample_gdouble_1 = resample_gdouble_1_sse;
resample_gint16_2 = resample_gint16_2_sse;
resample_gdouble_2 = resample_gdouble_2_sse;
}
}

View file

@ -25,6 +25,10 @@
#include <stdio.h>
#include <math.h>
#ifdef HAVE_ORC
#include <orc/orc.h>
#endif
#include "audio-resampler.h"
typedef struct _Tap
@ -84,27 +88,8 @@ struct _GstAudioResampler
gpointer *sbuf;
};
#ifndef GST_DISABLE_GST_DEBUG
#define GST_CAT_DEFAULT ensure_debug_category()
static GstDebugCategory *
ensure_debug_category (void)
{
static gsize cat_gonce = 0;
if (g_once_init_enter (&cat_gonce)) {
gsize cat_done;
cat_done = (gsize) _gst_debug_category_new ("audio-resampler", 0,
"audio-resampler object");
g_once_init_leave (&cat_gonce, cat_done);
}
return (GstDebugCategory *) cat_gonce;
}
#else
#define ensure_debug_category() /* NOOP */
#endif /* GST_DISABLE_GST_DEBUG */
GST_DEBUG_CATEGORY_STATIC (audio_resampler_debug);
#define GST_CAT_DEFAULT audio_resampler_debug
/**
* SECTION:gstaudioresampler
@ -305,7 +290,8 @@ G_STMT_START { \
GST_WARNING ("can't find exact taps"); \
} G_STMT_END
#include "audio-resampler-core.h"
#define PRECISION_S16 15
#define PRECISION_S32 30
static void
make_taps (GstAudioResampler * resampler, Tap * t, gint j)
@ -375,10 +361,98 @@ make_taps (GstAudioResampler * resampler, Tap * t, gint j)
}
}
#define MAKE_RESAMPLE_FUNC(type,channels) \
static inline void
inner_product_gint16_1_c (gint16 * o, const gint16 * a, const gint16 * b,
gint len)
{
gint i;
gint32 res = 0;
for (i = 0; i < len; i++)
res += (gint32) a[i] * (gint32) b[i];
res = (res + (1 << (PRECISION_S16 - 1))) >> PRECISION_S16;
*o = CLAMP (res, -(1L << 15), (1L << 15) - 1);
}
static inline void
inner_product_gint16_2_c (gint16 * o, const gint16 * a, const gint16 * b,
gint len)
{
gint i;
gint32 r[2] = { 0, 0 };
for (i = 0; i < len; i++) {
r[0] += (gint32) a[2 * i] * (gint32) b[i];
r[1] += (gint32) a[2 * i + 1] * (gint32) b[i];
}
r[0] = (r[0] + (1 << (PRECISION_S16 - 1))) >> PRECISION_S16;
r[1] = (r[1] + (1 << (PRECISION_S16 - 1))) >> PRECISION_S16;
o[0] = CLAMP (r[0], -(1L << 15), (1L << 15) - 1);
o[1] = CLAMP (r[1], -(1L << 15), (1L << 15) - 1);
}
static inline void
inner_product_gint32_1_c (gint32 * o, const gint32 * a, const gint32 * b,
gint len)
{
gint i;
gint64 res = 0;
for (i = 0; i < len; i++)
res += (gint64) a[i] * (gint64) b[i];
res = (res + (1 << (PRECISION_S32 - 1))) >> PRECISION_S32;
*o = CLAMP (res, -(1L << 31), (1L << 31) - 1);
}
static inline void
inner_product_gfloat_1_c (gfloat * o, const gfloat * a, const gfloat * b,
gint len)
{
gint i;
gfloat res = 0.0;
for (i = 0; i < len; i++)
res += a[i] * b[i];
*o = res;
}
static inline void
inner_product_gdouble_1_c (gdouble * o, const gdouble * a, const gdouble * b,
gint len)
{
gint i;
gdouble res = 0.0;
for (i = 0; i < len; i++)
res += a[i] * b[i];
*o = res;
}
static inline void
inner_product_gdouble_2_c (gdouble * o, const gdouble * a, const gdouble * b,
gint len)
{
gint i;
gdouble r[2] = { 0.0, 0.0 };
for (i = 0; i < len; i++) {
r[0] += a[2 * i] * b[i];
r[1] += a[2 * i + 1] * b[i];
}
o[0] = r[0];
o[1] = r[1];
}
#define MAKE_RESAMPLE_FUNC(type,channels,arch) \
static void \
resample_ ##type## _ ##channels (GstAudioResampler * resampler, gpointer in[], gsize in_len, \
gpointer out[], gsize out_len, gsize * consumed, gboolean move) \
resample_ ##type## _ ##channels## _ ##arch (GstAudioResampler * resampler, \
gpointer in[], gsize in_len, gpointer out[], gsize out_len, \
gsize * consumed, gboolean move) \
{ \
gint c, di = 0; \
gint n_taps = resampler->n_taps; \
@ -401,7 +475,7 @@ resample_ ##type## _ ##channels (GstAudioResampler * resampler, gpointer in[], g
if (t->taps == NULL) \
make_taps (resampler, t, samp_phase); \
\
inner_product_ ##type## _##channels (op, ipp, t->taps, n_taps); \
inner_product_ ##type## _##channels##_##arch (op, ipp, t->taps, n_taps); \
op += ostride; \
\
samp_phase = t->next_phase; \
@ -417,12 +491,78 @@ resample_ ##type## _ ##channels (GstAudioResampler * resampler, gpointer in[], g
resampler->samp_phase = samp_phase; \
}
MAKE_RESAMPLE_FUNC (gdouble, 1);
MAKE_RESAMPLE_FUNC (gfloat, 1);
MAKE_RESAMPLE_FUNC (gint32, 1);
MAKE_RESAMPLE_FUNC (gint16, 1);
MAKE_RESAMPLE_FUNC (gdouble, 2);
MAKE_RESAMPLE_FUNC (gint16, 2);
MAKE_RESAMPLE_FUNC (gint16, 1, c);
MAKE_RESAMPLE_FUNC (gint32, 1, c);
MAKE_RESAMPLE_FUNC (gfloat, 1, c);
MAKE_RESAMPLE_FUNC (gdouble, 1, c);
MAKE_RESAMPLE_FUNC (gint16, 2, c);
MAKE_RESAMPLE_FUNC (gdouble, 2, c);
typedef void (*ResampleFunc) (GstAudioResampler * resampler,
gpointer in[], gsize in_len, gpointer out[], gsize out_len,
gsize * consumed, gboolean move);
static ResampleFunc resample_funcs[] = {
resample_gint16_1_c,
resample_gint32_1_c,
resample_gfloat_1_c,
resample_gdouble_1_c,
resample_gint16_2_c,
resample_gdouble_2_c,
};
#define resample_gint16_1 resample_funcs[0]
#define resample_gint32_1 resample_funcs[1]
#define resample_gfloat_1 resample_funcs[2]
#define resample_gdouble_1 resample_funcs[3]
#define resample_gint16_2 resample_funcs[4]
#define resample_gdouble_2 resample_funcs[5]
#if defined HAVE_ORC && !defined DISABLE_ORC
# if defined (__i386__) || defined (__x86_64__)
# define CHECK_X86
# include "audio-resampler-x86.h"
# endif
#endif
static void
audio_resampler_init (void)
{
static gsize init_gonce = 0;
if (g_once_init_enter (&init_gonce)) {
GST_DEBUG_CATEGORY_INIT (audio_resampler_debug, "audio-resampler", 0,
"audio-resampler object");
#if defined HAVE_ORC && !defined DISABLE_ORC
orc_init ();
{
OrcTarget *target = orc_target_get_default ();
gint i;
if (target) {
unsigned int flags = orc_target_get_default_flags (target);
const gchar *name;
name = orc_target_get_name (target);
GST_DEBUG ("target %s, default flags %08x", name, flags);
for (i = 0; i < 32; ++i) {
if (flags & (1U << i)) {
name = orc_target_get_flag_name (target, i);
GST_DEBUG ("target flag %s", name);
#ifdef CHECK_X86
audio_resampler_check_x86 (name);
#endif
}
}
}
}
#endif
g_once_init_leave (&init_gonce, 1);
}
}
#define MAKE_DEINTERLEAVE_FUNC(type) \
static void \
@ -790,6 +930,8 @@ gst_audio_resampler_new (GstAudioResamplerMethod method,
g_return_val_if_fail (in_rate != 0, FALSE);
g_return_val_if_fail (out_rate != 0, FALSE);
audio_resampler_init ();
resampler = g_slice_new0 (GstAudioResampler);
resampler->method = method;
resampler->flags = flags;