audioresample: use SSE/SSE2 when possible

Compile in the code on i386 and x86_64, and use ORC to determine
when the runtime platform can run the code.

https://bugzilla.gnome.org/show_bug.cgi?id=636562
This commit is contained in:
Vincent Penquerc'h 2011-08-11 15:54:15 +01:00 committed by Sebastian Dröge
parent 58fd202b7d
commit 746415a6e3
5 changed files with 99 additions and 9 deletions

View file

@ -213,6 +213,8 @@ LIBS="$save_libs"
dnl used in gst-libs/gst/pbutils and associated unit test
AC_CHECK_HEADERS([process.h sys/types.h sys/wait.h sys/stat.h])
AC_CHECK_HEADERS([xmmintrin.h emmintrin.h])
dnl ffmpegcolorspace includes _stdint.h
dnl also, Windows does not have long long
AX_CREATE_STDINT_H

View file

@ -64,10 +64,30 @@
#ifdef OUTSIDE_SPEEX
#include <stdlib.h>
#ifdef HAVE_STRING_H
#include <string.h>
#endif
#include <glib.h>
#ifdef HAVE_ORC
#include <orc/orc.h>
#endif
#define EXPORT G_GNUC_INTERNAL
#ifdef _USE_SSE
#ifndef HAVE_XMMINTRIN_H
#undef _USE_SSE
#endif
#endif
#ifdef _USE_SSE2
#ifndef HAVE_EMMINTRIN_H
#undef _USE_SSE2
#endif
#endif
static inline void *
speex_alloc (int size)
{
@ -110,7 +130,7 @@ speex_free (void *ptr)
#define NULL 0
#endif
#ifdef _USE_SSE
#if defined _USE_SSE || defined _USE_SSE2
#include "resample_sse.h"
#endif
@ -121,6 +141,28 @@ speex_free (void *ptr)
#define FIXED_STACK_ALLOC 1024
#endif
/* Allow selecting SSE or not when compiled with SSE support */
#ifdef _USE_SSE
#define SSE_FALLBACK(macro) \
if (st->use_sse) goto sse_##macro##_sse; {
#define SSE_IMPLEMENTATION(macro) \
goto sse_##macro##_end; } sse_##macro##_sse: {
#define SSE_END(macro) sse_##macro##_end:; }
#else
#define SSE_FALLBACK(macro)
#endif
#ifdef _USE_SSE2
#define SSE2_FALLBACK(macro) \
if (st->use_sse2) goto sse2_##macro##_sse2; {
#define SSE2_IMPLEMENTATION(macro) \
goto sse2_##macro##_end; } sse2_##macro##_sse2: {
#define SSE2_END(macro) sse2_##macro##_end:; }
#else
#define SSE2_FALLBACK(macro)
#endif
typedef int (*resampler_basic_func) (SpeexResamplerState *, spx_uint32_t,
const spx_word16_t *, spx_uint32_t *, spx_word16_t *, spx_uint32_t *);
@ -155,6 +197,9 @@ struct SpeexResamplerState_
int in_stride;
int out_stride;
int use_sse:1;
int use_sse2:1;
};
static double kaiser12_table[68] = {
@ -410,7 +455,7 @@ resampler_basic_direct_single (SpeexResamplerState * st,
const spx_word16_t *sinc = &sinc_table[samp_frac_num * N];
const spx_word16_t *iptr = &in[last_sample];
#ifndef OVERRIDE_INNER_PRODUCT_SINGLE
SSE_FALLBACK (INNER_PRODUCT_SINGLE)
sum = 0;
for (j = 0; j < N; j++)
sum += MULT16_16 (sinc[j], iptr[j]);
@ -427,8 +472,10 @@ resampler_basic_direct_single (SpeexResamplerState * st,
}
sum = accum[0] + accum[1] + accum[2] + accum[3];
*/
#else
#ifdef OVERRIDE_INNER_PRODUCT_SINGLE
SSE_IMPLEMENTATION (INNER_PRODUCT_SINGLE)
sum = inner_product_single (sinc, iptr, N);
SSE_END(INNER_PRODUCT_SINGLE)
#endif
out[out_stride * out_sample++] = SATURATE32 (PSHR32 (sum, 15), 32767);
@ -471,7 +518,7 @@ resampler_basic_direct_double (SpeexResamplerState * st,
const spx_word16_t *sinc = &sinc_table[samp_frac_num * N];
const spx_word16_t *iptr = &in[last_sample];
#ifndef OVERRIDE_INNER_PRODUCT_DOUBLE
SSE2_FALLBACK (INNER_PRODUCT_DOUBLE)
double accum[4] = { 0, 0, 0, 0 };
for (j = 0; j < N; j += 4) {
@ -481,8 +528,10 @@ resampler_basic_direct_double (SpeexResamplerState * st,
accum[3] += sinc[j + 3] * iptr[j + 3];
}
sum = accum[0] + accum[1] + accum[2] + accum[3];
#else
#ifdef OVERRIDE_INNER_PRODUCT_DOUBLE
SSE2_IMPLEMENTATION (INNER_PRODUCT_DOUBLE)
sum = inner_product_double (sinc, iptr, N);
SSE2_END (INNER_PRODUCT_DOUBLE)
#endif
out[out_stride * out_sample++] = PSHR32 (sum, 15);
@ -534,7 +583,7 @@ resampler_basic_interpolate_single (SpeexResamplerState * st,
spx_word16_t interp[4];
#ifndef OVERRIDE_INTERPOLATE_PRODUCT_SINGLE
SSE_FALLBACK (INTERPOLATE_PRODUCT_SINGLE)
spx_word32_t accum[4] = { 0, 0, 0, 0 };
for (j = 0; j < N; j++) {
@ -559,12 +608,14 @@ resampler_basic_interpolate_single (SpeexResamplerState * st,
1)) + MULT16_32_Q15 (interp[1], SHR32 (accum[1],
1)) + MULT16_32_Q15 (interp[2], SHR32 (accum[2],
1)) + MULT16_32_Q15 (interp[3], SHR32 (accum[3], 1));
#else
#ifdef OVERRIDE_INTERPOLATE_PRODUCT_SINGLE
SSE_IMPLEMENTATION (INTERPOLATE_PRODUCT_SINGLE)
cubic_coef (frac, interp);
sum =
interpolate_product_single (iptr,
st->sinc_table + st->oversample + 4 - offset - 2, N, st->oversample,
interp);
SSE_END (INTERPOLATE_PRODUCT_SINGLE)
#endif
out[out_stride * out_sample++] = SATURATE32 (PSHR32 (sum, 14), 32767);
@ -624,7 +675,7 @@ resampler_basic_interpolate_double (SpeexResamplerState * st,
spx_word16_t interp[4];
#ifndef OVERRIDE_INTERPOLATE_PRODUCT_DOUBLE
SSE2_FALLBACK (INTERPOLATE_PRODUCT_DOUBLE)
double accum[4] = { 0, 0, 0, 0 };
for (j = 0; j < N; j++) {
@ -648,12 +699,14 @@ resampler_basic_interpolate_double (SpeexResamplerState * st,
MULT16_32_Q15 (interp[0], accum[0]) + MULT16_32_Q15 (interp[1],
accum[1]) + MULT16_32_Q15 (interp[2],
accum[2]) + MULT16_32_Q15 (interp[3], accum[3]);
#else
#ifdef OVERRIDE_INTERPOLATE_PRODUCT_DOUBLE
SSE2_IMPLEMENTATION (INTERPOLATE_PRODUCT_DOUBLE)
cubic_coef (frac, interp);
sum =
interpolate_product_double (iptr,
st->sinc_table + st->oversample + 4 - offset - 2, N, st->oversample,
interp);
SSE2_END (INTERPOLATE_PRODUCT_DOUBLE)
#endif
out[out_stride * out_sample++] = PSHR32 (sum, 15);
@ -875,6 +928,17 @@ speex_resampler_init (spx_uint32_t nb_channels, spx_uint32_t in_rate,
out_rate, quality, err);
}
static void
check_insn_set (SpeexResamplerState * st, const char *name)
{
if (!name)
return;
if (!strcmp (name, "sse"))
st->use_sse = 1;
if (!strcmp (name, "sse2"))
st->use_sse = st->use_sse2 = 1;
}
EXPORT SpeexResamplerState *
speex_resampler_init_frac (spx_uint32_t nb_channels, spx_uint32_t ratio_num,
spx_uint32_t ratio_den, spx_uint32_t in_rate, spx_uint32_t out_rate,
@ -912,6 +976,23 @@ speex_resampler_init_frac (spx_uint32_t nb_channels, spx_uint32_t ratio_num,
st->buffer_size = 160;
#endif
st->use_sse = st->use_sse2 = 0;
#if defined HAVE_ORC && !defined DISABLE_ORC
orc_init ();
{
OrcTarget *target = orc_target_get_default ();
if (target) {
unsigned int flags = orc_target_get_default_flags (target);
check_insn_set (st, orc_target_get_name (target));
for (i = 0; i < 32; ++i) {
if (flags & (1 << i)) {
check_insn_set (st, orc_target_get_flag_name (target, i));
}
}
}
}
#endif
/* Per channel data */
st->last_sample = (spx_int32_t *) speex_alloc (nb_channels * sizeof (int));
st->magic_samples = (spx_uint32_t *) speex_alloc (nb_channels * sizeof (int));

View file

@ -34,7 +34,9 @@
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifdef HAVE_XMMINTRIN_H
#include <xmmintrin.h>
#endif
#define OVERRIDE_INNER_PRODUCT_SINGLE
static inline float inner_product_single(const float *a, const float *b, unsigned int len)
@ -72,7 +74,9 @@ static inline float interpolate_product_single(const float *a, const float *b, u
}
#ifdef _USE_SSE2
#ifdef HAVE_EMMINTRIN_H
#include <emmintrin.h>
#endif
#define OVERRIDE_INNER_PRODUCT_DOUBLE
#ifdef DOUBLE_PRECISION

View file

@ -17,6 +17,7 @@
* Boston, MA 02111-1307, USA.
*/
#define _USE_SSE2
#define FLOATING_POINT
#define DOUBLE_PRECISION
#define OUTSIDE_SPEEX

View file

@ -17,6 +17,8 @@
* Boston, MA 02111-1307, USA.
*/
#define _USE_SSE
#define _USE_SSE2
#define FLOATING_POINT
#define OUTSIDE_SPEEX
#define RANDOM_PREFIX resample_float