mirror of
https://gitlab.freedesktop.org/gstreamer/gstreamer.git
synced 2025-01-11 18:05:37 +00:00
audio-resampler: add neon optimizations
Unroll some more loops in the fallback code that seems to work fine for ARM. Add some simple ARM optimizations taken from speex.
This commit is contained in:
parent
25d81ffb55
commit
d5abdd83c9
3 changed files with 300 additions and 32 deletions
253
gst-libs/gst/audio/audio-resampler-neon.h
Normal file
253
gst-libs/gst/audio/audio-resampler-neon.h
Normal file
|
@ -0,0 +1,253 @@
|
|||
/* GStreamer
|
||||
* Copyright (C) <2016> Wim Taymans <wim.taymans@gmail.com>
|
||||
*
|
||||
* This library is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Library General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This library is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Library General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Library General Public
|
||||
* License along with this library; if not, write to the
|
||||
* Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
|
||||
* Boston, MA 02110-1301, USA.
|
||||
*/
|
||||
|
||||
static inline void
|
||||
inner_product_gint16_none_1_neon (gint16 * o, const gint16 * a,
|
||||
const gint16 * b, gint len, const gint16 * icoeff)
|
||||
{
|
||||
uint32_t remainder = len % 16;
|
||||
len = len - remainder;
|
||||
|
||||
asm volatile (" cmp %[len], #0\n"
|
||||
" bne 1f\n"
|
||||
" vld1.16 {d16}, [%[b]]!\n"
|
||||
" vld1.16 {d20}, [%[a]]!\n"
|
||||
" subs %[remainder], %[remainder], #4\n"
|
||||
" vmull.s16 q0, d16, d20\n"
|
||||
" beq 5f\n"
|
||||
" b 4f\n"
|
||||
"1:"
|
||||
" vld1.16 {d16, d17, d18, d19}, [%[b]]!\n"
|
||||
" vld1.16 {d20, d21, d22, d23}, [%[a]]!\n"
|
||||
" subs %[len], %[len], #16\n"
|
||||
" vmull.s16 q0, d16, d20\n"
|
||||
" vmlal.s16 q0, d17, d21\n"
|
||||
" vmlal.s16 q0, d18, d22\n"
|
||||
" vmlal.s16 q0, d19, d23\n"
|
||||
" beq 3f\n"
|
||||
"2:"
|
||||
" vld1.16 {d16, d17, d18, d19}, [%[b]]!\n"
|
||||
" vld1.16 {d20, d21, d22, d23}, [%[a]]!\n"
|
||||
" subs %[len], %[len], #16\n"
|
||||
" vmlal.s16 q0, d16, d20\n"
|
||||
" vmlal.s16 q0, d17, d21\n"
|
||||
" vmlal.s16 q0, d18, d22\n"
|
||||
" vmlal.s16 q0, d19, d23\n"
|
||||
" bne 2b\n"
|
||||
"3:"
|
||||
" cmp %[remainder], #0\n"
|
||||
" beq 5f\n"
|
||||
"4:"
|
||||
" vld1.16 {d16}, [%[b]]!\n"
|
||||
" vld1.16 {d20}, [%[a]]!\n"
|
||||
" subs %[remainder], %[remainder], #4\n"
|
||||
" vmlal.s16 q0, d16, d20\n"
|
||||
" bne 4b\n"
|
||||
"5:"
|
||||
" vaddl.s32 q0, d0, d1\n"
|
||||
" vadd.s64 d0, d0, d1\n"
|
||||
" vqmovn.s64 d0, q0\n"
|
||||
" vqrshrn.s32 d0, q0, #15\n"
|
||||
" vst1.s16 d0[0], [%[o]]\n"
|
||||
: [a] "+r" (a), [b] "+r" (b),
|
||||
[len] "+r" (len), [remainder] "+r" (remainder)
|
||||
: [o] "r" (o)
|
||||
: "cc", "q0",
|
||||
"d16", "d17", "d18", "d19",
|
||||
"d20", "d21", "d22", "d23");
|
||||
}
|
||||
|
||||
static inline void
|
||||
inner_product_gint16_linear_1_neon (gint16 * o, const gint16 * a,
|
||||
const gint16 * b, gint len, const gint16 * icoeff)
|
||||
{
|
||||
}
|
||||
|
||||
static inline void
|
||||
inner_product_gint16_cubic_1_neon (gint16 * o, const gint16 * a,
|
||||
const gint16 * b, gint len, const gint16 * icoeff)
|
||||
{
|
||||
}
|
||||
|
||||
static inline void
|
||||
inner_product_gint32_none_1_neon (gint32 * o, const gint32 * a,
|
||||
const gint32 * b, gint len, const gint32 * icoeff)
|
||||
{
|
||||
}
|
||||
|
||||
static inline void
|
||||
inner_product_gint32_linear_1_neon (gint32 * o, const gint32 * a,
|
||||
const gint32 * b, gint len, const gint32 * icoeff)
|
||||
{
|
||||
}
|
||||
|
||||
static inline void
|
||||
inner_product_gint32_cubic_1_neon (gint32 * o, const gint32 * a,
|
||||
const gint32 * b, gint len, const gint32 * icoeff)
|
||||
{
|
||||
}
|
||||
|
||||
static inline void
|
||||
inner_product_gfloat_none_1_neon (gfloat * o, const gfloat * a,
|
||||
const gfloat * b, gint len, const gfloat * icoeff)
|
||||
{
|
||||
uint32_t remainder = len % 16;
|
||||
len = len - remainder;
|
||||
|
||||
asm volatile (" cmp %[len], #0\n"
|
||||
" bne 1f\n"
|
||||
" vld1.32 {q4}, [%[b]]!\n"
|
||||
" vld1.32 {q8}, [%[a]]!\n"
|
||||
" subs %[remainder], %[remainder], #4\n"
|
||||
" vmul.f32 q0, q4, q8\n"
|
||||
" bne 4f\n"
|
||||
" b 5f\n"
|
||||
"1:"
|
||||
" vld1.32 {q4, q5}, [%[b]]!\n"
|
||||
" vld1.32 {q8, q9}, [%[a]]!\n"
|
||||
" vld1.32 {q6, q7}, [%[b]]!\n"
|
||||
" vld1.32 {q10, q11}, [%[a]]!\n"
|
||||
" subs %[len], %[len], #16\n"
|
||||
" vmul.f32 q0, q4, q8\n"
|
||||
" vmul.f32 q1, q5, q9\n"
|
||||
" vmul.f32 q2, q6, q10\n"
|
||||
" vmul.f32 q3, q7, q11\n"
|
||||
" beq 3f\n"
|
||||
"2:"
|
||||
" vld1.32 {q4, q5}, [%[b]]!\n"
|
||||
" vld1.32 {q8, q9}, [%[a]]!\n"
|
||||
" vld1.32 {q6, q7}, [%[b]]!\n"
|
||||
" vld1.32 {q10, q11}, [%[a]]!\n"
|
||||
" subs %[len], %[len], #16\n"
|
||||
" vmla.f32 q0, q4, q8\n"
|
||||
" vmla.f32 q1, q5, q9\n"
|
||||
" vmla.f32 q2, q6, q10\n"
|
||||
" vmla.f32 q3, q7, q11\n"
|
||||
" bne 2b\n"
|
||||
"3:"
|
||||
" vadd.f32 q4, q0, q1\n"
|
||||
" vadd.f32 q5, q2, q3\n"
|
||||
" cmp %[remainder], #0\n"
|
||||
" vadd.f32 q0, q4, q5\n"
|
||||
" beq 5f\n"
|
||||
"4:"
|
||||
" vld1.32 {q6}, [%[b]]!\n"
|
||||
" vld1.32 {q10}, [%[a]]!\n"
|
||||
" subs %[remainder], %[remainder], #4\n"
|
||||
" vmla.f32 q0, q6, q10\n"
|
||||
" bne 4b\n"
|
||||
"5:"
|
||||
" vadd.f32 d0, d0, d1\n"
|
||||
" vpadd.f32 d0, d0, d0\n"
|
||||
" vst1.f32 d0[0], [%[o]]\n"
|
||||
: [a] "+r" (a), [b] "+r" (b),
|
||||
[len] "+r" (len), [remainder] "+r" (remainder)
|
||||
: [o] "r" (o)
|
||||
: "cc", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8",
|
||||
"q9", "q10", "q11");
|
||||
|
||||
}
|
||||
|
||||
static inline void
|
||||
inner_product_gfloat_linear_1_neon (gfloat * o, const gfloat * a,
|
||||
const gfloat * b, gint len, const gfloat * icoeff)
|
||||
{
|
||||
}
|
||||
|
||||
static inline void
|
||||
inner_product_gfloat_cubic_1_neon (gfloat * o, const gfloat * a,
|
||||
const gfloat * b, gint len, const gfloat * icoeff)
|
||||
{
|
||||
}
|
||||
|
||||
static inline void
|
||||
inner_product_gdouble_none_1_neon (gdouble * o, const gdouble * a,
|
||||
const gdouble * b, gint len, const gdouble * icoeff)
|
||||
{
|
||||
}
|
||||
|
||||
static inline void
|
||||
inner_product_gdouble_linear_1_neon (gdouble * o, const gdouble * a,
|
||||
const gdouble * b, gint len, const gdouble * icoeff)
|
||||
{
|
||||
}
|
||||
|
||||
static inline void
|
||||
inner_product_gdouble_cubic_1_neon (gdouble * o, const gdouble * a,
|
||||
const gdouble * b, gint len, const gdouble * icoeff)
|
||||
{
|
||||
}
|
||||
|
||||
static void
|
||||
interpolate_gdouble_linear_neon (gdouble * o, const gdouble * a,
|
||||
gint len, const gdouble * icoeff)
|
||||
{
|
||||
}
|
||||
|
||||
static void
|
||||
interpolate_gdouble_cubic_neon (gdouble * o, const gdouble * a,
|
||||
gint len, const gdouble * icoeff)
|
||||
{
|
||||
}
|
||||
|
||||
MAKE_RESAMPLE_FUNC (gint16, none, 1, neon);
|
||||
MAKE_RESAMPLE_FUNC (gint16, linear, 1, neon);
|
||||
MAKE_RESAMPLE_FUNC (gint16, cubic, 1, neon);
|
||||
|
||||
MAKE_RESAMPLE_FUNC (gint32, none, 1, neon);
|
||||
MAKE_RESAMPLE_FUNC (gint32, linear, 1, neon);
|
||||
MAKE_RESAMPLE_FUNC (gint32, cubic, 1, neon);
|
||||
|
||||
MAKE_RESAMPLE_FUNC (gfloat, none, 1, neon);
|
||||
MAKE_RESAMPLE_FUNC (gfloat, linear, 1, neon);
|
||||
MAKE_RESAMPLE_FUNC (gfloat, cubic, 1, neon);
|
||||
|
||||
MAKE_RESAMPLE_FUNC (gdouble, none, 1, neon);
|
||||
MAKE_RESAMPLE_FUNC (gdouble, linear, 1, neon);
|
||||
MAKE_RESAMPLE_FUNC (gdouble, cubic, 1, neon);
|
||||
|
||||
static void
|
||||
audio_resampler_check_neon (const gchar *target_name, const gchar *option)
|
||||
{
|
||||
if (!strcmp (target_name, "neon")) {
|
||||
GST_DEBUG ("enable NEON optimisations");
|
||||
resample_gint16_none_1 = resample_gint16_none_1_neon;
|
||||
|
||||
resample_gfloat_none_1 = resample_gfloat_none_1_neon;
|
||||
|
||||
if (0) {
|
||||
resample_gint16_linear_1 = resample_gint16_linear_1_neon;
|
||||
resample_gint16_cubic_1 = resample_gint16_cubic_1_neon;
|
||||
|
||||
resample_gint32_none_1 = resample_gint32_none_1_neon;
|
||||
resample_gint32_linear_1 = resample_gint32_linear_1_neon;
|
||||
resample_gint32_cubic_1 = resample_gint32_cubic_1_neon;
|
||||
|
||||
resample_gfloat_linear_1 = resample_gfloat_linear_1_neon;
|
||||
resample_gfloat_cubic_1 = resample_gfloat_cubic_1_neon;
|
||||
|
||||
resample_gdouble_none_1 = resample_gdouble_none_1_neon;
|
||||
resample_gdouble_linear_1 = resample_gdouble_linear_1_neon;
|
||||
resample_gdouble_cubic_1 = resample_gdouble_cubic_1_neon;
|
||||
|
||||
interpolate_gdouble_linear = interpolate_gdouble_linear_neon;
|
||||
interpolate_gdouble_cubic = interpolate_gdouble_cubic_neon;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -546,9 +546,9 @@ MAKE_RESAMPLE_FUNC (gint32, cubic, 1, sse41);
|
|||
#endif
|
||||
|
||||
static void
|
||||
audio_resampler_check_x86 (const gchar *option)
|
||||
audio_resampler_check_x86 (const gchar *target_name, const gchar *option)
|
||||
{
|
||||
if (!strcmp (option, "sse")) {
|
||||
if (!strcmp (target_name, "sse")) {
|
||||
#if defined (HAVE_XMMINTRIN_H) && defined(__SSE__)
|
||||
GST_DEBUG ("enable SSE optimisations");
|
||||
resample_gfloat_none_1 = resample_gfloat_none_1_sse;
|
||||
|
@ -559,23 +559,19 @@ audio_resampler_check_x86 (const gchar *option)
|
|||
#else
|
||||
GST_DEBUG ("SSE optimisations not enabled");
|
||||
#endif
|
||||
} else if (!strcmp (option, "sse2")) {
|
||||
}
|
||||
if (!strcmp (option, "sse2")) {
|
||||
#if defined (HAVE_EMMINTRIN_H) && defined(__SSE2__)
|
||||
GST_DEBUG ("enable SSE2 optimisations");
|
||||
resample_gint16_none_1 = resample_gint16_none_1_sse2;
|
||||
resample_gint16_linear_1 = resample_gint16_linear_1_sse2;
|
||||
resample_gint16_cubic_1 = resample_gint16_cubic_1_sse2;
|
||||
|
||||
resample_gfloat_none_1 = resample_gfloat_none_1_sse;
|
||||
resample_gfloat_linear_1 = resample_gfloat_linear_1_sse;
|
||||
resample_gfloat_cubic_1 = resample_gfloat_cubic_1_sse;
|
||||
|
||||
resample_gdouble_none_1 = resample_gdouble_none_1_sse2;
|
||||
resample_gdouble_linear_1 = resample_gdouble_linear_1_sse2;
|
||||
resample_gdouble_cubic_1 = resample_gdouble_cubic_1_sse2;
|
||||
|
||||
resample_gint16_none_2 = resample_gint16_none_2_sse2;
|
||||
resample_gfloat_none_2 = resample_gfloat_none_2_sse;
|
||||
resample_gdouble_none_2 = resample_gdouble_none_2_sse2;
|
||||
|
||||
interpolate_gdouble_linear = interpolate_gdouble_linear_sse2;
|
||||
|
|
|
@ -637,14 +637,17 @@ inner_product_##type##_none_1_c (type * o, const type * a, \
|
|||
const type * b, gint len, const type *ic) \
|
||||
{ \
|
||||
gint i; \
|
||||
type2 res = 0; \
|
||||
type2 res[4] = { 0, 0, 0, 0 }; \
|
||||
\
|
||||
for (i = 0; i < len; i += 2) { \
|
||||
res += (type2) a[2*i+0] * (type2) b[2*i+0]; \
|
||||
res += (type2) a[2*i+1] * (type2) b[2*i+1]; \
|
||||
for (i = 0; i < len; i += 4) { \
|
||||
res[0] += (type2) a[i + 0] * (type2) b[i + 0]; \
|
||||
res[1] += (type2) a[i + 1] * (type2) b[i + 1]; \
|
||||
res[2] += (type2) a[i + 2] * (type2) b[i + 2]; \
|
||||
res[3] += (type2) a[i + 3] * (type2) b[i + 3]; \
|
||||
} \
|
||||
res = (res + ((type2)1 << ((prec) - 1))) >> (prec); \
|
||||
*o = CLAMP (res, -(limit), (limit) - 1); \
|
||||
res[0] = res[0] + res[1] + res[2] + res[3]; \
|
||||
res[0] = (res[0] + ((type2)1 << ((prec) - 1))) >> (prec); \
|
||||
*o = CLAMP (res[0], -(limit), (limit) - 1); \
|
||||
}
|
||||
|
||||
INNER_PRODUCT_INT_NONE_FUNC (gint16, gint32, PRECISION_S16, (gint32) 1 << 15);
|
||||
|
@ -656,14 +659,18 @@ inner_product_##type##_linear_1_c (type * o, const type * a, \
|
|||
const type * b, gint len, const type *ic) \
|
||||
{ \
|
||||
gint i; \
|
||||
type2 res[2] = { 0, 0 }; \
|
||||
type2 res[4] = { 0, 0, 0, 0 }; \
|
||||
\
|
||||
for (i = 0; i < len; i++) { \
|
||||
res[0] += (type2) a[i] * (type2) b[2 * i + 0]; \
|
||||
res[1] += (type2) a[i] * (type2) b[2 * i + 1]; \
|
||||
for (i = 0; i < len; i += 2) { \
|
||||
res[0] += (type2) a[i + 0] * (type2) b[2 * i + 0]; \
|
||||
res[1] += (type2) a[i + 0] * (type2) b[2 * i + 1]; \
|
||||
res[2] += (type2) a[i + 1] * (type2) b[2 * i + 2]; \
|
||||
res[3] += (type2) a[i + 1] * (type2) b[2 * i + 3]; \
|
||||
} \
|
||||
res[0] = (type2)(type)(res[0] >> (prec)) * (type2) ic[0] + \
|
||||
(type2)(type)(res[1] >> (prec)) * (type2) ic[1]; \
|
||||
res[0] = (res[0] + res[2]) >> (prec); \
|
||||
res[1] = (res[1] + res[3]) >> (prec); \
|
||||
res[0] = (type2)(type)res[0] * (type2) ic[0] + \
|
||||
(type2)(type)res[1] * (type2) ic[1]; \
|
||||
res[0] = (res[0] + ((type2)1 << ((prec) - 1))) >> (prec); \
|
||||
*o = CLAMP (res[0], -(limit), (limit) - 1); \
|
||||
}
|
||||
|
@ -702,13 +709,15 @@ inner_product_##type##_none_1_c (type * o, const type * a, \
|
|||
const type * b, gint len, const type *ic) \
|
||||
{ \
|
||||
gint i; \
|
||||
type res = 0.0; \
|
||||
type res[4] = { 0.0, 0.0, 0.0, 0.0 }; \
|
||||
\
|
||||
for (i = 0; i < len; i += 2) { \
|
||||
res += a[2 * i + 0] * b[2 * i + 0]; \
|
||||
res += a[2 * i + 1] * b[2 * i + 1]; \
|
||||
for (i = 0; i < len; i += 4) { \
|
||||
res[0] += a[i + 0] * b[i + 0]; \
|
||||
res[1] += a[i + 1] * b[i + 1]; \
|
||||
res[2] += a[i + 2] * b[i + 2]; \
|
||||
res[3] += a[i + 3] * b[i + 3]; \
|
||||
} \
|
||||
*o = res; \
|
||||
*o = res[0] + res[1] + res[2] + res[3]; \
|
||||
}
|
||||
|
||||
INNER_PRODUCT_FLOAT_NONE_FUNC (gfloat);
|
||||
|
@ -720,13 +729,16 @@ inner_product_##type##_linear_1_c (type * o, const type * a, \
|
|||
const type * b, gint len, const type *ic) \
|
||||
{ \
|
||||
gint i; \
|
||||
type res[2] = { 0.0, 0.0 }; \
|
||||
type res[4] = { 0.0, 0.0, 0.0, 0.0 }; \
|
||||
\
|
||||
for (i = 0; i < len; i++) { \
|
||||
for (i = 0; i < len; i += 2) { \
|
||||
res[0] += a[i] * b[2 * i + 0]; \
|
||||
res[1] += a[i] * b[2 * i + 1]; \
|
||||
res[2] += a[i] * b[2 * i + 2]; \
|
||||
res[3] += a[i] * b[2 * i + 3]; \
|
||||
} \
|
||||
*o = res[0] * ic[0] + res[1] * ic[1]; \
|
||||
*o = (res[0] + res[2]) * ic[0] + \
|
||||
(res[1] + res[3]) * ic[1]; \
|
||||
}
|
||||
INNER_PRODUCT_FLOAT_LINEAR_FUNC (gfloat);
|
||||
INNER_PRODUCT_FLOAT_LINEAR_FUNC (gdouble);
|
||||
|
@ -856,6 +868,10 @@ static ResampleFunc resample_funcs[] = {
|
|||
#define resample_gdouble_cubic_1 resample_funcs[19]
|
||||
|
||||
#if defined HAVE_ORC && !defined DISABLE_ORC
|
||||
# if defined (__ARM_NEON__)
|
||||
# define CHECK_NEON
|
||||
# include "audio-resampler-neon.h"
|
||||
# endif
|
||||
# if defined (__i386__) || defined (__x86_64__)
|
||||
# define CHECK_X86
|
||||
# include "audio-resampler-x86.h"
|
||||
|
@ -880,17 +896,20 @@ audio_resampler_init (void)
|
|||
|
||||
if (target) {
|
||||
unsigned int flags = orc_target_get_default_flags (target);
|
||||
const gchar *name;
|
||||
const gchar *tname, *name;
|
||||
|
||||
name = orc_target_get_name (target);
|
||||
GST_DEBUG ("target %s, default flags %08x", name, flags);
|
||||
tname = orc_target_get_name (target);
|
||||
GST_DEBUG ("target %s, default flags %08x", tname, flags);
|
||||
|
||||
for (i = 0; i < 32; ++i) {
|
||||
if (flags & (1U << i)) {
|
||||
name = orc_target_get_flag_name (target, i);
|
||||
GST_DEBUG ("target flag %s", name);
|
||||
#ifdef CHECK_X86
|
||||
audio_resampler_check_x86 (name);
|
||||
audio_resampler_check_x86 (tname, name);
|
||||
#endif
|
||||
#ifdef CHECK_NEON
|
||||
audio_resampler_check_neon (tname, name);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue