gstreamer/gst-libs/gst/fft/kiss_fft_f32.c
Sebastian Dröge ad68f71d9a fft: Update our kiss fft version
This fixes thread-safety issues and various other minor issues. Our
previous version was about 13 years old.

Fixes https://gitlab.freedesktop.org/gstreamer/gst-plugins-base/issues/715
2019-12-27 11:53:14 +02:00

442 lines
11 KiB
C

/*
* Copyright (c) 2003-2010, Mark Borgerding. All rights reserved.
* This file is part of KISS FFT - https://github.com/mborgerding/kissfft
*
* SPDX-License-Identifier: BSD-3-Clause
* See COPYING file for more information.
*/
#include "_kiss_fft_guts_f32.h"
/* The guts header contains all the multiplication and addition macros that are defined for
fixed or floating point complex numbers. It also delares the kf_ internal functions.
*/
static void
kf_bfly2 (kiss_fft_f32_cpx * Fout,
const size_t fstride, const kiss_fft_f32_cfg st, int m)
{
kiss_fft_f32_cpx *Fout2;
kiss_fft_f32_cpx *tw1 = st->twiddles;
kiss_fft_f32_cpx t;
Fout2 = Fout + m;
do {
C_FIXDIV (*Fout, 2);
C_FIXDIV (*Fout2, 2);
C_MUL (t, *Fout2, *tw1);
tw1 += fstride;
C_SUB (*Fout2, *Fout, t);
C_ADDTO (*Fout, t);
++Fout2;
++Fout;
} while (--m);
}
static void
kf_bfly4 (kiss_fft_f32_cpx * Fout,
const size_t fstride, const kiss_fft_f32_cfg st, const size_t m)
{
kiss_fft_f32_cpx *tw1, *tw2, *tw3;
kiss_fft_f32_cpx scratch[6];
size_t k = m;
const size_t m2 = 2 * m;
const size_t m3 = 3 * m;
tw3 = tw2 = tw1 = st->twiddles;
do {
C_FIXDIV (*Fout, 4);
C_FIXDIV (Fout[m], 4);
C_FIXDIV (Fout[m2], 4);
C_FIXDIV (Fout[m3], 4);
C_MUL (scratch[0], Fout[m], *tw1);
C_MUL (scratch[1], Fout[m2], *tw2);
C_MUL (scratch[2], Fout[m3], *tw3);
C_SUB (scratch[5], *Fout, scratch[1]);
C_ADDTO (*Fout, scratch[1]);
C_ADD (scratch[3], scratch[0], scratch[2]);
C_SUB (scratch[4], scratch[0], scratch[2]);
C_SUB (Fout[m2], *Fout, scratch[3]);
tw1 += fstride;
tw2 += fstride * 2;
tw3 += fstride * 3;
C_ADDTO (*Fout, scratch[3]);
if (st->inverse) {
Fout[m].r = scratch[5].r - scratch[4].i;
Fout[m].i = scratch[5].i + scratch[4].r;
Fout[m3].r = scratch[5].r + scratch[4].i;
Fout[m3].i = scratch[5].i - scratch[4].r;
} else {
Fout[m].r = scratch[5].r + scratch[4].i;
Fout[m].i = scratch[5].i - scratch[4].r;
Fout[m3].r = scratch[5].r - scratch[4].i;
Fout[m3].i = scratch[5].i + scratch[4].r;
}
++Fout;
} while (--k);
}
static void
kf_bfly3 (kiss_fft_f32_cpx * Fout,
const size_t fstride, const kiss_fft_f32_cfg st, size_t m)
{
size_t k = m;
const size_t m2 = 2 * m;
kiss_fft_f32_cpx *tw1, *tw2;
kiss_fft_f32_cpx scratch[5];
kiss_fft_f32_cpx epi3;
epi3 = st->twiddles[fstride * m];
tw1 = tw2 = st->twiddles;
do {
C_FIXDIV (*Fout, 3);
C_FIXDIV (Fout[m], 3);
C_FIXDIV (Fout[m2], 3);
C_MUL (scratch[1], Fout[m], *tw1);
C_MUL (scratch[2], Fout[m2], *tw2);
C_ADD (scratch[3], scratch[1], scratch[2]);
C_SUB (scratch[0], scratch[1], scratch[2]);
tw1 += fstride;
tw2 += fstride * 2;
Fout[m].r = Fout->r - HALF_OF (scratch[3].r);
Fout[m].i = Fout->i - HALF_OF (scratch[3].i);
C_MULBYSCALAR (scratch[0], epi3.i);
C_ADDTO (*Fout, scratch[3]);
Fout[m2].r = Fout[m].r + scratch[0].i;
Fout[m2].i = Fout[m].i - scratch[0].r;
Fout[m].r -= scratch[0].i;
Fout[m].i += scratch[0].r;
++Fout;
} while (--k);
}
static void
kf_bfly5 (kiss_fft_f32_cpx * Fout,
const size_t fstride, const kiss_fft_f32_cfg st, int m)
{
kiss_fft_f32_cpx *Fout0, *Fout1, *Fout2, *Fout3, *Fout4;
int u;
kiss_fft_f32_cpx scratch[13];
kiss_fft_f32_cpx *twiddles = st->twiddles;
kiss_fft_f32_cpx *tw;
kiss_fft_f32_cpx ya, yb;
ya = twiddles[fstride * m];
yb = twiddles[fstride * 2 * m];
Fout0 = Fout;
Fout1 = Fout0 + m;
Fout2 = Fout0 + 2 * m;
Fout3 = Fout0 + 3 * m;
Fout4 = Fout0 + 4 * m;
tw = st->twiddles;
for (u = 0; u < m; ++u) {
C_FIXDIV (*Fout0, 5);
C_FIXDIV (*Fout1, 5);
C_FIXDIV (*Fout2, 5);
C_FIXDIV (*Fout3, 5);
C_FIXDIV (*Fout4, 5);
scratch[0] = *Fout0;
C_MUL (scratch[1], *Fout1, tw[u * fstride]);
C_MUL (scratch[2], *Fout2, tw[2 * u * fstride]);
C_MUL (scratch[3], *Fout3, tw[3 * u * fstride]);
C_MUL (scratch[4], *Fout4, tw[4 * u * fstride]);
C_ADD (scratch[7], scratch[1], scratch[4]);
C_SUB (scratch[10], scratch[1], scratch[4]);
C_ADD (scratch[8], scratch[2], scratch[3]);
C_SUB (scratch[9], scratch[2], scratch[3]);
Fout0->r += scratch[7].r + scratch[8].r;
Fout0->i += scratch[7].i + scratch[8].i;
scratch[5].r =
scratch[0].r + S_MUL (scratch[7].r, ya.r) + S_MUL (scratch[8].r, yb.r);
scratch[5].i =
scratch[0].i + S_MUL (scratch[7].i, ya.r) + S_MUL (scratch[8].i, yb.r);
scratch[6].r = S_MUL (scratch[10].i, ya.i) + S_MUL (scratch[9].i, yb.i);
scratch[6].i = -S_MUL (scratch[10].r, ya.i) - S_MUL (scratch[9].r, yb.i);
C_SUB (*Fout1, scratch[5], scratch[6]);
C_ADD (*Fout4, scratch[5], scratch[6]);
scratch[11].r =
scratch[0].r + S_MUL (scratch[7].r, yb.r) + S_MUL (scratch[8].r, ya.r);
scratch[11].i =
scratch[0].i + S_MUL (scratch[7].i, yb.r) + S_MUL (scratch[8].i, ya.r);
scratch[12].r = -S_MUL (scratch[10].i, yb.i) + S_MUL (scratch[9].i, ya.i);
scratch[12].i = S_MUL (scratch[10].r, yb.i) - S_MUL (scratch[9].r, ya.i);
C_ADD (*Fout2, scratch[11], scratch[12]);
C_SUB (*Fout3, scratch[11], scratch[12]);
++Fout0;
++Fout1;
++Fout2;
++Fout3;
++Fout4;
}
}
/* perform the butterfly for one stage of a mixed radix FFT */
static void
kf_bfly_generic (kiss_fft_f32_cpx * Fout,
const size_t fstride, const kiss_fft_f32_cfg st, int m, int p)
{
int u, k, q1, q;
kiss_fft_f32_cpx *twiddles = st->twiddles;
kiss_fft_f32_cpx t;
int Norig = st->nfft;
kiss_fft_f32_cpx *scratch =
(kiss_fft_f32_cpx *) KISS_FFT_F32_TMP_ALLOC (sizeof (kiss_fft_f32_cpx) *
p);
for (u = 0; u < m; ++u) {
k = u;
for (q1 = 0; q1 < p; ++q1) {
scratch[q1] = Fout[k];
C_FIXDIV (scratch[q1], p);
k += m;
}
k = u;
for (q1 = 0; q1 < p; ++q1) {
int twidx = 0;
Fout[k] = scratch[0];
for (q = 1; q < p; ++q) {
twidx += fstride * k;
if (twidx >= Norig)
twidx -= Norig;
C_MUL (t, scratch[q], twiddles[twidx]);
C_ADDTO (Fout[k], t);
}
k += m;
}
}
KISS_FFT_F32_TMP_FREE (scratch);
}
static void
kf_work (kiss_fft_f32_cpx * Fout,
const kiss_fft_f32_cpx * f,
const size_t fstride, int in_stride, int *factors,
const kiss_fft_f32_cfg st)
{
kiss_fft_f32_cpx *Fout_beg = Fout;
const int p = *factors++; /* the radix */
const int m = *factors++; /* stage's fft length/p */
const kiss_fft_f32_cpx *Fout_end = Fout + p * m;
#ifdef _OPENMP
// use openmp extensions at the
// top-level (not recursive)
if (fstride == 1 && p <= 5 && m != 1) {
int k;
// execute the p different work units in different threads
# pragma omp parallel for
for (k = 0; k < p; ++k)
kf_work (Fout + k * m, f + fstride * in_stride * k, fstride * p,
in_stride, factors, st);
// all threads have joined by this point
switch (p) {
case 2:
kf_bfly2 (Fout, fstride, st, m);
break;
case 3:
kf_bfly3 (Fout, fstride, st, m);
break;
case 4:
kf_bfly4 (Fout, fstride, st, m);
break;
case 5:
kf_bfly5 (Fout, fstride, st, m);
break;
default:
kf_bfly_generic (Fout, fstride, st, m, p);
break;
}
return;
}
#endif
if (m == 1) {
do {
*Fout = *f;
f += fstride * in_stride;
} while (++Fout != Fout_end);
} else {
do {
// recursive call:
// DFT of size m*p performed by doing
// p instances of smaller DFTs of size m,
// each one takes a decimated version of the input
kf_work (Fout, f, fstride * p, in_stride, factors, st);
f += fstride * in_stride;
} while ((Fout += m) != Fout_end);
}
Fout = Fout_beg;
// recombine the p smaller DFTs
switch (p) {
case 2:
kf_bfly2 (Fout, fstride, st, m);
break;
case 3:
kf_bfly3 (Fout, fstride, st, m);
break;
case 4:
kf_bfly4 (Fout, fstride, st, m);
break;
case 5:
kf_bfly5 (Fout, fstride, st, m);
break;
default:
kf_bfly_generic (Fout, fstride, st, m, p);
break;
}
}
/* facbuf is populated by p1,m1,p2,m2, ...
where
p[i] * m[i] = m[i-1]
m0 = n */
static void
kf_factor (int n, int *facbuf)
{
int p = 4;
double floor_sqrt;
floor_sqrt = floor (sqrt ((double) n));
/*factor out powers of 4, powers of 2, then any remaining primes */
do {
while (n % p) {
switch (p) {
case 4:
p = 2;
break;
case 2:
p = 3;
break;
default:
p += 2;
break;
}
if (p > floor_sqrt)
p = n; /* no more factors, skip to end */
}
n /= p;
*facbuf++ = p;
*facbuf++ = n;
} while (n > 1);
}
/*
*
* User-callable function to allocate all necessary storage space for the fft.
*
* The return value is a contiguous block of memory, allocated with malloc. As such,
* It can be freed with free(), rather than a kiss_fft_f32-specific function.
* */
kiss_fft_f32_cfg
kiss_fft_f32_alloc (int nfft, int inverse_fft, void *mem, size_t * lenmem)
{
kiss_fft_f32_cfg st = NULL;
size_t memneeded = sizeof (struct kiss_fft_f32_state)
+ sizeof (kiss_fft_f32_cpx) * (nfft - 1); /* twiddle factors */
if (lenmem == NULL) {
st = (kiss_fft_f32_cfg) KISS_FFT_F32_MALLOC (memneeded);
} else {
if (mem != NULL && *lenmem >= memneeded)
st = (kiss_fft_f32_cfg) mem;
*lenmem = memneeded;
}
if (st) {
int i;
st->nfft = nfft;
st->inverse = inverse_fft;
for (i = 0; i < nfft; ++i) {
const double pi =
3.141592653589793238462643383279502884197169399375105820974944;
double phase = -2 * pi * i / nfft;
if (st->inverse)
phase *= -1;
kf_cexp (st->twiddles + i, phase);
}
kf_factor (nfft, st->factors);
}
return st;
}
void
kiss_fft_f32_stride (kiss_fft_f32_cfg st, const kiss_fft_f32_cpx * fin,
kiss_fft_f32_cpx * fout, int in_stride)
{
if (fin == fout) {
//NOTE: this is not really an in-place FFT algorithm.
//It just performs an out-of-place FFT into a temp buffer
kiss_fft_f32_cpx *tmpbuf =
(kiss_fft_f32_cpx *) KISS_FFT_F32_TMP_ALLOC (sizeof (kiss_fft_f32_cpx) *
st->nfft);
kf_work (tmpbuf, fin, 1, in_stride, st->factors, st);
memcpy (fout, tmpbuf, sizeof (kiss_fft_f32_cpx) * st->nfft);
KISS_FFT_F32_TMP_FREE (tmpbuf);
} else {
kf_work (fout, fin, 1, in_stride, st->factors, st);
}
}
void
kiss_fft_f32 (kiss_fft_f32_cfg cfg, const kiss_fft_f32_cpx * fin,
kiss_fft_f32_cpx * fout)
{
kiss_fft_f32_stride (cfg, fin, fout, 1);
}
void
kiss_fft_f32_cleanup (void)
{
// nothing needed any more
}
int
kiss_fft_f32_next_fast_size (int n)
{
while (1) {
int m = n;
while ((m % 2) == 0)
m /= 2;
while ((m % 3) == 0)
m /= 3;
while ((m % 5) == 0)
m /= 5;
if (m <= 1)
break; /* n is completely factorable by twos, threes, and fives */
n++;
}
return n;
}