/* * Copyright (c) 2003-2010, Mark Borgerding. All rights reserved. * This file is part of KISS FFT - https://github.com/mborgerding/kissfft * * SPDX-License-Identifier: BSD-3-Clause * See COPYING file for more information. */ #include "_kiss_fft_guts_f32.h" /* The guts header contains all the multiplication and addition macros that are defined for fixed or floating point complex numbers. It also delares the kf_ internal functions. */ static void kf_bfly2 (kiss_fft_f32_cpx * Fout, const size_t fstride, const kiss_fft_f32_cfg st, int m) { kiss_fft_f32_cpx *Fout2; kiss_fft_f32_cpx *tw1 = st->twiddles; kiss_fft_f32_cpx t; Fout2 = Fout + m; do { C_FIXDIV (*Fout, 2); C_FIXDIV (*Fout2, 2); C_MUL (t, *Fout2, *tw1); tw1 += fstride; C_SUB (*Fout2, *Fout, t); C_ADDTO (*Fout, t); ++Fout2; ++Fout; } while (--m); } static void kf_bfly4 (kiss_fft_f32_cpx * Fout, const size_t fstride, const kiss_fft_f32_cfg st, const size_t m) { kiss_fft_f32_cpx *tw1, *tw2, *tw3; kiss_fft_f32_cpx scratch[6]; size_t k = m; const size_t m2 = 2 * m; const size_t m3 = 3 * m; tw3 = tw2 = tw1 = st->twiddles; do { C_FIXDIV (*Fout, 4); C_FIXDIV (Fout[m], 4); C_FIXDIV (Fout[m2], 4); C_FIXDIV (Fout[m3], 4); C_MUL (scratch[0], Fout[m], *tw1); C_MUL (scratch[1], Fout[m2], *tw2); C_MUL (scratch[2], Fout[m3], *tw3); C_SUB (scratch[5], *Fout, scratch[1]); C_ADDTO (*Fout, scratch[1]); C_ADD (scratch[3], scratch[0], scratch[2]); C_SUB (scratch[4], scratch[0], scratch[2]); C_SUB (Fout[m2], *Fout, scratch[3]); tw1 += fstride; tw2 += fstride * 2; tw3 += fstride * 3; C_ADDTO (*Fout, scratch[3]); if (st->inverse) { Fout[m].r = scratch[5].r - scratch[4].i; Fout[m].i = scratch[5].i + scratch[4].r; Fout[m3].r = scratch[5].r + scratch[4].i; Fout[m3].i = scratch[5].i - scratch[4].r; } else { Fout[m].r = scratch[5].r + scratch[4].i; Fout[m].i = scratch[5].i - scratch[4].r; Fout[m3].r = scratch[5].r - scratch[4].i; Fout[m3].i = scratch[5].i + scratch[4].r; } ++Fout; } while (--k); } static void kf_bfly3 (kiss_fft_f32_cpx * Fout, const size_t fstride, const kiss_fft_f32_cfg st, size_t m) { size_t k = m; const size_t m2 = 2 * m; kiss_fft_f32_cpx *tw1, *tw2; kiss_fft_f32_cpx scratch[5]; kiss_fft_f32_cpx epi3; epi3 = st->twiddles[fstride * m]; tw1 = tw2 = st->twiddles; do { C_FIXDIV (*Fout, 3); C_FIXDIV (Fout[m], 3); C_FIXDIV (Fout[m2], 3); C_MUL (scratch[1], Fout[m], *tw1); C_MUL (scratch[2], Fout[m2], *tw2); C_ADD (scratch[3], scratch[1], scratch[2]); C_SUB (scratch[0], scratch[1], scratch[2]); tw1 += fstride; tw2 += fstride * 2; Fout[m].r = Fout->r - HALF_OF (scratch[3].r); Fout[m].i = Fout->i - HALF_OF (scratch[3].i); C_MULBYSCALAR (scratch[0], epi3.i); C_ADDTO (*Fout, scratch[3]); Fout[m2].r = Fout[m].r + scratch[0].i; Fout[m2].i = Fout[m].i - scratch[0].r; Fout[m].r -= scratch[0].i; Fout[m].i += scratch[0].r; ++Fout; } while (--k); } static void kf_bfly5 (kiss_fft_f32_cpx * Fout, const size_t fstride, const kiss_fft_f32_cfg st, int m) { kiss_fft_f32_cpx *Fout0, *Fout1, *Fout2, *Fout3, *Fout4; int u; kiss_fft_f32_cpx scratch[13]; kiss_fft_f32_cpx *twiddles = st->twiddles; kiss_fft_f32_cpx *tw; kiss_fft_f32_cpx ya, yb; ya = twiddles[fstride * m]; yb = twiddles[fstride * 2 * m]; Fout0 = Fout; Fout1 = Fout0 + m; Fout2 = Fout0 + 2 * m; Fout3 = Fout0 + 3 * m; Fout4 = Fout0 + 4 * m; tw = st->twiddles; for (u = 0; u < m; ++u) { C_FIXDIV (*Fout0, 5); C_FIXDIV (*Fout1, 5); C_FIXDIV (*Fout2, 5); C_FIXDIV (*Fout3, 5); C_FIXDIV (*Fout4, 5); scratch[0] = *Fout0; C_MUL (scratch[1], *Fout1, tw[u * fstride]); C_MUL (scratch[2], *Fout2, tw[2 * u * fstride]); C_MUL (scratch[3], *Fout3, tw[3 * u * fstride]); C_MUL (scratch[4], *Fout4, tw[4 * u * fstride]); C_ADD (scratch[7], scratch[1], scratch[4]); C_SUB (scratch[10], scratch[1], scratch[4]); C_ADD (scratch[8], scratch[2], scratch[3]); C_SUB (scratch[9], scratch[2], scratch[3]); Fout0->r += scratch[7].r + scratch[8].r; Fout0->i += scratch[7].i + scratch[8].i; scratch[5].r = scratch[0].r + S_MUL (scratch[7].r, ya.r) + S_MUL (scratch[8].r, yb.r); scratch[5].i = scratch[0].i + S_MUL (scratch[7].i, ya.r) + S_MUL (scratch[8].i, yb.r); scratch[6].r = S_MUL (scratch[10].i, ya.i) + S_MUL (scratch[9].i, yb.i); scratch[6].i = -S_MUL (scratch[10].r, ya.i) - S_MUL (scratch[9].r, yb.i); C_SUB (*Fout1, scratch[5], scratch[6]); C_ADD (*Fout4, scratch[5], scratch[6]); scratch[11].r = scratch[0].r + S_MUL (scratch[7].r, yb.r) + S_MUL (scratch[8].r, ya.r); scratch[11].i = scratch[0].i + S_MUL (scratch[7].i, yb.r) + S_MUL (scratch[8].i, ya.r); scratch[12].r = -S_MUL (scratch[10].i, yb.i) + S_MUL (scratch[9].i, ya.i); scratch[12].i = S_MUL (scratch[10].r, yb.i) - S_MUL (scratch[9].r, ya.i); C_ADD (*Fout2, scratch[11], scratch[12]); C_SUB (*Fout3, scratch[11], scratch[12]); ++Fout0; ++Fout1; ++Fout2; ++Fout3; ++Fout4; } } /* perform the butterfly for one stage of a mixed radix FFT */ static void kf_bfly_generic (kiss_fft_f32_cpx * Fout, const size_t fstride, const kiss_fft_f32_cfg st, int m, int p) { int u, k, q1, q; kiss_fft_f32_cpx *twiddles = st->twiddles; kiss_fft_f32_cpx t; int Norig = st->nfft; kiss_fft_f32_cpx *scratch = (kiss_fft_f32_cpx *) KISS_FFT_F32_TMP_ALLOC (sizeof (kiss_fft_f32_cpx) * p); for (u = 0; u < m; ++u) { k = u; for (q1 = 0; q1 < p; ++q1) { scratch[q1] = Fout[k]; C_FIXDIV (scratch[q1], p); k += m; } k = u; for (q1 = 0; q1 < p; ++q1) { int twidx = 0; Fout[k] = scratch[0]; for (q = 1; q < p; ++q) { twidx += fstride * k; if (twidx >= Norig) twidx -= Norig; C_MUL (t, scratch[q], twiddles[twidx]); C_ADDTO (Fout[k], t); } k += m; } } KISS_FFT_F32_TMP_FREE (scratch); } static void kf_work (kiss_fft_f32_cpx * Fout, const kiss_fft_f32_cpx * f, const size_t fstride, int in_stride, int *factors, const kiss_fft_f32_cfg st) { kiss_fft_f32_cpx *Fout_beg = Fout; const int p = *factors++; /* the radix */ const int m = *factors++; /* stage's fft length/p */ const kiss_fft_f32_cpx *Fout_end = Fout + p * m; #ifdef _OPENMP // use openmp extensions at the // top-level (not recursive) if (fstride == 1 && p <= 5 && m != 1) { int k; // execute the p different work units in different threads # pragma omp parallel for for (k = 0; k < p; ++k) kf_work (Fout + k * m, f + fstride * in_stride * k, fstride * p, in_stride, factors, st); // all threads have joined by this point switch (p) { case 2: kf_bfly2 (Fout, fstride, st, m); break; case 3: kf_bfly3 (Fout, fstride, st, m); break; case 4: kf_bfly4 (Fout, fstride, st, m); break; case 5: kf_bfly5 (Fout, fstride, st, m); break; default: kf_bfly_generic (Fout, fstride, st, m, p); break; } return; } #endif if (m == 1) { do { *Fout = *f; f += fstride * in_stride; } while (++Fout != Fout_end); } else { do { // recursive call: // DFT of size m*p performed by doing // p instances of smaller DFTs of size m, // each one takes a decimated version of the input kf_work (Fout, f, fstride * p, in_stride, factors, st); f += fstride * in_stride; } while ((Fout += m) != Fout_end); } Fout = Fout_beg; // recombine the p smaller DFTs switch (p) { case 2: kf_bfly2 (Fout, fstride, st, m); break; case 3: kf_bfly3 (Fout, fstride, st, m); break; case 4: kf_bfly4 (Fout, fstride, st, m); break; case 5: kf_bfly5 (Fout, fstride, st, m); break; default: kf_bfly_generic (Fout, fstride, st, m, p); break; } } /* facbuf is populated by p1,m1,p2,m2, ... where p[i] * m[i] = m[i-1] m0 = n */ static void kf_factor (int n, int *facbuf) { int p = 4; double floor_sqrt; floor_sqrt = floor (sqrt ((double) n)); /*factor out powers of 4, powers of 2, then any remaining primes */ do { while (n % p) { switch (p) { case 4: p = 2; break; case 2: p = 3; break; default: p += 2; break; } if (p > floor_sqrt) p = n; /* no more factors, skip to end */ } n /= p; *facbuf++ = p; *facbuf++ = n; } while (n > 1); } /* * * User-callable function to allocate all necessary storage space for the fft. * * The return value is a contiguous block of memory, allocated with malloc. As such, * It can be freed with free(), rather than a kiss_fft_f32-specific function. * */ kiss_fft_f32_cfg kiss_fft_f32_alloc (int nfft, int inverse_fft, void *mem, size_t * lenmem) { kiss_fft_f32_cfg st = NULL; size_t memneeded = sizeof (struct kiss_fft_f32_state) + sizeof (kiss_fft_f32_cpx) * (nfft - 1); /* twiddle factors */ if (lenmem == NULL) { st = (kiss_fft_f32_cfg) KISS_FFT_F32_MALLOC (memneeded); } else { if (mem != NULL && *lenmem >= memneeded) st = (kiss_fft_f32_cfg) mem; *lenmem = memneeded; } if (st) { int i; st->nfft = nfft; st->inverse = inverse_fft; for (i = 0; i < nfft; ++i) { const double pi = 3.141592653589793238462643383279502884197169399375105820974944; double phase = -2 * pi * i / nfft; if (st->inverse) phase *= -1; kf_cexp (st->twiddles + i, phase); } kf_factor (nfft, st->factors); } return st; } void kiss_fft_f32_stride (kiss_fft_f32_cfg st, const kiss_fft_f32_cpx * fin, kiss_fft_f32_cpx * fout, int in_stride) { if (fin == fout) { //NOTE: this is not really an in-place FFT algorithm. //It just performs an out-of-place FFT into a temp buffer kiss_fft_f32_cpx *tmpbuf = (kiss_fft_f32_cpx *) KISS_FFT_F32_TMP_ALLOC (sizeof (kiss_fft_f32_cpx) * st->nfft); kf_work (tmpbuf, fin, 1, in_stride, st->factors, st); memcpy (fout, tmpbuf, sizeof (kiss_fft_f32_cpx) * st->nfft); KISS_FFT_F32_TMP_FREE (tmpbuf); } else { kf_work (fout, fin, 1, in_stride, st->factors, st); } } void kiss_fft_f32 (kiss_fft_f32_cfg cfg, const kiss_fft_f32_cpx * fin, kiss_fft_f32_cpx * fout) { kiss_fft_f32_stride (cfg, fin, fout, 1); } void kiss_fft_f32_cleanup (void) { // nothing needed any more } int kiss_fft_f32_next_fast_size (int n) { while (1) { int m = n; while ((m % 2) == 0) m /= 2; while ((m % 3) == 0) m /= 3; while ((m % 5) == 0) m /= 5; if (m <= 1) break; /* n is completely factorable by twos, threes, and fives */ n++; } return n; }