mirror of
https://gitlab.freedesktop.org/gstreamer/gstreamer.git
synced 2024-11-18 15:51:11 +00:00
346 lines
9.9 KiB
C
346 lines
9.9 KiB
C
/* Karatsuba convolution
|
|
*
|
|
* Copyright (C) 1999 Ralph Loader <suckfish@ihug.co.nz>
|
|
*
|
|
* This library is free software; you can redistribute it and/or
|
|
* modify it under the terms of the GNU Library General Public
|
|
* License as published by the Free Software Foundation; either
|
|
* version 2 of the License, or (at your option) any later version.
|
|
*
|
|
* This library is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
* Library General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU Library General Public
|
|
* License along with this library; if not, write to the
|
|
* Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
|
|
* Boston, MA 02110-1301, USA.
|
|
*
|
|
*
|
|
* Note: 7th December 2004: This file used to be licensed under the GPL,
|
|
* but we got permission from Ralp Loader to relicense it to LGPL.
|
|
*
|
|
* $Id$
|
|
*
|
|
*/
|
|
|
|
/* The algorithm is based on the following. For the convolution of a pair
|
|
* of pairs, (a,b) * (c,d) = (0, a.c, a.d+b.c, b.d), we can reduce the four
|
|
* multiplications to three, by the formulae a.d+b.c = (a+b).(c+d) - a.c -
|
|
* b.d. A similar relation enables us to compute a 2n by 2n convolution
|
|
* using 3 n by n convolutions, and thus a 2^n by 2^n convolution using 3^n
|
|
* multiplications (as opposed to the 4^n that the quadratic algorithm
|
|
* takes. */
|
|
|
|
/* For large n, this is slower than the O(n log n) that the FFT method
|
|
* takes, but we avoid using complex numbers, and we only have to compute
|
|
* one convolution, as opposed to 3 FFTs. We have good locality-of-
|
|
* reference as well, which will help on CPUs with tiny caches. */
|
|
|
|
/* E.g., for a 512 x 512 convolution, the FFT method takes 55 * 512 = 28160
|
|
* (real) multiplications, as opposed to 3^9 = 19683 for the Karatsuba
|
|
* algorithm. We actually want 257 outputs of a 256 x 512 convolution;
|
|
* that doesn't appear to give an easy advantage for the FFT algorithm, but
|
|
* for the Karatsuba algorithm, it's easy to use two 256 x 256
|
|
* convolutions, taking 2 x 3^8 = 12312 multiplications. [This difference
|
|
* is that the FFT method "wraps" the arrays, doing a 2^n x 2^n -> 2^n,
|
|
* while the Karatsuba algorithm pads with zeros, doing 2^n x 2^n -> 2.2^n
|
|
* - 1]. */
|
|
|
|
/* There's a big lie above, actually... for a 4x4 convolution, it's quicker
|
|
* to do it using 16 multiplications than the more complex Karatsuba
|
|
* algorithm... So the recursion bottoms out at 4x4s. This increases the
|
|
* number of multiplications by a factor of 16/9, but reduces the overheads
|
|
* dramatically. */
|
|
|
|
/* The convolution algorithm is implemented as a stack machine. We have a
|
|
* stack of commands, each in one of the forms "do a 2^n x 2^n
|
|
* convolution", or "combine these three length 2^n outputs into one
|
|
* 2^{n+1} output." */
|
|
|
|
#ifdef HAVE_CONFIG_H
|
|
#include "config.h"
|
|
#endif
|
|
|
|
#include <stdlib.h>
|
|
#include "convolve.h"
|
|
|
|
typedef union stack_entry_s
|
|
{
|
|
struct
|
|
{
|
|
const double *left, *right;
|
|
double *out;
|
|
}
|
|
v;
|
|
struct
|
|
{
|
|
double *main, *null;
|
|
}
|
|
b;
|
|
|
|
}
|
|
stack_entry;
|
|
|
|
#define STACK_SIZE (CONVOLVE_DEPTH * 3)
|
|
|
|
struct _struct_convolve_state
|
|
{
|
|
double left[CONVOLVE_BIG];
|
|
double right[CONVOLVE_SMALL * 3];
|
|
double scratch[CONVOLVE_SMALL * 3];
|
|
stack_entry stack[STACK_SIZE + 1];
|
|
};
|
|
|
|
/*
|
|
* Initialisation routine - sets up tables and space to work in.
|
|
* Returns a pointer to internal state, to be used when performing calls.
|
|
* On error, returns NULL.
|
|
* The pointer should be freed when it is finished with, by convolve_close().
|
|
*/
|
|
convolve_state *
|
|
convolve_init (void)
|
|
{
|
|
return (convolve_state *) calloc (1, sizeof (convolve_state));
|
|
}
|
|
|
|
/*
|
|
* Free the state allocated with convolve_init().
|
|
*/
|
|
void
|
|
convolve_close (convolve_state * state)
|
|
{
|
|
free (state);
|
|
}
|
|
|
|
static void
|
|
convolve_4 (double *out, const double *left, const double *right)
|
|
/* This does a 4x4 -> 7 convolution. For what it's worth, the slightly odd
|
|
* ordering gives about a 1% speed up on my Pentium II. */
|
|
{
|
|
double l0, l1, l2, l3, r0, r1, r2, r3;
|
|
double a;
|
|
|
|
l0 = left[0];
|
|
r0 = right[0];
|
|
a = l0 * r0;
|
|
l1 = left[1];
|
|
r1 = right[1];
|
|
out[0] = a;
|
|
a = (l0 * r1) + (l1 * r0);
|
|
l2 = left[2];
|
|
r2 = right[2];
|
|
out[1] = a;
|
|
a = (l0 * r2) + (l1 * r1) + (l2 * r0);
|
|
l3 = left[3];
|
|
r3 = right[3];
|
|
out[2] = a;
|
|
|
|
out[3] = (l0 * r3) + (l1 * r2) + (l2 * r1) + (l3 * r0);
|
|
out[4] = (l1 * r3) + (l2 * r2) + (l3 * r1);
|
|
out[5] = (l2 * r3) + (l3 * r2);
|
|
out[6] = l3 * r3;
|
|
}
|
|
|
|
static void
|
|
convolve_run (stack_entry * top, unsigned size, double *scratch)
|
|
/* Interpret a stack of commands. The stack starts with two entries; the
|
|
* convolution to do, and an illegal entry used to mark the stack top. The
|
|
* size is the number of entries in each input, and must be a power of 2,
|
|
* and at least 8. It is OK to have out equal to left and/or right.
|
|
* scratch must have length 3*size. The number of stack entries needed is
|
|
* 3n-4 where size=2^n. */
|
|
{
|
|
do {
|
|
const double *left;
|
|
const double *right;
|
|
double *out;
|
|
|
|
/* When we get here, the stack top is always a convolve,
|
|
* with size > 4. So we will split it. We repeatedly split
|
|
* the top entry until we get to size = 4. */
|
|
|
|
left = top->v.left;
|
|
right = top->v.right;
|
|
out = top->v.out;
|
|
top++;
|
|
|
|
do {
|
|
double *s_left, *s_right;
|
|
int i;
|
|
|
|
/* Halve the size. */
|
|
size >>= 1;
|
|
|
|
/* Allocate the scratch areas. */
|
|
s_left = scratch + size * 3;
|
|
/* s_right is a length 2*size buffer also used for
|
|
* intermediate output. */
|
|
s_right = scratch + size * 4;
|
|
|
|
/* Create the intermediate factors. */
|
|
for (i = 0; i < size; i++) {
|
|
double l = left[i] + left[i + size];
|
|
double r = right[i] + right[i + size];
|
|
|
|
s_left[i + size] = r;
|
|
s_left[i] = l;
|
|
}
|
|
|
|
/* Push the combine entry onto the stack. */
|
|
top -= 3;
|
|
top[2].b.main = out;
|
|
top[2].b.null = NULL;
|
|
|
|
/* Push the low entry onto the stack. This must be
|
|
* the last of the three sub-convolutions, because
|
|
* it may overwrite the arguments. */
|
|
top[1].v.left = left;
|
|
top[1].v.right = right;
|
|
top[1].v.out = out;
|
|
|
|
/* Push the mid entry onto the stack. */
|
|
top[0].v.left = s_left;
|
|
top[0].v.right = s_right;
|
|
top[0].v.out = s_right;
|
|
|
|
/* Leave the high entry in variables. */
|
|
left += size;
|
|
right += size;
|
|
out += size * 2;
|
|
|
|
} while (size > 4);
|
|
|
|
/* When we get here, the stack top is a group of 3
|
|
* convolves, with size = 4, followed by some combines. */
|
|
convolve_4 (out, left, right);
|
|
convolve_4 (top[0].v.out, top[0].v.left, top[0].v.right);
|
|
convolve_4 (top[1].v.out, top[1].v.left, top[1].v.right);
|
|
top += 2;
|
|
|
|
/* Now process combines. */
|
|
do {
|
|
/* b.main is the output buffer, mid is the middle
|
|
* part which needs to be adjusted in place, and
|
|
* then folded back into the output. We do this in
|
|
* a slightly strange way, so as to avoid having
|
|
* two loops. */
|
|
double *out = top->b.main;
|
|
double *mid = scratch + size * 4;
|
|
unsigned int i;
|
|
|
|
top++;
|
|
out[size * 2 - 1] = 0;
|
|
for (i = 0; i < size - 1; i++) {
|
|
double lo;
|
|
double hi;
|
|
|
|
lo = mid[0] - (out[0] + out[2 * size]) + out[size];
|
|
hi = mid[size] - (out[size] + out[3 * size]) + out[2 * size];
|
|
out[size] = lo;
|
|
out[2 * size] = hi;
|
|
out++;
|
|
mid++;
|
|
}
|
|
size <<= 1;
|
|
} while (top->b.null == NULL);
|
|
} while (top->b.main != NULL);
|
|
}
|
|
|
|
int
|
|
convolve_match (const int *lastchoice,
|
|
const short *input, convolve_state * state)
|
|
/* lastchoice is a 256 sized array. input is a 512 array. We find the
|
|
* contiguous length 256 sub-array of input that best matches lastchoice.
|
|
* A measure of how good a sub-array is compared with the lastchoice is
|
|
* given by the sum of the products of each pair of entries. We maximise
|
|
* that, by taking an appropriate convolution, and then finding the maximum
|
|
* entry in the convolutions. state is a (non-NULL) pointer returned by
|
|
* convolve_init. */
|
|
{
|
|
double avg;
|
|
double best;
|
|
int p = 0;
|
|
int i;
|
|
double *left = state->left;
|
|
double *right = state->right;
|
|
double *scratch = state->scratch;
|
|
stack_entry *top = state->stack + (STACK_SIZE - 1);
|
|
|
|
#if 1
|
|
for (i = 0; i < 512; i++)
|
|
left[i] = input[i];
|
|
|
|
avg = 0;
|
|
for (i = 0; i < 256; i++) {
|
|
double a = lastchoice[255 - i];
|
|
|
|
right[i] = a;
|
|
avg += a;
|
|
}
|
|
#endif
|
|
/* We adjust the smaller of the two input arrays to have average
|
|
* value 0. This makes the eventual result insensitive to both
|
|
* constant offsets and positive multipliers of the inputs. */
|
|
avg /= 256;
|
|
for (i = 0; i < 256; i++)
|
|
right[i] -= avg;
|
|
/* End-of-stack marker. */
|
|
top[1].b.null = scratch;
|
|
top[1].b.main = NULL;
|
|
/* The low 256x256, of which we want the high 256 outputs. */
|
|
top->v.left = left;
|
|
top->v.right = right;
|
|
top->v.out = right + 256;
|
|
convolve_run (top, 256, scratch);
|
|
|
|
/* The high 256x256, of which we want the low 256 outputs. */
|
|
top->v.left = left + 256;
|
|
top->v.right = right;
|
|
top->v.out = right;
|
|
convolve_run (top, 256, scratch);
|
|
|
|
/* Now find the best position amoungs this. Apart from the first
|
|
* and last, the required convolution outputs are formed by adding
|
|
* outputs from the two convolutions above. */
|
|
best = right[511];
|
|
right[767] = 0;
|
|
p = -1;
|
|
for (i = 0; i < 256; i++) {
|
|
double a = right[i] + right[i + 512];
|
|
|
|
if (a > best) {
|
|
best = a;
|
|
p = i;
|
|
}
|
|
}
|
|
p++;
|
|
|
|
#if 0
|
|
{
|
|
/* This is some debugging code... */
|
|
int bad = 0;
|
|
|
|
best = 0;
|
|
for (i = 0; i < 256; i++)
|
|
best += ((double) input[i + p]) * ((double) lastchoice[i] - avg);
|
|
|
|
for (i = 0; i < 257; i++) {
|
|
double tot = 0;
|
|
unsigned int j;
|
|
|
|
for (j = 0; j < 256; j++)
|
|
tot += ((double) input[i + j]) * ((double) lastchoice[j] - avg);
|
|
if (tot > best)
|
|
printf ("(%i)", i);
|
|
if (tot != left[i + 255])
|
|
printf ("!");
|
|
}
|
|
|
|
printf ("%i\n", p);
|
|
}
|
|
#endif
|
|
|
|
return p;
|
|
}
|