audiofxbasefirfilter: Optimize time-domain convolution

Remove some redundant calculations, move comparisions out of inner loops, etc. This makes the convolution about 3 (!) times faster but processing time is of course still proportional to the filter size.
2025-01-29 18:48:44 +00:00 · 2009-11-26 15:17:27 +01:00 · 2009-11-26 15:17:27 +01:00 · 43576fb0cf
commit 43576fb0cf
parent c5f955a3b6
1 changed files with 28 additions and 15 deletions
--- a/gst/audiofx/audiofxbasefirfilter.c
+++ b/gst/audiofx/audiofxbasefirfilter.c
@ -149,37 +149,50 @@ process_##width (GstAudioFXBaseFIRFilter * self, const g##ctype * src, g##ctype
  gint i, j, k, l; \
  gint channels = GST_AUDIO_FILTER_CAST (self)->format.channels; \
  gint res_start; \
+  gint from_input; \
+  gint off; \
+  gdouble *buffer = self->buffer; \
+  gdouble *kernel = self->kernel; \
+  guint buffer_length = self->buffer_length; \
  \
  /* convolution */ \
  for (i = 0; i < input_samples; i++) { \
    dst[i] = 0.0; \
    k = i % channels; \
    l = i / channels; \
-    for (j = 0; j < kernel_length; j++) \
-      if (l < j) \
-        dst[i] += \
-            self->buffer[(kernel_length + l - j) * channels + \
-            k] * self->kernel[j]; \
-      else \
-        dst[i] += src[(l - j) * channels + k] * self->kernel[j]; \
+    from_input = MIN (l, kernel_length-1); \
+    off = l * channels + k; \
+    for (j = 0; j <= from_input; j++) { \
+      dst[i] += src[off] * kernel[j]; \
+      off -= channels; \
+    } \
+    /* j == from_input && off == (l - j) * channels + k */ \
+    off += kernel_length * channels; \
+    for (; j < kernel_length; j++) { \
+      dst[i] += buffer[off] * kernel[j]; \
+      off -= channels; \
+    } \
  } \
  \
  /* copy the tail of the current input buffer to the residue, while \
   * keeping parts of the residue if the input buffer is smaller than \
   * the kernel length */ \
-  if (input_samples < kernel_length * channels) \
-    res_start = kernel_length * channels - input_samples; \
+  /* from now on take kernel length as length over all channels */ \
+  kernel_length *= channels; \
+  if (input_samples < kernel_length) \
+    res_start = kernel_length - input_samples; \
  else \
    res_start = 0; \
  \
  for (i = 0; i < res_start; i++) \
-    self->buffer[i] = self->buffer[i + input_samples]; \
-  for (i = res_start; i < kernel_length * channels; i++) \
-    self->buffer[i] = src[input_samples - kernel_length * channels + i]; \
+    buffer[i] = buffer[i + input_samples]; \
+  /* i == res_start */ \
+  for (; i < kernel_length; i++) \
+    buffer[i] = src[input_samples - kernel_length + i]; \
  \
-  self->buffer_fill += kernel_length * channels - res_start; \
-  if (self->buffer_fill > kernel_length * channels) \
-    self->buffer_fill = kernel_length * channels; \
+  self->buffer_fill += kernel_length - res_start; \
+  if (self->buffer_fill > kernel_length) \
+    self->buffer_fill = kernel_length; \
 }

 DEFINE_PROCESS_FUNC (32, float);