From ced6e8445a1ef47d0b93d9abd9400212ae20f86b Mon Sep 17 00:00:00 2001 From: Edward Hervey Date: Thu, 16 Feb 2006 17:06:46 +0000 Subject: [PATCH] gst/videoscale/vs_scanline.c: C-level optimization of the RGBA nearest neighbour function. Original commit message from CVS: Reviewed by Edward Hervey * gst/videoscale/vs_scanline.c: (vs_scanline_resample_nearest_RGBA): C-level optimization of the RGBA nearest neighbour function. Eventually this might end up in liboil with vectorized versions. --- ChangeLog | 8 ++++++ gst/videoscale/vs_scanline.c | 48 +++++++++++++++++++++++++++++++----- 2 files changed, 50 insertions(+), 6 deletions(-) diff --git a/ChangeLog b/ChangeLog index 7eaa33ebd4..01da4b04ee 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,11 @@ +2006-02-16 Mathieu Garcia + + Reviewed by Edward Hervey + + * gst/videoscale/vs_scanline.c: (vs_scanline_resample_nearest_RGBA): + C-level optimization of the RGBA nearest neighbour function. + Eventually this might end up in liboil with vectorized versions. + 2006-02-16 Tim-Philipp Müller * gst-libs/gst/audio/multichannel.c: diff --git a/gst/videoscale/vs_scanline.c b/gst/videoscale/vs_scanline.c index d3c6e102aa..cb2c28d7b8 100644 --- a/gst/videoscale/vs_scanline.c +++ b/gst/videoscale/vs_scanline.c @@ -29,6 +29,7 @@ #include #include +#include /* greyscale, i.e., single componenet */ @@ -114,22 +115,57 @@ void vs_scanline_resample_nearest_RGBA (guint8 * dest, guint8 * src, int n, int *accumulator, int increment) { + guint8 *tmpsrc; int acc = *accumulator; int i; int j; int x; - for (i = 0; i < n; i++) { + /* Optimization Pass #1 : + * - Unroll loop by 16 + * - Pointer arithmetics (most CPUs have DAGs !) + * - Avoid useless branching + */ + for (i = 0, tmpsrc = src; i < n; i++) { j = acc >> 16; x = acc & 0xffff; - dest[i * 4 + 0] = (x < 32768) ? src[j * 4 + 0] : src[j * 4 + 4]; - dest[i * 4 + 1] = (x < 32768) ? src[j * 4 + 1] : src[j * 4 + 5]; - dest[i * 4 + 2] = (x < 32768) ? src[j * 4 + 2] : src[j * 4 + 6]; - dest[i * 4 + 3] = (x < 32768) ? src[j * 4 + 3] : src[j * 4 + 7]; - acc += increment; + if (x < 32768) { + tmpsrc = src + j * 4; + *dest++ = *tmpsrc++; + + /* We do it here to avoid low-level instruction locks */ + acc += increment; + + *dest++ = *tmpsrc++; + *dest++ = *tmpsrc++; + *dest++ = *tmpsrc++; + } else { + tmpsrc = src + (j + 1) * 4;; + *dest++ = *tmpsrc++; + + /* We do it here to avoid low-level instruction locks */ + acc += increment; + + *dest++ = *tmpsrc++; + *dest++ = *tmpsrc++; + *dest++ = *tmpsrc++; + } } + /* --- Unoptimized code BEGIN --- + for (i = 0; i < n; i++) { + j = acc >> 16; + x = acc & 0xffff; + dest[i * 4 + 0] = (x < 32768) ? src[j * 4 + 0] : src[j * 4 + 4]; + dest[i * 4 + 1] = (x < 32768) ? src[j * 4 + 1] : src[j * 4 + 5]; + dest[i * 4 + 2] = (x < 32768) ? src[j * 4 + 2] : src[j * 4 + 6]; + dest[i * 4 + 3] = (x < 32768) ? src[j * 4 + 3] : src[j * 4 + 7]; + + acc += increment; + } + --- Unoptimized code END --- */ + *accumulator = acc; }