gst/deinterlace2/tvtime/greedy.c: Fix the C implementation to produce correct results and optimize the

Original commit message from CVS:
* gst/deinterlace2/tvtime/greedy.c:
(deinterlace_greedy_packed422_scanline_c),
(deinterlace_greedy_packed422_scanline_mmxext),
(deinterlace_greedy_packed422_scanline):
Fix the C implementation to produce correct results and optimize the
MMXEXT implementation.
Handle odd widths and don't read over array boundaries in the MMXEXT
implementation.
* gst/deinterlace2/tvtime/vfir.c: (deinterlace_line_c),
(deinterlace_line_mmx), (deinterlace_scanline_vfir):
Fix a small rounding bug in the MMX implementation, the MMX
implementation doesn't actually need MMXEXT instructions so don't mark
it as such.
Handle odd widths in both implementations.
This commit is contained in:
Sebastian Dröge 2008-06-24 09:10:46 +00:00
parent d7cca01553
commit 6fd4ed3965
3 changed files with 191 additions and 182 deletions

View file

@ -1,3 +1,23 @@
2008-06-24 Sebastian Dröge <sebastian.droege@collabora.co.uk>
* gst/deinterlace2/tvtime/greedy.c:
(deinterlace_greedy_packed422_scanline_c),
(deinterlace_greedy_packed422_scanline_mmxext),
(deinterlace_greedy_packed422_scanline):
Fix the C implementation to produce correct results and optimize the
MMXEXT implementation.
Handle odd widths and don't read over array boundaries in the MMXEXT
implementation.
* gst/deinterlace2/tvtime/vfir.c: (deinterlace_line_c),
(deinterlace_line_mmx), (deinterlace_scanline_vfir):
Fix a small rounding bug in the MMX implementation, the MMX
implementation doesn't actually need MMXEXT instructions so don't mark
it as such.
Handle odd widths in both implementations.
2008-06-22 Stefan Kost <ensonic@users.sf.net>
* ext/resindvd/rsnbasesrc.c:

View file

@ -60,135 +60,14 @@ copy_scanline (GstDeinterlace2 * object,
blit_packed422_scanline (output, data->m1, object->frame_width);
}
static int GreedyMaxComb = 15;
static const int GreedyMaxComb = 15;
#ifdef HAVE_CPU_I386
#include "mmx.h"
#include "sse.h"
static void
deinterlace_greedy_packed422_scanline_sse (GstDeinterlace2 * object,
deinterlace_scanline_data_t * data, uint8_t * output)
{
mmx_t MaxComb;
uint8_t *m0 = data->m0;
uint8_t *t1 = data->t1;
uint8_t *b1 = data->b1;
uint8_t *m2 = data->m2;
int width = object->frame_width;
// How badly do we let it weave? 0-255
MaxComb.ub[0] = GreedyMaxComb;
MaxComb.ub[1] = GreedyMaxComb;
MaxComb.ub[2] = GreedyMaxComb;
MaxComb.ub[3] = GreedyMaxComb;
MaxComb.ub[4] = GreedyMaxComb;
MaxComb.ub[5] = GreedyMaxComb;
MaxComb.ub[6] = GreedyMaxComb;
MaxComb.ub[7] = GreedyMaxComb;
// L2 == m0
// L1 == t1
// L3 == b1
// LP2 == m2
width /= 4;
while (width--) {
movq_m2r (*t1, mm1); // L1
movq_m2r (*m0, mm2); // L2
movq_m2r (*b1, mm3); // L3
movq_m2r (*m2, mm0); // LP2
// average L1 and L3 leave result in mm4
movq_r2r (mm1, mm4); // L1
pavgb_r2r (mm3, mm4); // (L1 + L3)/2
// get abs value of possible L2 comb
movq_r2r (mm2, mm7); // L2
psubusb_r2r (mm4, mm7); // L2 - avg
movq_r2r (mm4, mm5); // avg
psubusb_r2r (mm2, mm5); // avg - L2
por_r2r (mm7, mm5); // abs(avg-L2)
movq_r2r (mm4, mm6); // copy of avg for later
// get abs value of possible LP2 comb
movq_r2r (mm0, mm7); // LP2
psubusb_r2r (mm4, mm7); // LP2 - avg
psubusb_r2r (mm0, mm4); // avg - LP2
por_r2r (mm7, mm4); // abs(avg-LP2)
// use L2 or LP2 depending upon which makes smaller comb
psubusb_r2r (mm5, mm4); // see if it goes to zero
psubusb_r2r (mm5, mm5); // 0
pcmpeqb_r2r (mm5, mm4); // if (mm4=0) then FF else 0
pcmpeqb_r2r (mm4, mm5); // opposite of mm4
// if Comb(LP2) <= Comb(L2) then mm4=ff, mm5=0 else mm4=0, mm5 = 55
pand_r2r (mm2, mm5); // use L2 if mm5 == ff, else 0
pand_r2r (mm0, mm4); // use LP2 if mm4 = ff, else 0
por_r2r (mm5, mm4); // may the best win
// Now lets clip our chosen value to be not outside of the range
// of the high/low range L1-L3 by more than abs(L1-L3)
// This allows some comb but limits the damages and also allows more
// detail than a boring oversmoothed clip.
movq_r2r (mm1, mm2); // copy L1
psubusb_r2r (mm3, mm2); // - L3, with saturation
paddusb_r2r (mm3, mm2); // now = Max(L1,L3)
pcmpeqb_r2r (mm7, mm7); // all ffffffff
psubusb_r2r (mm1, mm7); // - L1
paddusb_r2r (mm7, mm3); // add, may sat at fff..
psubusb_r2r (mm7, mm3); // now = Min(L1,L3)
// allow the value to be above the high or below the low by amt of MaxComb
paddusb_m2r (MaxComb, mm2); // increase max by diff
psubusb_m2r (MaxComb, mm3); // lower min by diff
psubusb_r2r (mm3, mm4); // best - Min
paddusb_r2r (mm3, mm4); // now = Max(best,Min(L1,L3)
pcmpeqb_r2r (mm7, mm7); // all ffffffff
psubusb_r2r (mm4, mm7); // - Max(best,Min(best,L3)
paddusb_r2r (mm7, mm2); // add may sat at FFF..
psubusb_r2r (mm7, mm2); // now = Min( Max(best, Min(L1,L3), L2 )=L2 clipped
movntq_r2m (mm2, *output); // move in our clipped best
// Advance to the next set of pixels.
output += 8;
m0 += 8;
t1 += 8;
b1 += 8;
m2 += 8;
}
sfence ();
emms ();
}
#endif
static void
static inline void
deinterlace_greedy_packed422_scanline_c (GstDeinterlace2 * object,
deinterlace_scanline_data_t * data, uint8_t * output)
uint8_t * m0, uint8_t * t1, uint8_t * b1, uint8_t * m2, uint8_t * output,
int width)
{
uint8_t *m0 = data->m0;
uint8_t *t1 = data->t1;
uint8_t *b1 = data->b1;
uint8_t *m2 = data->m2;
int width = 2 * object->frame_width;
uint16_t avg, l2_diff, lp2_diff, max, min, best;
int avg, l2_diff, lp2_diff, max, min, best;
// L2 == m0
// L1 == t1
@ -211,10 +90,15 @@ deinterlace_greedy_packed422_scanline_c (GstDeinterlace2 * object,
if (max < 256 - GreedyMaxComb)
max += GreedyMaxComb;
else
max = 255;
if (min > GreedyMaxComb)
min -= GreedyMaxComb;
else
min = 0;
*output = MIN (MAX (best, min), max);
*output = CLAMP (best, min, max);
// Advance to the next set of pixels.
output += 1;
@ -225,18 +109,118 @@ deinterlace_greedy_packed422_scanline_c (GstDeinterlace2 * object,
}
}
#ifdef HAVE_CPU_I386
#include "mmx.h"
#include "sse.h"
static void
deinterlace_greedy_packed422_scanline_mmxext (GstDeinterlace2 * object,
uint8_t * m0, uint8_t * t1, uint8_t * b1, uint8_t * m2, uint8_t * output,
int width)
{
mmx_t MaxComb;
// How badly do we let it weave? 0-255
MaxComb.ub[0] = GreedyMaxComb;
MaxComb.ub[1] = GreedyMaxComb;
MaxComb.ub[2] = GreedyMaxComb;
MaxComb.ub[3] = GreedyMaxComb;
MaxComb.ub[4] = GreedyMaxComb;
MaxComb.ub[5] = GreedyMaxComb;
MaxComb.ub[6] = GreedyMaxComb;
MaxComb.ub[7] = GreedyMaxComb;
// L2 == m0
// L1 == t1
// L3 == b1
// LP2 == m2
for (; width > 7; width -= 8) {
movq_m2r (*t1, mm1); // L1
movq_m2r (*m0, mm2); // L2
movq_m2r (*b1, mm3); // L3
movq_m2r (*m2, mm0); // LP2
// average L1 and L3 leave result in mm4
movq_r2r (mm1, mm4); // L1
pavgb_r2r (mm3, mm4); // (L1 + L3)/2
// get abs value of possible L2 comb
movq_r2r (mm2, mm7); // L2
psubusb_r2r (mm4, mm7); // L2 - avg
movq_r2r (mm4, mm5); // avg
psubusb_r2r (mm2, mm5); // avg - L2
por_r2r (mm7, mm5); // abs(avg-L2)
// get abs value of possible LP2 comb
movq_r2r (mm0, mm7); // LP2
psubusb_r2r (mm4, mm7); // LP2 - avg
psubusb_r2r (mm0, mm4); // avg - LP2
por_r2r (mm7, mm4); // abs(avg-LP2)
// use L2 or LP2 depending upon which makes smaller comb
psubusb_r2r (mm5, mm4); // see if it goes to zero
pxor_r2r (mm5, mm5); // 0
pcmpeqb_r2r (mm5, mm4); // if (mm4=0) then FF else 0
pcmpeqb_r2r (mm4, mm5); // opposite of mm4
// if Comb(LP2) <= Comb(L2) then mm4=ff, mm5=0 else mm4=0, mm5 = 55
pand_r2r (mm2, mm5); // use L2 if mm5 == ff, else 0
pand_r2r (mm0, mm4); // use LP2 if mm4 = ff, else 0
por_r2r (mm5, mm4); // may the best win
// Now lets clip our chosen value to be not outside of the range
// of the high/low range L1-L3 by more than abs(L1-L3)
// This allows some comb but limits the damages and also allows more
// detail than a boring oversmoothed clip.
movq_r2r (mm1, mm2); // copy L1
pmaxub_r2r (mm3, mm2); // now = Max(L1,L3)
pminub_r2r (mm1, mm3); // now = Min(L1,L3)
// allow the value to be above the high or below the low by amt of MaxComb
paddusb_m2r (MaxComb, mm2); // increase max by diff
psubusb_m2r (MaxComb, mm3); // lower min by diff
pmaxub_r2r (mm3, mm4); // now = Max(best,Min(L1,L3)
pminub_r2r (mm4, mm2); // now = Min( Max(best, Min(L1,L3)), L2 )=L2 clipped
movq_r2m (mm2, *output); // move in our clipped best
// Advance to the next set of pixels.
output += 8;
m0 += 8;
t1 += 8;
b1 += 8;
m2 += 8;
}
sfence ();
emms ();
if (width > 0)
deinterlace_greedy_packed422_scanline_c (object, m0, t1, b1, m2, output,
width);
}
#endif
static void
deinterlace_greedy_packed422_scanline (GstDeinterlace2 * object,
deinterlace_scanline_data_t * data, uint8_t * output)
{
#ifdef HAVE_CPU_I386
if (object->cpu_feature_flags & OIL_IMPL_FLAG_SSE) {
deinterlace_greedy_packed422_scanline_sse (object, data, output);
if (object->cpu_feature_flags & OIL_IMPL_FLAG_MMXEXT) {
deinterlace_greedy_packed422_scanline_mmxext (object, data->m0, data->t1,
data->b1, data->m2, output, 2 * object->frame_width);
} else {
deinterlace_greedy_packed422_scanline_c (object, data, output);
deinterlace_greedy_packed422_scanline_c (object, data->m0, data->t1,
data->b1, data->m2, output, 2 * object->frame_width);
}
#else
deinterlace_greedy_packed422_scanline_c (object, data, output);
deinterlace_greedy_packed422_scanline_c (object, data->m0, data->t1, data->b1,
data->m2, output, 2 * object->frame_width);
#endif
}

View file

@ -49,58 +49,10 @@
* filter taps here are: [-1 4 2 4 -1].
*/
#ifdef HAVE_CPU_I386
#include "mmx.h"
static void
deinterlace_line_mmxext (uint8_t * dst, uint8_t * lum_m4,
uint8_t * lum_m3, uint8_t * lum_m2,
uint8_t * lum_m1, uint8_t * lum, int size)
{
mmx_t rounder;
rounder.uw[0] = 4;
rounder.uw[1] = 4;
rounder.uw[2] = 4;
rounder.uw[3] = 4;
pxor_r2r (mm7, mm7);
movq_m2r (rounder, mm6);
for (; size > 3; size -= 4) {
movd_m2r (lum_m4[0], mm0);
movd_m2r (lum_m3[0], mm1);
movd_m2r (lum_m2[0], mm2);
movd_m2r (lum_m1[0], mm3);
movd_m2r (lum[0], mm4);
punpcklbw_r2r (mm7, mm0);
punpcklbw_r2r (mm7, mm1);
punpcklbw_r2r (mm7, mm2);
punpcklbw_r2r (mm7, mm3);
punpcklbw_r2r (mm7, mm4);
paddw_r2r (mm3, mm1);
psllw_i2r (1, mm2);
paddw_r2r (mm4, mm0);
psllw_i2r (2, mm1); // 2
paddw_r2r (mm6, mm2);
paddw_r2r (mm2, mm1);
psubusw_r2r (mm0, mm1);
psrlw_i2r (3, mm1); // 3
packuswb_r2r (mm7, mm1);
movd_r2m (mm1, dst[0]);
lum_m4 += 4;
lum_m3 += 4;
lum_m2 += 4;
lum_m1 += 4;
lum += 4;
dst += 4;
}
emms ();
}
#endif
/**
* C implementation.
*/
static void
static inline void
deinterlace_line_c (uint8_t * dst, uint8_t * lum_m4,
uint8_t * lum_m3, uint8_t * lum_m2,
uint8_t * lum_m1, uint8_t * lum, int size)
@ -123,6 +75,59 @@ deinterlace_line_c (uint8_t * dst, uint8_t * lum_m4,
}
}
#ifdef HAVE_CPU_I386
#include "mmx.h"
static void
deinterlace_line_mmx (uint8_t * dst, uint8_t * lum_m4,
uint8_t * lum_m3, uint8_t * lum_m2,
uint8_t * lum_m1, uint8_t * lum, int size)
{
mmx_t rounder;
rounder.uw[0] = 4;
rounder.uw[1] = 4;
rounder.uw[2] = 4;
rounder.uw[3] = 4;
pxor_r2r (mm7, mm7);
movd_m2r (rounder, mm6);
punpcklbw_r2r (mm7, mm6);
for (; size > 3; size -= 4) {
movd_m2r (*lum_m4, mm0);
movd_m2r (*lum_m3, mm1);
movd_m2r (*lum_m2, mm2);
movd_m2r (*lum_m1, mm3);
movd_m2r (*lum, mm4);
punpcklbw_r2r (mm7, mm0);
punpcklbw_r2r (mm7, mm1);
punpcklbw_r2r (mm7, mm2);
punpcklbw_r2r (mm7, mm3);
punpcklbw_r2r (mm7, mm4);
paddw_r2r (mm3, mm1);
psllw_i2r (1, mm2);
paddw_r2r (mm4, mm0);
psllw_i2r (2, mm1); // 2
paddw_r2r (mm6, mm2);
paddw_r2r (mm2, mm1);
psubusw_r2r (mm0, mm1);
psrlw_i2r (3, mm1); // 3
packuswb_r2r (mm7, mm1);
movd_r2m (mm1, *dst);
lum_m4 += 4;
lum_m3 += 4;
lum_m2 += 4;
lum_m1 += 4;
lum += 4;
dst += 4;
}
emms ();
/* Handle odd widths */
if (size > 0)
deinterlace_line_c (dst, lum_m4, lum_m3, lum_m2, lum_m1, lum, size);
}
#endif
/*
* The commented-out method below that uses the bottom_field member is more
* like the filter as specified in the MPEG2 spec, but it doesn't seem to
@ -134,8 +139,8 @@ deinterlace_scanline_vfir (GstDeinterlace2 * object,
deinterlace_scanline_data_t * data, uint8_t * output)
{
#ifdef HAVE_CPU_I386
if (object->cpu_feature_flags & OIL_IMPL_FLAG_MMXEXT) {
deinterlace_line_mmxext (output, data->tt1, data->t0, data->m1, data->b0,
if (object->cpu_feature_flags & OIL_IMPL_FLAG_MMX) {
deinterlace_line_mmx (output, data->tt1, data->t0, data->m1, data->b0,
data->bb1, object->frame_width * 2);
} else {
deinterlace_line_c (output, data->tt1, data->t0, data->m1, data->b0,