mirror of
https://gitlab.freedesktop.org/gstreamer/gstreamer.git
synced 2025-03-28 11:55:39 +00:00
gst/deinterlace2/tvtime/greedy.c: Fix the C implementation to produce correct results and optimize the
Original commit message from CVS: * gst/deinterlace2/tvtime/greedy.c: (deinterlace_greedy_packed422_scanline_c), (deinterlace_greedy_packed422_scanline_mmxext), (deinterlace_greedy_packed422_scanline): Fix the C implementation to produce correct results and optimize the MMXEXT implementation. Handle odd widths and don't read over array boundaries in the MMXEXT implementation. * gst/deinterlace2/tvtime/vfir.c: (deinterlace_line_c), (deinterlace_line_mmx), (deinterlace_scanline_vfir): Fix a small rounding bug in the MMX implementation, the MMX implementation doesn't actually need MMXEXT instructions so don't mark it as such. Handle odd widths in both implementations.
This commit is contained in:
parent
d7cca01553
commit
6fd4ed3965
3 changed files with 191 additions and 182 deletions
20
ChangeLog
20
ChangeLog
|
@ -1,3 +1,23 @@
|
|||
2008-06-24 Sebastian Dröge <sebastian.droege@collabora.co.uk>
|
||||
|
||||
* gst/deinterlace2/tvtime/greedy.c:
|
||||
(deinterlace_greedy_packed422_scanline_c),
|
||||
(deinterlace_greedy_packed422_scanline_mmxext),
|
||||
(deinterlace_greedy_packed422_scanline):
|
||||
Fix the C implementation to produce correct results and optimize the
|
||||
MMXEXT implementation.
|
||||
|
||||
Handle odd widths and don't read over array boundaries in the MMXEXT
|
||||
implementation.
|
||||
|
||||
* gst/deinterlace2/tvtime/vfir.c: (deinterlace_line_c),
|
||||
(deinterlace_line_mmx), (deinterlace_scanline_vfir):
|
||||
Fix a small rounding bug in the MMX implementation, the MMX
|
||||
implementation doesn't actually need MMXEXT instructions so don't mark
|
||||
it as such.
|
||||
|
||||
Handle odd widths in both implementations.
|
||||
|
||||
2008-06-22 Stefan Kost <ensonic@users.sf.net>
|
||||
|
||||
* ext/resindvd/rsnbasesrc.c:
|
||||
|
|
|
@ -60,135 +60,14 @@ copy_scanline (GstDeinterlace2 * object,
|
|||
blit_packed422_scanline (output, data->m1, object->frame_width);
|
||||
}
|
||||
|
||||
static int GreedyMaxComb = 15;
|
||||
static const int GreedyMaxComb = 15;
|
||||
|
||||
#ifdef HAVE_CPU_I386
|
||||
#include "mmx.h"
|
||||
#include "sse.h"
|
||||
static void
|
||||
deinterlace_greedy_packed422_scanline_sse (GstDeinterlace2 * object,
|
||||
deinterlace_scanline_data_t * data, uint8_t * output)
|
||||
{
|
||||
mmx_t MaxComb;
|
||||
|
||||
uint8_t *m0 = data->m0;
|
||||
|
||||
uint8_t *t1 = data->t1;
|
||||
|
||||
uint8_t *b1 = data->b1;
|
||||
|
||||
uint8_t *m2 = data->m2;
|
||||
|
||||
int width = object->frame_width;
|
||||
|
||||
// How badly do we let it weave? 0-255
|
||||
MaxComb.ub[0] = GreedyMaxComb;
|
||||
MaxComb.ub[1] = GreedyMaxComb;
|
||||
MaxComb.ub[2] = GreedyMaxComb;
|
||||
MaxComb.ub[3] = GreedyMaxComb;
|
||||
MaxComb.ub[4] = GreedyMaxComb;
|
||||
MaxComb.ub[5] = GreedyMaxComb;
|
||||
MaxComb.ub[6] = GreedyMaxComb;
|
||||
MaxComb.ub[7] = GreedyMaxComb;
|
||||
|
||||
// L2 == m0
|
||||
// L1 == t1
|
||||
// L3 == b1
|
||||
// LP2 == m2
|
||||
|
||||
width /= 4;
|
||||
while (width--) {
|
||||
movq_m2r (*t1, mm1); // L1
|
||||
movq_m2r (*m0, mm2); // L2
|
||||
movq_m2r (*b1, mm3); // L3
|
||||
movq_m2r (*m2, mm0); // LP2
|
||||
|
||||
// average L1 and L3 leave result in mm4
|
||||
movq_r2r (mm1, mm4); // L1
|
||||
pavgb_r2r (mm3, mm4); // (L1 + L3)/2
|
||||
|
||||
|
||||
// get abs value of possible L2 comb
|
||||
movq_r2r (mm2, mm7); // L2
|
||||
psubusb_r2r (mm4, mm7); // L2 - avg
|
||||
movq_r2r (mm4, mm5); // avg
|
||||
psubusb_r2r (mm2, mm5); // avg - L2
|
||||
por_r2r (mm7, mm5); // abs(avg-L2)
|
||||
movq_r2r (mm4, mm6); // copy of avg for later
|
||||
|
||||
|
||||
// get abs value of possible LP2 comb
|
||||
movq_r2r (mm0, mm7); // LP2
|
||||
psubusb_r2r (mm4, mm7); // LP2 - avg
|
||||
psubusb_r2r (mm0, mm4); // avg - LP2
|
||||
por_r2r (mm7, mm4); // abs(avg-LP2)
|
||||
|
||||
// use L2 or LP2 depending upon which makes smaller comb
|
||||
psubusb_r2r (mm5, mm4); // see if it goes to zero
|
||||
psubusb_r2r (mm5, mm5); // 0
|
||||
pcmpeqb_r2r (mm5, mm4); // if (mm4=0) then FF else 0
|
||||
pcmpeqb_r2r (mm4, mm5); // opposite of mm4
|
||||
|
||||
// if Comb(LP2) <= Comb(L2) then mm4=ff, mm5=0 else mm4=0, mm5 = 55
|
||||
pand_r2r (mm2, mm5); // use L2 if mm5 == ff, else 0
|
||||
pand_r2r (mm0, mm4); // use LP2 if mm4 = ff, else 0
|
||||
por_r2r (mm5, mm4); // may the best win
|
||||
|
||||
// Now lets clip our chosen value to be not outside of the range
|
||||
// of the high/low range L1-L3 by more than abs(L1-L3)
|
||||
// This allows some comb but limits the damages and also allows more
|
||||
// detail than a boring oversmoothed clip.
|
||||
|
||||
movq_r2r (mm1, mm2); // copy L1
|
||||
psubusb_r2r (mm3, mm2); // - L3, with saturation
|
||||
paddusb_r2r (mm3, mm2); // now = Max(L1,L3)
|
||||
|
||||
pcmpeqb_r2r (mm7, mm7); // all ffffffff
|
||||
psubusb_r2r (mm1, mm7); // - L1
|
||||
paddusb_r2r (mm7, mm3); // add, may sat at fff..
|
||||
psubusb_r2r (mm7, mm3); // now = Min(L1,L3)
|
||||
|
||||
// allow the value to be above the high or below the low by amt of MaxComb
|
||||
paddusb_m2r (MaxComb, mm2); // increase max by diff
|
||||
psubusb_m2r (MaxComb, mm3); // lower min by diff
|
||||
|
||||
psubusb_r2r (mm3, mm4); // best - Min
|
||||
paddusb_r2r (mm3, mm4); // now = Max(best,Min(L1,L3)
|
||||
|
||||
pcmpeqb_r2r (mm7, mm7); // all ffffffff
|
||||
psubusb_r2r (mm4, mm7); // - Max(best,Min(best,L3)
|
||||
paddusb_r2r (mm7, mm2); // add may sat at FFF..
|
||||
psubusb_r2r (mm7, mm2); // now = Min( Max(best, Min(L1,L3), L2 )=L2 clipped
|
||||
|
||||
movntq_r2m (mm2, *output); // move in our clipped best
|
||||
|
||||
// Advance to the next set of pixels.
|
||||
output += 8;
|
||||
m0 += 8;
|
||||
t1 += 8;
|
||||
b1 += 8;
|
||||
m2 += 8;
|
||||
}
|
||||
sfence ();
|
||||
emms ();
|
||||
}
|
||||
#endif
|
||||
|
||||
static void
|
||||
static inline void
|
||||
deinterlace_greedy_packed422_scanline_c (GstDeinterlace2 * object,
|
||||
deinterlace_scanline_data_t * data, uint8_t * output)
|
||||
uint8_t * m0, uint8_t * t1, uint8_t * b1, uint8_t * m2, uint8_t * output,
|
||||
int width)
|
||||
{
|
||||
uint8_t *m0 = data->m0;
|
||||
|
||||
uint8_t *t1 = data->t1;
|
||||
|
||||
uint8_t *b1 = data->b1;
|
||||
|
||||
uint8_t *m2 = data->m2;
|
||||
|
||||
int width = 2 * object->frame_width;
|
||||
|
||||
uint16_t avg, l2_diff, lp2_diff, max, min, best;
|
||||
int avg, l2_diff, lp2_diff, max, min, best;
|
||||
|
||||
// L2 == m0
|
||||
// L1 == t1
|
||||
|
@ -211,10 +90,15 @@ deinterlace_greedy_packed422_scanline_c (GstDeinterlace2 * object,
|
|||
|
||||
if (max < 256 - GreedyMaxComb)
|
||||
max += GreedyMaxComb;
|
||||
else
|
||||
max = 255;
|
||||
|
||||
if (min > GreedyMaxComb)
|
||||
min -= GreedyMaxComb;
|
||||
else
|
||||
min = 0;
|
||||
|
||||
*output = MIN (MAX (best, min), max);
|
||||
*output = CLAMP (best, min, max);
|
||||
|
||||
// Advance to the next set of pixels.
|
||||
output += 1;
|
||||
|
@ -225,18 +109,118 @@ deinterlace_greedy_packed422_scanline_c (GstDeinterlace2 * object,
|
|||
}
|
||||
}
|
||||
|
||||
#ifdef HAVE_CPU_I386
|
||||
#include "mmx.h"
|
||||
#include "sse.h"
|
||||
|
||||
static void
|
||||
deinterlace_greedy_packed422_scanline_mmxext (GstDeinterlace2 * object,
|
||||
uint8_t * m0, uint8_t * t1, uint8_t * b1, uint8_t * m2, uint8_t * output,
|
||||
int width)
|
||||
{
|
||||
mmx_t MaxComb;
|
||||
|
||||
// How badly do we let it weave? 0-255
|
||||
MaxComb.ub[0] = GreedyMaxComb;
|
||||
MaxComb.ub[1] = GreedyMaxComb;
|
||||
MaxComb.ub[2] = GreedyMaxComb;
|
||||
MaxComb.ub[3] = GreedyMaxComb;
|
||||
MaxComb.ub[4] = GreedyMaxComb;
|
||||
MaxComb.ub[5] = GreedyMaxComb;
|
||||
MaxComb.ub[6] = GreedyMaxComb;
|
||||
MaxComb.ub[7] = GreedyMaxComb;
|
||||
|
||||
// L2 == m0
|
||||
// L1 == t1
|
||||
// L3 == b1
|
||||
// LP2 == m2
|
||||
|
||||
for (; width > 7; width -= 8) {
|
||||
movq_m2r (*t1, mm1); // L1
|
||||
movq_m2r (*m0, mm2); // L2
|
||||
movq_m2r (*b1, mm3); // L3
|
||||
movq_m2r (*m2, mm0); // LP2
|
||||
|
||||
// average L1 and L3 leave result in mm4
|
||||
movq_r2r (mm1, mm4); // L1
|
||||
pavgb_r2r (mm3, mm4); // (L1 + L3)/2
|
||||
|
||||
// get abs value of possible L2 comb
|
||||
movq_r2r (mm2, mm7); // L2
|
||||
psubusb_r2r (mm4, mm7); // L2 - avg
|
||||
movq_r2r (mm4, mm5); // avg
|
||||
psubusb_r2r (mm2, mm5); // avg - L2
|
||||
por_r2r (mm7, mm5); // abs(avg-L2)
|
||||
|
||||
// get abs value of possible LP2 comb
|
||||
movq_r2r (mm0, mm7); // LP2
|
||||
psubusb_r2r (mm4, mm7); // LP2 - avg
|
||||
psubusb_r2r (mm0, mm4); // avg - LP2
|
||||
por_r2r (mm7, mm4); // abs(avg-LP2)
|
||||
|
||||
// use L2 or LP2 depending upon which makes smaller comb
|
||||
psubusb_r2r (mm5, mm4); // see if it goes to zero
|
||||
pxor_r2r (mm5, mm5); // 0
|
||||
pcmpeqb_r2r (mm5, mm4); // if (mm4=0) then FF else 0
|
||||
pcmpeqb_r2r (mm4, mm5); // opposite of mm4
|
||||
|
||||
// if Comb(LP2) <= Comb(L2) then mm4=ff, mm5=0 else mm4=0, mm5 = 55
|
||||
pand_r2r (mm2, mm5); // use L2 if mm5 == ff, else 0
|
||||
pand_r2r (mm0, mm4); // use LP2 if mm4 = ff, else 0
|
||||
por_r2r (mm5, mm4); // may the best win
|
||||
|
||||
// Now lets clip our chosen value to be not outside of the range
|
||||
// of the high/low range L1-L3 by more than abs(L1-L3)
|
||||
// This allows some comb but limits the damages and also allows more
|
||||
// detail than a boring oversmoothed clip.
|
||||
|
||||
movq_r2r (mm1, mm2); // copy L1
|
||||
pmaxub_r2r (mm3, mm2); // now = Max(L1,L3)
|
||||
|
||||
pminub_r2r (mm1, mm3); // now = Min(L1,L3)
|
||||
|
||||
// allow the value to be above the high or below the low by amt of MaxComb
|
||||
paddusb_m2r (MaxComb, mm2); // increase max by diff
|
||||
psubusb_m2r (MaxComb, mm3); // lower min by diff
|
||||
|
||||
|
||||
pmaxub_r2r (mm3, mm4); // now = Max(best,Min(L1,L3)
|
||||
pminub_r2r (mm4, mm2); // now = Min( Max(best, Min(L1,L3)), L2 )=L2 clipped
|
||||
|
||||
movq_r2m (mm2, *output); // move in our clipped best
|
||||
|
||||
// Advance to the next set of pixels.
|
||||
output += 8;
|
||||
m0 += 8;
|
||||
t1 += 8;
|
||||
b1 += 8;
|
||||
m2 += 8;
|
||||
}
|
||||
sfence ();
|
||||
emms ();
|
||||
|
||||
if (width > 0)
|
||||
deinterlace_greedy_packed422_scanline_c (object, m0, t1, b1, m2, output,
|
||||
width);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
static void
|
||||
deinterlace_greedy_packed422_scanline (GstDeinterlace2 * object,
|
||||
deinterlace_scanline_data_t * data, uint8_t * output)
|
||||
{
|
||||
#ifdef HAVE_CPU_I386
|
||||
if (object->cpu_feature_flags & OIL_IMPL_FLAG_SSE) {
|
||||
deinterlace_greedy_packed422_scanline_sse (object, data, output);
|
||||
if (object->cpu_feature_flags & OIL_IMPL_FLAG_MMXEXT) {
|
||||
deinterlace_greedy_packed422_scanline_mmxext (object, data->m0, data->t1,
|
||||
data->b1, data->m2, output, 2 * object->frame_width);
|
||||
} else {
|
||||
deinterlace_greedy_packed422_scanline_c (object, data, output);
|
||||
deinterlace_greedy_packed422_scanline_c (object, data->m0, data->t1,
|
||||
data->b1, data->m2, output, 2 * object->frame_width);
|
||||
}
|
||||
#else
|
||||
deinterlace_greedy_packed422_scanline_c (object, data, output);
|
||||
deinterlace_greedy_packed422_scanline_c (object, data->m0, data->t1, data->b1,
|
||||
data->m2, output, 2 * object->frame_width);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
|
|
@ -49,58 +49,10 @@
|
|||
* filter taps here are: [-1 4 2 4 -1].
|
||||
*/
|
||||
|
||||
#ifdef HAVE_CPU_I386
|
||||
#include "mmx.h"
|
||||
static void
|
||||
deinterlace_line_mmxext (uint8_t * dst, uint8_t * lum_m4,
|
||||
uint8_t * lum_m3, uint8_t * lum_m2,
|
||||
uint8_t * lum_m1, uint8_t * lum, int size)
|
||||
{
|
||||
mmx_t rounder;
|
||||
|
||||
rounder.uw[0] = 4;
|
||||
rounder.uw[1] = 4;
|
||||
rounder.uw[2] = 4;
|
||||
rounder.uw[3] = 4;
|
||||
pxor_r2r (mm7, mm7);
|
||||
movq_m2r (rounder, mm6);
|
||||
|
||||
for (; size > 3; size -= 4) {
|
||||
movd_m2r (lum_m4[0], mm0);
|
||||
movd_m2r (lum_m3[0], mm1);
|
||||
movd_m2r (lum_m2[0], mm2);
|
||||
movd_m2r (lum_m1[0], mm3);
|
||||
movd_m2r (lum[0], mm4);
|
||||
punpcklbw_r2r (mm7, mm0);
|
||||
punpcklbw_r2r (mm7, mm1);
|
||||
punpcklbw_r2r (mm7, mm2);
|
||||
punpcklbw_r2r (mm7, mm3);
|
||||
punpcklbw_r2r (mm7, mm4);
|
||||
paddw_r2r (mm3, mm1);
|
||||
psllw_i2r (1, mm2);
|
||||
paddw_r2r (mm4, mm0);
|
||||
psllw_i2r (2, mm1); // 2
|
||||
paddw_r2r (mm6, mm2);
|
||||
paddw_r2r (mm2, mm1);
|
||||
psubusw_r2r (mm0, mm1);
|
||||
psrlw_i2r (3, mm1); // 3
|
||||
packuswb_r2r (mm7, mm1);
|
||||
movd_r2m (mm1, dst[0]);
|
||||
lum_m4 += 4;
|
||||
lum_m3 += 4;
|
||||
lum_m2 += 4;
|
||||
lum_m1 += 4;
|
||||
lum += 4;
|
||||
dst += 4;
|
||||
}
|
||||
emms ();
|
||||
}
|
||||
#endif
|
||||
|
||||
/**
|
||||
* C implementation.
|
||||
*/
|
||||
static void
|
||||
static inline void
|
||||
deinterlace_line_c (uint8_t * dst, uint8_t * lum_m4,
|
||||
uint8_t * lum_m3, uint8_t * lum_m2,
|
||||
uint8_t * lum_m1, uint8_t * lum, int size)
|
||||
|
@ -123,6 +75,59 @@ deinterlace_line_c (uint8_t * dst, uint8_t * lum_m4,
|
|||
}
|
||||
}
|
||||
|
||||
#ifdef HAVE_CPU_I386
|
||||
#include "mmx.h"
|
||||
static void
|
||||
deinterlace_line_mmx (uint8_t * dst, uint8_t * lum_m4,
|
||||
uint8_t * lum_m3, uint8_t * lum_m2,
|
||||
uint8_t * lum_m1, uint8_t * lum, int size)
|
||||
{
|
||||
mmx_t rounder;
|
||||
|
||||
rounder.uw[0] = 4;
|
||||
rounder.uw[1] = 4;
|
||||
rounder.uw[2] = 4;
|
||||
rounder.uw[3] = 4;
|
||||
pxor_r2r (mm7, mm7);
|
||||
movd_m2r (rounder, mm6);
|
||||
punpcklbw_r2r (mm7, mm6);
|
||||
|
||||
for (; size > 3; size -= 4) {
|
||||
movd_m2r (*lum_m4, mm0);
|
||||
movd_m2r (*lum_m3, mm1);
|
||||
movd_m2r (*lum_m2, mm2);
|
||||
movd_m2r (*lum_m1, mm3);
|
||||
movd_m2r (*lum, mm4);
|
||||
punpcklbw_r2r (mm7, mm0);
|
||||
punpcklbw_r2r (mm7, mm1);
|
||||
punpcklbw_r2r (mm7, mm2);
|
||||
punpcklbw_r2r (mm7, mm3);
|
||||
punpcklbw_r2r (mm7, mm4);
|
||||
paddw_r2r (mm3, mm1);
|
||||
psllw_i2r (1, mm2);
|
||||
paddw_r2r (mm4, mm0);
|
||||
psllw_i2r (2, mm1); // 2
|
||||
paddw_r2r (mm6, mm2);
|
||||
paddw_r2r (mm2, mm1);
|
||||
psubusw_r2r (mm0, mm1);
|
||||
psrlw_i2r (3, mm1); // 3
|
||||
packuswb_r2r (mm7, mm1);
|
||||
movd_r2m (mm1, *dst);
|
||||
lum_m4 += 4;
|
||||
lum_m3 += 4;
|
||||
lum_m2 += 4;
|
||||
lum_m1 += 4;
|
||||
lum += 4;
|
||||
dst += 4;
|
||||
}
|
||||
emms ();
|
||||
|
||||
/* Handle odd widths */
|
||||
if (size > 0)
|
||||
deinterlace_line_c (dst, lum_m4, lum_m3, lum_m2, lum_m1, lum, size);
|
||||
}
|
||||
#endif
|
||||
|
||||
/*
|
||||
* The commented-out method below that uses the bottom_field member is more
|
||||
* like the filter as specified in the MPEG2 spec, but it doesn't seem to
|
||||
|
@ -134,8 +139,8 @@ deinterlace_scanline_vfir (GstDeinterlace2 * object,
|
|||
deinterlace_scanline_data_t * data, uint8_t * output)
|
||||
{
|
||||
#ifdef HAVE_CPU_I386
|
||||
if (object->cpu_feature_flags & OIL_IMPL_FLAG_MMXEXT) {
|
||||
deinterlace_line_mmxext (output, data->tt1, data->t0, data->m1, data->b0,
|
||||
if (object->cpu_feature_flags & OIL_IMPL_FLAG_MMX) {
|
||||
deinterlace_line_mmx (output, data->tt1, data->t0, data->m1, data->b0,
|
||||
data->bb1, object->frame_width * 2);
|
||||
} else {
|
||||
deinterlace_line_c (output, data->tt1, data->t0, data->m1, data->b0,
|
||||
|
|
Loading…
Reference in a new issue