[MOVED FROM BAD 11/56] gst/deinterlace2/tvtime/greedy.c: Add plain MMX implementation for the greedyl method.

Original commit message from CVS:
* gst/deinterlace2/tvtime/greedy.c:
(deinterlace_greedy_packed422_scanline_mmx),
(deinterlace_greedy_packed422_scanline):
Add plain MMX implementation for the greedyl method.
This commit is contained in:
Sebastian Dröge 2008-06-24 10:12:08 +00:00 committed by Sebastian Dröge
parent bd35b53558
commit df1798ddf9

View file

@ -111,6 +111,107 @@ deinterlace_greedy_packed422_scanline_c (GstDeinterlace2 * object,
#ifdef HAVE_CPU_I386
#include "mmx.h"
static void
deinterlace_greedy_packed422_scanline_mmx (GstDeinterlace2 * object,
uint8_t * m0, uint8_t * t1, uint8_t * b1, uint8_t * m2, uint8_t * output,
int width)
{
mmx_t MaxComb;
// How badly do we let it weave? 0-255
MaxComb.ub[0] = GreedyMaxComb;
MaxComb.ub[1] = GreedyMaxComb;
MaxComb.ub[2] = GreedyMaxComb;
MaxComb.ub[3] = GreedyMaxComb;
MaxComb.ub[4] = GreedyMaxComb;
MaxComb.ub[5] = GreedyMaxComb;
MaxComb.ub[6] = GreedyMaxComb;
MaxComb.ub[7] = GreedyMaxComb;
// L2 == m0
// L1 == t1
// L3 == b1
// LP2 == m2
for (; width > 7; width -= 8) {
movq_m2r (*t1, mm1); // L1
movq_m2r (*m0, mm2); // L2
movq_m2r (*b1, mm3); // L3
movq_m2r (*m2, mm0); // LP2
// average L1 and L3 leave result in mm4
movq_r2r (mm1, mm4); // L1
movq_r2r (mm3, mm5); // L3
psrlw_i2r (1, mm4); // L1/2
psrlw_i2r (1, mm5); // L3/2
paddusb_r2r (mm5, mm4); // (L1 + L3) / 2
// get abs value of possible L2 comb
movq_r2r (mm2, mm7); // L2
psubusb_r2r (mm4, mm7); // L2 - avg
movq_r2r (mm4, mm5); // avg
psubusb_r2r (mm2, mm5); // avg - L2
por_r2r (mm7, mm5); // abs(avg-L2)
// get abs value of possible LP2 comb
movq_r2r (mm0, mm7); // LP2
psubusb_r2r (mm4, mm7); // LP2 - avg
psubusb_r2r (mm0, mm4); // avg - LP2
por_r2r (mm7, mm4); // abs(avg-LP2)
// use L2 or LP2 depending upon which makes smaller comb
psubusb_r2r (mm5, mm4); // see if it goes to zero
psubusb_r2r (mm5, mm5); // 0
pcmpeqb_r2r (mm5, mm4); // if (mm4=0) then FF else 0
pcmpeqb_r2r (mm4, mm5); // opposite of mm4
// if Comb(LP2) <= Comb(L2) then mm4=ff, mm5=0 else mm4=0, mm5 = 55
pand_r2r (mm2, mm5); // use L2 if mm5 == ff, else 0
pand_r2r (mm0, mm4); // use LP2 if mm4 = ff, else 0
por_r2r (mm5, mm4); // may the best win
// Now lets clip our chosen value to be not outside of the range
// of the high/low range L1-L3 by more than abs(L1-L3)
// This allows some comb but limits the damages and also allows more
// detail than a boring oversmoothed clip.
movq_r2r (mm1, mm2); // copy L1
psubusb_r2r (mm3, mm2); // - L3, with saturation
paddusb_r2r (mm3, mm2); // now = Max(L1,L3)
pcmpeqb_r2r (mm7, mm7); // all ffffffff
psubusb_r2r (mm1, mm7); // - L1
paddusb_r2r (mm7, mm3); // add, may sat at fff..
psubusb_r2r (mm7, mm3); // now = Min(L1,L3)
// allow the value to be above the high or below the low by amt of MaxComb
paddusb_m2r (MaxComb, mm2); // increase max by diff
psubusb_m2r (MaxComb, mm3); // lower min by diff
psubusb_r2r (mm3, mm4); // best - Min
paddusb_r2r (mm3, mm4); // now = Max(best,Min(L1,L3)
pcmpeqb_r2r (mm7, mm7); // all ffffffff
psubusb_r2r (mm4, mm7); // - Max(best,Min(best,L3)
paddusb_r2r (mm7, mm2); // add may sat at FFF..
psubusb_r2r (mm7, mm2); // now = Min( Max(best, Min(L1,L3), L2 )=L2 clipped
movq_r2m (mm2, *output); // move in our clipped best
// Advance to the next set of pixels.
output += 8;
m0 += 8;
t1 += 8;
b1 += 8;
m2 += 8;
}
emms ();
if (width > 0)
deinterlace_greedy_packed422_scanline_c (object, m0, t1, b1, m2, output,
width);
}
#include "sse.h"
static void
@ -214,6 +315,9 @@ deinterlace_greedy_packed422_scanline (GstDeinterlace2 * object,
if (object->cpu_feature_flags & OIL_IMPL_FLAG_MMXEXT) {
deinterlace_greedy_packed422_scanline_mmxext (object, data->m0, data->t1,
data->b1, data->m2, output, 2 * object->frame_width);
} else if (object->cpu_feature_flags & OIL_IMPL_FLAG_MMX) {
deinterlace_greedy_packed422_scanline_mmx (object, data->m0, data->t1,
data->b1, data->m2, output, 2 * object->frame_width);
} else {
deinterlace_greedy_packed422_scanline_c (object, data->m0, data->t1,
data->b1, data->m2, output, 2 * object->frame_width);