deinterlace: Fix greedyl Orc implementation

To agree with the previous C/asm code.
This commit is contained in:
David Schleef 2010-09-05 18:40:48 -07:00
parent de8fda9cc2
commit 6143a60bdb
3 changed files with 152 additions and 458 deletions

View file

@ -688,10 +688,6 @@ deinterlace_line_greedy (orc_uint8 * d1, const orc_uint8 * s1,
const orc_int8 *ORC_RESTRICT ptr5;
const orc_int8 *ORC_RESTRICT ptr6;
const orc_int8 *ORC_RESTRICT ptr7;
orc_int8 var40;
orc_int8 var41;
orc_int8 var42;
orc_int8 var43;
orc_int8 var44;
orc_int8 var45;
orc_int8 var46;
@ -719,10 +715,6 @@ deinterlace_line_greedy (orc_uint8 * d1, const orc_uint8 * s1,
orc_int8 var68;
orc_int8 var69;
orc_int8 var70;
orc_int8 var71;
orc_int8 var72;
orc_int8 var73;
orc_int8 var74;
ptr0 = (orc_int8 *) d1;
ptr4 = (orc_int8 *) s1;
@ -730,80 +722,64 @@ deinterlace_line_greedy (orc_uint8 * d1, const orc_uint8 * s1,
ptr6 = (orc_int8 *) s3;
ptr7 = (orc_int8 *) s4;
/* 11: loadpb */
var44 = 0x00000080; /* 128 or 6.32404e-322f */
/* 13: loadpb */
var46 = 0x00000080; /* 128 or 6.32404e-322f */
/* 15: loadpb */
var47 = 0x00000080; /* 128 or 6.32404e-322f */
/* 29: loadpb */
var54 = p1;
/* 31: loadpb */
var55 = p1;
var45 = 0x00000080; /* 128 or 6.32404e-322f */
/* 21: loadpb */
var46 = p1;
/* 23: loadpb */
var47 = p1;
for (i = 0; i < n; i++) {
/* 0: loadb */
var40 = ptr5[i];
var49 = ptr4[i];
/* 1: loadb */
var41 = ptr6[i];
/* 2: avgub */
var57 = ((orc_uint8) var40 + (orc_uint8) var41 + 1) >> 1;
/* 3: loadb */
var42 = ptr4[i];
/* 4: maxub */
var58 = ORC_MAX ((orc_uint8) var42, (orc_uint8) var57);
/* 5: loadb */
var43 = ptr4[i];
/* 6: minub */
var59 = ORC_MIN ((orc_uint8) var43, (orc_uint8) var57);
/* 7: subb */
var60 = var58 - var59;
/* 8: loadb */
var44 = ptr7[i];
/* 9: maxub */
var61 = ORC_MAX ((orc_uint8) var44, (orc_uint8) var57);
/* 10: loadb */
var45 = ptr7[i];
/* 11: minub */
var62 = ORC_MIN ((orc_uint8) var45, (orc_uint8) var57);
/* 12: subb */
var63 = var61 - var62;
/* 14: xorb */
var64 = var60 ^ var46;
/* 16: xorb */
var65 = var63 ^ var47;
/* 17: cmpgtsb */
var66 = (var64 > var65) ? (~0) : 0;
/* 18: loadb */
var48 = ptr4[i];
/* 19: andb */
var67 = var48 & var66;
/* 20: loadb */
var49 = ptr7[i];
/* 21: andnb */
var68 = (~var49) & var66;
/* 22: orb */
var69 = var67 | var68;
/* 23: loadb */
var50 = ptr5[i];
/* 24: loadb */
var50 = ptr7[i];
/* 2: loadb */
var51 = ptr6[i];
/* 25: maxub */
var70 = ORC_MAX ((orc_uint8) var50, (orc_uint8) var51);
/* 26: loadb */
/* 3: loadb */
var52 = ptr5[i];
/* 27: loadb */
var53 = ptr6[i];
/* 28: minub */
var71 = ORC_MIN ((orc_uint8) var52, (orc_uint8) var53);
/* 30: addusb */
var72 = ORC_CLAMP_UB ((orc_uint8) var70 + (orc_uint8) var54);
/* 32: subusb */
var73 = ORC_CLAMP_UB ((orc_uint8) var71 - (orc_uint8) var55);
/* 33: minub */
var74 = ORC_MIN ((orc_uint8) var69, (orc_uint8) var72);
/* 34: maxub */
var56 = ORC_MAX ((orc_uint8) var74, (orc_uint8) var73);
/* 35: storeb */
ptr0[i] = var56;
/* 4: avgub */
var53 = ((orc_uint8) var52 + (orc_uint8) var51 + 1) >> 1;
/* 5: maxub */
var54 = ORC_MAX ((orc_uint8) var49, (orc_uint8) var53);
/* 6: minub */
var55 = ORC_MIN ((orc_uint8) var49, (orc_uint8) var53);
/* 7: subb */
var56 = var54 - var55;
/* 8: maxub */
var57 = ORC_MAX ((orc_uint8) var50, (orc_uint8) var53);
/* 9: minub */
var58 = ORC_MIN ((orc_uint8) var50, (orc_uint8) var53);
/* 10: subb */
var59 = var57 - var58;
/* 12: xorb */
var60 = var56 ^ var44;
/* 14: xorb */
var61 = var59 ^ var45;
/* 15: cmpgtsb */
var62 = (var60 > var61) ? (~0) : 0;
/* 16: andb */
var63 = var50 & var62;
/* 17: andnb */
var64 = (~var62) & var49;
/* 18: orb */
var65 = var63 | var64;
/* 19: maxub */
var66 = ORC_MAX ((orc_uint8) var52, (orc_uint8) var51);
/* 20: minub */
var67 = ORC_MIN ((orc_uint8) var52, (orc_uint8) var51);
/* 22: addusb */
var68 = ORC_CLAMP_UB ((orc_uint8) var66 + (orc_uint8) var46);
/* 24: subusb */
var69 = ORC_CLAMP_UB ((orc_uint8) var67 - (orc_uint8) var47);
/* 25: minub */
var70 = ORC_MIN ((orc_uint8) var65, (orc_uint8) var68);
/* 26: maxub */
var48 = ORC_MAX ((orc_uint8) var70, (orc_uint8) var69);
/* 27: storeb */
ptr0[i] = var48;
}
}
@ -819,10 +795,6 @@ _backup_deinterlace_line_greedy (OrcExecutor * ORC_RESTRICT ex)
const orc_int8 *ORC_RESTRICT ptr5;
const orc_int8 *ORC_RESTRICT ptr6;
const orc_int8 *ORC_RESTRICT ptr7;
orc_int8 var40;
orc_int8 var41;
orc_int8 var42;
orc_int8 var43;
orc_int8 var44;
orc_int8 var45;
orc_int8 var46;
@ -850,10 +822,6 @@ _backup_deinterlace_line_greedy (OrcExecutor * ORC_RESTRICT ex)
orc_int8 var68;
orc_int8 var69;
orc_int8 var70;
orc_int8 var71;
orc_int8 var72;
orc_int8 var73;
orc_int8 var74;
ptr0 = (orc_int8 *) ex->arrays[0];
ptr4 = (orc_int8 *) ex->arrays[4];
@ -861,80 +829,64 @@ _backup_deinterlace_line_greedy (OrcExecutor * ORC_RESTRICT ex)
ptr6 = (orc_int8 *) ex->arrays[6];
ptr7 = (orc_int8 *) ex->arrays[7];
/* 11: loadpb */
var44 = 0x00000080; /* 128 or 6.32404e-322f */
/* 13: loadpb */
var46 = 0x00000080; /* 128 or 6.32404e-322f */
/* 15: loadpb */
var47 = 0x00000080; /* 128 or 6.32404e-322f */
/* 29: loadpb */
var54 = ex->params[24];
/* 31: loadpb */
var55 = ex->params[24];
var45 = 0x00000080; /* 128 or 6.32404e-322f */
/* 21: loadpb */
var46 = ex->params[24];
/* 23: loadpb */
var47 = ex->params[24];
for (i = 0; i < n; i++) {
/* 0: loadb */
var40 = ptr5[i];
var49 = ptr4[i];
/* 1: loadb */
var41 = ptr6[i];
/* 2: avgub */
var57 = ((orc_uint8) var40 + (orc_uint8) var41 + 1) >> 1;
/* 3: loadb */
var42 = ptr4[i];
/* 4: maxub */
var58 = ORC_MAX ((orc_uint8) var42, (orc_uint8) var57);
/* 5: loadb */
var43 = ptr4[i];
/* 6: minub */
var59 = ORC_MIN ((orc_uint8) var43, (orc_uint8) var57);
/* 7: subb */
var60 = var58 - var59;
/* 8: loadb */
var44 = ptr7[i];
/* 9: maxub */
var61 = ORC_MAX ((orc_uint8) var44, (orc_uint8) var57);
/* 10: loadb */
var45 = ptr7[i];
/* 11: minub */
var62 = ORC_MIN ((orc_uint8) var45, (orc_uint8) var57);
/* 12: subb */
var63 = var61 - var62;
/* 14: xorb */
var64 = var60 ^ var46;
/* 16: xorb */
var65 = var63 ^ var47;
/* 17: cmpgtsb */
var66 = (var64 > var65) ? (~0) : 0;
/* 18: loadb */
var48 = ptr4[i];
/* 19: andb */
var67 = var48 & var66;
/* 20: loadb */
var49 = ptr7[i];
/* 21: andnb */
var68 = (~var49) & var66;
/* 22: orb */
var69 = var67 | var68;
/* 23: loadb */
var50 = ptr5[i];
/* 24: loadb */
var50 = ptr7[i];
/* 2: loadb */
var51 = ptr6[i];
/* 25: maxub */
var70 = ORC_MAX ((orc_uint8) var50, (orc_uint8) var51);
/* 26: loadb */
/* 3: loadb */
var52 = ptr5[i];
/* 27: loadb */
var53 = ptr6[i];
/* 28: minub */
var71 = ORC_MIN ((orc_uint8) var52, (orc_uint8) var53);
/* 30: addusb */
var72 = ORC_CLAMP_UB ((orc_uint8) var70 + (orc_uint8) var54);
/* 32: subusb */
var73 = ORC_CLAMP_UB ((orc_uint8) var71 - (orc_uint8) var55);
/* 33: minub */
var74 = ORC_MIN ((orc_uint8) var69, (orc_uint8) var72);
/* 34: maxub */
var56 = ORC_MAX ((orc_uint8) var74, (orc_uint8) var73);
/* 35: storeb */
ptr0[i] = var56;
/* 4: avgub */
var53 = ((orc_uint8) var52 + (orc_uint8) var51 + 1) >> 1;
/* 5: maxub */
var54 = ORC_MAX ((orc_uint8) var49, (orc_uint8) var53);
/* 6: minub */
var55 = ORC_MIN ((orc_uint8) var49, (orc_uint8) var53);
/* 7: subb */
var56 = var54 - var55;
/* 8: maxub */
var57 = ORC_MAX ((orc_uint8) var50, (orc_uint8) var53);
/* 9: minub */
var58 = ORC_MIN ((orc_uint8) var50, (orc_uint8) var53);
/* 10: subb */
var59 = var57 - var58;
/* 12: xorb */
var60 = var56 ^ var44;
/* 14: xorb */
var61 = var59 ^ var45;
/* 15: cmpgtsb */
var62 = (var60 > var61) ? (~0) : 0;
/* 16: andb */
var63 = var50 & var62;
/* 17: andnb */
var64 = (~var62) & var49;
/* 18: orb */
var65 = var63 | var64;
/* 19: maxub */
var66 = ORC_MAX ((orc_uint8) var52, (orc_uint8) var51);
/* 20: minub */
var67 = ORC_MIN ((orc_uint8) var52, (orc_uint8) var51);
/* 22: addusb */
var68 = ORC_CLAMP_UB ((orc_uint8) var66 + (orc_uint8) var46);
/* 24: subusb */
var69 = ORC_CLAMP_UB ((orc_uint8) var67 - (orc_uint8) var47);
/* 25: minub */
var70 = ORC_MIN ((orc_uint8) var65, (orc_uint8) var68);
/* 26: maxub */
var48 = ORC_MAX ((orc_uint8) var70, (orc_uint8) var69);
/* 27: storeb */
ptr0[i] = var48;
}
}
@ -972,44 +924,56 @@ deinterlace_line_greedy (orc_uint8 * d1, const orc_uint8 * s1,
orc_program_add_temporary (p, 1, "t6");
orc_program_add_temporary (p, 1, "t7");
orc_program_add_temporary (p, 1, "t8");
orc_program_add_temporary (p, 1, "t9");
orc_program_add_temporary (p, 1, "t10");
orc_program_add_temporary (p, 1, "t11");
orc_program_add_temporary (p, 1, "t12");
orc_program_append_2 (p, "avgub", 0, ORC_VAR_T1, ORC_VAR_S2, ORC_VAR_S3,
orc_program_append_2 (p, "loadb", 0, ORC_VAR_T1, ORC_VAR_S1, ORC_VAR_D1,
ORC_VAR_D1);
orc_program_append_2 (p, "maxub", 0, ORC_VAR_T4, ORC_VAR_S1, ORC_VAR_T1,
orc_program_append_2 (p, "loadb", 0, ORC_VAR_T2, ORC_VAR_S4, ORC_VAR_D1,
ORC_VAR_D1);
orc_program_append_2 (p, "minub", 0, ORC_VAR_T5, ORC_VAR_S1, ORC_VAR_T1,
orc_program_append_2 (p, "loadb", 0, ORC_VAR_T3, ORC_VAR_S3, ORC_VAR_D1,
ORC_VAR_D1);
orc_program_append_2 (p, "subb", 0, ORC_VAR_T2, ORC_VAR_T4, ORC_VAR_T5,
orc_program_append_2 (p, "loadb", 0, ORC_VAR_T4, ORC_VAR_S2, ORC_VAR_D1,
ORC_VAR_D1);
orc_program_append_2 (p, "maxub", 0, ORC_VAR_T4, ORC_VAR_S4, ORC_VAR_T1,
orc_program_append_2 (p, "avgub", 0, ORC_VAR_T5, ORC_VAR_T4, ORC_VAR_T3,
ORC_VAR_D1);
orc_program_append_2 (p, "minub", 0, ORC_VAR_T5, ORC_VAR_S4, ORC_VAR_T1,
orc_program_append_2 (p, "maxub", 0, ORC_VAR_T8, ORC_VAR_T1, ORC_VAR_T5,
ORC_VAR_D1);
orc_program_append_2 (p, "subb", 0, ORC_VAR_T3, ORC_VAR_T4, ORC_VAR_T5,
orc_program_append_2 (p, "minub", 0, ORC_VAR_T9, ORC_VAR_T1, ORC_VAR_T5,
ORC_VAR_D1);
orc_program_append_2 (p, "xorb", 0, ORC_VAR_T2, ORC_VAR_T2, ORC_VAR_C1,
orc_program_append_2 (p, "subb", 0, ORC_VAR_T6, ORC_VAR_T8, ORC_VAR_T9,
ORC_VAR_D1);
orc_program_append_2 (p, "xorb", 0, ORC_VAR_T3, ORC_VAR_T3, ORC_VAR_C1,
orc_program_append_2 (p, "maxub", 0, ORC_VAR_T8, ORC_VAR_T2, ORC_VAR_T5,
ORC_VAR_D1);
orc_program_append_2 (p, "cmpgtsb", 0, ORC_VAR_T5, ORC_VAR_T2, ORC_VAR_T3,
orc_program_append_2 (p, "minub", 0, ORC_VAR_T9, ORC_VAR_T2, ORC_VAR_T5,
ORC_VAR_D1);
orc_program_append_2 (p, "andb", 0, ORC_VAR_T4, ORC_VAR_S1, ORC_VAR_T5,
orc_program_append_2 (p, "subb", 0, ORC_VAR_T7, ORC_VAR_T8, ORC_VAR_T9,
ORC_VAR_D1);
orc_program_append_2 (p, "andnb", 0, ORC_VAR_T5, ORC_VAR_S4, ORC_VAR_T5,
orc_program_append_2 (p, "xorb", 0, ORC_VAR_T6, ORC_VAR_T6, ORC_VAR_C1,
ORC_VAR_D1);
orc_program_append_2 (p, "orb", 0, ORC_VAR_T6, ORC_VAR_T4, ORC_VAR_T5,
orc_program_append_2 (p, "xorb", 0, ORC_VAR_T7, ORC_VAR_T7, ORC_VAR_C1,
ORC_VAR_D1);
orc_program_append_2 (p, "maxub", 0, ORC_VAR_T8, ORC_VAR_S2, ORC_VAR_S3,
orc_program_append_2 (p, "cmpgtsb", 0, ORC_VAR_T9, ORC_VAR_T6, ORC_VAR_T7,
ORC_VAR_D1);
orc_program_append_2 (p, "minub", 0, ORC_VAR_T7, ORC_VAR_S2, ORC_VAR_S3,
orc_program_append_2 (p, "andb", 0, ORC_VAR_T8, ORC_VAR_T2, ORC_VAR_T9,
ORC_VAR_D1);
orc_program_append_2 (p, "addusb", 0, ORC_VAR_T8, ORC_VAR_T8, ORC_VAR_P1,
orc_program_append_2 (p, "andnb", 0, ORC_VAR_T9, ORC_VAR_T9, ORC_VAR_T1,
ORC_VAR_D1);
orc_program_append_2 (p, "subusb", 0, ORC_VAR_T7, ORC_VAR_T7, ORC_VAR_P1,
orc_program_append_2 (p, "orb", 0, ORC_VAR_T10, ORC_VAR_T8, ORC_VAR_T9,
ORC_VAR_D1);
orc_program_append_2 (p, "minub", 0, ORC_VAR_T6, ORC_VAR_T6, ORC_VAR_T8,
orc_program_append_2 (p, "maxub", 0, ORC_VAR_T12, ORC_VAR_T4, ORC_VAR_T3,
ORC_VAR_D1);
orc_program_append_2 (p, "maxub", 0, ORC_VAR_D1, ORC_VAR_T6, ORC_VAR_T7,
orc_program_append_2 (p, "minub", 0, ORC_VAR_T11, ORC_VAR_T4, ORC_VAR_T3,
ORC_VAR_D1);
orc_program_append_2 (p, "addusb", 0, ORC_VAR_T12, ORC_VAR_T12,
ORC_VAR_P1, ORC_VAR_D1);
orc_program_append_2 (p, "subusb", 0, ORC_VAR_T11, ORC_VAR_T11,
ORC_VAR_P1, ORC_VAR_D1);
orc_program_append_2 (p, "minub", 0, ORC_VAR_T10, ORC_VAR_T10,
ORC_VAR_T12, ORC_VAR_D1);
orc_program_append_2 (p, "maxub", 0, ORC_VAR_D1, ORC_VAR_T10, ORC_VAR_T11,
ORC_VAR_D1);
result = orc_program_compile (p);

View file

@ -61,6 +61,10 @@ convsuswb d1, t1
.source 1 b1
.source 1 m2
.param 1 max_comb
.temp 1 tm0
.temp 1 tm2
.temp 1 tb1
.temp 1 tt1
.temp 1 avg
.temp 1 l2_diff
.temp 1 lp2_diff
@ -71,29 +75,31 @@ convsuswb d1, t1
.temp 1 max
avgub avg, t1, b1
#absdiffb l2_diff, m0, avg
maxub t2, m0, avg
minub t3, m0, avg
loadb tm0, m0
loadb tm2, m2
loadb tb1, b1
loadb tt1, t1
avgub avg, tt1, tb1
maxub t2, tm0, avg
minub t3, tm0, avg
subb l2_diff, t2, t3
#absdiffb lp2_diff, m2, avg
maxub t2, m2, avg
minub t3, m2, avg
maxub t2, tm2, avg
minub t3, tm2, avg
subb lp2_diff, t2, t3
#cmpgtub t1, l2_diff, lp2_diff
xorb l2_diff, l2_diff, 0x80
xorb lp2_diff, lp2_diff, 0x80
cmpgtsb t3, l2_diff, lp2_diff
#selectb best, m0, m2, t3
andb t2, m0, t3
andnb t3, m2, t3
andb t2, tm2, t3
andnb t3, t3, tm0
orb best, t2, t3
maxub max, t1, b1
minub min, t1, b1
maxub max, tt1, tb1
minub min, tt1, tb1
addusb max, max, max_comb
subusb min, min, max_comb
minub best, best, max

View file

@ -34,9 +34,6 @@
#include "gstdeinterlacemethod.h"
#include <string.h>
#ifdef HAVE_ORC
#include <orc/orc.h>
#endif
#include "tvtime.h"
@ -79,54 +76,6 @@ typedef struct
// I'd intended this to be part of a larger more elaborate method added to
// Blended Clip but this give too good results for the CPU to ignore here.
static inline void
deinterlace_greedy_scanline_c (GstDeinterlaceMethodGreedyL * self,
const guint8 * m0, const guint8 * t1,
const guint8 * b1, const guint8 * m2, guint8 * output, gint width)
{
gint avg, l2_diff, lp2_diff, max, min, best;
guint max_comb = self->max_comb;
// L2 == m0
// L1 == t1
// L3 == b1
// LP2 == m2
while (width--) {
avg = (*t1 + *b1) / 2;
l2_diff = ABS (*m0 - avg);
lp2_diff = ABS (*m2 - avg);
if (l2_diff > lp2_diff)
best = *m2;
else
best = *m0;
max = MAX (*t1, *b1);
min = MIN (*t1, *b1);
if (max < 256 - max_comb)
max += max_comb;
else
max = 255;
if (min > max_comb)
min -= max_comb;
else
min = 0;
*output = CLAMP (best, min, max);
// Advance to the next set of pixels.
output += 1;
m0 += 1;
t1 += 1;
b1 += 1;
m2 += 1;
}
}
static inline void
deinterlace_greedy_scanline_orc (GstDeinterlaceMethodGreedyL * self,
const guint8 * m0, const guint8 * t1,
@ -135,216 +84,6 @@ deinterlace_greedy_scanline_orc (GstDeinterlaceMethodGreedyL * self,
deinterlace_line_greedy (output, m0, t1, b1, m2, self->max_comb, width);
}
#ifdef BUILD_X86_ASM
#include "mmx.h"
static void
deinterlace_greedy_scanline_mmx (GstDeinterlaceMethodGreedyL * self,
const guint8 * m0, const guint8 * t1,
const guint8 * b1, const guint8 * m2, guint8 * output, gint width)
{
mmx_t MaxComb;
mmx_t ShiftMask;
// How badly do we let it weave? 0-255
MaxComb.ub[0] = self->max_comb;
MaxComb.ub[1] = self->max_comb;
MaxComb.ub[2] = self->max_comb;
MaxComb.ub[3] = self->max_comb;
MaxComb.ub[4] = self->max_comb;
MaxComb.ub[5] = self->max_comb;
MaxComb.ub[6] = self->max_comb;
MaxComb.ub[7] = self->max_comb;
ShiftMask.ub[0] = 0x7f;
ShiftMask.ub[1] = 0x7f;
ShiftMask.ub[2] = 0x7f;
ShiftMask.ub[3] = 0x7f;
ShiftMask.ub[4] = 0x7f;
ShiftMask.ub[5] = 0x7f;
ShiftMask.ub[6] = 0x7f;
ShiftMask.ub[7] = 0x7f;
// L2 == m0
// L1 == t1
// L3 == b1
// LP2 == m2
movq_m2r (MaxComb, mm6);
for (; width > 7; width -= 8) {
movq_m2r (*t1, mm1); // L1
movq_m2r (*m0, mm2); // L2
movq_m2r (*b1, mm3); // L3
movq_m2r (*m2, mm0); // LP2
// average L1 and L3 leave result in mm4
movq_r2r (mm1, mm4); // L1
movq_r2r (mm3, mm5); // L3
psrlw_i2r (1, mm4); // L1/2
pand_m2r (ShiftMask, mm4);
psrlw_i2r (1, mm5); // L3/2
pand_m2r (ShiftMask, mm5);
paddusb_r2r (mm5, mm4); // (L1 + L3) / 2
// get abs value of possible L2 comb
movq_r2r (mm2, mm7); // L2
psubusb_r2r (mm4, mm7); // L2 - avg
movq_r2r (mm4, mm5); // avg
psubusb_r2r (mm2, mm5); // avg - L2
por_r2r (mm7, mm5); // abs(avg-L2)
// get abs value of possible LP2 comb
movq_r2r (mm0, mm7); // LP2
psubusb_r2r (mm4, mm7); // LP2 - avg
psubusb_r2r (mm0, mm4); // avg - LP2
por_r2r (mm7, mm4); // abs(avg-LP2)
// use L2 or LP2 depending upon which makes smaller comb
psubusb_r2r (mm5, mm4); // see if it goes to zero
psubusb_r2r (mm5, mm5); // 0
pcmpeqb_r2r (mm5, mm4); // if (mm4=0) then FF else 0
pcmpeqb_r2r (mm4, mm5); // opposite of mm4
// if Comb(LP2) <= Comb(L2) then mm4=ff, mm5=0 else mm4=0, mm5 = 55
pand_r2r (mm2, mm5); // use L2 if mm5 == ff, else 0
pand_r2r (mm0, mm4); // use LP2 if mm4 = ff, else 0
por_r2r (mm5, mm4); // may the best win
// Now lets clip our chosen value to be not outside of the range
// of the high/low range L1-L3 by more than abs(L1-L3)
// This allows some comb but limits the damages and also allows more
// detail than a boring oversmoothed clip.
movq_r2r (mm1, mm2); // copy L1
psubusb_r2r (mm3, mm2); // - L3, with saturation
paddusb_r2r (mm3, mm2); // now = Max(L1,L3)
pcmpeqb_r2r (mm7, mm7); // all ffffffff
psubusb_r2r (mm1, mm7); // - L1
paddusb_r2r (mm7, mm3); // add, may sat at fff..
psubusb_r2r (mm7, mm3); // now = Min(L1,L3)
// allow the value to be above the high or below the low by amt of MaxComb
paddusb_r2r (mm6, mm2); // increase max by diff
psubusb_r2r (mm6, mm3); // lower min by diff
psubusb_r2r (mm3, mm4); // best - Min
paddusb_r2r (mm3, mm4); // now = Max(best,Min(L1,L3)
pcmpeqb_r2r (mm7, mm7); // all ffffffff
psubusb_r2r (mm4, mm7); // - Max(best,Min(best,L3)
paddusb_r2r (mm7, mm2); // add may sat at FFF..
psubusb_r2r (mm7, mm2); // now = Min( Max(best, Min(L1,L3), L2 )=L2 clipped
movq_r2m (mm2, *output); // move in our clipped best
// Advance to the next set of pixels.
output += 8;
m0 += 8;
t1 += 8;
b1 += 8;
m2 += 8;
}
emms ();
if (width > 0)
deinterlace_greedy_scanline_c (self, m0, t1, b1, m2, output, width);
}
#include "sse.h"
static void
deinterlace_greedy_scanline_mmxext (GstDeinterlaceMethodGreedyL *
self, const guint8 * m0, const guint8 * t1, const guint8 * b1,
const guint8 * m2, guint8 * output, gint width)
{
mmx_t MaxComb;
// How badly do we let it weave? 0-255
MaxComb.ub[0] = self->max_comb;
MaxComb.ub[1] = self->max_comb;
MaxComb.ub[2] = self->max_comb;
MaxComb.ub[3] = self->max_comb;
MaxComb.ub[4] = self->max_comb;
MaxComb.ub[5] = self->max_comb;
MaxComb.ub[6] = self->max_comb;
MaxComb.ub[7] = self->max_comb;
// L2 == m0
// L1 == t1
// L3 == b1
// LP2 == m2
movq_m2r (MaxComb, mm6);
for (; width > 7; width -= 8) {
movq_m2r (*t1, mm1); // L1
movq_m2r (*m0, mm2); // L2
movq_m2r (*b1, mm3); // L3
movq_m2r (*m2, mm0); // LP2
// average L1 and L3 leave result in mm4
movq_r2r (mm1, mm4); // L1
pavgb_r2r (mm3, mm4); // (L1 + L3)/2
// get abs value of possible L2 comb
movq_r2r (mm2, mm7); // L2
psubusb_r2r (mm4, mm7); // L2 - avg
movq_r2r (mm4, mm5); // avg
psubusb_r2r (mm2, mm5); // avg - L2
por_r2r (mm7, mm5); // abs(avg-L2)
// get abs value of possible LP2 comb
movq_r2r (mm0, mm7); // LP2
psubusb_r2r (mm4, mm7); // LP2 - avg
psubusb_r2r (mm0, mm4); // avg - LP2
por_r2r (mm7, mm4); // abs(avg-LP2)
// use L2 or LP2 depending upon which makes smaller comb
psubusb_r2r (mm5, mm4); // see if it goes to zero
pxor_r2r (mm5, mm5); // 0
pcmpeqb_r2r (mm5, mm4); // if (mm4=0) then FF else 0
pcmpeqb_r2r (mm4, mm5); // opposite of mm4
// if Comb(LP2) <= Comb(L2) then mm4=ff, mm5=0 else mm4=0, mm5 = 55
pand_r2r (mm2, mm5); // use L2 if mm5 == ff, else 0
pand_r2r (mm0, mm4); // use LP2 if mm4 = ff, else 0
por_r2r (mm5, mm4); // may the best win
// Now lets clip our chosen value to be not outside of the range
// of the high/low range L1-L3 by more than abs(L1-L3)
// This allows some comb but limits the damages and also allows more
// detail than a boring oversmoothed clip.
movq_r2r (mm1, mm2); // copy L1
pmaxub_r2r (mm3, mm2); // now = Max(L1,L3)
pminub_r2r (mm1, mm3); // now = Min(L1,L3)
// allow the value to be above the high or below the low by amt of MaxComb
paddusb_r2r (mm6, mm2); // increase max by diff
psubusb_r2r (mm6, mm3); // lower min by diff
pmaxub_r2r (mm3, mm4); // now = Max(best,Min(L1,L3)
pminub_r2r (mm4, mm2); // now = Min( Max(best, Min(L1,L3)), L2 )=L2 clipped
movq_r2m (mm2, *output); // move in our clipped best
// Advance to the next set of pixels.
output += 8;
m0 += 8;
t1 += 8;
b1 += 8;
m2 += 8;
}
emms ();
if (width > 0)
deinterlace_greedy_scanline_c (self, m0, t1, b1, m2, output, width);
}
#endif
static void
deinterlace_frame_di_greedy_packed (GstDeinterlaceMethod * method,
const GstDeinterlaceField * history, guint history_count,
@ -561,10 +300,6 @@ gst_deinterlace_method_greedy_l_class_init (GstDeinterlaceMethodGreedyLClass *
{
GstDeinterlaceMethodClass *dim_class = (GstDeinterlaceMethodClass *) klass;
GObjectClass *gobject_class = (GObjectClass *) klass;
#ifdef BUILD_X86_ASM
guint cpu_flags =
orc_target_get_default_flags (orc_target_get_by_name ("mmx"));
#endif
gobject_class->set_property = gst_deinterlace_method_greedy_l_set_property;
gobject_class->get_property = gst_deinterlace_method_greedy_l_get_property;
@ -596,18 +331,7 @@ gst_deinterlace_method_greedy_l_class_init (GstDeinterlaceMethodGreedyLClass *
dim_class->deinterlace_frame_rgb = deinterlace_frame_di_greedy_packed;
dim_class->deinterlace_frame_bgr = deinterlace_frame_di_greedy_packed;
#ifdef BUILD_X86_ASM
if (cpu_flags & ORC_TARGET_MMX_MMXEXT) {
klass->scanline = deinterlace_greedy_scanline_mmxext;
} else if (cpu_flags & ORC_TARGET_MMX_MMX) {
klass->scanline = deinterlace_greedy_scanline_mmx;
} else {
klass->scanline = deinterlace_greedy_scanline_c;
}
#else
klass->scanline = deinterlace_greedy_scanline_c;
klass->scanline = deinterlace_greedy_scanline_orc;
#endif
}
static void