mirror of
https://gitlab.freedesktop.org/gstreamer/gstreamer.git
synced 2025-01-27 09:38:17 +00:00
deinterlace: Add MMX/3DNow implementations of greedyh for UYVY
This commit is contained in:
parent
2096cf6e55
commit
dc6dd62824
2 changed files with 235 additions and 2 deletions
|
@ -185,7 +185,7 @@ FUNCT_NAME_YUY2 (GstDeinterlaceMethodGreedyH *self, const guint8 * L1, const gui
|
|||
V_PMINUB ("%%mm4", "%%mm2", "%%mm7")
|
||||
|
||||
// Blend weave pixel with bob pixel, depending on motion val in mm0
|
||||
"psubusb %[MotionThreshold], %%mm0\n\t" // test Threshold, clear chroma change >>>??
|
||||
"psubusb %[MotionThreshold], %%mm0\n\t" // test Threshold, clear chroma change
|
||||
"pmullw %[MotionSense], %%mm0\n\t" // mul by user factor, keep low 16 bits
|
||||
"movq %[QW256], %%mm7\n\t"
|
||||
#ifdef IS_MMXEXT
|
||||
|
@ -248,3 +248,225 @@ FUNCT_NAME_YUY2 (GstDeinterlaceMethodGreedyH *self, const guint8 * L1, const gui
|
|||
"memory", "cc");
|
||||
}
|
||||
|
||||
static void
|
||||
FUNCT_NAME_UYVY (GstDeinterlaceMethodGreedyH *self, const guint8 * L1, const guint8 * L2, const guint8 * L3, const guint8 * L2P, guint8 * Dest, gint width)
|
||||
{
|
||||
|
||||
// in tight loop some vars are accessed faster in local storage
|
||||
gint64 YMask = 0xff00ff00ff00ff00ull; // to keep only luma
|
||||
gint64 UVMask = 0x00ff00ff00ff00ffull; // to keep only chroma
|
||||
gint64 ShiftMask = 0xfefefefefefefefeull; // to avoid shifting chroma to luma
|
||||
gint64 QW256 = 0x0100010001000100ull; // 4 256's
|
||||
gint64 MaxComb;
|
||||
gint64 MotionThreshold;
|
||||
gint64 MotionSense;
|
||||
gint64 i;
|
||||
glong LoopCtr;
|
||||
glong oldbx;
|
||||
|
||||
gint64 QW256B;
|
||||
gint64 LastAvg = 0; //interp value from left qword
|
||||
|
||||
// FIXME: Use C implementation if the width is not a multiple of 4
|
||||
// Do something more optimal later
|
||||
if (width % 4 != 0)
|
||||
C_FUNCT_UYVY (self, L1, L2, L3, L2P, Dest, width);
|
||||
|
||||
// Set up our two parms that are actually evaluated for each pixel
|
||||
i = self->max_comb;
|
||||
MaxComb =
|
||||
i << 56 | i << 48 | i << 40 | i << 32 | i << 24 | i << 16 | i << 8 | i;
|
||||
|
||||
i = self->motion_threshold; // scale to range of 0-257
|
||||
MotionThreshold = i << 48 | i << 32 | i << 16 | i | UVMask;
|
||||
|
||||
i = self->motion_sense; // scale to range of 0-257
|
||||
MotionSense = i << 48 | i << 32 | i << 16 | i;
|
||||
|
||||
i = 0xffffffff - 256;
|
||||
QW256B = i << 48 | i << 32 | i << 16 | i; // save a couple instr on PMINSW instruct.
|
||||
|
||||
LoopCtr = width / 8 - 1; // there are LineLength / 4 qwords per line but do 1 less, adj at end of loop
|
||||
|
||||
// For ease of reading, the comments below assume that we're operating on an odd
|
||||
// field (i.e., that InfoIsOdd is true). Assume the obvious for even lines..
|
||||
__asm__ __volatile__ (
|
||||
// save ebx (-fPIC)
|
||||
MOVX " %%" XBX ", %[oldbx]\n\t"
|
||||
MOVX " %[L1], %%" XAX "\n\t"
|
||||
LEAX " 8(%%" XAX "), %%" XBX "\n\t" // next qword needed by DJR
|
||||
MOVX " %[L3], %%" XCX "\n\t"
|
||||
SUBX " %%" XAX ", %%" XCX "\n\t" // carry L3 addr as an offset
|
||||
MOVX " %[L2P], %%" XDX "\n\t"
|
||||
MOVX " %[L2], %%" XSI "\n\t"
|
||||
MOVX " %[Dest], %%" XDI "\n\t" // DL1 if Odd or DL2 if Even
|
||||
|
||||
".align 8\n\t"
|
||||
"1:\n\t"
|
||||
"movq (%%" XSI "), %%mm0\n\t" // L2 - the newest weave pixel value
|
||||
"movq (%%" XAX "), %%mm1\n\t" // L1 - the top pixel
|
||||
"movq (%%" XDX "), %%mm2\n\t" // L2P - the prev weave pixel
|
||||
"movq (%%" XAX ", %%" XCX "), %%mm3\n\t" // L3, next odd row
|
||||
"movq %%mm1, %%mm6\n\t" // L1 - get simple single pixel interp
|
||||
|
||||
// pavgb mm6, mm3 // use macro below
|
||||
V_PAVGB ("%%mm6", "%%mm3", "%%mm4", "%[ShiftMask]")
|
||||
|
||||
// DJR - Diagonal Jaggie Reduction
|
||||
// In the event that we are going to use an average (Bob) pixel we do not want a jagged
|
||||
// stair step effect. To combat this we avg in the 2 horizontally adjacen pixels into the
|
||||
// interpolated Bob mix. This will do horizontal smoothing for only the Bob'd pixels.
|
||||
|
||||
"movq %[LastAvg], %%mm4\n\t" // the bob value from prev qword in row
|
||||
"movq %%mm6, %[LastAvg]\n\t" // save for next pass
|
||||
"psrlq $48, %%mm4\n\t" // right justify 1 pixel
|
||||
"movq %%mm6, %%mm7\n\t" // copy of simple bob pixel
|
||||
"psllq $16, %%mm7\n\t" // left justify 3 pixels
|
||||
"por %%mm7, %%mm4\n\t" // and combine
|
||||
"movq (%%" XBX "), %%mm5\n\t" // next horiz qword from L1
|
||||
// pavgb mm5, qword ptr[ebx+ecx] // next horiz qword from L3, use macro below
|
||||
|
||||
V_PAVGB ("%%mm5", "(%%" XBX ",%%" XCX ")", "%%mm7", "%[ShiftMask]")
|
||||
"psllq $48, %%mm5\n\t" // left just 1 pixel
|
||||
"movq %%mm6, %%mm7\n\t" // another copy of simple bob pixel
|
||||
"psrlq $16, %%mm7\n\t" // right just 3 pixels
|
||||
"por %%mm7, %%mm5\n\t" // combine
|
||||
// pavgb mm4, mm5 // avg of forward and prev by 1 pixel, use macro
|
||||
V_PAVGB ("%%mm4", "%%mm5", "%%mm5", "%[ShiftMask]") // mm5 gets modified if MMX
|
||||
// pavgb mm6, mm4 // avg of center and surround interp vals, use macro
|
||||
V_PAVGB ("%%mm6", "%%mm4", "%%mm7", "%[ShiftMask]")
|
||||
|
||||
// Don't do any more averaging than needed for mmx. It hurts performance and causes rounding errors.
|
||||
#ifndef IS_MMX
|
||||
// pavgb mm4, mm6 // 1/4 center, 3/4 adjacent
|
||||
V_PAVGB ("%%mm4", "%%mm6", "%%mm7", "%[ShiftMask]")
|
||||
// pavgb mm6, mm4 // 3/8 center, 5/8 adjacent
|
||||
V_PAVGB ("%%mm6", "%%mm4", "%%mm7", "%[ShiftMask]")
|
||||
#endif
|
||||
|
||||
// get abs value of possible L2 comb
|
||||
"movq %%mm6, %%mm4\n\t" // work copy of interp val
|
||||
"movq %%mm2, %%mm7\n\t" // L2
|
||||
"psubusb %%mm4, %%mm7\n\t" // L2 - avg
|
||||
"movq %%mm4, %%mm5\n\t" // avg
|
||||
"psubusb %%mm2, %%mm5\n\t" // avg - L2
|
||||
"por %%mm7, %%mm5\n\t" // abs(avg-L2)
|
||||
|
||||
// get abs value of possible L2P comb
|
||||
"movq %%mm0, %%mm7\n\t" // L2P
|
||||
"psubusb %%mm4, %%mm7\n\t" // L2P - avg
|
||||
"psubusb %%mm0, %%mm4\n\t" // avg - L2P
|
||||
"por %%mm7, %%mm4\n\t" // abs(avg-L2P)
|
||||
|
||||
// use L2 or L2P depending upon which makes smaller comb
|
||||
"psubusb %%mm5, %%mm4\n\t" // see if it goes to zero
|
||||
"psubusb %%mm5, %%mm5\n\t" // 0
|
||||
"pcmpeqb %%mm5, %%mm4\n\t" // if (mm4=0) then FF else 0
|
||||
"pcmpeqb %%mm4, %%mm5\n\t" // opposite of mm4
|
||||
|
||||
// if Comb(L2P) <= Comb(L2) then mm4=ff, mm5=0 else mm4=0, mm5 = 55
|
||||
"pand %%mm2, %%mm5\n\t" // use L2 if mm5 == ff, else 0
|
||||
"pand %%mm0, %%mm4\n\t" // use L2P if mm4 = ff, else 0
|
||||
"por %%mm5, %%mm4\n\t" // may the best win
|
||||
|
||||
// Inventory: at this point we have the following values:
|
||||
// mm0 = L2P (or L2)
|
||||
// mm1 = L1
|
||||
// mm2 = L2 (or L2P)
|
||||
// mm3 = L3
|
||||
// mm4 = the best of L2,L2P weave pixel, base upon comb
|
||||
// mm6 = the avg interpolated value, if we need to use it
|
||||
// Let's measure movement, as how much the weave pixel has changed
|
||||
|
||||
"movq %%mm2, %%mm7\n\t"
|
||||
"psubusb %%mm0, %%mm2\n\t"
|
||||
"psubusb %%mm7, %%mm0\n\t"
|
||||
"por %%mm2, %%mm0\n\t" // abs value of change, used later
|
||||
|
||||
// Now lets clip our chosen value to be not outside of the range
|
||||
// of the high/low range L1-L3 by more than MaxComb.
|
||||
// This allows some comb but limits the damages and also allows more
|
||||
// detail than a boring oversmoothed clip.
|
||||
|
||||
"movq %%mm1, %%mm2\n\t" // copy L1
|
||||
// pmaxub mm2, mm3 // use macro
|
||||
V_PMAXUB ("%%mm2", "%%mm3") // now = Max(L1,L3)
|
||||
"movq %%mm1, %%mm5\n\t" // copy L1
|
||||
// pminub mm5, mm3 // now = Min(L1,L3), use macro
|
||||
V_PMINUB ("%%mm5", "%%mm3", "%%mm7")
|
||||
|
||||
// allow the value to be above the high or below the low by amt of MaxComb
|
||||
"psubusb %[MaxComb], %%mm5\n\t" // lower min by diff
|
||||
"paddusb %[MaxComb], %%mm2\n\t" // increase max by diff
|
||||
// pmaxub mm4, mm5 // now = Max(best,Min(L1,L3) use macro
|
||||
V_PMAXUB ("%%mm4", "%%mm5")
|
||||
// pminub mm4, mm2 // now = Min( Max(best, Min(L1,L3), L2 )=L2 clipped
|
||||
V_PMINUB ("%%mm4", "%%mm2", "%%mm7")
|
||||
|
||||
// Blend weave pixel with bob pixel, depending on motion val in mm0
|
||||
"psubusb %[MotionThreshold], %%mm0\n\t" // test Threshold, clear chroma change
|
||||
"psrlw $8, %%mm0\n\t" // div by 256 to get weighted avg
|
||||
"pmullw %[MotionSense], %%mm0\n\t" // mul by user factor, keep low 16 bits
|
||||
"movq %[QW256], %%mm7\n\t"
|
||||
#ifdef IS_MMXEXT
|
||||
"pminsw %%mm7, %%mm0\n\t" // max = 256
|
||||
#else
|
||||
"paddusw %[QW256B], %%mm0\n\t" // add, may sat at fff..
|
||||
"psubusw %[QW256B], %%mm0\n\t" // now = Min(L1,256)
|
||||
#endif
|
||||
"psubusw %%mm0, %%mm7\n\t" // so the 2 sum to 256, weighted avg
|
||||
"movq %%mm4, %%mm2\n\t" // save weave chroma info before trashing
|
||||
"pand %[YMask], %%mm4\n\t" // keep only luma from calc'd value
|
||||
"psrlw $8, %%mm4\n\t" // div by 256 to get weighted avg
|
||||
"pmullw %%mm7, %%mm4\n\t" // use more weave for less motion
|
||||
"pand %[YMask], %%mm6\n\t" // keep only luma from calc'd value
|
||||
"psrlw $8, %%mm6\n\t" // div by 256 to get weighted avg
|
||||
"pmullw %%mm0, %%mm6\n\t" // use more bob for large motion
|
||||
"paddusw %%mm6, %%mm4\n\t" // combine
|
||||
"pand %[YMask], %%mm4\n\t" // keep only luma from calc'd value
|
||||
// chroma comes from weave pixel
|
||||
"pand %[UVMask], %%mm2\n\t" // keep chroma
|
||||
"por %%mm4, %%mm2\n\t" // and combine
|
||||
V_MOVNTQ ("(%%" XDI ")", "%%mm2") // move in our clipped best, use macro
|
||||
// bump ptrs and loop
|
||||
LEAX " 8(%%" XAX "), %%" XAX "\n\t"
|
||||
LEAX " 8(%%" XBX "), %%" XBX "\n\t"
|
||||
LEAX " 8(%%" XDX "), %%" XDX "\n\t"
|
||||
LEAX " 8(%%" XDI "), %%" XDI "\n\t"
|
||||
LEAX " 8(%%" XSI "), %%" XSI "\n\t"
|
||||
DECX " %[LoopCtr]\n\t"
|
||||
|
||||
"jg 1b\n\t" // loop if not to last line
|
||||
// note P-III default assumes backward branches taken
|
||||
"jl 1f\n\t" // done
|
||||
MOVX " %%" XAX ", %%" XBX "\n\t" // sharpness lookahead 1 byte only, be wrong on 1
|
||||
"jmp 1b\n\t"
|
||||
|
||||
"1:\n\t"
|
||||
MOVX " %[oldbx], %%" XBX "\n\t"
|
||||
"emms\n\t": /* no outputs */
|
||||
|
||||
:[LastAvg] "m" (LastAvg),
|
||||
[L1] "m" (L1),
|
||||
[L3] "m" (L3),
|
||||
[L2P] "m" (L2P),
|
||||
[L2] "m" (L2),
|
||||
[Dest] "m" (Dest),
|
||||
[ShiftMask] "m" (ShiftMask),
|
||||
[MaxComb] "m" (MaxComb),
|
||||
[MotionThreshold] "m" (MotionThreshold),
|
||||
[MotionSense] "m" (MotionSense),
|
||||
[QW256B] "m" (QW256B),
|
||||
[YMask] "m" (YMask),
|
||||
[UVMask] "m" (UVMask),
|
||||
[LoopCtr] "m" (LoopCtr),
|
||||
[QW256] "m" (QW256),
|
||||
[oldbx] "m" (oldbx)
|
||||
: XAX, XCX, XDX, XSI, XDI,
|
||||
"st", "st(1)", "st(2)", "st(3)", "st(4)", "st(5)", "st(6)", "st(7)",
|
||||
#ifdef __MMX__
|
||||
"mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7",
|
||||
#endif
|
||||
"memory", "cc");
|
||||
}
|
||||
|
||||
|
|
|
@ -665,39 +665,46 @@ greedyh_scanline_C_planar_uv (GstDeinterlaceMethodGreedyH * self,
|
|||
#define IS_MMXEXT
|
||||
#define SIMD_TYPE MMXEXT
|
||||
#define C_FUNCT_YUY2 greedyh_scanline_C_yuy2
|
||||
#define C_FUNCT_UYVY greedyh_scanline_C_uyvy
|
||||
#define C_FUNCT_PLANAR_Y greedyh_scanline_C_planar_y
|
||||
#define C_FUNCT_PLANAR_UV greedyh_scanline_C_planar_uv
|
||||
#define FUNCT_NAME_YUY2 greedyh_scanline_MMXEXT_yuy2
|
||||
#define FUNCT_NAME_UYVY greedyh_scanline_MMXEXT_uyvy
|
||||
#define FUNCT_NAME_PLANAR_Y greedyh_scanline_MMXEXT_planar_y
|
||||
#define FUNCT_NAME_PLANAR_UV greedyh_scanline_MMXEXT_planar_uv
|
||||
#include "greedyh.asm"
|
||||
#undef SIMD_TYPE
|
||||
#undef IS_MMXEXT
|
||||
#undef FUNCT_NAME_YUY2
|
||||
#undef FUNCT_NAME_UYVY
|
||||
#undef FUNCT_NAME_PLANAR_Y
|
||||
#undef FUNCT_NAME_PLANAR_UV
|
||||
|
||||
#define IS_3DNOW
|
||||
#define SIMD_TYPE 3DNOW
|
||||
#define FUNCT_NAME_YUY2 greedyh_scanline_3DNOW_yuy2
|
||||
#define FUNCT_NAME_UYVY greedyh_scanline_3DNOW_uyvy
|
||||
#define FUNCT_NAME_PLANAR_Y greedyh_scanline_3DNOW_planar_y
|
||||
#define FUNCT_NAME_PLANAR_UV greedyh_scanline_3DNOW_planar_uv
|
||||
#include "greedyh.asm"
|
||||
#undef SIMD_TYPE
|
||||
#undef IS_3DNOW
|
||||
#undef FUNCT_NAME_YUY2
|
||||
#undef FUNCT_NAME_UYVY
|
||||
#undef FUNCT_NAME_PLANAR_Y
|
||||
#undef FUNCT_NAME_PLANAR_UV
|
||||
|
||||
#define IS_MMX
|
||||
#define SIMD_TYPE MMX
|
||||
#define FUNCT_NAME_YUY2 greedyh_scanline_MMX_yuy2
|
||||
#define FUNCT_NAME_UYVY greedyh_scanline_MMX_uyvy
|
||||
#define FUNCT_NAME_PLANAR_Y greedyh_scanline_MMX_planar_y
|
||||
#define FUNCT_NAME_PLANAR_UV greedyh_scanline_MMX_planar_uv
|
||||
#include "greedyh.asm"
|
||||
#undef SIMD_TYPE
|
||||
#undef IS_MMX
|
||||
#undef FUNCT_NAME_YUY2
|
||||
#undef FUNCT_NAME_UYVY
|
||||
#undef FUNCT_NAME_PLANAR_Y
|
||||
#undef FUNCT_NAME_PLANAR_UV
|
||||
#undef C_FUNCT_YUY2
|
||||
|
@ -1003,19 +1010,23 @@ gst_deinterlace_method_greedy_h_class_init (GstDeinterlaceMethodGreedyHClass *
|
|||
#ifdef BUILD_X86_ASM
|
||||
if (cpu_flags & OIL_IMPL_FLAG_MMXEXT) {
|
||||
klass->scanline_yuy2 = greedyh_scanline_MMXEXT_yuy2;
|
||||
klass->scanline_uyvy = greedyh_scanline_MMXEXT_uyvy;
|
||||
} else if (cpu_flags & OIL_IMPL_FLAG_3DNOW) {
|
||||
klass->scanline_yuy2 = greedyh_scanline_3DNOW_yuy2;
|
||||
klass->scanline_uyvy = greedyh_scanline_3DNOW_uyvy;
|
||||
} else if (cpu_flags & OIL_IMPL_FLAG_MMX) {
|
||||
klass->scanline_yuy2 = greedyh_scanline_MMX_yuy2;
|
||||
klass->scanline_uyvy = greedyh_scanline_MMX_uyvy;
|
||||
} else {
|
||||
klass->scanline_yuy2 = greedyh_scanline_C_yuy2;
|
||||
klass->scanline_uyvy = greedyh_scanline_C_uyvy;
|
||||
}
|
||||
#else
|
||||
klass->scanline_yuy2 = greedyh_scanline_C_yuy2;
|
||||
klass->scanline_uyvy = greedyh_scanline_C_uyvy;
|
||||
#endif
|
||||
/* TODO: MMX implementation of these two */
|
||||
klass->scanline_ayuv = greedyh_scanline_C_ayuv;
|
||||
klass->scanline_uyvy = greedyh_scanline_C_uyvy;
|
||||
klass->scanline_planar_y = greedyh_scanline_C_planar_y;
|
||||
klass->scanline_planar_uv = greedyh_scanline_C_planar_uv;
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue