videomixer: Add MMX implementations for I420 and all non-alpha RGB formats

This commit is contained in:
Sebastian Dröge 2010-01-11 18:35:47 +01:00
parent 2950262186
commit 6158f401a1
2 changed files with 273 additions and 20 deletions

View file

@ -34,6 +34,8 @@
#include <string.h>
#define BLEND(D,S,alpha) (((D) * (256 - (alpha)) + (S) * (alpha)) >> 8)
#ifdef HAVE_GCC_ASM
#if defined(HAVE_CPU_I386) || defined(HAVE_CPU_X86_64)
#define BUILD_X86_ASM
@ -46,8 +48,6 @@
/* Below are the implementations of everything */
#define BLEND(D,S,alpha) (((D) * (255 - (alpha)) + (S) * (alpha)) >> 8)
inline static void
_blend_u8_c (guint8 * dest, const guint8 * src,
gint src_stride, gint dest_stride, gint src_width, gint src_height,
@ -459,11 +459,9 @@ fill_color_##name (guint8 * dest, gint width, gint height, \
gint i; \
gint dest_stride = GST_ROUND_UP_4 (width * bpp); \
\
red = CLAMP (1.164 * (colY - 16) + 1.596 * (colV - 128), 0, 255); \
green = \
CLAMP (1.164 * (colY - 16) - 0.813 * (colV - 128) - 0.391 * (colU - 128), \
0, 255); \
blue = CLAMP (1.164 * (colY - 16) + 2.018 * (colU - 128), 0, 255); \
red = YUV_TO_R (colY, colU, colV); \
green = YUV_TO_G (colY, colU, colV); \
blue = YUV_TO_B (colY, colU, colV); \
\
for (i = 0; i < height; i++) { \
MEMSET_RGB (dest, red, green, blue, width); \
@ -508,6 +506,15 @@ RGB_FILL_COLOR (bgrx_c, 4, _memset_bgrx_c);
/* MMX Implementations */
#ifdef BUILD_X86_ASM
#define MEMSET_xRGB_MMX(name, r, g, b) \
static inline void \
_memset_##name##_mmx (guint8* dest, gint red, gint green, gint blue, gint width) { \
guint32 val = (red << r) | (green << g) | (blue << b); \
\
_memset_u32_mmx ((guint32 *) dest, val, width); \
}
#define A32
#define NAME_BLEND _blend_loop_argb_mmx
#define NAME_FILL_COLOR _fill_color_loop_argb_mmx
@ -544,6 +551,25 @@ BLEND_A32 (bgra_mmx, _blend_loop_bgra_mmx);
A32_COLOR (argb_mmx, TRUE, _fill_color_loop_argb_mmx);
A32_COLOR (bgra_mmx, TRUE, _fill_color_loop_bgra_mmx);
A32_COLOR (ayuv_mmx, FALSE, _fill_color_loop_argb_mmx);
I420_BLEND (mmx, _memcpy_u8_mmx, _blend_u8_mmx);
I420_FILL_CHECKER (mmx, _memset_u8_mmx);
I420_FILL_COLOR (mmx, _memset_u8_mmx);
RGB_BLEND (rgb_mmx, 3, _memcpy_u8_mmx, _blend_u8_mmx);
RGB_BLEND (xrgb_mmx, 4, _memcpy_u8_mmx, _blend_u8_mmx);
MEMSET_xRGB_MMX (xrgb, 16, 8, 0);
RGB_FILL_COLOR (xrgb_mmx, 4, _memset_xrgb_mmx);
MEMSET_xRGB_MMX (xbgr, 0, 8, 16);
RGB_FILL_COLOR (xbgr_mmx, 4, _memset_xbgr_mmx);
MEMSET_xRGB_MMX (rgbx, 24, 16, 8);
RGB_FILL_COLOR (rgbx_mmx, 4, _memset_rgbx_mmx);
MEMSET_xRGB_MMX (bgrx, 8, 16, 24);
RGB_FILL_COLOR (bgrx_mmx, 4, _memset_bgrx_mmx);
#endif
/* Init function */
@ -612,10 +638,20 @@ gst_video_mixer_init_blend (void)
if (cpu_flags & OIL_IMPL_FLAG_MMX) {
gst_video_mixer_blend_argb = blend_argb_mmx;
gst_video_mixer_blend_bgra = blend_bgra_mmx;
gst_video_mixer_blend_i420 = blend_i420_mmx;
gst_video_mixer_blend_rgb = blend_rgb_mmx;
gst_video_mixer_blend_xrgb = blend_xrgb_mmx;
gst_video_mixer_fill_checker_i420 = fill_checker_i420_mmx;
gst_video_mixer_fill_color_argb = fill_color_argb_mmx;
gst_video_mixer_fill_color_bgra = fill_color_bgra_mmx;
gst_video_mixer_fill_color_ayuv = fill_color_ayuv_mmx;
gst_video_mixer_fill_color_i420 = fill_color_i420_mmx;
gst_video_mixer_fill_color_xrgb = fill_color_xrgb_mmx;
gst_video_mixer_fill_color_xbgr = fill_color_xbgr_mmx;
gst_video_mixer_fill_color_rgbx = fill_color_rgbx_mmx;
gst_video_mixer_fill_color_bgrx = fill_color_bgrx_mmx;
}
#endif
}

View file

@ -8,8 +8,6 @@ NAME_BLEND (guint8 * dest, const guint8 * src, gint src_height, gint src_width,
gint dest_add = dest_stride - (4 * src_width);
for (i = 0; i < src_height; i++) {
gulong old_ebx;
/* (P1 * (256 - A) + (P2 * A)) / 256
* => (P1 * 256 - P1 * A + P2 * A) / 256
* => (P1 * 256 + A * (P2 - P1) / 256
@ -17,8 +15,6 @@ NAME_BLEND (guint8 * dest, const guint8 * src, gint src_height, gint src_width,
*/
/* *INDENT-OFF* */
__asm__ __volatile__ (
" movl %%ebx , %6 \n\t"
" pcmpeqd %%mm5 , %%mm5 \n\t" /* mm5 = 0xffff... */
#if A_OFF == 0
" psrld $24 , %%mm5 \n\t" /* mm5 = 00 00 00 ff 00 00 00 ff, selector for alpha */
@ -29,8 +25,8 @@ NAME_BLEND (guint8 * dest, const guint8 * src, gint src_height, gint src_width,
" movd %%eax , %%mm6 \n\t" /* mm6 = s_alpha */
" punpckldq %%mm6 , %%mm6 \n\t" /* mm6 = 00 00 00 aa 00 00 00 aa, alpha scale factor */
" movl %5 , %%ebx \n\t" /* ebx = src_width */
" test $1 , %%ebx \n\t" /* check odd pixel */
" movl %5 , %%ecx \n\t" /* ecx = src_width */
" test $1 , %%ecx \n\t" /* check odd pixel */
" je 1f \n\t"
/* do odd pixel */
@ -59,8 +55,8 @@ NAME_BLEND (guint8 * dest, const guint8 * src, gint src_height, gint src_width,
" add $4 , %0 \n\t"
"1: \n\t"
" sar $1 , %%ebx \n\t" /* prepare for 2 pixel per loop */
" cmp $0 , %%ebx \n\t"
" sar $1 , %%ecx \n\t" /* prepare for 2 pixel per loop */
" cmp $0 , %%ecx \n\t"
" je 3f \n\t"
"2: \n\t"
@ -106,16 +102,15 @@ NAME_BLEND (guint8 * dest, const guint8 * src, gint src_height, gint src_width,
" add $8 , %1 \n\t"
" add $8 , %0 \n\t"
" dec %%ebx \n\t"
" dec %%ecx \n\t"
" jne 2b \n\t"
"3: \n\t"
" movl %6 , %%ebx \n\t"
:"=r" (src), "=r" (dest)
:"0" (src), "1" (dest), "m" (s_alpha), "m" (src_width), "m" (old_ebx)
:"%eax", "memory"
:"0" (src), "1" (dest), "m" (s_alpha), "m" (src_width)
:"%eax", "%ecx", "memory"
#ifdef __MMX__
, "mm0", "mm1", "mm2", "mm5", "mm6", "mm7"
, "mm0", "mm1", "mm2", "mm4", "mm3", "mm5", "mm6", "mm7"
#endif
);
/* *INDENT-ON* */
@ -166,3 +161,225 @@ NAME_FILL_COLOR (guint8 * dest, gint height, gint width, gint c1, gint c2,
}
#endif
#ifdef GENERIC
static inline void
_memcpy_u8_mmx (guint8 * dest, const guint8 * src, guint count)
{
/* *INDENT-OFF* */
__asm__ __volatile__ (
"1: \n\t"
"test $7, %0 \n\t"
"je 3f \n\t"
"2: \n\t"
"movb (%2), %%ah \n\t"
"movb %%ah, (%1) \n\t"
"inc %2 \n\t"
"inc %1 \n\t"
"dec %0 \n\t"
"test $7, %0 \n\t"
"jne 2b \n\t"
"3: \n\t"
"sar $3, %0 \n\t"
"cmp $0, %0 \n\t"
"je 5f \n\t"
"4: \n\t"
"movq (%2), %%mm0 \n\t"
"movq %%mm0, (%1) \n\t"
"add $8, %2 \n\t"
"add $8, %1 \n\t"
"dec %0 \n\t"
"jne 4b \n\t"
"5: \n\t"
"emms \n\t"
: "=r" (count), "=r" (dest), "=r" (src)
: "0" (count), "1" (dest), "2" (src)
: "memory", "ah"
#ifdef __MMX__
, "mm0"
#endif
);
/* *INDENT-ON* */
}
static inline void
_memset_u8_mmx (guint8 * dest, guint val, guint count)
{
guint8 val8 = val;
guint64 val64;
val64 = (val << 24) | (val << 16) | (val << 8) | (val);
val64 = (val64 << 32) | val64;
/* *INDENT-OFF* */
__asm__ __volatile__ (
"1: \n\t"
"test $7, %0 \n\t"
"je 3f \n\t"
"2: \n\t"
"movb %4, (%1) \n\t"
"inc %1 \n\t"
"dec %0 \n\t"
"test $7, %0 \n\t"
"jne 2b \n\t"
"3: \n\t"
"sar $3, %0 \n\t"
"cmp $0, %0 \n\t"
"je 5f \n\t"
"movq %5, %%mm0 \n\t"
"4: \n\t"
"movq %%mm0, (%1) \n\t"
"add $8, %1 \n\t"
"dec %0 \n\t"
"jne 4b \n\t"
"5: \n\t"
"emms \n\t"
: "=r" (count), "=r" (dest)
: "0" (count), "1" (dest), "r" (val8), "m" (val64)
: "memory"
#ifdef __MMX__
, "mm0"
#endif
);
/* *INDENT-ON* */
}
static inline void
_memset_u32_mmx (guint32 * dest, guint32 val, guint count)
{
guint64 val64 = val;
val64 |= (val64 << 32);
/* *INDENT-OFF* */
__asm__ __volatile__ (
"1: \n\t"
"test $1, %0 \n\t"
"je 3f \n\t"
"2: \n\t"
"movl %4, (%1) \n\t"
"add $4, %1 \n\t"
"dec %0 \n\t"
"test $1, %0 \n\t"
"jne 2b \n\t"
"3: \n\t"
"sar $1, %0 \n\t"
"cmp $0, %0 \n\t"
"je 5f \n\t"
"movq %5, %%mm0 \n\t"
"4: \n\t"
"movq %%mm0, (%1) \n\t"
"add $8, %1 \n\t"
"dec %0 \n\t"
"jne 4b \n\t"
"5: \n\t"
"emms \n\t"
: "=r" (count), "=r" (dest)
: "0" (count), "1" (dest), "r" (val), "m" (val64)
: "memory"
#ifdef __MMX__
, "mm0"
#endif
);
/* *INDENT-ON* */
}
static inline void
_blend_u8_mmx (guint8 * dest, const guint8 * src,
gint src_stride, gint dest_stride, gint src_width, gint src_height,
gint dest_width, gint s_alpha)
{
gint i;
gint src_add = src_stride - src_width;
gint dest_add = dest_stride - src_width;
for (i = 0; i < src_height; i++) {
/* Do first 3 "odd" pixels */
while ((src_width & 0x03)) {
*dest = BLEND (*dest, *src, s_alpha);
dest++;
src++;
src_width--;
}
/* (P1 * (256 - A) + (P2 * A)) / 256
* => (P1 * 256 - P1 * A + P2 * A) / 256
* => (P1 * 256 + A * (P2 - P1) / 256
* => P1 + (A * (P2 - P1)) / 256
*/
/* *INDENT-OFF* */
__asm__ __volatile__ (
" mov %4 , %%eax \n\t" /* eax = s_alpha */
" movd %%eax , %%mm6 \n\t" /* mm6 = s_alpha */
" punpcklwd %%mm6 , %%mm6 \n\t" /* mm6 = 00 00 00 00 00 aa 00 aa, alpha scale factor */
" punpckldq %%mm6 , %%mm6 \n\t" /* mm6 = 00 aa 00 aa 00 aa 00 aa */
" pxor %%mm7 , %%mm7 \n\t" /* mm7 = 00 00 00 00 00 00 00 00 */
" movl %5 , %%ecx \n\t" /* ecx = src_width */
"1: \n\t"
" test $7 , %%ecx \n\t"
" je 2f \n\t"
/* do first 4 "odd" bytes */
" movd (%2) , %%mm2 \n\t" /* mm2 = src, 00 00 00 00 sv su sy sa */
" movd (%3) , %%mm1 \n\t" /* mm1 = dest, 00 00 00 00 dv du dy da */
" punpcklbw %%mm7 , %%mm2 \n\t"
" punpcklbw %%mm7 , %%mm1 \n\t"
" psubw %%mm1 , %%mm2 \n\t" /* mm2 = mm2 - mm1 */
" pmullw %%mm6 , %%mm2 \n\t" /* mm2 = a * mm2 */
" psllw $8 , %%mm1 \n\t" /* scale up */
" paddw %%mm1 , %%mm2 \n\t" /* mm2 = mm2 + mm1 */
" psrlw $8 , %%mm2 \n\t" /* scale down */
" packuswb %%mm2 , %%mm2 \n\t"
" movd %%mm2 , (%3) \n\t" /* dest = mm1 */
" add $4 , %1 \n\t"
" add $4 , %0 \n\t"
"2: \n\t"
" sar $3 , %%ecx \n\t" /* prepare for 8 bytes per loop */
" cmp $0 , %%ecx \n\t"
" je 4f \n\t"
"3: \n\t"
/* do even pixels */
" movq (%2) , %%mm2 \n\t" /* mm2 = src, sv1 su1 sy1 sa1 sv0 su0 sy0 sa0 */
" movq (%3) , %%mm1 \n\t" /* mm1 = dest, dv1 du1 dy1 da1 dv0 du0 dy0 da0 */
" movq %%mm2 , %%mm4 \n\t"
" movq %%mm1 , %%mm3 \n\t"
" punpcklbw %%mm7 , %%mm2 \n\t"
" punpckhbw %%mm7 , %%mm4 \n\t"
" punpcklbw %%mm7 , %%mm1 \n\t"
" punpckhbw %%mm7 , %%mm3 \n\t"
" psubw %%mm1 , %%mm2 \n\t" /* mm2 = mm2 - mm1 */
" psubw %%mm3 , %%mm4 \n\t" /* mm4 = mm4 - mm3 */
" pmullw %%mm6 , %%mm2 \n\t" /* mm2 = a * mm2 */
" pmullw %%mm6 , %%mm4 \n\t" /* mm2 = a * mm2 */
" psllw $8 , %%mm1 \n\t" /* scale up */
" psllw $8 , %%mm3 \n\t" /* scale up */
" paddw %%mm1 , %%mm2 \n\t" /* mm2 = mm2 + mm1 */
" paddw %%mm3 , %%mm4 \n\t" /* mm4 = mm4 + mm3 */
" psrlw $8 , %%mm2 \n\t" /* scale down */
" psrlw $8 , %%mm4 \n\t" /* scale down */
" packuswb %%mm4 , %%mm2 \n\t"
" movq %%mm2 , (%3) \n\t"
" add $8 , %0 \n\t"
" add $8 , %1 \n\t"
" dec %%ecx \n\t"
" jne 3b \n\t"
"4: \n\t"
:"=r" (src), "=r" (dest)
:"0" (src), "1" (dest), "m" (s_alpha), "m" (src_width)
:"%eax", "%ecx", "memory"
#ifdef __MMX__
, "mm1", "mm2", "mm3", "mm4", "mm6", "mm7"
#endif
);
/* *INDENT-ON* */
src += src_add;
dest += dest_add;
}
__asm__ __volatile__ ("emms");
}
#endif