diff --git a/gst/videomixer/blend.c b/gst/videomixer/blend.c index 5beaf48549..909fa5ec3e 100644 --- a/gst/videomixer/blend.c +++ b/gst/videomixer/blend.c @@ -34,6 +34,8 @@ #include +#define BLEND(D,S,alpha) (((D) * (256 - (alpha)) + (S) * (alpha)) >> 8) + #ifdef HAVE_GCC_ASM #if defined(HAVE_CPU_I386) || defined(HAVE_CPU_X86_64) #define BUILD_X86_ASM @@ -46,8 +48,6 @@ /* Below are the implementations of everything */ -#define BLEND(D,S,alpha) (((D) * (255 - (alpha)) + (S) * (alpha)) >> 8) - inline static void _blend_u8_c (guint8 * dest, const guint8 * src, gint src_stride, gint dest_stride, gint src_width, gint src_height, @@ -459,11 +459,9 @@ fill_color_##name (guint8 * dest, gint width, gint height, \ gint i; \ gint dest_stride = GST_ROUND_UP_4 (width * bpp); \ \ - red = CLAMP (1.164 * (colY - 16) + 1.596 * (colV - 128), 0, 255); \ - green = \ - CLAMP (1.164 * (colY - 16) - 0.813 * (colV - 128) - 0.391 * (colU - 128), \ - 0, 255); \ - blue = CLAMP (1.164 * (colY - 16) + 2.018 * (colU - 128), 0, 255); \ + red = YUV_TO_R (colY, colU, colV); \ + green = YUV_TO_G (colY, colU, colV); \ + blue = YUV_TO_B (colY, colU, colV); \ \ for (i = 0; i < height; i++) { \ MEMSET_RGB (dest, red, green, blue, width); \ @@ -508,6 +506,15 @@ RGB_FILL_COLOR (bgrx_c, 4, _memset_bgrx_c); /* MMX Implementations */ #ifdef BUILD_X86_ASM + +#define MEMSET_xRGB_MMX(name, r, g, b) \ +static inline void \ +_memset_##name##_mmx (guint8* dest, gint red, gint green, gint blue, gint width) { \ + guint32 val = (red << r) | (green << g) | (blue << b); \ + \ + _memset_u32_mmx ((guint32 *) dest, val, width); \ +} + #define A32 #define NAME_BLEND _blend_loop_argb_mmx #define NAME_FILL_COLOR _fill_color_loop_argb_mmx @@ -544,6 +551,25 @@ BLEND_A32 (bgra_mmx, _blend_loop_bgra_mmx); A32_COLOR (argb_mmx, TRUE, _fill_color_loop_argb_mmx); A32_COLOR (bgra_mmx, TRUE, _fill_color_loop_bgra_mmx); A32_COLOR (ayuv_mmx, FALSE, _fill_color_loop_argb_mmx); + +I420_BLEND (mmx, _memcpy_u8_mmx, _blend_u8_mmx); +I420_FILL_CHECKER (mmx, _memset_u8_mmx); +I420_FILL_COLOR (mmx, _memset_u8_mmx); + +RGB_BLEND (rgb_mmx, 3, _memcpy_u8_mmx, _blend_u8_mmx); + +RGB_BLEND (xrgb_mmx, 4, _memcpy_u8_mmx, _blend_u8_mmx); +MEMSET_xRGB_MMX (xrgb, 16, 8, 0); +RGB_FILL_COLOR (xrgb_mmx, 4, _memset_xrgb_mmx); + +MEMSET_xRGB_MMX (xbgr, 0, 8, 16); +RGB_FILL_COLOR (xbgr_mmx, 4, _memset_xbgr_mmx); + +MEMSET_xRGB_MMX (rgbx, 24, 16, 8); +RGB_FILL_COLOR (rgbx_mmx, 4, _memset_rgbx_mmx); + +MEMSET_xRGB_MMX (bgrx, 8, 16, 24); +RGB_FILL_COLOR (bgrx_mmx, 4, _memset_bgrx_mmx); #endif /* Init function */ @@ -612,10 +638,20 @@ gst_video_mixer_init_blend (void) if (cpu_flags & OIL_IMPL_FLAG_MMX) { gst_video_mixer_blend_argb = blend_argb_mmx; gst_video_mixer_blend_bgra = blend_bgra_mmx; + gst_video_mixer_blend_i420 = blend_i420_mmx; + gst_video_mixer_blend_rgb = blend_rgb_mmx; + gst_video_mixer_blend_xrgb = blend_xrgb_mmx; + + gst_video_mixer_fill_checker_i420 = fill_checker_i420_mmx; gst_video_mixer_fill_color_argb = fill_color_argb_mmx; gst_video_mixer_fill_color_bgra = fill_color_bgra_mmx; gst_video_mixer_fill_color_ayuv = fill_color_ayuv_mmx; + gst_video_mixer_fill_color_i420 = fill_color_i420_mmx; + gst_video_mixer_fill_color_xrgb = fill_color_xrgb_mmx; + gst_video_mixer_fill_color_xbgr = fill_color_xbgr_mmx; + gst_video_mixer_fill_color_rgbx = fill_color_rgbx_mmx; + gst_video_mixer_fill_color_bgrx = fill_color_bgrx_mmx; } #endif } diff --git a/gst/videomixer/blend_mmx.h b/gst/videomixer/blend_mmx.h index 141f712277..29840926c6 100644 --- a/gst/videomixer/blend_mmx.h +++ b/gst/videomixer/blend_mmx.h @@ -8,8 +8,6 @@ NAME_BLEND (guint8 * dest, const guint8 * src, gint src_height, gint src_width, gint dest_add = dest_stride - (4 * src_width); for (i = 0; i < src_height; i++) { - gulong old_ebx; - /* (P1 * (256 - A) + (P2 * A)) / 256 * => (P1 * 256 - P1 * A + P2 * A) / 256 * => (P1 * 256 + A * (P2 - P1) / 256 @@ -17,8 +15,6 @@ NAME_BLEND (guint8 * dest, const guint8 * src, gint src_height, gint src_width, */ /* *INDENT-OFF* */ __asm__ __volatile__ ( - " movl %%ebx , %6 \n\t" - " pcmpeqd %%mm5 , %%mm5 \n\t" /* mm5 = 0xffff... */ #if A_OFF == 0 " psrld $24 , %%mm5 \n\t" /* mm5 = 00 00 00 ff 00 00 00 ff, selector for alpha */ @@ -29,8 +25,8 @@ NAME_BLEND (guint8 * dest, const guint8 * src, gint src_height, gint src_width, " movd %%eax , %%mm6 \n\t" /* mm6 = s_alpha */ " punpckldq %%mm6 , %%mm6 \n\t" /* mm6 = 00 00 00 aa 00 00 00 aa, alpha scale factor */ - " movl %5 , %%ebx \n\t" /* ebx = src_width */ - " test $1 , %%ebx \n\t" /* check odd pixel */ + " movl %5 , %%ecx \n\t" /* ecx = src_width */ + " test $1 , %%ecx \n\t" /* check odd pixel */ " je 1f \n\t" /* do odd pixel */ @@ -59,8 +55,8 @@ NAME_BLEND (guint8 * dest, const guint8 * src, gint src_height, gint src_width, " add $4 , %0 \n\t" "1: \n\t" - " sar $1 , %%ebx \n\t" /* prepare for 2 pixel per loop */ - " cmp $0 , %%ebx \n\t" + " sar $1 , %%ecx \n\t" /* prepare for 2 pixel per loop */ + " cmp $0 , %%ecx \n\t" " je 3f \n\t" "2: \n\t" @@ -106,16 +102,15 @@ NAME_BLEND (guint8 * dest, const guint8 * src, gint src_height, gint src_width, " add $8 , %1 \n\t" " add $8 , %0 \n\t" - " dec %%ebx \n\t" + " dec %%ecx \n\t" " jne 2b \n\t" "3: \n\t" - " movl %6 , %%ebx \n\t" :"=r" (src), "=r" (dest) - :"0" (src), "1" (dest), "m" (s_alpha), "m" (src_width), "m" (old_ebx) - :"%eax", "memory" + :"0" (src), "1" (dest), "m" (s_alpha), "m" (src_width) + :"%eax", "%ecx", "memory" #ifdef __MMX__ - , "mm0", "mm1", "mm2", "mm5", "mm6", "mm7" + , "mm0", "mm1", "mm2", "mm4", "mm3", "mm5", "mm6", "mm7" #endif ); /* *INDENT-ON* */ @@ -166,3 +161,225 @@ NAME_FILL_COLOR (guint8 * dest, gint height, gint width, gint c1, gint c2, } #endif +#ifdef GENERIC +static inline void +_memcpy_u8_mmx (guint8 * dest, const guint8 * src, guint count) +{ + /* *INDENT-OFF* */ + __asm__ __volatile__ ( + "1: \n\t" + "test $7, %0 \n\t" + "je 3f \n\t" + "2: \n\t" + "movb (%2), %%ah \n\t" + "movb %%ah, (%1) \n\t" + "inc %2 \n\t" + "inc %1 \n\t" + "dec %0 \n\t" + "test $7, %0 \n\t" + "jne 2b \n\t" + "3: \n\t" + "sar $3, %0 \n\t" + "cmp $0, %0 \n\t" + "je 5f \n\t" + "4: \n\t" + "movq (%2), %%mm0 \n\t" + "movq %%mm0, (%1) \n\t" + "add $8, %2 \n\t" + "add $8, %1 \n\t" + "dec %0 \n\t" + "jne 4b \n\t" + "5: \n\t" + "emms \n\t" + : "=r" (count), "=r" (dest), "=r" (src) + : "0" (count), "1" (dest), "2" (src) + : "memory", "ah" +#ifdef __MMX__ + , "mm0" +#endif + ); + /* *INDENT-ON* */ +} + +static inline void +_memset_u8_mmx (guint8 * dest, guint val, guint count) +{ + guint8 val8 = val; + guint64 val64; + + val64 = (val << 24) | (val << 16) | (val << 8) | (val); + val64 = (val64 << 32) | val64; + + /* *INDENT-OFF* */ + __asm__ __volatile__ ( + "1: \n\t" + "test $7, %0 \n\t" + "je 3f \n\t" + "2: \n\t" + "movb %4, (%1) \n\t" + "inc %1 \n\t" + "dec %0 \n\t" + "test $7, %0 \n\t" + "jne 2b \n\t" + "3: \n\t" + "sar $3, %0 \n\t" + "cmp $0, %0 \n\t" + "je 5f \n\t" + "movq %5, %%mm0 \n\t" + "4: \n\t" + "movq %%mm0, (%1) \n\t" + "add $8, %1 \n\t" + "dec %0 \n\t" + "jne 4b \n\t" + "5: \n\t" + "emms \n\t" + : "=r" (count), "=r" (dest) + : "0" (count), "1" (dest), "r" (val8), "m" (val64) + : "memory" +#ifdef __MMX__ + , "mm0" +#endif + ); + /* *INDENT-ON* */ +} + +static inline void +_memset_u32_mmx (guint32 * dest, guint32 val, guint count) +{ + guint64 val64 = val; + + val64 |= (val64 << 32); + + /* *INDENT-OFF* */ + __asm__ __volatile__ ( + "1: \n\t" + "test $1, %0 \n\t" + "je 3f \n\t" + "2: \n\t" + "movl %4, (%1) \n\t" + "add $4, %1 \n\t" + "dec %0 \n\t" + "test $1, %0 \n\t" + "jne 2b \n\t" + "3: \n\t" + "sar $1, %0 \n\t" + "cmp $0, %0 \n\t" + "je 5f \n\t" + "movq %5, %%mm0 \n\t" + "4: \n\t" + "movq %%mm0, (%1) \n\t" + "add $8, %1 \n\t" + "dec %0 \n\t" + "jne 4b \n\t" + "5: \n\t" + "emms \n\t" + : "=r" (count), "=r" (dest) + : "0" (count), "1" (dest), "r" (val), "m" (val64) + : "memory" +#ifdef __MMX__ + , "mm0" +#endif + ); + /* *INDENT-ON* */ +} + +static inline void +_blend_u8_mmx (guint8 * dest, const guint8 * src, + gint src_stride, gint dest_stride, gint src_width, gint src_height, + gint dest_width, gint s_alpha) +{ + gint i; + gint src_add = src_stride - src_width; + gint dest_add = dest_stride - src_width; + + for (i = 0; i < src_height; i++) { + /* Do first 3 "odd" pixels */ + while ((src_width & 0x03)) { + *dest = BLEND (*dest, *src, s_alpha); + dest++; + src++; + src_width--; + } + + /* (P1 * (256 - A) + (P2 * A)) / 256 + * => (P1 * 256 - P1 * A + P2 * A) / 256 + * => (P1 * 256 + A * (P2 - P1) / 256 + * => P1 + (A * (P2 - P1)) / 256 + */ + /* *INDENT-OFF* */ + __asm__ __volatile__ ( + " mov %4 , %%eax \n\t" /* eax = s_alpha */ + " movd %%eax , %%mm6 \n\t" /* mm6 = s_alpha */ + " punpcklwd %%mm6 , %%mm6 \n\t" /* mm6 = 00 00 00 00 00 aa 00 aa, alpha scale factor */ + " punpckldq %%mm6 , %%mm6 \n\t" /* mm6 = 00 aa 00 aa 00 aa 00 aa */ + + " pxor %%mm7 , %%mm7 \n\t" /* mm7 = 00 00 00 00 00 00 00 00 */ + + " movl %5 , %%ecx \n\t" /* ecx = src_width */ + + "1: \n\t" + " test $7 , %%ecx \n\t" + " je 2f \n\t" + + /* do first 4 "odd" bytes */ + " movd (%2) , %%mm2 \n\t" /* mm2 = src, 00 00 00 00 sv su sy sa */ + " movd (%3) , %%mm1 \n\t" /* mm1 = dest, 00 00 00 00 dv du dy da */ + " punpcklbw %%mm7 , %%mm2 \n\t" + " punpcklbw %%mm7 , %%mm1 \n\t" + " psubw %%mm1 , %%mm2 \n\t" /* mm2 = mm2 - mm1 */ + " pmullw %%mm6 , %%mm2 \n\t" /* mm2 = a * mm2 */ + " psllw $8 , %%mm1 \n\t" /* scale up */ + " paddw %%mm1 , %%mm2 \n\t" /* mm2 = mm2 + mm1 */ + " psrlw $8 , %%mm2 \n\t" /* scale down */ + " packuswb %%mm2 , %%mm2 \n\t" + " movd %%mm2 , (%3) \n\t" /* dest = mm1 */ + " add $4 , %1 \n\t" + " add $4 , %0 \n\t" + + "2: \n\t" + " sar $3 , %%ecx \n\t" /* prepare for 8 bytes per loop */ + " cmp $0 , %%ecx \n\t" + " je 4f \n\t" + + "3: \n\t" + /* do even pixels */ + " movq (%2) , %%mm2 \n\t" /* mm2 = src, sv1 su1 sy1 sa1 sv0 su0 sy0 sa0 */ + " movq (%3) , %%mm1 \n\t" /* mm1 = dest, dv1 du1 dy1 da1 dv0 du0 dy0 da0 */ + " movq %%mm2 , %%mm4 \n\t" + " movq %%mm1 , %%mm3 \n\t" + " punpcklbw %%mm7 , %%mm2 \n\t" + " punpckhbw %%mm7 , %%mm4 \n\t" + " punpcklbw %%mm7 , %%mm1 \n\t" + " punpckhbw %%mm7 , %%mm3 \n\t" + " psubw %%mm1 , %%mm2 \n\t" /* mm2 = mm2 - mm1 */ + " psubw %%mm3 , %%mm4 \n\t" /* mm4 = mm4 - mm3 */ + " pmullw %%mm6 , %%mm2 \n\t" /* mm2 = a * mm2 */ + " pmullw %%mm6 , %%mm4 \n\t" /* mm2 = a * mm2 */ + " psllw $8 , %%mm1 \n\t" /* scale up */ + " psllw $8 , %%mm3 \n\t" /* scale up */ + " paddw %%mm1 , %%mm2 \n\t" /* mm2 = mm2 + mm1 */ + " paddw %%mm3 , %%mm4 \n\t" /* mm4 = mm4 + mm3 */ + " psrlw $8 , %%mm2 \n\t" /* scale down */ + " psrlw $8 , %%mm4 \n\t" /* scale down */ + " packuswb %%mm4 , %%mm2 \n\t" + " movq %%mm2 , (%3) \n\t" + " add $8 , %0 \n\t" + " add $8 , %1 \n\t" + " dec %%ecx \n\t" + " jne 3b \n\t" + + "4: \n\t" + :"=r" (src), "=r" (dest) + :"0" (src), "1" (dest), "m" (s_alpha), "m" (src_width) + :"%eax", "%ecx", "memory" +#ifdef __MMX__ + , "mm1", "mm2", "mm3", "mm4", "mm6", "mm7" +#endif + ); + /* *INDENT-ON* */ + src += src_add; + dest += dest_add; + } + __asm__ __volatile__ ("emms"); +} +#endif