From 362785df8843acc0e4276a0c611add3bf01d5a1f Mon Sep 17 00:00:00 2001 From: Wim Taymans Date: Fri, 25 Dec 2009 12:38:35 +0100 Subject: [PATCH] videomixer: optimize blend code some more Use more efficient formula that uses less multiplies. Reduce the amount of scalar code, use MMX to calculate the desired alpha value. Unroll and handle 2 pixels in one iteration for improved pairing. --- gst/videomixer/blend_ayuv.c | 274 +++++++++++++++++++++++------------- 1 file changed, 173 insertions(+), 101 deletions(-) diff --git a/gst/videomixer/blend_ayuv.c b/gst/videomixer/blend_ayuv.c index af5f174301..43704a23cd 100644 --- a/gst/videomixer/blend_ayuv.c +++ b/gst/videomixer/blend_ayuv.c @@ -185,11 +185,86 @@ gst_videomixer_blend_ayuv_ayuv (guint8 * src, gint xpos, gint ypos, gint src_width, gint src_height, gdouble src_alpha, guint8 * dest, gint dest_width, gint dest_height) { - gint alpha, s_alpha; + guint s_alpha, alpha; gint i, j; gint src_stride, dest_stride; gint src_add, dest_add; - gint Y, U, V; + + src_stride = src_width * 4; + dest_stride = dest_width * 4; + + s_alpha = CLAMP ((gint) (src_alpha * 256), 0, 256); + //g_print ("%f %d\n", src_alpha, s_alpha); + + /* adjust src pointers for negative sizes */ + if (xpos < 0) { + src += -xpos * 4; + src_width -= -xpos; + xpos = 0; + } + if (ypos < 0) { + src += -ypos * src_stride; + src_height -= -ypos; + ypos = 0; + } + /* adjust width/height if the src is bigger than dest */ + if (xpos + src_width > dest_width) { + src_width = dest_width - xpos; + } + if (ypos + src_height > dest_height) { + src_height = dest_height - ypos; + } + + src_add = src_stride - (4 * src_width); + dest_add = dest_stride - (4 * src_width); + + dest = dest + 4 * xpos + (ypos * dest_stride); + + for (i = 0; i < src_height; i++) { + for (j = 0; j < src_width; j++) { +#if 0 + gint Y, U, V; + + alpha = (src[0] * s_alpha) >> 8; + Y = dest[1]; + U = dest[2]; + V = dest[3]; + dest[0] = 0xff; + dest[1] = (((src[1] - Y) * alpha) >> 8) + Y; + dest[2] = (((src[2] - U) * alpha) >> 8) + U; + dest[3] = (((src[3] - V) * alpha) >> 8) + V; +#else + gint Y, U, V; + + alpha = (src[0] * s_alpha) >> 8; + BLEND_MODE (dest[1], dest[2], dest[3], src[1], src[2], src[3], + alpha, Y, U, V); + dest[0] = 0xff; + dest[1] = Y; + dest[2] = U; + dest[3] = V; +#endif + + src += 4; + dest += 4; + } + src += src_add; + dest += dest_add; + } +} + +#undef BLEND_MODE + +#ifdef BUILD_X86_ASM +void +gst_videomixer_blend_ayuv_ayuv_mmx (guint8 * src, gint xpos, gint ypos, + gint src_width, gint src_height, gdouble src_alpha, + guint8 * dest, gint dest_width, gint dest_height) +{ + gint s_alpha; + gint i; + gint src_stride, dest_stride; + gint src_add, dest_add; src_stride = src_width * 4; dest_stride = dest_width * 4; @@ -220,110 +295,107 @@ gst_videomixer_blend_ayuv_ayuv (guint8 * src, gint xpos, gint ypos, dest = dest + 4 * xpos + (ypos * dest_stride); - for (i = 0; i < src_height; i++) { - for (j = 0; j < src_width; j++) { - alpha = (src[0] * s_alpha) >> 8; - BLEND_MODE (dest[1], dest[2], dest[3], src[1], src[2], src[3], - alpha, Y, U, V); - dest[0] = 0xff; - dest[1] = Y; - dest[2] = U; - dest[3] = V; - - src += 4; - dest += 4; - } - src += src_add; - dest += dest_add; - } -} - -#undef BLEND_MODE - -#ifdef BUILD_X86_ASM -void -gst_videomixer_blend_ayuv_ayuv_mmx (guint8 * src, gint xpos, gint ypos, - gint src_width, gint src_height, gdouble src_alpha, - guint8 * dest, gint dest_width, gint dest_height) -{ - gint b_alpha; - gint i; - gint src_stride, dest_stride; - gint src_add, dest_add; - - src_stride = src_width * 4; - dest_stride = dest_width * 4; - - b_alpha = CLAMP ((gint) (src_alpha * 255), 0, 255); - - /* adjust src pointers for negative sizes */ - if (xpos < 0) { - src += -xpos * 4; - src_width -= -xpos; - xpos = 0; - } - if (ypos < 0) { - src += -ypos * src_stride; - src_height -= -ypos; - ypos = 0; - } - /* adjust width/height if the src is bigger than dest */ - if (xpos + src_width > dest_width) { - src_width = dest_width - xpos; - } - if (ypos + src_height > dest_height) { - src_height = dest_height - ypos; - } - - src_add = src_stride - (4 * src_width); - dest_add = dest_stride - (4 * src_width); - - dest = dest + 4 * xpos + (ypos * dest_stride); - for (i = 0; i < src_height; i++) { gulong old_ebx; - /* *INDENT-OFF* */ - __asm__ __volatile__ ( - "movl %%ebx , %6 \n\t" - "pxor %%mm7 , %%mm7 \n\t" /* mm7 = 0 */ - "pcmpeqd %%mm6 , %%mm6 \n\t" /* mm6 = 0xffff... */ - "punpcklbw %%mm7 , %%mm6 \n\t" /* mm6 = 0x00ff00ff00ff... */ - "pcmpeqd %%mm5 , %%mm5 \n\t" /* mm5 = 0xffff... */ - "psrlq $56 , %%mm5 \n\t" /* mm5 = 0x0...0ff */ - "xor %%ebx , %%ebx \n\t" /* ebx = 0 */ - "1: \n\t" - "movzbl (%2) , %%eax \n\t" /* eax == source alpha */ - "imul %4 , %%eax \n\t" /* eax = source alpha * alpha */ - "sar $8 , %%eax \n\t" /* eax = (source alpha * alpha) / 256 */ - "movd %%eax , %%mm0 \n\t" /* mm0 = apply alpha */ - "movd (%2) , %%mm2 \n\t" /* mm2 = src */ - "movd (%3) , %%mm1 \n\t" /* mm1 = dest */ - "punpcklwd %%mm0 , %%mm0 \n\t" - "punpckldq %%mm0 , %%mm0 \n\t" /* mm0 == 0a 0a 0a 0a */ - "punpcklbw %%mm7 , %%mm1 \n\t" /* mm1 == dv du dy da */ - "punpcklbw %%mm7 , %%mm2 \n\t" /* mm2 == sv su sy sa */ - "pmullw %%mm0 , %%mm2 \n\t" /* mm2 == a * s */ - "pandn %%mm6 , %%mm0 \n\t" /* mm0 == 255 - a */ - "pmullw %%mm0 , %%mm1 \n\t" /* mm1 == (255 - a) * d */ - "paddusw %%mm2 , %%mm1 \n\t" /* mm1 == s + d */ - "psrlw $8 , %%mm1 \n\t" - "packuswb %%mm7 , %%mm1 \n\t" - "por %%mm5 , %%mm1 \n\t" /* mm1 = 0x.....ff */ - "movd %%mm1 , (%3) \n\t" /* dest = mm1 */ - "add $4 , %1 \n\t" - "add $4 , %0 \n\t" - "add $1 , %%ebx \n\t" - "cmp %%ebx , %5 \n\t" - "jne 1b \n\t" - "movl %6 , %%ebx \n\t" - :"=r" (src), "=r" (dest) - :"0" (src), "1" (dest), "r" (b_alpha), "r" (src_width), "m" (old_ebx) - :"%eax", "memory" + /* (P1 * (256 - A) + (P2 * A)) / 256 + * => (P1 * 256 - P1 * A + P2 * A) / 256 + * => (P1 * 256 + A * (P2 - P1) / 256 + * => P1 + (A * (P2 - P1)) / 256 + */ + /* *INDENT-OFF* */ + __asm__ __volatile__ ( + " movl %%ebx , %6 \n\t" + + " pcmpeqd %%mm5 , %%mm5 \n\t" /* mm5 = 0xffff... */ + " psrld $24 , %%mm5 \n\t" /* mm5 = 00 00 00 ff 00 00 00 0ff, selector for alpha */ + " mov %4 , %%eax \n\t" /* eax = s_alpha */ + " movd %%eax , %%mm6 \n\t" /* mm6 = s_alpha */ + " punpckldq %%mm6 , %%mm6 \n\t" /* mm6 = 00 00 00 aa 00 00 00 aa, alpha scale factor */ + + " movl %5 , %%ebx \n\t" /* ebx = src_width */ + " test $1 , %%ebx \n\t" /* check odd pixel */ + " je 1f \n\t" + + /* do odd pixel */ + " movd (%2) , %%mm2 \n\t" /* mm2 = src, 00 00 00 00 sv su sy sa */ + " movd (%3) , %%mm1 \n\t" /* mm1 = dest, 00 00 00 00 dv du dy da */ + " movq %%mm2 , %%mm0 \n\t" + " punpcklbw %%mm7 , %%mm2 \n\t" /* mm2 = 00 sv 00 su 00 sy 00 sa */ + " pand %%mm5 , %%mm0 \n\t" /* mm0 = 00 00 00 00 00 00 00 sa, get alpha component */ + " punpcklbw %%mm7 , %%mm1 \n\t" /* mm1 = 00 dv 00 du 00 dy 00 da */ + " pmullw %%mm6 , %%mm0 \n\t" /* mult with scale */ + " psubw %%mm1 , %%mm2 \n\t" /* mm2 = mm2 - mm1 */ + " punpcklwd %%mm0 , %%mm0 \n\t" + " punpckldq %%mm0 , %%mm0 \n\t" /* mm0 == 00 aa 00 aa 00 aa 00 aa */ + " psrlw $8 , %%mm0 \n\t" + " pmullw %%mm0 , %%mm2 \n\t" /* mm2 == a * mm2 */ + " psllw $8 , %%mm1 \n\t" /* scale up */ + " paddw %%mm1 , %%mm2 \n\t" /* mm2 == mm2 + mm1 */ + " psrlw $8 , %%mm2 \n\t" /* scale down */ + " por %%mm5 , %%mm2 \n\t" /* set alpha to ff */ + " packuswb %%mm2 , %%mm2 \n\t" + " movd %%mm2 , (%3) \n\t" /* dest = mm1 */ + " add $4 , %1 \n\t" + " add $4 , %0 \n\t" + + "1: \n\t" + " sar $1 , %%ebx \n\t" /* prepare for 2 pixel per loop */ + " cmp $0 , %%ebx \n\t" + " je 3f \n\t" + "2: \n\t" + + /* do even pixels */ + " movq (%2) , %%mm2 \n\t" /* mm2 = src, sv1 su1 sy1 sa1 sv0 su0 sy0 sa0 */ + " movq (%3) , %%mm1 \n\t" /* mm1 = dest, dv1 du1 dy1 da1 dv0 du0 dy0 da0 */ + " movq %%mm2 , %%mm4 \n\t" + " movq %%mm1 , %%mm3 \n\t" + " movq %%mm2 , %%mm0 \n\t" /* copy for doing the alpha */ + + " pxor %%mm7 , %%mm7 \n\t" + " punpcklbw %%mm7 , %%mm2 \n\t" /* mm2 = 00 sv0 00 su0 00 sy0 00 sa0 */ + " punpckhbw %%mm7 , %%mm4 \n\t" /* mm4 = 00 sv1 00 su1 00 sy1 00 sa1 */ + " punpcklbw %%mm7 , %%mm1 \n\t" /* mm1 = 00 dv0 00 du0 00 dy0 00 da0 */ + " punpckhbw %%mm7 , %%mm3 \n\t" /* mm2 = 00 dv1 00 du1 00 dy1 00 da1 */ + + " pand %%mm5 , %%mm0 \n\t" /* mm0 = 00 00 00 sa1 00 00 00 sa0 */ + " psubw %%mm1 , %%mm2 \n\t" /* mm2 = mm2 - mm1 */ + " pmullw %%mm6 , %%mm0 \n\t" /* mult with scale */ + " psubw %%mm3 , %%mm4 \n\t" /* mm4 = mm4 - mm3 */ + " psrlw $8 , %%mm0 \n\t" /* scale back */ + " movq %%mm0 , %%mm7 \n\t" /* save copy */ + " punpcklwd %%mm0 , %%mm0 \n\t" /* mm0 = 00 00 00 00 00 aa0 00 aa0 */ + " punpckhwd %%mm7 , %%mm7 \n\t" /* mm7 = 00 00 00 00 00 aa1 00 aa1 */ + " punpckldq %%mm0 , %%mm0 \n\t" /* mm0 = 00 aa0 00 aa0 00 aa0 00 aa0 */ + " punpckldq %%mm7 , %%mm7 \n\t" /* mm7 = 00 aa1 00 aa1 00 aa1 00 aa1 */ + + " pmullw %%mm0 , %%mm2 \n\t" /* mm2 == aa * mm2 */ + " pmullw %%mm7 , %%mm4 \n\t" /* mm2 == aa * mm2 */ + " psllw $8 , %%mm1 \n\t" + " psllw $8 , %%mm3 \n\t" + " paddw %%mm1 , %%mm2 \n\t" /* mm2 == mm2 + mm1 */ + " paddw %%mm3 , %%mm4 \n\t" /* mm2 == mm2 + mm1 */ + + " psrlw $8 , %%mm2 \n\t" + " psrlw $8 , %%mm4 \n\t" + " packuswb %%mm4 , %%mm2 \n\t" + " por %%mm5 , %%mm2 \n\t" /* set alpha to ff */ + " movq %%mm2 , (%3) \n\t" + + " add $8 , %1 \n\t" + " add $8 , %0 \n\t" + " dec %%ebx \n\t" + " jne 2b \n\t" + + "3: \n\t" + " movl %6 , %%ebx \n\t" + :"=r" (src), "=r" (dest) + :"0" (src), "1" (dest), "m" (s_alpha), "m" (src_width), "m" (old_ebx) + :"%eax", "memory" #ifdef __MMX__ - , "mm0", "mm1", "mm2", "mm5", "mm6", "mm7" + , "mm0", "mm1", "mm2", "mm5", "mm6", "mm7" #endif - ); + ); /* *INDENT-ON* */ src += src_add; dest += dest_add;