From 6da14d0c4187f94b2742b228da7f1370dcb92853 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sebastian=20Dr=C3=B6ge?= Date: Mon, 23 Aug 2010 15:44:50 +0200 Subject: [PATCH] videomixer: Optimize ARGB blending and implement BGRA blending with orc This now means, that we have absolutely no handwritten assembly anymore in videomixer and it's also faster now when using SSE. --- configure.ac | 2 +- gst/videomixer/Makefile.am | 2 +- gst/videomixer/blend.c | 99 ++++++---------------------- gst/videomixer/blend_mmx.h | 124 ------------------------------------ gst/videomixer/blendorc.orc | 44 +++++++++++-- gst/videomixer/videomixer.c | 6 -- 6 files changed, 59 insertions(+), 218 deletions(-) delete mode 100644 gst/videomixer/blend_mmx.h diff --git a/configure.ac b/configure.ac index d338662ea1..b262b20476 100644 --- a/configure.ac +++ b/configure.ac @@ -208,7 +208,7 @@ dnl GLib is required AG_GST_GLIB_CHECK([2.18]) dnl Orc -ORC_CHECK([0.4.5]) +ORC_CHECK([0.4.7]) dnl checks for gstreamer diff --git a/gst/videomixer/Makefile.am b/gst/videomixer/Makefile.am index fee1c06a56..193d8a7457 100644 --- a/gst/videomixer/Makefile.am +++ b/gst/videomixer/Makefile.am @@ -13,4 +13,4 @@ libgstvideomixer_la_LDFLAGS = $(GST_PLUGIN_LDFLAGS) libgstvideomixer_la_LIBTOOLFLAGS = --tag=disable-static # headers we need but don't want installed -noinst_HEADERS = videomixer.h videomixerpad.h blend.h blend_mmx.h +noinst_HEADERS = videomixer.h videomixerpad.h blend.h diff --git a/gst/videomixer/blend.c b/gst/videomixer/blend.c index 13c775484b..bb55a85bd6 100644 --- a/gst/videomixer/blend.c +++ b/gst/videomixer/blend.c @@ -38,12 +38,6 @@ #define BLEND(D,S,alpha) (((D) * (256 - (alpha)) + (S) * (alpha)) >> 8) -#ifdef HAVE_GCC_ASM -#if defined(HAVE_ORC) && (defined(HAVE_CPU_I386) || defined(HAVE_CPU_X86_64)) -#define BUILD_X86_ASM -#endif -#endif - GST_DEBUG_CATEGORY_STATIC (gst_videomixer_blend_debug); #define GST_CAT_DEFAULT gst_videomixer_blend_debug @@ -92,34 +86,26 @@ blend_##name (const guint8 * src, gint xpos, gint ypos, \ LOOP (dest, src, src_height, src_width, src_stride, dest_stride, s_alpha); \ } -#define BLEND_A32_LOOP_C(name, A, C1, C2, C3) \ +#define BLEND_A32_LOOP(name) \ static inline void \ -_blend_loop_##name##_c (guint8 *dest, const guint8 *src, gint src_height, gint src_width, gint src_stride, gint dest_stride, guint s_alpha) { \ - gint i, j; \ - gint alpha; \ - gint src_add = src_stride - (4 * src_width); \ - gint dest_add = dest_stride - (4 * src_width); \ - \ - for (i = 0; i < src_height; i++) { \ - for (j = 0; j < src_width; j++) { \ - alpha = (src[A] * s_alpha) >> 8; \ - dest[A] = 0xff; \ - dest[C1] = BLEND(dest[C1], src[C1], alpha); \ - dest[C2] = BLEND(dest[C2], src[C2], alpha); \ - dest[C3] = BLEND(dest[C3], src[C3], alpha); \ - \ - src += 4; \ - dest += 4; \ - } \ - src += src_add; \ - dest += dest_add; \ - } \ +_blend_loop_##name (guint8 * dest, const guint8 * src, gint src_height, \ + gint src_width, gint src_stride, gint dest_stride, guint s_alpha) \ +{ \ + s_alpha = MIN (255, s_alpha); \ + orc_blend_##name (dest, dest_stride, src, src_stride, \ + s_alpha, src_width, src_height); \ } -BLEND_A32_LOOP_C (argb, 0, 1, 2, 3); -BLEND_A32_LOOP_C (bgra, 3, 2, 1, 0); -BLEND_A32 (argb_c, _blend_loop_argb_c); -BLEND_A32 (bgra_c, _blend_loop_bgra_c); +BLEND_A32_LOOP (argb); +BLEND_A32_LOOP (bgra); + +#if G_BYTE_ORDER == LITTLE_ENDIAN +BLEND_A32 (argb, _blend_loop_argb); +BLEND_A32 (bgra, _blend_loop_bgra); +#else +BLEND_A32 (argb, _blend_loop_bgra); +BLEND_A32 (bgra, _blend_loop_argb); +#endif #define A32_CHECKER_C(name, RGB, A, C1, C2, C3) \ static void \ @@ -680,39 +666,6 @@ PACKED_422_FILL_COLOR (yuy2, 24, 16, 8, 0); PACKED_422_FILL_COLOR (yvyu, 24, 0, 8, 16); PACKED_422_FILL_COLOR (uyvy, 16, 24, 0, 8); -/* MMX Implementations */ -#ifdef BUILD_X86_ASM - -#define A32 -#define NAME_BLEND _blend_loop_argb_mmx -#define A_OFF 0 -#include "blend_mmx.h" -#undef NAME_BLEND -#undef A_OFF - -#define NAME_BLEND _blend_loop_bgra_mmx -#define A_OFF 24 -#include "blend_mmx.h" -#undef NAME_BLEND -#undef A_OFF -#undef A32 - -BLEND_A32 (argb_mmx, _blend_loop_argb_mmx); -BLEND_A32 (bgra_mmx, _blend_loop_bgra_mmx); -#endif - -static void -_blend_loop_argb_orc (guint8 * dest, const guint8 * src, gint src_height, - gint src_width, gint src_stride, gint dest_stride, guint s_alpha) -{ - s_alpha = MIN (255, s_alpha); - gst_videomixer_orc_blend_ayuv (dest, dest_stride, src, src_stride, - s_alpha, src_width, src_height); -} - -BLEND_A32 (argb_orc, _blend_loop_argb_orc); - - /* Init function */ BlendFunction gst_video_mixer_blend_argb; BlendFunction gst_video_mixer_blend_bgra; @@ -769,18 +722,13 @@ FillColorFunction gst_video_mixer_fill_color_uyvy; void gst_video_mixer_init_blend (void) { -#ifdef BUILD_X86_ASM - guint cpu_flags; - orc_init (); - cpu_flags = orc_target_get_default_flags (orc_target_get_by_name ("mmx")); -#endif GST_DEBUG_CATEGORY_INIT (gst_videomixer_blend_debug, "videomixer_blend", 0, "video mixer blending functions"); - gst_video_mixer_blend_argb = blend_argb_c; - gst_video_mixer_blend_bgra = blend_bgra_c; + gst_video_mixer_blend_argb = blend_argb; + gst_video_mixer_blend_bgra = blend_bgra; gst_video_mixer_blend_i420 = blend_i420; gst_video_mixer_blend_y444 = blend_y444; gst_video_mixer_blend_y42b = blend_y42b; @@ -820,13 +768,4 @@ gst_video_mixer_init_blend (void) gst_video_mixer_fill_color_yuy2 = fill_color_yuy2; gst_video_mixer_fill_color_yvyu = fill_color_yvyu; gst_video_mixer_fill_color_uyvy = fill_color_uyvy; - -#ifdef BUILD_X86_ASM - if (cpu_flags & ORC_TARGET_MMX_MMX) { - gst_video_mixer_blend_argb = blend_argb_mmx; - gst_video_mixer_blend_bgra = blend_bgra_mmx; - } -#endif - - gst_video_mixer_blend_argb = blend_argb_orc; } diff --git a/gst/videomixer/blend_mmx.h b/gst/videomixer/blend_mmx.h deleted file mode 100644 index 9c0f250aa6..0000000000 --- a/gst/videomixer/blend_mmx.h +++ /dev/null @@ -1,124 +0,0 @@ -#ifdef A32 -static inline void -NAME_BLEND (guint8 * dest, const guint8 * src, gint src_height, gint src_width, - gint src_stride, gint dest_stride, guint s_alpha) -{ - gint i; - gint src_add = src_stride - (4 * src_width); - gint dest_add = dest_stride - (4 * src_width); - - for (i = 0; i < src_height; i++) { - /* (P1 * (256 - A) + (P2 * A)) / 256 - * => (P1 * 256 - P1 * A + P2 * A) / 256 - * => (P1 * 256 + A * (P2 - P1) / 256 - * => P1 + (A * (P2 - P1)) / 256 - */ - /* *INDENT-OFF* */ - __asm__ __volatile__ ( - " pcmpeqd %%mm5 , %%mm5 \n\t" /* mm5 = 0xffff... */ -#if A_OFF == 0 - " psrld $24 , %%mm5 \n\t" /* mm5 = 00 00 00 ff 00 00 00 ff, selector for alpha */ -#else - " pslld $24 , %%mm5 \n\t" /* mm5 = ff 00 00 00 ff 00 00 00, selector for alpha */ -#endif - " mov %4 , %%eax \n\t" /* eax = s_alpha */ - " movd %%eax , %%mm6 \n\t" /* mm6 = s_alpha */ - " punpckldq %%mm6 , %%mm6 \n\t" /* mm6 = 00 00 00 aa 00 00 00 aa, alpha scale factor */ - - " movl %5 , %%ecx \n\t" /* ecx = src_width */ - " test $1 , %%ecx \n\t" /* check odd pixel */ - " je 1f \n\t" - - /* do odd pixel */ - " movd (%2) , %%mm2 \n\t" /* mm2 = src, 00 00 00 00 sv su sy sa */ - " movd (%3) , %%mm1 \n\t" /* mm1 = dest, 00 00 00 00 dv du dy da */ - " movq %%mm2 , %%mm0 \n\t" - " punpcklbw %%mm7 , %%mm2 \n\t" /* mm2 = 00 sv 00 su 00 sy 00 sa */ - " pand %%mm5 , %%mm0 \n\t" /* mm0 = 00 00 00 00 00 00 00 sa, get alpha component */ -#if A_OFF != 0 - " psrld $24 , %%mm0 \n\t" -#endif - " punpcklbw %%mm7 , %%mm1 \n\t" /* mm1 = 00 dv 00 du 00 dy 00 da */ - " pmullw %%mm6 , %%mm0 \n\t" /* mult with scale */ - " psubw %%mm1 , %%mm2 \n\t" /* mm2 = mm2 - mm1 */ - " punpcklwd %%mm0 , %%mm0 \n\t" - " punpckldq %%mm0 , %%mm0 \n\t" /* mm0 == 00 aa 00 aa 00 aa 00 aa */ - " psrlw $8 , %%mm0 \n\t" - " pmullw %%mm0 , %%mm2 \n\t" /* mm2 == a * mm2 */ - " psllw $8 , %%mm1 \n\t" /* scale up */ - " paddw %%mm1 , %%mm2 \n\t" /* mm2 == mm2 + mm1 */ - " psrlw $8 , %%mm2 \n\t" /* scale down */ - " por %%mm5 , %%mm2 \n\t" /* set alpha to ff */ - " packuswb %%mm2 , %%mm2 \n\t" - " movd %%mm2 , (%3) \n\t" /* dest = mm1 */ - " add $4 , %1 \n\t" - " add $4 , %0 \n\t" - - "1: \n\t" - " sar $1 , %%ecx \n\t" /* prepare for 2 pixel per loop */ - " cmp $0 , %%ecx \n\t" - " je 3f \n\t" - "2: \n\t" - - /* do even pixels */ - " movq (%2) , %%mm2 \n\t" /* mm2 = src, sv1 su1 sy1 sa1 sv0 su0 sy0 sa0 */ - " movq (%3) , %%mm1 \n\t" /* mm1 = dest, dv1 du1 dy1 da1 dv0 du0 dy0 da0 */ - " movq %%mm2 , %%mm4 \n\t" - " movq %%mm1 , %%mm3 \n\t" - " movq %%mm2 , %%mm0 \n\t" /* copy for doing the alpha */ - - " pxor %%mm7 , %%mm7 \n\t" - " punpcklbw %%mm7 , %%mm2 \n\t" /* mm2 = 00 sv0 00 su0 00 sy0 00 sa0 */ - " punpckhbw %%mm7 , %%mm4 \n\t" /* mm4 = 00 sv1 00 su1 00 sy1 00 sa1 */ - " punpcklbw %%mm7 , %%mm1 \n\t" /* mm1 = 00 dv0 00 du0 00 dy0 00 da0 */ - " punpckhbw %%mm7 , %%mm3 \n\t" /* mm2 = 00 dv1 00 du1 00 dy1 00 da1 */ - - " pand %%mm5 , %%mm0 \n\t" /* mm0 = 00 00 00 sa1 00 00 00 sa0 */ -#if A_OFF != 0 - " psrld $24 , %%mm0 \n\t" -#endif - " psubw %%mm1 , %%mm2 \n\t" /* mm2 = mm2 - mm1 */ - " pmullw %%mm6 , %%mm0 \n\t" /* mult with scale */ - " psubw %%mm3 , %%mm4 \n\t" /* mm4 = mm4 - mm3 */ - " psrlw $8 , %%mm0 \n\t" /* scale back */ - " movq %%mm0 , %%mm7 \n\t" /* save copy */ - " punpcklwd %%mm0 , %%mm0 \n\t" /* mm0 = 00 00 00 00 00 aa0 00 aa0 */ - " punpckhwd %%mm7 , %%mm7 \n\t" /* mm7 = 00 00 00 00 00 aa1 00 aa1 */ - " punpckldq %%mm0 , %%mm0 \n\t" /* mm0 = 00 aa0 00 aa0 00 aa0 00 aa0 */ - " punpckldq %%mm7 , %%mm7 \n\t" /* mm7 = 00 aa1 00 aa1 00 aa1 00 aa1 */ - - " pmullw %%mm0 , %%mm2 \n\t" /* mm2 == aa * mm2 */ - " pmullw %%mm7 , %%mm4 \n\t" /* mm2 == aa * mm2 */ - " psllw $8 , %%mm1 \n\t" - " psllw $8 , %%mm3 \n\t" - " paddw %%mm1 , %%mm2 \n\t" /* mm2 == mm2 + mm1 */ - " paddw %%mm3 , %%mm4 \n\t" /* mm2 == mm2 + mm1 */ - - " psrlw $8 , %%mm2 \n\t" - " psrlw $8 , %%mm4 \n\t" - " packuswb %%mm4 , %%mm2 \n\t" - " por %%mm5 , %%mm2 \n\t" /* set alpha to ff */ - " movq %%mm2 , (%3) \n\t" - - " add $8 , %1 \n\t" - " add $8 , %0 \n\t" - " dec %%ecx \n\t" - " jne 2b \n\t" - - "3: \n\t" - :"=r" (src), "=r" (dest) - :"0" (src), "1" (dest), "m" (s_alpha), "m" (src_width) - :"%eax", "%ecx", "memory", - "st", "st(1)", "st(2)", "st(3)", "st(4)", "st(5)", "st(6)", "st(7)" -#ifdef __MMX__ - , "mm0", "mm1", "mm2", "mm4", "mm3", "mm5", "mm6", "mm7" -#endif - ); - /* *INDENT-ON* */ - src += src_add; - dest += dest_add; - } - __asm__ __volatile__ ("emms"); -} -#endif - diff --git a/gst/videomixer/blendorc.orc b/gst/videomixer/blendorc.orc index 1114aae042..bb4601c166 100644 --- a/gst/videomixer/blendorc.orc +++ b/gst/videomixer/blendorc.orc @@ -29,7 +29,7 @@ shruw t2, t2, c1 convsuswb d1, t2 -.function gst_videomixer_orc_blend_ayuv +.function orc_blend_argb .flags 2d .dest 4 d guint8 .source 4 s guint8 @@ -41,8 +41,7 @@ convsuswb d1, t2 .temp 8 d_wide .temp 8 s_wide .temp 8 a_wide -.const 4 c_alpha 0xffffff00 - +.const 4 a_alpha 0x000000ff loadl t, s convlw tw, t @@ -51,15 +50,48 @@ splatbl a, tb x4 convubw a_wide, a x4 mullw a_wide, a_wide, alpha x4 shruw a_wide, a_wide, 8 -andl t, t, c_alpha x4 convubw s_wide, t -andl t, d, c_alpha +loadl t, d x4 convubw d_wide, t x4 subw s_wide, s_wide, d_wide x4 mullw s_wide, s_wide, a_wide x4 div255w s_wide, s_wide x4 addw d_wide, d_wide, s_wide -x4 convwb d, d_wide +x4 convwb t, d_wide +orl t, t, a_alpha +storel d, t +.function orc_blend_bgra +.flags 2d +.dest 4 d guint8 +.source 4 s guint8 +.param 2 alpha +.temp 4 t +.temp 4 t2 +.temp 2 tw +.temp 1 tb +.temp 4 a +.temp 8 d_wide +.temp 8 s_wide +.temp 8 a_wide +.const 4 a_alpha 0xff000000 +loadl t, s +shrul t2, t, 24 +convlw tw, t2 +convwb tb, tw +splatbl a, tb +x4 convubw a_wide, a +x4 mullw a_wide, a_wide, alpha +x4 shruw a_wide, a_wide, 8 +x4 convubw s_wide, t +loadl t, d +x4 convubw d_wide, t +x4 subw s_wide, s_wide, d_wide +x4 mullw s_wide, s_wide, a_wide +x4 div255w s_wide, s_wide +x4 addw d_wide, d_wide, s_wide +x4 convwb t, d_wide +orl t, t, a_alpha +storel d, t diff --git a/gst/videomixer/videomixer.c b/gst/videomixer/videomixer.c index 497c15119e..c96aec6bb4 100644 --- a/gst/videomixer/videomixer.c +++ b/gst/videomixer/videomixer.c @@ -72,12 +72,6 @@ #include "config.h" #endif -#ifdef HAVE_GCC_ASM -#if defined(HAVE_CPU_I386) || defined(HAVE_CPU_X86_64) -#define BUILD_X86_ASM -#endif -#endif - #include #include #include