videomixer: Optimize ARGB blending and implement BGRA blending with orc

This now means, that we have absolutely no handwritten assembly anymore
in videomixer and it's also faster now when using SSE.
This commit is contained in:
Sebastian Dröge 2010-08-23 15:44:50 +02:00
parent 7cfa519547
commit 6da14d0c41
6 changed files with 59 additions and 218 deletions

View file

@ -208,7 +208,7 @@ dnl GLib is required
AG_GST_GLIB_CHECK([2.18]) AG_GST_GLIB_CHECK([2.18])
dnl Orc dnl Orc
ORC_CHECK([0.4.5]) ORC_CHECK([0.4.7])
dnl checks for gstreamer dnl checks for gstreamer

View file

@ -13,4 +13,4 @@ libgstvideomixer_la_LDFLAGS = $(GST_PLUGIN_LDFLAGS)
libgstvideomixer_la_LIBTOOLFLAGS = --tag=disable-static libgstvideomixer_la_LIBTOOLFLAGS = --tag=disable-static
# headers we need but don't want installed # headers we need but don't want installed
noinst_HEADERS = videomixer.h videomixerpad.h blend.h blend_mmx.h noinst_HEADERS = videomixer.h videomixerpad.h blend.h

View file

@ -38,12 +38,6 @@
#define BLEND(D,S,alpha) (((D) * (256 - (alpha)) + (S) * (alpha)) >> 8) #define BLEND(D,S,alpha) (((D) * (256 - (alpha)) + (S) * (alpha)) >> 8)
#ifdef HAVE_GCC_ASM
#if defined(HAVE_ORC) && (defined(HAVE_CPU_I386) || defined(HAVE_CPU_X86_64))
#define BUILD_X86_ASM
#endif
#endif
GST_DEBUG_CATEGORY_STATIC (gst_videomixer_blend_debug); GST_DEBUG_CATEGORY_STATIC (gst_videomixer_blend_debug);
#define GST_CAT_DEFAULT gst_videomixer_blend_debug #define GST_CAT_DEFAULT gst_videomixer_blend_debug
@ -92,34 +86,26 @@ blend_##name (const guint8 * src, gint xpos, gint ypos, \
LOOP (dest, src, src_height, src_width, src_stride, dest_stride, s_alpha); \ LOOP (dest, src, src_height, src_width, src_stride, dest_stride, s_alpha); \
} }
#define BLEND_A32_LOOP_C(name, A, C1, C2, C3) \ #define BLEND_A32_LOOP(name) \
static inline void \ static inline void \
_blend_loop_##name##_c (guint8 *dest, const guint8 *src, gint src_height, gint src_width, gint src_stride, gint dest_stride, guint s_alpha) { \ _blend_loop_##name (guint8 * dest, const guint8 * src, gint src_height, \
gint i, j; \ gint src_width, gint src_stride, gint dest_stride, guint s_alpha) \
gint alpha; \ { \
gint src_add = src_stride - (4 * src_width); \ s_alpha = MIN (255, s_alpha); \
gint dest_add = dest_stride - (4 * src_width); \ orc_blend_##name (dest, dest_stride, src, src_stride, \
\ s_alpha, src_width, src_height); \
for (i = 0; i < src_height; i++) { \
for (j = 0; j < src_width; j++) { \
alpha = (src[A] * s_alpha) >> 8; \
dest[A] = 0xff; \
dest[C1] = BLEND(dest[C1], src[C1], alpha); \
dest[C2] = BLEND(dest[C2], src[C2], alpha); \
dest[C3] = BLEND(dest[C3], src[C3], alpha); \
\
src += 4; \
dest += 4; \
} \
src += src_add; \
dest += dest_add; \
} \
} }
BLEND_A32_LOOP_C (argb, 0, 1, 2, 3); BLEND_A32_LOOP (argb);
BLEND_A32_LOOP_C (bgra, 3, 2, 1, 0); BLEND_A32_LOOP (bgra);
BLEND_A32 (argb_c, _blend_loop_argb_c);
BLEND_A32 (bgra_c, _blend_loop_bgra_c); #if G_BYTE_ORDER == LITTLE_ENDIAN
BLEND_A32 (argb, _blend_loop_argb);
BLEND_A32 (bgra, _blend_loop_bgra);
#else
BLEND_A32 (argb, _blend_loop_bgra);
BLEND_A32 (bgra, _blend_loop_argb);
#endif
#define A32_CHECKER_C(name, RGB, A, C1, C2, C3) \ #define A32_CHECKER_C(name, RGB, A, C1, C2, C3) \
static void \ static void \
@ -680,39 +666,6 @@ PACKED_422_FILL_COLOR (yuy2, 24, 16, 8, 0);
PACKED_422_FILL_COLOR (yvyu, 24, 0, 8, 16); PACKED_422_FILL_COLOR (yvyu, 24, 0, 8, 16);
PACKED_422_FILL_COLOR (uyvy, 16, 24, 0, 8); PACKED_422_FILL_COLOR (uyvy, 16, 24, 0, 8);
/* MMX Implementations */
#ifdef BUILD_X86_ASM
#define A32
#define NAME_BLEND _blend_loop_argb_mmx
#define A_OFF 0
#include "blend_mmx.h"
#undef NAME_BLEND
#undef A_OFF
#define NAME_BLEND _blend_loop_bgra_mmx
#define A_OFF 24
#include "blend_mmx.h"
#undef NAME_BLEND
#undef A_OFF
#undef A32
BLEND_A32 (argb_mmx, _blend_loop_argb_mmx);
BLEND_A32 (bgra_mmx, _blend_loop_bgra_mmx);
#endif
static void
_blend_loop_argb_orc (guint8 * dest, const guint8 * src, gint src_height,
gint src_width, gint src_stride, gint dest_stride, guint s_alpha)
{
s_alpha = MIN (255, s_alpha);
gst_videomixer_orc_blend_ayuv (dest, dest_stride, src, src_stride,
s_alpha, src_width, src_height);
}
BLEND_A32 (argb_orc, _blend_loop_argb_orc);
/* Init function */ /* Init function */
BlendFunction gst_video_mixer_blend_argb; BlendFunction gst_video_mixer_blend_argb;
BlendFunction gst_video_mixer_blend_bgra; BlendFunction gst_video_mixer_blend_bgra;
@ -769,18 +722,13 @@ FillColorFunction gst_video_mixer_fill_color_uyvy;
void void
gst_video_mixer_init_blend (void) gst_video_mixer_init_blend (void)
{ {
#ifdef BUILD_X86_ASM
guint cpu_flags;
orc_init (); orc_init ();
cpu_flags = orc_target_get_default_flags (orc_target_get_by_name ("mmx"));
#endif
GST_DEBUG_CATEGORY_INIT (gst_videomixer_blend_debug, "videomixer_blend", 0, GST_DEBUG_CATEGORY_INIT (gst_videomixer_blend_debug, "videomixer_blend", 0,
"video mixer blending functions"); "video mixer blending functions");
gst_video_mixer_blend_argb = blend_argb_c; gst_video_mixer_blend_argb = blend_argb;
gst_video_mixer_blend_bgra = blend_bgra_c; gst_video_mixer_blend_bgra = blend_bgra;
gst_video_mixer_blend_i420 = blend_i420; gst_video_mixer_blend_i420 = blend_i420;
gst_video_mixer_blend_y444 = blend_y444; gst_video_mixer_blend_y444 = blend_y444;
gst_video_mixer_blend_y42b = blend_y42b; gst_video_mixer_blend_y42b = blend_y42b;
@ -820,13 +768,4 @@ gst_video_mixer_init_blend (void)
gst_video_mixer_fill_color_yuy2 = fill_color_yuy2; gst_video_mixer_fill_color_yuy2 = fill_color_yuy2;
gst_video_mixer_fill_color_yvyu = fill_color_yvyu; gst_video_mixer_fill_color_yvyu = fill_color_yvyu;
gst_video_mixer_fill_color_uyvy = fill_color_uyvy; gst_video_mixer_fill_color_uyvy = fill_color_uyvy;
#ifdef BUILD_X86_ASM
if (cpu_flags & ORC_TARGET_MMX_MMX) {
gst_video_mixer_blend_argb = blend_argb_mmx;
gst_video_mixer_blend_bgra = blend_bgra_mmx;
}
#endif
gst_video_mixer_blend_argb = blend_argb_orc;
} }

View file

@ -1,124 +0,0 @@
#ifdef A32
static inline void
NAME_BLEND (guint8 * dest, const guint8 * src, gint src_height, gint src_width,
gint src_stride, gint dest_stride, guint s_alpha)
{
gint i;
gint src_add = src_stride - (4 * src_width);
gint dest_add = dest_stride - (4 * src_width);
for (i = 0; i < src_height; i++) {
/* (P1 * (256 - A) + (P2 * A)) / 256
* => (P1 * 256 - P1 * A + P2 * A) / 256
* => (P1 * 256 + A * (P2 - P1) / 256
* => P1 + (A * (P2 - P1)) / 256
*/
/* *INDENT-OFF* */
__asm__ __volatile__ (
" pcmpeqd %%mm5 , %%mm5 \n\t" /* mm5 = 0xffff... */
#if A_OFF == 0
" psrld $24 , %%mm5 \n\t" /* mm5 = 00 00 00 ff 00 00 00 ff, selector for alpha */
#else
" pslld $24 , %%mm5 \n\t" /* mm5 = ff 00 00 00 ff 00 00 00, selector for alpha */
#endif
" mov %4 , %%eax \n\t" /* eax = s_alpha */
" movd %%eax , %%mm6 \n\t" /* mm6 = s_alpha */
" punpckldq %%mm6 , %%mm6 \n\t" /* mm6 = 00 00 00 aa 00 00 00 aa, alpha scale factor */
" movl %5 , %%ecx \n\t" /* ecx = src_width */
" test $1 , %%ecx \n\t" /* check odd pixel */
" je 1f \n\t"
/* do odd pixel */
" movd (%2) , %%mm2 \n\t" /* mm2 = src, 00 00 00 00 sv su sy sa */
" movd (%3) , %%mm1 \n\t" /* mm1 = dest, 00 00 00 00 dv du dy da */
" movq %%mm2 , %%mm0 \n\t"
" punpcklbw %%mm7 , %%mm2 \n\t" /* mm2 = 00 sv 00 su 00 sy 00 sa */
" pand %%mm5 , %%mm0 \n\t" /* mm0 = 00 00 00 00 00 00 00 sa, get alpha component */
#if A_OFF != 0
" psrld $24 , %%mm0 \n\t"
#endif
" punpcklbw %%mm7 , %%mm1 \n\t" /* mm1 = 00 dv 00 du 00 dy 00 da */
" pmullw %%mm6 , %%mm0 \n\t" /* mult with scale */
" psubw %%mm1 , %%mm2 \n\t" /* mm2 = mm2 - mm1 */
" punpcklwd %%mm0 , %%mm0 \n\t"
" punpckldq %%mm0 , %%mm0 \n\t" /* mm0 == 00 aa 00 aa 00 aa 00 aa */
" psrlw $8 , %%mm0 \n\t"
" pmullw %%mm0 , %%mm2 \n\t" /* mm2 == a * mm2 */
" psllw $8 , %%mm1 \n\t" /* scale up */
" paddw %%mm1 , %%mm2 \n\t" /* mm2 == mm2 + mm1 */
" psrlw $8 , %%mm2 \n\t" /* scale down */
" por %%mm5 , %%mm2 \n\t" /* set alpha to ff */
" packuswb %%mm2 , %%mm2 \n\t"
" movd %%mm2 , (%3) \n\t" /* dest = mm1 */
" add $4 , %1 \n\t"
" add $4 , %0 \n\t"
"1: \n\t"
" sar $1 , %%ecx \n\t" /* prepare for 2 pixel per loop */
" cmp $0 , %%ecx \n\t"
" je 3f \n\t"
"2: \n\t"
/* do even pixels */
" movq (%2) , %%mm2 \n\t" /* mm2 = src, sv1 su1 sy1 sa1 sv0 su0 sy0 sa0 */
" movq (%3) , %%mm1 \n\t" /* mm1 = dest, dv1 du1 dy1 da1 dv0 du0 dy0 da0 */
" movq %%mm2 , %%mm4 \n\t"
" movq %%mm1 , %%mm3 \n\t"
" movq %%mm2 , %%mm0 \n\t" /* copy for doing the alpha */
" pxor %%mm7 , %%mm7 \n\t"
" punpcklbw %%mm7 , %%mm2 \n\t" /* mm2 = 00 sv0 00 su0 00 sy0 00 sa0 */
" punpckhbw %%mm7 , %%mm4 \n\t" /* mm4 = 00 sv1 00 su1 00 sy1 00 sa1 */
" punpcklbw %%mm7 , %%mm1 \n\t" /* mm1 = 00 dv0 00 du0 00 dy0 00 da0 */
" punpckhbw %%mm7 , %%mm3 \n\t" /* mm2 = 00 dv1 00 du1 00 dy1 00 da1 */
" pand %%mm5 , %%mm0 \n\t" /* mm0 = 00 00 00 sa1 00 00 00 sa0 */
#if A_OFF != 0
" psrld $24 , %%mm0 \n\t"
#endif
" psubw %%mm1 , %%mm2 \n\t" /* mm2 = mm2 - mm1 */
" pmullw %%mm6 , %%mm0 \n\t" /* mult with scale */
" psubw %%mm3 , %%mm4 \n\t" /* mm4 = mm4 - mm3 */
" psrlw $8 , %%mm0 \n\t" /* scale back */
" movq %%mm0 , %%mm7 \n\t" /* save copy */
" punpcklwd %%mm0 , %%mm0 \n\t" /* mm0 = 00 00 00 00 00 aa0 00 aa0 */
" punpckhwd %%mm7 , %%mm7 \n\t" /* mm7 = 00 00 00 00 00 aa1 00 aa1 */
" punpckldq %%mm0 , %%mm0 \n\t" /* mm0 = 00 aa0 00 aa0 00 aa0 00 aa0 */
" punpckldq %%mm7 , %%mm7 \n\t" /* mm7 = 00 aa1 00 aa1 00 aa1 00 aa1 */
" pmullw %%mm0 , %%mm2 \n\t" /* mm2 == aa * mm2 */
" pmullw %%mm7 , %%mm4 \n\t" /* mm2 == aa * mm2 */
" psllw $8 , %%mm1 \n\t"
" psllw $8 , %%mm3 \n\t"
" paddw %%mm1 , %%mm2 \n\t" /* mm2 == mm2 + mm1 */
" paddw %%mm3 , %%mm4 \n\t" /* mm2 == mm2 + mm1 */
" psrlw $8 , %%mm2 \n\t"
" psrlw $8 , %%mm4 \n\t"
" packuswb %%mm4 , %%mm2 \n\t"
" por %%mm5 , %%mm2 \n\t" /* set alpha to ff */
" movq %%mm2 , (%3) \n\t"
" add $8 , %1 \n\t"
" add $8 , %0 \n\t"
" dec %%ecx \n\t"
" jne 2b \n\t"
"3: \n\t"
:"=r" (src), "=r" (dest)
:"0" (src), "1" (dest), "m" (s_alpha), "m" (src_width)
:"%eax", "%ecx", "memory",
"st", "st(1)", "st(2)", "st(3)", "st(4)", "st(5)", "st(6)", "st(7)"
#ifdef __MMX__
, "mm0", "mm1", "mm2", "mm4", "mm3", "mm5", "mm6", "mm7"
#endif
);
/* *INDENT-ON* */
src += src_add;
dest += dest_add;
}
__asm__ __volatile__ ("emms");
}
#endif

View file

@ -29,7 +29,7 @@ shruw t2, t2, c1
convsuswb d1, t2 convsuswb d1, t2
.function gst_videomixer_orc_blend_ayuv .function orc_blend_argb
.flags 2d .flags 2d
.dest 4 d guint8 .dest 4 d guint8
.source 4 s guint8 .source 4 s guint8
@ -41,8 +41,7 @@ convsuswb d1, t2
.temp 8 d_wide .temp 8 d_wide
.temp 8 s_wide .temp 8 s_wide
.temp 8 a_wide .temp 8 a_wide
.const 4 c_alpha 0xffffff00 .const 4 a_alpha 0x000000ff
loadl t, s loadl t, s
convlw tw, t convlw tw, t
@ -51,15 +50,48 @@ splatbl a, tb
x4 convubw a_wide, a x4 convubw a_wide, a
x4 mullw a_wide, a_wide, alpha x4 mullw a_wide, a_wide, alpha
x4 shruw a_wide, a_wide, 8 x4 shruw a_wide, a_wide, 8
andl t, t, c_alpha
x4 convubw s_wide, t x4 convubw s_wide, t
andl t, d, c_alpha loadl t, d
x4 convubw d_wide, t x4 convubw d_wide, t
x4 subw s_wide, s_wide, d_wide x4 subw s_wide, s_wide, d_wide
x4 mullw s_wide, s_wide, a_wide x4 mullw s_wide, s_wide, a_wide
x4 div255w s_wide, s_wide x4 div255w s_wide, s_wide
x4 addw d_wide, d_wide, s_wide x4 addw d_wide, d_wide, s_wide
x4 convwb d, d_wide x4 convwb t, d_wide
orl t, t, a_alpha
storel d, t
.function orc_blend_bgra
.flags 2d
.dest 4 d guint8
.source 4 s guint8
.param 2 alpha
.temp 4 t
.temp 4 t2
.temp 2 tw
.temp 1 tb
.temp 4 a
.temp 8 d_wide
.temp 8 s_wide
.temp 8 a_wide
.const 4 a_alpha 0xff000000
loadl t, s
shrul t2, t, 24
convlw tw, t2
convwb tb, tw
splatbl a, tb
x4 convubw a_wide, a
x4 mullw a_wide, a_wide, alpha
x4 shruw a_wide, a_wide, 8
x4 convubw s_wide, t
loadl t, d
x4 convubw d_wide, t
x4 subw s_wide, s_wide, d_wide
x4 mullw s_wide, s_wide, a_wide
x4 div255w s_wide, s_wide
x4 addw d_wide, d_wide, s_wide
x4 convwb t, d_wide
orl t, t, a_alpha
storel d, t

View file

@ -72,12 +72,6 @@
#include "config.h" #include "config.h"
#endif #endif
#ifdef HAVE_GCC_ASM
#if defined(HAVE_CPU_I386) || defined(HAVE_CPU_X86_64)
#define BUILD_X86_ASM
#endif
#endif
#include <gst/gst.h> #include <gst/gst.h>
#include <gst/base/gstcollectpads.h> #include <gst/base/gstcollectpads.h>
#include <gst/controller/gstcontroller.h> #include <gst/controller/gstcontroller.h>