From 6da14d0c4187f94b2742b228da7f1370dcb92853 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sebastian=20Dr=C3=B6ge?= <sebastian.droege@collabora.co.uk>
Date: Mon, 23 Aug 2010 15:44:50 +0200
Subject: [PATCH] videomixer: Optimize ARGB blending and implement BGRA
 blending with orc

This now means, that we have absolutely no handwritten assembly anymore
in videomixer and it's also faster now when using SSE.
---
 configure.ac                |   2 +-
 gst/videomixer/Makefile.am  |   2 +-
 gst/videomixer/blend.c      |  99 ++++++----------------------
 gst/videomixer/blend_mmx.h  | 124 ------------------------------------
 gst/videomixer/blendorc.orc |  44 +++++++++++--
 gst/videomixer/videomixer.c |   6 --
 6 files changed, 59 insertions(+), 218 deletions(-)
 delete mode 100644 gst/videomixer/blend_mmx.h

diff --git a/configure.ac b/configure.ac
index d338662ea1..b262b20476 100644
--- a/configure.ac
+++ b/configure.ac
@@ -208,7 +208,7 @@ dnl GLib is required
 AG_GST_GLIB_CHECK([2.18])
 
 dnl Orc
-ORC_CHECK([0.4.5])
+ORC_CHECK([0.4.7])
 
 
 dnl checks for gstreamer
diff --git a/gst/videomixer/Makefile.am b/gst/videomixer/Makefile.am
index fee1c06a56..193d8a7457 100644
--- a/gst/videomixer/Makefile.am
+++ b/gst/videomixer/Makefile.am
@@ -13,4 +13,4 @@ libgstvideomixer_la_LDFLAGS = $(GST_PLUGIN_LDFLAGS)
 libgstvideomixer_la_LIBTOOLFLAGS = --tag=disable-static
 
 # headers we need but don't want installed
-noinst_HEADERS = videomixer.h videomixerpad.h blend.h blend_mmx.h
+noinst_HEADERS = videomixer.h videomixerpad.h blend.h
diff --git a/gst/videomixer/blend.c b/gst/videomixer/blend.c
index 13c775484b..bb55a85bd6 100644
--- a/gst/videomixer/blend.c
+++ b/gst/videomixer/blend.c
@@ -38,12 +38,6 @@
 
 #define BLEND(D,S,alpha) (((D) * (256 - (alpha)) + (S) * (alpha)) >> 8)
 
-#ifdef HAVE_GCC_ASM
-#if defined(HAVE_ORC) && (defined(HAVE_CPU_I386) || defined(HAVE_CPU_X86_64))
-#define BUILD_X86_ASM
-#endif
-#endif
-
 GST_DEBUG_CATEGORY_STATIC (gst_videomixer_blend_debug);
 #define GST_CAT_DEFAULT gst_videomixer_blend_debug
 
@@ -92,34 +86,26 @@ blend_##name (const guint8 * src, gint xpos, gint ypos, \
   LOOP (dest, src, src_height, src_width, src_stride, dest_stride, s_alpha); \
 }
 
-#define BLEND_A32_LOOP_C(name, A, C1, C2, C3) \
+#define BLEND_A32_LOOP(name) \
 static inline void \
-_blend_loop_##name##_c (guint8 *dest, const guint8 *src, gint src_height, gint src_width, gint src_stride, gint dest_stride, guint s_alpha) { \
-  gint i, j; \
-  gint alpha; \
-  gint src_add = src_stride - (4 * src_width); \
-  gint dest_add = dest_stride - (4 * src_width); \
-  \
-  for (i = 0; i < src_height; i++) { \
-    for (j = 0; j < src_width; j++) { \
-      alpha = (src[A] * s_alpha) >> 8; \
-      dest[A] = 0xff; \
-      dest[C1] = BLEND(dest[C1], src[C1], alpha); \
-      dest[C2] = BLEND(dest[C2], src[C2], alpha); \
-      dest[C3] = BLEND(dest[C3], src[C3], alpha); \
-      \
-      src += 4; \
-      dest += 4; \
-    } \
-    src += src_add; \
-    dest += dest_add; \
-  } \
+_blend_loop_##name (guint8 * dest, const guint8 * src, gint src_height, \
+    gint src_width, gint src_stride, gint dest_stride, guint s_alpha) \
+{ \
+  s_alpha = MIN (255, s_alpha); \
+  orc_blend_##name (dest, dest_stride, src, src_stride, \
+      s_alpha, src_width, src_height); \
 }
 
-BLEND_A32_LOOP_C (argb, 0, 1, 2, 3);
-BLEND_A32_LOOP_C (bgra, 3, 2, 1, 0);
-BLEND_A32 (argb_c, _blend_loop_argb_c);
-BLEND_A32 (bgra_c, _blend_loop_bgra_c);
+BLEND_A32_LOOP (argb);
+BLEND_A32_LOOP (bgra);
+
+#if G_BYTE_ORDER == LITTLE_ENDIAN
+BLEND_A32 (argb, _blend_loop_argb);
+BLEND_A32 (bgra, _blend_loop_bgra);
+#else
+BLEND_A32 (argb, _blend_loop_bgra);
+BLEND_A32 (bgra, _blend_loop_argb);
+#endif
 
 #define A32_CHECKER_C(name, RGB, A, C1, C2, C3) \
 static void \
@@ -680,39 +666,6 @@ PACKED_422_FILL_COLOR (yuy2, 24, 16, 8, 0);
 PACKED_422_FILL_COLOR (yvyu, 24, 0, 8, 16);
 PACKED_422_FILL_COLOR (uyvy, 16, 24, 0, 8);
 
-/* MMX Implementations */
-#ifdef BUILD_X86_ASM
-
-#define A32
-#define NAME_BLEND _blend_loop_argb_mmx
-#define A_OFF 0
-#include "blend_mmx.h"
-#undef NAME_BLEND
-#undef A_OFF
-
-#define NAME_BLEND _blend_loop_bgra_mmx
-#define A_OFF 24
-#include "blend_mmx.h"
-#undef NAME_BLEND
-#undef A_OFF
-#undef A32
-
-BLEND_A32 (argb_mmx, _blend_loop_argb_mmx);
-BLEND_A32 (bgra_mmx, _blend_loop_bgra_mmx);
-#endif
-
-static void
-_blend_loop_argb_orc (guint8 * dest, const guint8 * src, gint src_height,
-    gint src_width, gint src_stride, gint dest_stride, guint s_alpha)
-{
-  s_alpha = MIN (255, s_alpha);
-  gst_videomixer_orc_blend_ayuv (dest, dest_stride, src, src_stride,
-      s_alpha, src_width, src_height);
-}
-
-BLEND_A32 (argb_orc, _blend_loop_argb_orc);
-
-
 /* Init function */
 BlendFunction gst_video_mixer_blend_argb;
 BlendFunction gst_video_mixer_blend_bgra;
@@ -769,18 +722,13 @@ FillColorFunction gst_video_mixer_fill_color_uyvy;
 void
 gst_video_mixer_init_blend (void)
 {
-#ifdef BUILD_X86_ASM
-  guint cpu_flags;
-
   orc_init ();
-  cpu_flags = orc_target_get_default_flags (orc_target_get_by_name ("mmx"));
-#endif
 
   GST_DEBUG_CATEGORY_INIT (gst_videomixer_blend_debug, "videomixer_blend", 0,
       "video mixer blending functions");
 
-  gst_video_mixer_blend_argb = blend_argb_c;
-  gst_video_mixer_blend_bgra = blend_bgra_c;
+  gst_video_mixer_blend_argb = blend_argb;
+  gst_video_mixer_blend_bgra = blend_bgra;
   gst_video_mixer_blend_i420 = blend_i420;
   gst_video_mixer_blend_y444 = blend_y444;
   gst_video_mixer_blend_y42b = blend_y42b;
@@ -820,13 +768,4 @@ gst_video_mixer_init_blend (void)
   gst_video_mixer_fill_color_yuy2 = fill_color_yuy2;
   gst_video_mixer_fill_color_yvyu = fill_color_yvyu;
   gst_video_mixer_fill_color_uyvy = fill_color_uyvy;
-
-#ifdef BUILD_X86_ASM
-  if (cpu_flags & ORC_TARGET_MMX_MMX) {
-    gst_video_mixer_blend_argb = blend_argb_mmx;
-    gst_video_mixer_blend_bgra = blend_bgra_mmx;
-  }
-#endif
-
-  gst_video_mixer_blend_argb = blend_argb_orc;
 }
diff --git a/gst/videomixer/blend_mmx.h b/gst/videomixer/blend_mmx.h
deleted file mode 100644
index 9c0f250aa6..0000000000
--- a/gst/videomixer/blend_mmx.h
+++ /dev/null
@@ -1,124 +0,0 @@
-#ifdef A32
-static inline void
-NAME_BLEND (guint8 * dest, const guint8 * src, gint src_height, gint src_width,
-    gint src_stride, gint dest_stride, guint s_alpha)
-{
-  gint i;
-  gint src_add = src_stride - (4 * src_width);
-  gint dest_add = dest_stride - (4 * src_width);
-
-  for (i = 0; i < src_height; i++) {
-    /*      (P1 * (256 - A) + (P2 * A)) / 256
-     * =>   (P1 * 256 - P1 * A + P2 * A) / 256
-     * =>   (P1 * 256 + A * (P2 - P1) / 256
-     * =>   P1 + (A * (P2 - P1)) / 256
-     */
-    /* *INDENT-OFF* */
-    __asm__ __volatile__ (
-        " pcmpeqd    %%mm5 ,   %%mm5   \n\t"   /* mm5 = 0xffff... */
-#if A_OFF == 0
-        " psrld        $24 ,   %%mm5   \n\t"   /* mm5 = 00 00 00 ff 00 00 00 ff, selector for alpha */
-#else
-        " pslld        $24 ,   %%mm5   \n\t"   /* mm5 = ff 00 00 00 ff 00 00 00, selector for alpha */
-#endif
-        " mov           %4 ,   %%eax   \n\t"   /* eax = s_alpha */
-        " movd       %%eax ,   %%mm6   \n\t"   /* mm6 = s_alpha */
-        " punpckldq  %%mm6 ,   %%mm6   \n\t"   /* mm6 = 00 00 00 aa 00 00 00 aa, alpha scale factor */
-
-        " movl          %5 ,   %%ecx   \n\t"   /* ecx = src_width */
-        " test          $1 ,   %%ecx   \n\t"   /* check odd pixel */
-        " je                      1f   \n\t"
-
-        /* do odd pixel */
-        " movd        (%2) ,   %%mm2   \n\t"   /* mm2 = src,  00 00 00 00 sv su sy sa */
-        " movd        (%3) ,   %%mm1   \n\t"   /* mm1 = dest, 00 00 00 00 dv du dy da */
-        " movq       %%mm2 ,   %%mm0   \n\t"
-        " punpcklbw  %%mm7 ,   %%mm2   \n\t"   /* mm2 = 00 sv 00 su 00 sy 00 sa */
-        " pand       %%mm5 ,   %%mm0   \n\t"   /* mm0 = 00 00 00 00 00 00 00 sa, get alpha component  */
-#if A_OFF != 0
-        " psrld        $24 ,   %%mm0   \n\t"
-#endif
-        " punpcklbw  %%mm7 ,   %%mm1   \n\t"   /* mm1 = 00 dv 00 du 00 dy 00 da */
-        " pmullw     %%mm6 ,   %%mm0   \n\t"   /* mult with scale */
-        " psubw      %%mm1 ,   %%mm2   \n\t"   /* mm2 = mm2 - mm1 */
-        " punpcklwd  %%mm0 ,   %%mm0   \n\t"
-        " punpckldq  %%mm0 ,   %%mm0   \n\t"   /* mm0 == 00 aa 00 aa 00 aa 00 aa */
-        " psrlw         $8 ,   %%mm0   \n\t"
-        " pmullw     %%mm0 ,   %%mm2   \n\t"   /* mm2 == a * mm2 */
-        " psllw         $8 ,   %%mm1   \n\t"   /* scale up */
-        " paddw      %%mm1 ,   %%mm2   \n\t"   /* mm2 == mm2 + mm1 */
-        " psrlw         $8 ,   %%mm2   \n\t"   /* scale down */
-        " por        %%mm5 ,   %%mm2   \n\t"   /* set alpha to ff */
-        " packuswb   %%mm2 ,   %%mm2   \n\t" 
-        " movd       %%mm2 ,    (%3)   \n\t"   /* dest = mm1 */
-        " add           $4 ,     %1    \n\t"
-        " add           $4 ,     %0    \n\t"
-
-        "1:                            \n\t"
-        " sar           $1 ,   %%ecx   \n\t"   /* prepare for 2 pixel per loop */
-        " cmp           $0 ,   %%ecx   \n\t"
-        " je                      3f   \n\t"
-        "2:                            \n\t"
-
-        /* do even pixels */
-        " movq        (%2) ,   %%mm2   \n\t"   /* mm2 = src,  sv1 su1 sy1 sa1  sv0 su0 sy0 sa0 */
-        " movq        (%3) ,   %%mm1   \n\t"   /* mm1 = dest, dv1 du1 dy1 da1  dv0 du0 dy0 da0 */
-        " movq       %%mm2 ,   %%mm4   \n\t"
-        " movq       %%mm1 ,   %%mm3   \n\t"
-        " movq       %%mm2 ,   %%mm0   \n\t"   /* copy for doing the alpha */
-
-        " pxor       %%mm7 ,   %%mm7   \n\t"  
-        " punpcklbw  %%mm7 ,   %%mm2   \n\t"   /* mm2 = 00 sv0  00 su0  00 sy0  00 sa0 */
-        " punpckhbw  %%mm7 ,   %%mm4   \n\t"   /* mm4 = 00 sv1  00 su1  00 sy1  00 sa1 */
-        " punpcklbw  %%mm7 ,   %%mm1   \n\t"   /* mm1 = 00 dv0  00 du0  00 dy0  00 da0 */
-        " punpckhbw  %%mm7 ,   %%mm3   \n\t"   /* mm2 = 00 dv1  00 du1  00 dy1  00 da1 */
-
-        " pand       %%mm5 ,   %%mm0   \n\t"   /* mm0 = 00 00 00 sa1  00 00 00 sa0 */
-#if A_OFF != 0
-        " psrld        $24 ,   %%mm0   \n\t"
-#endif
-        " psubw      %%mm1 ,   %%mm2   \n\t"   /* mm2 = mm2 - mm1 */
-        " pmullw     %%mm6 ,   %%mm0   \n\t"   /* mult with scale */
-        " psubw      %%mm3 ,   %%mm4   \n\t"   /* mm4 = mm4 - mm3 */
-        " psrlw         $8 ,   %%mm0   \n\t"   /* scale back */
-        " movq       %%mm0 ,   %%mm7   \n\t"   /* save copy */
-        " punpcklwd  %%mm0 ,   %%mm0   \n\t"   /* mm0 = 00 00   00 00   00 aa0  00 aa0 */
-        " punpckhwd  %%mm7 ,   %%mm7   \n\t"   /* mm7 = 00 00   00 00   00 aa1  00 aa1 */
-        " punpckldq  %%mm0 ,   %%mm0   \n\t"   /* mm0 = 00 aa0  00 aa0  00 aa0  00 aa0 */
-        " punpckldq  %%mm7 ,   %%mm7   \n\t"   /* mm7 = 00 aa1  00 aa1  00 aa1  00 aa1 */
-
-        " pmullw     %%mm0 ,   %%mm2   \n\t"   /* mm2 == aa * mm2 */
-        " pmullw     %%mm7 ,   %%mm4   \n\t"   /* mm2 == aa * mm2 */
-        " psllw         $8 ,   %%mm1   \n\t"
-        " psllw         $8 ,   %%mm3   \n\t"
-        " paddw      %%mm1 ,   %%mm2   \n\t"   /* mm2 == mm2 + mm1 */
-        " paddw      %%mm3 ,   %%mm4   \n\t"   /* mm2 == mm2 + mm1 */
-
-        " psrlw         $8 ,   %%mm2   \n\t"
-        " psrlw         $8 ,   %%mm4   \n\t"
-        " packuswb   %%mm4 ,   %%mm2   \n\t"
-        " por        %%mm5 ,   %%mm2   \n\t"   /* set alpha to ff */
-        " movq       %%mm2 ,    (%3)   \n\t"
-
-        " add           $8 ,     %1    \n\t"
-        " add           $8 ,     %0    \n\t"
-        " dec           %%ecx          \n\t"
-        " jne                     2b   \n\t"
-
-        "3:                            \n\t"
-        :"=r" (src), "=r" (dest)
-        :"0" (src), "1" (dest), "m" (s_alpha), "m" (src_width)
-        :"%eax", "%ecx", "memory",
-         "st", "st(1)", "st(2)", "st(3)", "st(4)", "st(5)", "st(6)", "st(7)"
-#ifdef __MMX__
-        , "mm0", "mm1", "mm2", "mm4", "mm3", "mm5", "mm6", "mm7"
-#endif
-    );
-      /* *INDENT-ON* */
-    src += src_add;
-    dest += dest_add;
-  }
-  __asm__ __volatile__ ("emms");
-}
-#endif
-
diff --git a/gst/videomixer/blendorc.orc b/gst/videomixer/blendorc.orc
index 1114aae042..bb4601c166 100644
--- a/gst/videomixer/blendorc.orc
+++ b/gst/videomixer/blendorc.orc
@@ -29,7 +29,7 @@ shruw t2, t2, c1
 convsuswb d1, t2
 
 
-.function gst_videomixer_orc_blend_ayuv
+.function orc_blend_argb
 .flags 2d
 .dest 4 d guint8
 .source 4 s guint8
@@ -41,8 +41,7 @@ convsuswb d1, t2
 .temp 8 d_wide
 .temp 8 s_wide
 .temp 8 a_wide
-.const 4 c_alpha 0xffffff00
-
+.const 4 a_alpha 0x000000ff
 
 loadl t, s
 convlw tw, t
@@ -51,15 +50,48 @@ splatbl a, tb
 x4 convubw a_wide, a
 x4 mullw a_wide, a_wide, alpha
 x4 shruw a_wide, a_wide, 8
-andl t, t, c_alpha
 x4 convubw s_wide, t
-andl t, d, c_alpha
+loadl t, d
 x4 convubw d_wide, t
 x4 subw s_wide, s_wide, d_wide
 x4 mullw s_wide, s_wide, a_wide
 x4 div255w s_wide, s_wide
 x4 addw d_wide, d_wide, s_wide
-x4 convwb d, d_wide
+x4 convwb t, d_wide
+orl t, t, a_alpha
+storel d, t
 
+.function orc_blend_bgra
+.flags 2d
+.dest 4 d guint8
+.source 4 s guint8
+.param 2 alpha
+.temp 4 t
+.temp 4 t2
+.temp 2 tw
+.temp 1 tb
+.temp 4 a
+.temp 8 d_wide
+.temp 8 s_wide
+.temp 8 a_wide
+.const 4 a_alpha 0xff000000
 
+loadl t, s
+shrul t2, t, 24
+convlw tw, t2
+convwb tb, tw
+splatbl a, tb
+x4 convubw a_wide, a
+x4 mullw a_wide, a_wide, alpha
+x4 shruw a_wide, a_wide, 8
+x4 convubw s_wide, t
+loadl t, d
+x4 convubw d_wide, t
+x4 subw s_wide, s_wide, d_wide
+x4 mullw s_wide, s_wide, a_wide
+x4 div255w s_wide, s_wide
+x4 addw d_wide, d_wide, s_wide
+x4 convwb t, d_wide
+orl t, t, a_alpha
+storel d, t
 
diff --git a/gst/videomixer/videomixer.c b/gst/videomixer/videomixer.c
index 497c15119e..c96aec6bb4 100644
--- a/gst/videomixer/videomixer.c
+++ b/gst/videomixer/videomixer.c
@@ -72,12 +72,6 @@
 #include "config.h"
 #endif
 
-#ifdef HAVE_GCC_ASM
-#if defined(HAVE_CPU_I386) || defined(HAVE_CPU_X86_64)
-#define BUILD_X86_ASM
-#endif
-#endif
-
 #include <gst/gst.h>
 #include <gst/base/gstcollectpads.h>
 #include <gst/controller/gstcontroller.h>