videomixer: Optimize ARGB blending and implement BGRA blending with orc

This now means, that we have absolutely no handwritten assembly anymore in videomixer and it's also faster now when using SSE.
2025-02-03 04:52:28 +00:00 · 2010-08-23 15:44:50 +02:00 · 2010-08-23 15:44:50 +02:00 · 6da14d0c41
commit 6da14d0c41
parent 7cfa519547
6 changed files with 59 additions and 218 deletions
--- a/configure.ac
+++ b/configure.ac
@ -208,7 +208,7 @@ dnl GLib is required
 AG_GST_GLIB_CHECK([2.18])
 dnl Orc
-ORC_CHECK([0.4.5])
+ORC_CHECK([0.4.7])
 dnl checks for gstreamer
--- a/gst/videomixer/Makefile.am
+++ b/gst/videomixer/Makefile.am
@ -13,4 +13,4 @@ libgstvideomixer_la_LDFLAGS = $(GST_PLUGIN_LDFLAGS)
 libgstvideomixer_la_LIBTOOLFLAGS = --tag=disable-static
 # headers we need but don't want installed
-noinst_HEADERS = videomixer.h videomixerpad.h blend.h blend_mmx.h
+noinst_HEADERS = videomixer.h videomixerpad.h blend.h
--- a/gst/videomixer/blend.c
+++ b/gst/videomixer/blend.c
@ -38,12 +38,6 @@
 #define BLEND(D,S,alpha) (((D) * (256 - (alpha)) + (S) * (alpha)) >> 8)
 #ifdef HAVE_GCC_ASM
 #if defined(HAVE_ORC) && (defined(HAVE_CPU_I386) || defined(HAVE_CPU_X86_64))
 #define BUILD_X86_ASM
 #endif
 #endif
 GST_DEBUG_CATEGORY_STATIC (gst_videomixer_blend_debug);
 #define GST_CAT_DEFAULT gst_videomixer_blend_debug
@ -92,34 +86,26 @@ blend_##name (const guint8 * src, gint xpos, gint ypos, \
  LOOP (dest, src, src_height, src_width, src_stride, dest_stride, s_alpha); \
 }
-#define BLEND_A32_LOOP_C(name, A, C1, C2, C3) \
+#define BLEND_A32_LOOP(name) \
 static inline void \
-_blend_loop_##name##_c (guint8 *dest, const guint8 *src, gint src_height, gint src_width, gint src_stride, gint dest_stride, guint s_alpha) { \
+_blend_loop_##name (guint8 * dest, const guint8 * src, gint src_height, \
-  gint i, j; \
+    gint src_width, gint src_stride, gint dest_stride, guint s_alpha) \
-  gint alpha; \
+{ \
-  gint src_add = src_stride - (4 * src_width); \
+  s_alpha = MIN (255, s_alpha); \
-  gint dest_add = dest_stride - (4 * src_width); \
+  orc_blend_##name (dest, dest_stride, src, src_stride, \
-  \
+      s_alpha, src_width, src_height); \
  for (i = 0; i < src_height; i++) { \
    for (j = 0; j < src_width; j++) { \
      alpha = (src[A] * s_alpha) >> 8; \
      dest[A] = 0xff; \
      dest[C1] = BLEND(dest[C1], src[C1], alpha); \
      dest[C2] = BLEND(dest[C2], src[C2], alpha); \
      dest[C3] = BLEND(dest[C3], src[C3], alpha); \
      \
      src += 4; \
      dest += 4; \
    } \
    src += src_add; \
    dest += dest_add; \
  } \
 }
-BLEND_A32_LOOP_C (argb, 0, 1, 2, 3);
+BLEND_A32_LOOP (argb);
-BLEND_A32_LOOP_C (bgra, 3, 2, 1, 0);
+BLEND_A32_LOOP (bgra);
-BLEND_A32 (argb_c, _blend_loop_argb_c);
+
-BLEND_A32 (bgra_c, _blend_loop_bgra_c);
+#if G_BYTE_ORDER == LITTLE_ENDIAN
 BLEND_A32 (argb, _blend_loop_argb);
 BLEND_A32 (bgra, _blend_loop_bgra);
 #else
 BLEND_A32 (argb, _blend_loop_bgra);
 BLEND_A32 (bgra, _blend_loop_argb);
 #endif
 #define A32_CHECKER_C(name, RGB, A, C1, C2, C3) \
 static void \
@ -680,39 +666,6 @@ PACKED_422_FILL_COLOR (yuy2, 24, 16, 8, 0);
 PACKED_422_FILL_COLOR (yvyu, 24, 0, 8, 16);
 PACKED_422_FILL_COLOR (uyvy, 16, 24, 0, 8);
 /* MMX Implementations */
 #ifdef BUILD_X86_ASM
 #define A32
 #define NAME_BLEND _blend_loop_argb_mmx
 #define A_OFF 0
 #include "blend_mmx.h"
 #undef NAME_BLEND
 #undef A_OFF
 #define NAME_BLEND _blend_loop_bgra_mmx
 #define A_OFF 24
 #include "blend_mmx.h"
 #undef NAME_BLEND
 #undef A_OFF
 #undef A32
 BLEND_A32 (argb_mmx, _blend_loop_argb_mmx);
 BLEND_A32 (bgra_mmx, _blend_loop_bgra_mmx);
 #endif
 static void
 _blend_loop_argb_orc (guint8 * dest, const guint8 * src, gint src_height,
    gint src_width, gint src_stride, gint dest_stride, guint s_alpha)
 {
  s_alpha = MIN (255, s_alpha);
  gst_videomixer_orc_blend_ayuv (dest, dest_stride, src, src_stride,
      s_alpha, src_width, src_height);
 }
 BLEND_A32 (argb_orc, _blend_loop_argb_orc);
 /* Init function */
 BlendFunction gst_video_mixer_blend_argb;
 BlendFunction gst_video_mixer_blend_bgra;
@ -769,18 +722,13 @@ FillColorFunction gst_video_mixer_fill_color_uyvy;
 void
 gst_video_mixer_init_blend (void)
 {
 #ifdef BUILD_X86_ASM
  guint cpu_flags;
  orc_init ();
  cpu_flags = orc_target_get_default_flags (orc_target_get_by_name ("mmx"));
 #endif
  GST_DEBUG_CATEGORY_INIT (gst_videomixer_blend_debug, "videomixer_blend", 0,
      "video mixer blending functions");
-  gst_video_mixer_blend_argb = blend_argb_c;
+  gst_video_mixer_blend_argb = blend_argb;
-  gst_video_mixer_blend_bgra = blend_bgra_c;
+  gst_video_mixer_blend_bgra = blend_bgra;
  gst_video_mixer_blend_i420 = blend_i420;
  gst_video_mixer_blend_y444 = blend_y444;
  gst_video_mixer_blend_y42b = blend_y42b;
@ -820,13 +768,4 @@ gst_video_mixer_init_blend (void)
  gst_video_mixer_fill_color_yuy2 = fill_color_yuy2;
  gst_video_mixer_fill_color_yvyu = fill_color_yvyu;
  gst_video_mixer_fill_color_uyvy = fill_color_uyvy;
 #ifdef BUILD_X86_ASM
  if (cpu_flags & ORC_TARGET_MMX_MMX) {
    gst_video_mixer_blend_argb = blend_argb_mmx;
    gst_video_mixer_blend_bgra = blend_bgra_mmx;
  }
 #endif
  gst_video_mixer_blend_argb = blend_argb_orc;
 }
--- a/gst/videomixer/blend_mmx.h
+++ b/gst/videomixer/blend_mmx.h
@ -1,124 +0,0 @@
 #ifdef A32
 static inline void
 NAME_BLEND (guint8 * dest, const guint8 * src, gint src_height, gint src_width,
    gint src_stride, gint dest_stride, guint s_alpha)
 {
  gint i;
  gint src_add = src_stride - (4 * src_width);
  gint dest_add = dest_stride - (4 * src_width);
  for (i = 0; i < src_height; i++) {
    /*      (P1 * (256 - A) + (P2 * A)) / 256
     * =>   (P1 * 256 - P1 * A + P2 * A) / 256
     * =>   (P1 * 256 + A * (P2 - P1) / 256
     * =>   P1 + (A * (P2 - P1)) / 256
     */
    /* *INDENT-OFF* */
    __asm__ __volatile__ (
        " pcmpeqd    %%mm5 ,   %%mm5   \n\t"   /* mm5 = 0xffff... */
 #if A_OFF == 0
        " psrld        $24 ,   %%mm5   \n\t"   /* mm5 = 00 00 00 ff 00 00 00 ff, selector for alpha */
 #else
        " pslld        $24 ,   %%mm5   \n\t"   /* mm5 = ff 00 00 00 ff 00 00 00, selector for alpha */
 #endif
        " mov           %4 ,   %%eax   \n\t"   /* eax = s_alpha */
        " movd       %%eax ,   %%mm6   \n\t"   /* mm6 = s_alpha */
        " punpckldq  %%mm6 ,   %%mm6   \n\t"   /* mm6 = 00 00 00 aa 00 00 00 aa, alpha scale factor */
        " movl          %5 ,   %%ecx   \n\t"   /* ecx = src_width */
        " test          $1 ,   %%ecx   \n\t"   /* check odd pixel */
        " je                      1f   \n\t"
        /* do odd pixel */
        " movd        (%2) ,   %%mm2   \n\t"   /* mm2 = src,  00 00 00 00 sv su sy sa */
        " movd        (%3) ,   %%mm1   \n\t"   /* mm1 = dest, 00 00 00 00 dv du dy da */
        " movq       %%mm2 ,   %%mm0   \n\t"
        " punpcklbw  %%mm7 ,   %%mm2   \n\t"   /* mm2 = 00 sv 00 su 00 sy 00 sa */
        " pand       %%mm5 ,   %%mm0   \n\t"   /* mm0 = 00 00 00 00 00 00 00 sa, get alpha component  */
 #if A_OFF != 0
        " psrld        $24 ,   %%mm0   \n\t"
 #endif
        " punpcklbw  %%mm7 ,   %%mm1   \n\t"   /* mm1 = 00 dv 00 du 00 dy 00 da */
        " pmullw     %%mm6 ,   %%mm0   \n\t"   /* mult with scale */
        " psubw      %%mm1 ,   %%mm2   \n\t"   /* mm2 = mm2 - mm1 */
        " punpcklwd  %%mm0 ,   %%mm0   \n\t"
        " punpckldq  %%mm0 ,   %%mm0   \n\t"   /* mm0 == 00 aa 00 aa 00 aa 00 aa */
        " psrlw         $8 ,   %%mm0   \n\t"
        " pmullw     %%mm0 ,   %%mm2   \n\t"   /* mm2 == a * mm2 */
        " psllw         $8 ,   %%mm1   \n\t"   /* scale up */
        " paddw      %%mm1 ,   %%mm2   \n\t"   /* mm2 == mm2 + mm1 */
        " psrlw         $8 ,   %%mm2   \n\t"   /* scale down */
        " por        %%mm5 ,   %%mm2   \n\t"   /* set alpha to ff */
        " packuswb   %%mm2 ,   %%mm2   \n\t" 
        " movd       %%mm2 ,    (%3)   \n\t"   /* dest = mm1 */
        " add           $4 ,     %1    \n\t"
        " add           $4 ,     %0    \n\t"
        "1:                            \n\t"
        " sar           $1 ,   %%ecx   \n\t"   /* prepare for 2 pixel per loop */
        " cmp           $0 ,   %%ecx   \n\t"
        " je                      3f   \n\t"
        "2:                            \n\t"
        /* do even pixels */
        " movq        (%2) ,   %%mm2   \n\t"   /* mm2 = src,  sv1 su1 sy1 sa1  sv0 su0 sy0 sa0 */
        " movq        (%3) ,   %%mm1   \n\t"   /* mm1 = dest, dv1 du1 dy1 da1  dv0 du0 dy0 da0 */
        " movq       %%mm2 ,   %%mm4   \n\t"
        " movq       %%mm1 ,   %%mm3   \n\t"
        " movq       %%mm2 ,   %%mm0   \n\t"   /* copy for doing the alpha */
        " pxor       %%mm7 ,   %%mm7   \n\t"  
        " punpcklbw  %%mm7 ,   %%mm2   \n\t"   /* mm2 = 00 sv0  00 su0  00 sy0  00 sa0 */
        " punpckhbw  %%mm7 ,   %%mm4   \n\t"   /* mm4 = 00 sv1  00 su1  00 sy1  00 sa1 */
        " punpcklbw  %%mm7 ,   %%mm1   \n\t"   /* mm1 = 00 dv0  00 du0  00 dy0  00 da0 */
        " punpckhbw  %%mm7 ,   %%mm3   \n\t"   /* mm2 = 00 dv1  00 du1  00 dy1  00 da1 */
        " pand       %%mm5 ,   %%mm0   \n\t"   /* mm0 = 00 00 00 sa1  00 00 00 sa0 */
 #if A_OFF != 0
        " psrld        $24 ,   %%mm0   \n\t"
 #endif
        " psubw      %%mm1 ,   %%mm2   \n\t"   /* mm2 = mm2 - mm1 */
        " pmullw     %%mm6 ,   %%mm0   \n\t"   /* mult with scale */
        " psubw      %%mm3 ,   %%mm4   \n\t"   /* mm4 = mm4 - mm3 */
        " psrlw         $8 ,   %%mm0   \n\t"   /* scale back */
        " movq       %%mm0 ,   %%mm7   \n\t"   /* save copy */
        " punpcklwd  %%mm0 ,   %%mm0   \n\t"   /* mm0 = 00 00   00 00   00 aa0  00 aa0 */
        " punpckhwd  %%mm7 ,   %%mm7   \n\t"   /* mm7 = 00 00   00 00   00 aa1  00 aa1 */
        " punpckldq  %%mm0 ,   %%mm0   \n\t"   /* mm0 = 00 aa0  00 aa0  00 aa0  00 aa0 */
        " punpckldq  %%mm7 ,   %%mm7   \n\t"   /* mm7 = 00 aa1  00 aa1  00 aa1  00 aa1 */
        " pmullw     %%mm0 ,   %%mm2   \n\t"   /* mm2 == aa * mm2 */
        " pmullw     %%mm7 ,   %%mm4   \n\t"   /* mm2 == aa * mm2 */
        " psllw         $8 ,   %%mm1   \n\t"
        " psllw         $8 ,   %%mm3   \n\t"
        " paddw      %%mm1 ,   %%mm2   \n\t"   /* mm2 == mm2 + mm1 */
        " paddw      %%mm3 ,   %%mm4   \n\t"   /* mm2 == mm2 + mm1 */
        " psrlw         $8 ,   %%mm2   \n\t"
        " psrlw         $8 ,   %%mm4   \n\t"
        " packuswb   %%mm4 ,   %%mm2   \n\t"
        " por        %%mm5 ,   %%mm2   \n\t"   /* set alpha to ff */
        " movq       %%mm2 ,    (%3)   \n\t"
        " add           $8 ,     %1    \n\t"
        " add           $8 ,     %0    \n\t"
        " dec           %%ecx          \n\t"
        " jne                     2b   \n\t"
        "3:                            \n\t"
        :"=r" (src), "=r" (dest)
        :"0" (src), "1" (dest), "m" (s_alpha), "m" (src_width)
        :"%eax", "%ecx", "memory",
         "st", "st(1)", "st(2)", "st(3)", "st(4)", "st(5)", "st(6)", "st(7)"
 #ifdef __MMX__
        , "mm0", "mm1", "mm2", "mm4", "mm3", "mm5", "mm6", "mm7"
 #endif
    );
      /* *INDENT-ON* */
    src += src_add;
    dest += dest_add;
  }
  __asm__ __volatile__ ("emms");
 }
 #endif
--- a/gst/videomixer/blendorc.orc
+++ b/gst/videomixer/blendorc.orc
@ -29,7 +29,7 @@ shruw t2, t2, c1
 convsuswb d1, t2
-.function gst_videomixer_orc_blend_ayuv
+.function orc_blend_argb
 .flags 2d
 .dest 4 d guint8
 .source 4 s guint8
@ -41,8 +41,7 @@ convsuswb d1, t2
 .temp 8 d_wide
 .temp 8 s_wide
 .temp 8 a_wide
-.const 4 c_alpha 0xffffff00
+.const 4 a_alpha 0x000000ff
 loadl t, s
 convlw tw, t
@ -51,15 +50,48 @@ splatbl a, tb
 x4 convubw a_wide, a
 x4 mullw a_wide, a_wide, alpha
 x4 shruw a_wide, a_wide, 8
 andl t, t, c_alpha
 x4 convubw s_wide, t
-andl t, d, c_alpha
+loadl t, d
 x4 convubw d_wide, t
 x4 subw s_wide, s_wide, d_wide
 x4 mullw s_wide, s_wide, a_wide
 x4 div255w s_wide, s_wide
 x4 addw d_wide, d_wide, s_wide
-x4 convwb d, d_wide
+x4 convwb t, d_wide
 orl t, t, a_alpha
 storel d, t
 .function orc_blend_bgra
 .flags 2d
 .dest 4 d guint8
 .source 4 s guint8
 .param 2 alpha
 .temp 4 t
 .temp 4 t2
 .temp 2 tw
 .temp 1 tb
 .temp 4 a
 .temp 8 d_wide
 .temp 8 s_wide
 .temp 8 a_wide
 .const 4 a_alpha 0xff000000
 loadl t, s
 shrul t2, t, 24
 convlw tw, t2
 convwb tb, tw
 splatbl a, tb
 x4 convubw a_wide, a
 x4 mullw a_wide, a_wide, alpha
 x4 shruw a_wide, a_wide, 8
 x4 convubw s_wide, t
 loadl t, d
 x4 convubw d_wide, t
 x4 subw s_wide, s_wide, d_wide
 x4 mullw s_wide, s_wide, a_wide
 x4 div255w s_wide, s_wide
 x4 addw d_wide, d_wide, s_wide
 x4 convwb t, d_wide
 orl t, t, a_alpha
 storel d, t
--- a/gst/videomixer/videomixer.c
+++ b/gst/videomixer/videomixer.c
@ -72,12 +72,6 @@
 #include "config.h"
 #endif
 #ifdef HAVE_GCC_ASM
 #if defined(HAVE_CPU_I386) || defined(HAVE_CPU_X86_64)
 #define BUILD_X86_ASM
 #endif
 #endif
 #include <gst/gst.h>
 #include <gst/base/gstcollectpads.h>
 #include <gst/controller/gstcontroller.h>