videomixer: Add MMX implementations of the AYUV blending and color filling functions

This provides a 20% speedup for blending and 100% for color filling.

The blending can probably be optimized even more.
This commit is contained in:
Sebastian Dröge 2009-12-14 20:08:06 +01:00
parent d3a9f07669
commit 061ededa36
3 changed files with 172 additions and 3 deletions

View file

@ -1,8 +1,8 @@
plugin_LTLIBRARIES = libgstvideomixer.la plugin_LTLIBRARIES = libgstvideomixer.la
libgstvideomixer_la_SOURCES = videomixer.c blend_ayuv.c blend_bgra.c blend_i420.c blend_rgb.c libgstvideomixer_la_SOURCES = videomixer.c blend_ayuv.c blend_bgra.c blend_i420.c blend_rgb.c
libgstvideomixer_la_CFLAGS = $(GST_CFLAGS) $(GST_BASE_CFLAGS) $(GST_CONTROLLER_CFLAGS) $(GST_PLUGINS_BASE_CFLAGS) libgstvideomixer_la_CFLAGS = $(GST_CFLAGS) $(GST_BASE_CFLAGS) $(GST_CONTROLLER_CFLAGS) $(GST_PLUGINS_BASE_CFLAGS) $(LIBOIL_CFLAGS)
libgstvideomixer_la_LIBADD = $(GST_LIBS) $(GST_BASE_LIBS) $(GST_CONTROLLER_LIBS) $(GST_PLUGINS_BASE_LIBS) -lgstvideo-@GST_MAJORMINOR@ libgstvideomixer_la_LIBADD = $(GST_LIBS) $(GST_BASE_LIBS) $(GST_CONTROLLER_LIBS) $(GST_PLUGINS_BASE_LIBS) -lgstvideo-@GST_MAJORMINOR@ $(LIBOIL_LIBS)
libgstvideomixer_la_LDFLAGS = $(GST_PLUGIN_LDFLAGS) libgstvideomixer_la_LDFLAGS = $(GST_PLUGIN_LDFLAGS)
libgstvideomixer_la_LIBTOOLFLAGS = --tag=disable-static libgstvideomixer_la_LIBTOOLFLAGS = --tag=disable-static

View file

@ -1,5 +1,6 @@
/* /*
* Copyright (C) 2004 Wim Taymans <wim@fluendo.com> * Copyright (C) 2004 Wim Taymans <wim@fluendo.com>
* Copyright (C) 2009 Sebastian Dröge <sebastian.droege@collabora.co.uk>
* *
* This library is free software; you can redistribute it and/or * This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Library General Public * modify it under the terms of the GNU Library General Public
@ -17,9 +18,18 @@
* Boston, MA 02111-1307, USA. * Boston, MA 02111-1307, USA.
*/ */
#ifdef HAVE_CONFIG_H
#include "config.h"
#endif
#include <gst/gst.h> #include <gst/gst.h>
#ifdef HAVE_GCC_ASM
#if defined(HAVE_CPU_I386) || defined(HAVE_CPU_X86_64)
#define BUILD_X86_ASM
#endif
#endif
#define BLEND_NORMAL(Y1,U1,V1,Y2,U2,V2,alpha,Y,U,V) \ #define BLEND_NORMAL(Y1,U1,V1,Y2,U2,V2,alpha,Y,U,V) \
Y = ((Y1*(255-alpha))+(Y2*alpha))>>8; \ Y = ((Y1*(255-alpha))+(Y2*alpha))>>8; \
U = ((U1*(255-alpha))+(U2*alpha))>>8; \ U = ((U1*(255-alpha))+(U2*alpha))>>8; \
@ -230,6 +240,93 @@ gst_videomixer_blend_ayuv_ayuv (guint8 * src, gint xpos, gint ypos,
#undef BLEND_MODE #undef BLEND_MODE
#ifdef BUILD_X86_ASM
void
gst_videomixer_blend_ayuv_ayuv_mmx (guint8 * src, gint xpos, gint ypos,
gint src_width, gint src_height, gdouble src_alpha,
guint8 * dest, gint dest_width, gint dest_height)
{
gint b_alpha;
gint i;
gint src_stride, dest_stride;
gint src_add, dest_add;
src_stride = src_width * 4;
dest_stride = dest_width * 4;
b_alpha = (gint) (src_alpha * 255);
/* adjust src pointers for negative sizes */
if (xpos < 0) {
src += -xpos * 4;
src_width -= -xpos;
xpos = 0;
}
if (ypos < 0) {
src += -ypos * src_stride;
src_height -= -ypos;
ypos = 0;
}
/* adjust width/height if the src is bigger than dest */
if (xpos + src_width > dest_width) {
src_width = dest_width - xpos;
}
if (ypos + src_height > dest_height) {
src_height = dest_height - ypos;
}
src_add = src_stride - (4 * src_width);
dest_add = dest_stride - (4 * src_width);
dest = dest + 4 * xpos + (ypos * dest_stride);
for (i = 0; i < src_height; i++) {
/* *INDENT-OFF* */
__asm__ __volatile__ (
"pxor %%mm7 , %%mm7 \n\t" /* mm7 = 0 */
"pcmpeqd %%mm6 , %%mm6 \n\t" /* mm6 = 0xffff... */
"punpcklbw %%mm7 , %%mm6 \n\t" /* mm6 = 0x00ff00ff00ff... */
"pcmpeqd %%mm5 , %%mm5 \n\t" /* mm5 = 0xffff... */
"psrlq $56 , %%mm5 \n\t" /* mm5 = 0x0...0ff */
"xor %%ecx , %%ecx \n\t" /* ecx = 0 */
"1: \n\t"
"movzxb (%0) , %%eax \n\t" /* eax == source alpha */
"imul %2 , %%eax \n\t" /* eax = source alpha * alpha */
"sar $8 , %%eax \n\t" /* eax = (source alpha * alpha) / 256 */
"movd %%eax , %%mm0 \n\t" /* mm0 = apply alpha */
"movd (%0) , %%mm2 \n\t" /* mm2 = src */
"movd (%1) , %%mm1 \n\t" /* mm1 = dest */
"punpcklwd %%mm0 , %%mm0 \n\t"
"punpckldq %%mm0 , %%mm0 \n\t" /* mm0 == 0a 0a 0a 0a */
"punpcklbw %%mm7 , %%mm1 \n\t" /* mm1 == dv du dy da */
"punpcklbw %%mm7 , %%mm2 \n\t" /* mm2 == sv su sy sa */
"pmullw %%mm0 , %%mm2 \n\t" /* mm2 == a * s */
"pandn %%mm6 , %%mm0 \n\t" /* mm0 == 255 - a */
"pmullw %%mm0 , %%mm1 \n\t" /* mm1 == (255 - a) * d */
"paddusw %%mm2 , %%mm1 \n\t" /* mm1 == s + d */
"psrlw $8 , %%mm1 \n\t"
"packuswb %%mm7 , %%mm1 \n\t"
"por %%mm5 , %%mm1 \n\t" /* mm1 = 0x.....ff */
"movd %%mm1 , (%1) \n\t" /* dest = mm1 */
"add $4 , %1 \n\t"
"add $4 , %0 \n\t"
"add $1 , %%ecx \n\t"
"cmp %%ecx , %3 \n\t"
"jne 1b"
: /* no output */
:"r" (src), "r" (dest), "r" (b_alpha), "r" (src_width)
:"%eax", "%ecx", "memory"
#ifdef __MMX__
, "mm0", "mm1", "mm2", "mm5", "mm6", "mm7"
#endif
);
/* *INDENT-ON* */
src += src_add;
dest += dest_add;
}
__asm__ __volatile__ ("emms");
}
#endif
/* fill a buffer with a checkerboard pattern */ /* fill a buffer with a checkerboard pattern */
void void
@ -263,3 +360,42 @@ gst_videomixer_fill_ayuv_color (guint8 * dest, gint width, gint height,
} }
} }
} }
#ifdef BUILD_X86_ASM
void
gst_videomixer_fill_ayuv_color_mmx (guint8 * dest, gint width, gint height,
gint colY, gint colU, gint colV)
{
guint64 val;
guint nvals = width * height;
val = (((guint64) 0xff)) | (((guint64) colY) << 8) |
(((guint64) colU) << 16) | (((guint64) colV) << 24);
val = (val << 32) | val;
/* *INDENT-OFF* */
__asm__ __volatile__ (
"cmp $2 , %2 \n\t"
"jb 2f \n\t"
"movq %4 , %%mm0 \n\t"
"1: \n\t"
"movq %%mm0 , (%1) \n\t"
"sub $2 , %0 \n\t"
"add $8 , %1 \n\t"
"cmp $2 , %2 \n\t"
"jae 1b \n\t"
"emms \n\t"
"2: \n\t"
: "=r" (nvals), "=r" (dest)
: "0" (nvals), "1" (dest), "r" (val)
: "memory"
#ifdef __MMX__
, "mm0"
#endif
);
/* *INDENT-ON* */
if (nvals)
GST_WRITE_UINT32_LE (&dest[-4], (guint32) (val & 0xffffffff));
}
#endif

View file

@ -57,11 +57,21 @@
#include "config.h" #include "config.h"
#endif #endif
#ifdef HAVE_GCC_ASM
#if defined(HAVE_CPU_I386) || defined(HAVE_CPU_X86_64)
#define BUILD_X86_ASM
#endif
#endif
#include <gst/gst.h> #include <gst/gst.h>
#include <gst/base/gstcollectpads.h> #include <gst/base/gstcollectpads.h>
#include <gst/controller/gstcontroller.h> #include <gst/controller/gstcontroller.h>
#include <gst/video/video.h> #include <gst/video/video.h>
#include <liboil/liboil.h>
#include <liboil/liboilcpu.h>
#include <liboil/liboilfunction.h>
#ifdef HAVE_STDLIB_H #ifdef HAVE_STDLIB_H
#include <stdlib.h> #include <stdlib.h>
#endif #endif
@ -160,6 +170,14 @@ void gst_videomixer_fill_i420_checker (guint8 * dest, gint width, gint height);
void gst_videomixer_fill_i420_color (guint8 * dest, gint width, gint height, void gst_videomixer_fill_i420_color (guint8 * dest, gint width, gint height,
gint colY, gint colU, gint colV); gint colY, gint colU, gint colV);
#ifdef BUILD_X86_ASM
void gst_videomixer_blend_ayuv_ayuv_mmx (guint8 * src, gint xpos, gint ypos,
gint src_width, gint src_height, gdouble src_alpha,
guint8 * dest, gint dest_width, gint dest_height);
void gst_videomixer_fill_ayuv_color_mmx (guint8 * dest, gint width, gint height,
gint colY, gint colU, gint colV);
#endif
#define DEFAULT_PAD_ZORDER 0 #define DEFAULT_PAD_ZORDER 0
#define DEFAULT_PAD_XPOS 0 #define DEFAULT_PAD_XPOS 0
#define DEFAULT_PAD_YPOS 0 #define DEFAULT_PAD_YPOS 0
@ -920,12 +938,25 @@ gst_videomixer_setcaps (GstPad * pad, GstCaps * caps)
goto done; goto done;
switch (mixer->fmt) { switch (mixer->fmt) {
case GST_VIDEO_FORMAT_AYUV: case GST_VIDEO_FORMAT_AYUV:{
#ifdef BUILD_X86_ASM
guint cpu_flags = oil_cpu_get_flags ();
mixer->blend =
(cpu_flags & OIL_IMPL_FLAG_MMX) ? gst_videomixer_blend_ayuv_ayuv_mmx :
gst_videomixer_blend_ayuv_ayuv;
mixer->fill_checker = gst_videomixer_fill_ayuv_checker;
mixer->fill_color =
(cpu_flags & OIL_IMPL_FLAG_MMX) ? gst_videomixer_fill_ayuv_color_mmx :
gst_videomixer_fill_ayuv_color;
#else
mixer->blend = gst_videomixer_blend_ayuv_ayuv; mixer->blend = gst_videomixer_blend_ayuv_ayuv;
mixer->fill_checker = gst_videomixer_fill_ayuv_checker; mixer->fill_checker = gst_videomixer_fill_ayuv_checker;
mixer->fill_color = gst_videomixer_fill_ayuv_color; mixer->fill_color = gst_videomixer_fill_ayuv_color;
#endif
ret = TRUE; ret = TRUE;
break; break;
}
case GST_VIDEO_FORMAT_I420: case GST_VIDEO_FORMAT_I420:
mixer->blend = gst_videomixer_blend_i420_i420; mixer->blend = gst_videomixer_blend_i420_i420;
mixer->fill_checker = gst_videomixer_fill_i420_checker; mixer->fill_checker = gst_videomixer_fill_i420_checker;
@ -1623,6 +1654,8 @@ plugin_init (GstPlugin * plugin)
GST_DEBUG_CATEGORY_INIT (gst_videomixer_debug, "videomixer", 0, GST_DEBUG_CATEGORY_INIT (gst_videomixer_debug, "videomixer", 0,
"video mixer"); "video mixer");
oil_init ();
return gst_element_register (plugin, "videomixer", GST_RANK_PRIMARY, return gst_element_register (plugin, "videomixer", GST_RANK_PRIMARY,
GST_TYPE_VIDEO_MIXER); GST_TYPE_VIDEO_MIXER);
} }