gstreamer/gst/videomixer/blendorc.orc

.function orc_splat_u32
.dest 4 d1 guint32
.param 4 p1 guint32

copyl d1, p1

.function orc_memcpy_u32
.dest 4 d1 guint32
.source 4 s1 guint32

copyl d1, s1

.function orc_blend_u8
.flags 2d
.dest 1 d1 guint8
.source 1 s1 guint8
.param 2 p1
.temp 2 t1
.temp 2 t2
.const 1 c1 8 

convubw t1, d1
convubw t2, s1
subw t2, t2, t1
mullw t2, t2, p1
shlw t1, t1, c1
addw t2, t1, t2
shruw t2, t2, c1
convsuswb d1, t2


.function orc_blend_argb
.flags 2d
.dest 4 d guint8
.source 4 s guint8
.param 2 alpha
.temp 4 t
.temp 2 tw
.temp 1 tb
.temp 4 a
.temp 8 d_wide
.temp 8 s_wide
.temp 8 a_wide
.const 4 a_alpha 0x000000ff

loadl t, s
convlw tw, t
convwb tb, tw
splatbl a, tb
x4 convubw a_wide, a
x4 mullw a_wide, a_wide, alpha
x4 shruw a_wide, a_wide, 8
x4 convubw s_wide, t
loadl t, d
x4 convubw d_wide, t
x4 subw s_wide, s_wide, d_wide
x4 mullw s_wide, s_wide, a_wide
x4 div255w s_wide, s_wide
x4 addw d_wide, d_wide, s_wide
x4 convwb t, d_wide
orl t, t, a_alpha
storel d, t

.function orc_blend_bgra
.flags 2d
.dest 4 d guint8
.source 4 s guint8
.param 2 alpha
.temp 4 t
.temp 4 t2
.temp 2 tw
.temp 1 tb
.temp 4 a
.temp 8 d_wide
.temp 8 s_wide
.temp 8 a_wide
.const 4 a_alpha 0xff000000

loadl t, s
shrul t2, t, 24
convlw tw, t2
convwb tb, tw
splatbl a, tb
x4 convubw a_wide, a
x4 mullw a_wide, a_wide, alpha
x4 shruw a_wide, a_wide, 8
x4 convubw s_wide, t
loadl t, d
x4 convubw d_wide, t
x4 subw s_wide, s_wide, d_wide
x4 mullw s_wide, s_wide, a_wide
x4 div255w s_wide, s_wide
x4 addw d_wide, d_wide, s_wide
x4 convwb t, d_wide
orl t, t, a_alpha
storel d, t


.function orc_overlay_argb
.flags 2d
.dest 4 d guint8
.source 4 s guint8
.param 2 alpha
.temp 4 t
.temp 2 tw
.temp 1 tb
.temp 8 alpha_s
.temp 8 alpha_s_inv
.temp 8 alpha_d
.temp 4 a
.temp 8 d_wide
.temp 8 s_wide
.const 4 xfs 0xffffffff
.const 4 a_alpha 0x000000ff
.const 4 a_alpha_inv 0xffffff00

# calc source alpha as alpha_s = alpha_s * alpha / 256
loadl t, s
convlw tw, t
convwb tb, tw
splatbl a, tb
x4 convubw alpha_s, a
x4 mullw alpha_s, alpha_s, alpha
x4 shruw alpha_s, alpha_s, 8
x4 convubw s_wide, t
x4 mullw s_wide, s_wide, alpha_s

# calc destination alpha as alpha_d = (255-alpha_s) * alpha_d / 255
loadpl a, xfs
x4 convubw alpha_s_inv, a
x4 subw alpha_s_inv, alpha_s_inv, alpha_s
loadl t, d
convlw tw, t
convwb tb, tw
convubw tw, tb
splatbl a, tb
x4 convubw alpha_d, a
x4 mullw alpha_d, alpha_d, alpha_s_inv
x4 div255w alpha_d, alpha_d
x4 convubw d_wide, t
x4 mullw d_wide, d_wide, alpha_d

# calc final pixel as pix_d = pix_s*alpha_s + pix_d*alpha_d*(255-alpha_s)/255
x4 addw d_wide, d_wide, s_wide

# calc the final destination alpha_d = alpha_s + alpha_d * (255-alpha_s)/255
x4 addw alpha_d, alpha_d, alpha_s

# now normalize the pix_d by the final alpha to make it associative
x4 divluw, d_wide, d_wide, alpha_d

# pack the new alpha into the correct spot
x4 convwb t, d_wide
andl t, t, a_alpha_inv
x4 convwb a, alpha_d
andl a, a, a_alpha
orl  t, t, a
storel d, t

.function orc_overlay_bgra
.flags 2d
.dest 4 d guint8
.source 4 s guint8
.param 2 alpha
.temp 4 t
.temp 4 t2
.temp 2 tw
.temp 1 tb
.temp 8 alpha_s
.temp 8 alpha_s_inv
.temp 8 alpha_d
.temp 4 a
.temp 8 d_wide
.temp 8 s_wide
.const 4 xfs 0xffffffff
.const 4 a_alpha 0xff000000
.const 4 a_alpha_inv 0x00ffffff

# calc source alpha as alpha_s = alpha_s * alpha / 256
loadl t, s
shrul t2, t, 24
convlw tw, t
convwb tb, tw
splatbl a, tb
x4 convubw alpha_s, a
x4 mullw alpha_s, alpha_s, alpha
x4 shruw alpha_s, alpha_s, 8
x4 convubw s_wide, t
x4 mullw s_wide, s_wide, alpha_s

# calc destination alpha as alpha_d = (255-alpha_s) * alpha_d / 255
loadpl a, xfs
x4 convubw alpha_s_inv, a
x4 subw alpha_s_inv, alpha_s_inv, alpha_s
loadl t, d
shrul t2, t, 24
convlw tw, t
convwb tb, tw
convubw tw, tb
splatbl a, tb
x4 convubw alpha_d, a
x4 mullw alpha_d, alpha_d, alpha_s_inv
x4 div255w alpha_d, alpha_d
x4 convubw d_wide, t
x4 mullw d_wide, d_wide, alpha_d

# calc final pixel as pix_d = pix_s*alpha_s + pix_d*alpha_d*(255-alpha_s)/255
x4 addw d_wide, d_wide, s_wide

# calc the final destination alpha_d = alpha_s + alpha_d * (255-alpha_s)/255
x4 addw alpha_d, alpha_d, alpha_s

# now normalize the pix_d by the final alpha to make it associative
x4 divluw, d_wide, d_wide, alpha_d

# pack the new alpha into the correct spot
x4 convwb t, d_wide
andl t, t, a_alpha_inv
x4 convwb a, alpha_d
andl a, a, a_alpha
orl  t, t, a
storel d, t
videomixer: Port most blending related functions to orc Only remaining MMX implementation is the ARGB/BGRA/AYUV blending for which we first need the orc compositing opcodes. 2010-06-10 20:45:13 +00:00			`.function orc_splat_u32`
			`.dest 4 d1 guint32`
			`.param 4 p1 guint32`

			`copyl d1, p1`

			`.function orc_memcpy_u32`
			`.dest 4 d1 guint32`
			`.source 4 s1 guint32`

			`copyl d1, s1`

			`.function orc_blend_u8`
			`.flags 2d`
			`.dest 1 d1 guint8`
			`.source 1 s1 guint8`
			`.param 2 p1`
			`.temp 2 t1`
			`.temp 2 t2`
			`.const 1 c1 8`

			`convubw t1, d1`
			`convubw t2, s1`
			`subw t2, t2, t1`
			`mullw t2, t2, p1`
			`shlw t1, t1, c1`
			`addw t2, t1, t2`
			`shruw t2, t2, c1`
			`convsuswb d1, t2`

videomixer: Add orc implementation for blending videomixer: Add orc implementation for blending 2010-08-22 08:58:05 +00:00
videomixer: Optimize ARGB blending and implement BGRA blending with orc This now means, that we have absolutely no handwritten assembly anymore in videomixer and it's also faster now when using SSE. 2010-08-23 13:44:50 +00:00			`.function orc_blend_argb`
videomixer: Add orc implementation for blending videomixer: Add orc implementation for blending 2010-08-22 08:58:05 +00:00			`.flags 2d`
			`.dest 4 d guint8`
			`.source 4 s guint8`
			`.param 2 alpha`
			`.temp 4 t`
			`.temp 2 tw`
			`.temp 1 tb`
			`.temp 4 a`
			`.temp 8 d_wide`
			`.temp 8 s_wide`
			`.temp 8 a_wide`
videomixer: Optimize ARGB blending and implement BGRA blending with orc This now means, that we have absolutely no handwritten assembly anymore in videomixer and it's also faster now when using SSE. 2010-08-23 13:44:50 +00:00			`.const 4 a_alpha 0x000000ff`
videomixer: Add orc implementation for blending videomixer: Add orc implementation for blending 2010-08-22 08:58:05 +00:00
			`loadl t, s`
			`convlw tw, t`
			`convwb tb, tw`
			`splatbl a, tb`
			`x4 convubw a_wide, a`
			`x4 mullw a_wide, a_wide, alpha`
			`x4 shruw a_wide, a_wide, 8`
			`x4 convubw s_wide, t`
videomixer: Optimize ARGB blending and implement BGRA blending with orc This now means, that we have absolutely no handwritten assembly anymore in videomixer and it's also faster now when using SSE. 2010-08-23 13:44:50 +00:00			`loadl t, d`
videomixer: Add orc implementation for blending videomixer: Add orc implementation for blending 2010-08-22 08:58:05 +00:00			`x4 convubw d_wide, t`
			`x4 subw s_wide, s_wide, d_wide`
			`x4 mullw s_wide, s_wide, a_wide`
			`x4 div255w s_wide, s_wide`
			`x4 addw d_wide, d_wide, s_wide`
videomixer: Optimize ARGB blending and implement BGRA blending with orc This now means, that we have absolutely no handwritten assembly anymore in videomixer and it's also faster now when using SSE. 2010-08-23 13:44:50 +00:00			`x4 convwb t, d_wide`
			`orl t, t, a_alpha`
			`storel d, t`
videomixer: Add orc implementation for blending videomixer: Add orc implementation for blending 2010-08-22 08:58:05 +00:00
videomixer: Optimize ARGB blending and implement BGRA blending with orc This now means, that we have absolutely no handwritten assembly anymore in videomixer and it's also faster now when using SSE. 2010-08-23 13:44:50 +00:00			`.function orc_blend_bgra`
			`.flags 2d`
			`.dest 4 d guint8`
			`.source 4 s guint8`
			`.param 2 alpha`
			`.temp 4 t`
			`.temp 4 t2`
			`.temp 2 tw`
			`.temp 1 tb`
			`.temp 4 a`
			`.temp 8 d_wide`
			`.temp 8 s_wide`
			`.temp 8 a_wide`
			`.const 4 a_alpha 0xff000000`
videomixer: Add orc implementation for blending videomixer: Add orc implementation for blending 2010-08-22 08:58:05 +00:00
videomixer: Optimize ARGB blending and implement BGRA blending with orc This now means, that we have absolutely no handwritten assembly anymore in videomixer and it's also faster now when using SSE. 2010-08-23 13:44:50 +00:00			`loadl t, s`
			`shrul t2, t, 24`
			`convlw tw, t2`
			`convwb tb, tw`
			`splatbl a, tb`
			`x4 convubw a_wide, a`
			`x4 mullw a_wide, a_wide, alpha`
			`x4 shruw a_wide, a_wide, 8`
			`x4 convubw s_wide, t`
			`loadl t, d`
			`x4 convubw d_wide, t`
			`x4 subw s_wide, s_wide, d_wide`
			`x4 mullw s_wide, s_wide, a_wide`
			`x4 div255w s_wide, s_wide`
			`x4 addw d_wide, d_wide, s_wide`
			`x4 convwb t, d_wide`
			`orl t, t, a_alpha`
			`storel d, t`
videomixer: Add orc implementation for blending videomixer: Add orc implementation for blending 2010-08-22 08:58:05 +00:00
videomixer2: Add transparent background option for alpha channel formats This option allows the videomixer2 element to output a valid alpha channel when the inputs contain a valid alpha channel. This allows mixing to occur in multiple stages serially. The following pipeline shows an example of such a pipeline: gst-launch videotestsrc background-color=0x000000 pattern=ball ! video/x-raw-yuv,format=\(fourcc\)AYUV ! videomixer2 background=transparent name=mix1 ! videomixer2 name=mix2 ! ffmpegcolorspace ! autovideosink videotestsrc ! video/x-raw-yuv,format=\(fourcc\)AYUV ! mix2. The first videotestsrc in this pipeline creates a moving ball on a transparent background. It is then passed to the first videomixer2. Previously, this videomixer2 would have forced the alpha channel to 1.0 and given a background of checker, black, or white to the stream. With this patch, however, you can now specify the background as transparent, and the alpha channel of the input will be preserved. This allows for further mixing downstream, as is shown in the above pipeline where the a second videomixer2 is used to mix in a background of an smpte videotestsrc. So the result is a ball hovering over the smpte test source. This could, of course, have been accomplished with a single mixer element, but staged mixing is useful when it is not convenient to mix all video at once (e.g. a pipeline where a foreground and background bin exist and are mixed at the final output, but the foreground bin needs an internal mixer to create transitions between clips). Fixes bug #639994. 2011-01-19 19:07:17 +00:00
			`.function orc_overlay_argb`
			`.flags 2d`
			`.dest 4 d guint8`
			`.source 4 s guint8`
			`.param 2 alpha`
			`.temp 4 t`
			`.temp 2 tw`
			`.temp 1 tb`
			`.temp 8 alpha_s`
			`.temp 8 alpha_s_inv`
			`.temp 8 alpha_d`
			`.temp 4 a`
			`.temp 8 d_wide`
			`.temp 8 s_wide`
			`.const 4 xfs 0xffffffff`
			`.const 4 a_alpha 0x000000ff`
			`.const 4 a_alpha_inv 0xffffff00`

			`# calc source alpha as alpha_s = alpha_s * alpha / 256`
			`loadl t, s`
			`convlw tw, t`
			`convwb tb, tw`
			`splatbl a, tb`
			`x4 convubw alpha_s, a`
			`x4 mullw alpha_s, alpha_s, alpha`
			`x4 shruw alpha_s, alpha_s, 8`
			`x4 convubw s_wide, t`
			`x4 mullw s_wide, s_wide, alpha_s`

			`# calc destination alpha as alpha_d = (255-alpha_s) * alpha_d / 255`
			`loadpl a, xfs`
			`x4 convubw alpha_s_inv, a`
			`x4 subw alpha_s_inv, alpha_s_inv, alpha_s`
			`loadl t, d`
			`convlw tw, t`
			`convwb tb, tw`
			`convubw tw, tb`
			`splatbl a, tb`
			`x4 convubw alpha_d, a`
			`x4 mullw alpha_d, alpha_d, alpha_s_inv`
			`x4 div255w alpha_d, alpha_d`
			`x4 convubw d_wide, t`
			`x4 mullw d_wide, d_wide, alpha_d`

			`# calc final pixel as pix_d = pix_salpha_s + pix_dalpha_d*(255-alpha_s)/255`
			`x4 addw d_wide, d_wide, s_wide`

			`# calc the final destination alpha_d = alpha_s + alpha_d * (255-alpha_s)/255`
			`x4 addw alpha_d, alpha_d, alpha_s`

			`# now normalize the pix_d by the final alpha to make it associative`
			`x4 divluw, d_wide, d_wide, alpha_d`

			`# pack the new alpha into the correct spot`
			`x4 convwb t, d_wide`
			`andl t, t, a_alpha_inv`
			`x4 convwb a, alpha_d`
			`andl a, a, a_alpha`
			`orl t, t, a`
			`storel d, t`

			`.function orc_overlay_bgra`
			`.flags 2d`
			`.dest 4 d guint8`
			`.source 4 s guint8`
			`.param 2 alpha`
			`.temp 4 t`
			`.temp 4 t2`
			`.temp 2 tw`
			`.temp 1 tb`
			`.temp 8 alpha_s`
			`.temp 8 alpha_s_inv`
			`.temp 8 alpha_d`
			`.temp 4 a`
			`.temp 8 d_wide`
			`.temp 8 s_wide`
			`.const 4 xfs 0xffffffff`
			`.const 4 a_alpha 0xff000000`
			`.const 4 a_alpha_inv 0x00ffffff`

			`# calc source alpha as alpha_s = alpha_s * alpha / 256`
			`loadl t, s`
			`shrul t2, t, 24`
			`convlw tw, t`
			`convwb tb, tw`
			`splatbl a, tb`
			`x4 convubw alpha_s, a`
			`x4 mullw alpha_s, alpha_s, alpha`
			`x4 shruw alpha_s, alpha_s, 8`
			`x4 convubw s_wide, t`
			`x4 mullw s_wide, s_wide, alpha_s`

			`# calc destination alpha as alpha_d = (255-alpha_s) * alpha_d / 255`
			`loadpl a, xfs`
			`x4 convubw alpha_s_inv, a`
			`x4 subw alpha_s_inv, alpha_s_inv, alpha_s`
			`loadl t, d`
			`shrul t2, t, 24`
			`convlw tw, t`
			`convwb tb, tw`
			`convubw tw, tb`
			`splatbl a, tb`
			`x4 convubw alpha_d, a`
			`x4 mullw alpha_d, alpha_d, alpha_s_inv`
			`x4 div255w alpha_d, alpha_d`
			`x4 convubw d_wide, t`
			`x4 mullw d_wide, d_wide, alpha_d`

			`# calc final pixel as pix_d = pix_salpha_s + pix_dalpha_d*(255-alpha_s)/255`
			`x4 addw d_wide, d_wide, s_wide`

			`# calc the final destination alpha_d = alpha_s + alpha_d * (255-alpha_s)/255`
			`x4 addw alpha_d, alpha_d, alpha_s`

			`# now normalize the pix_d by the final alpha to make it associative`
			`x4 divluw, d_wide, d_wide, alpha_d`

			`# pack the new alpha into the correct spot`
			`x4 convwb t, d_wide`
			`andl t, t, a_alpha_inv`
			`x4 convwb a, alpha_d`
			`andl a, a, a_alpha`
			`orl t, t, a`
			`storel d, t`