gstreamer/gst-libs/gst/video/video-orc.orc
Wim Taymans 903ba19593 video-orc: make RGB pack/unpack faster
Avoid all the merging and splitting and use a pair of shifts and or
2014-12-10 16:43:34 +01:00

2050 lines
31 KiB
Text

.function video_orc_blend_little
.flags 1d
.dest 4 d guint8
.source 4 s guint8
.temp 4 t
.temp 2 tw
.temp 1 tb
.temp 4 a
.temp 8 d_wide
.temp 8 s_wide
.temp 8 a_wide
.const 4 a_alpha 0x000000ff
loadl t, s
convlw tw, t
convwb tb, tw
splatbl a, tb
x4 convubw a_wide, a
x4 shruw a_wide, a_wide, 8
x4 convubw s_wide, t
loadl t, d
x4 convubw d_wide, t
x4 subw s_wide, s_wide, d_wide
x4 mullw s_wide, s_wide, a_wide
x4 div255w s_wide, s_wide
x4 addw d_wide, d_wide, s_wide
x4 convwb t, d_wide
orl t, t, a_alpha
storel d, t
.function video_orc_blend_big
.flags 1d
.dest 4 d guint8
.source 4 s guint8
.temp 4 t
.temp 4 t2
.temp 2 tw
.temp 1 tb
.temp 4 a
.temp 8 d_wide
.temp 8 s_wide
.temp 8 a_wide
.const 4 a_alpha 0xff000000
loadl t, s
shrul t2, t, 24
convlw tw, t2
convwb tb, tw
splatbl a, tb
x4 convubw a_wide, a
x4 shruw a_wide, a_wide, 8
x4 convubw s_wide, t
loadl t, d
x4 convubw d_wide, t
x4 subw s_wide, s_wide, d_wide
x4 mullw s_wide, s_wide, a_wide
x4 div255w s_wide, s_wide
x4 addw d_wide, d_wide, s_wide
x4 convwb t, d_wide
orl t, t, a_alpha
storel d, t
.function video_orc_unpack_I420
.dest 4 d guint8
.source 1 y guint8
.source 1 u guint8
.source 1 v guint8
.const 1 c255 255
.temp 2 uv
.temp 2 ay
.temp 1 tu
.temp 1 tv
loadupdb tu, u
loadupdb tv, v
mergebw uv, tu, tv
mergebw ay, c255, y
mergewl d, ay, uv
.function video_orc_pack_I420
.dest 2 y guint8
.dest 1 u guint8
.dest 1 v guint8
.source 8 ayuv guint8
.temp 4 ay
.temp 4 uv
.temp 2 uu
.temp 2 vv
.temp 1 t1
.temp 1 t2
x2 splitlw uv, ay, ayuv
x2 select1wb y, ay
x2 splitwb vv, uu, uv
select0wb u, uu
select0wb v, vv
.function video_orc_pack_Y
.dest 1 y guint8
.source 4 ayuv guint8
.temp 2 ay
select0lw ay, ayuv
select1wb y, ay
.function video_orc_unpack_YUY2
.dest 8 ayuv guint8
.source 4 yuy2 guint8
.const 2 c255 0xff
.temp 2 yy
.temp 2 uv
.temp 4 ayay
.temp 4 uvuv
x2 splitwb uv, yy, yuy2
x2 mergebw ayay, c255, yy
mergewl uvuv, uv, uv
x2 mergewl ayuv, ayay, uvuv
.function video_orc_pack_YUY2
.dest 4 yuy2 guint8
.source 8 ayuv guint8
.temp 2 yy
.temp 2 uv
.temp 4 ayay
.temp 4 uvuv
x2 splitlw uvuv, ayay, ayuv
select0lw uv, uvuv
x2 select1wb yy, ayay
x2 mergebw yuy2, yy, uv
.function video_orc_pack_UYVY
.dest 4 yuy2 guint8
.source 8 ayuv guint8
.temp 2 yy
.temp 2 uv
.temp 4 ayay
.temp 4 uvuv
x2 splitlw uvuv, ayay, ayuv
select0lw uv, uvuv
x2 select1wb yy, ayay
x2 mergebw yuy2, uv, yy
.function video_orc_unpack_UYVY
.dest 8 ayuv guint8
.source 4 uyvy guint8
.const 2 c255 0xff
.temp 2 yy
.temp 2 uv
.temp 4 ayay
.temp 4 uvuv
x2 splitwb yy, uv, uyvy
x2 mergebw ayay, c255, yy
mergewl uvuv, uv, uv
x2 mergewl ayuv, ayay, uvuv
.function video_orc_unpack_YVYU
.dest 8 ayuv guint8
.source 4 uyvy guint8
.const 2 c255 0xff
.temp 2 yy
.temp 2 uv
.temp 4 ayay
.temp 4 uvuv
x2 splitwb uv, yy, uyvy
swapw uv, uv
x2 mergebw ayay, c255, yy
mergewl uvuv, uv, uv
x2 mergewl ayuv, ayay, uvuv
.function video_orc_pack_YVYU
.dest 4 yuy2 guint8
.source 8 ayuv guint8
.temp 2 yy
.temp 2 uv
.temp 4 ayay
.temp 4 uvuv
x2 splitlw uvuv, ayay, ayuv
select0lw uv, uvuv
x2 select1wb yy, ayay
swapw uv, uv
x2 mergebw yuy2, yy, uv
.function video_orc_unpack_YUV9
.dest 8 d guint8
.source 2 y guint8
.source 1 u guint8
.source 1 v guint8
.const 1 c255 255
.temp 2 tuv
.temp 4 ay
.temp 4 uv
.temp 1 tu
.temp 1 tv
loadupdb tu, u
loadupdb tv, v
mergebw tuv, tu, tv
mergewl uv, tuv, tuv
x2 mergebw ay, c255, y
x2 mergewl d, ay, uv
.function video_orc_unpack_Y42B
.dest 8 ayuv guint8
.source 2 yy guint8
.source 1 u guint8
.source 1 v guint8
.const 1 c255 255
.temp 2 uv
.temp 2 ay
.temp 4 uvuv
.temp 4 ayay
mergebw uv, u, v
x2 mergebw ayay, c255, yy
mergewl uvuv, uv, uv
x2 mergewl ayuv, ayay, uvuv
.function video_orc_pack_Y42B
.dest 2 y guint8
.dest 1 u guint8
.dest 1 v guint8
.source 8 ayuv guint8
.temp 4 ayay
.temp 4 uvuv
.temp 2 uv
x2 splitlw uvuv, ayay, ayuv
select0lw uv, uvuv
splitwb v, u, uv
x2 select1wb y, ayay
.function video_orc_unpack_Y444
.dest 4 ayuv guint8
.source 1 y guint8
.source 1 u guint8
.source 1 v guint8
.const 1 c255 255
.temp 2 uv
.temp 2 ay
mergebw uv, u, v
mergebw ay, c255, y
mergewl ayuv, ay, uv
.function video_orc_pack_Y444
.dest 1 y guint8
.dest 1 u guint8
.dest 1 v guint8
.source 4 ayuv guint8
.temp 2 ay
.temp 2 uv
splitlw uv, ay, ayuv
splitwb v, u, uv
select1wb y, ay
.function video_orc_unpack_GRAY8
.dest 4 ayuv guint8
.source 1 y guint8
.const 1 c255 255
.const 2 c0x8080 0x8080
.temp 2 ay
mergebw ay, c255, y
mergewl ayuv, ay, c0x8080
.function video_orc_pack_GRAY8
.dest 1 y guint8
.source 4 ayuv guint8
.temp 2 ay
select0lw ay, ayuv
select1wb y, ay
.function video_orc_unpack_BGRA
.dest 4 argb guint8
.source 4 bgra guint8
swapl argb, bgra
.function video_orc_pack_BGRA
.dest 4 bgra guint8
.source 4 argb guint8
swapl bgra, argb
.function video_orc_pack_RGBA
.dest 4 rgba guint8
.source 4 argb guint8
.temp 4 a
.temp 4 r
loadl r, argb
shrul a, r, 8
shll r, r, 24
orl rgba, r, a
.function video_orc_unpack_RGBA
.dest 4 argb guint8
.source 4 rgba guint8
.temp 4 a
.temp 4 r
loadl r, rgba
shll a, r, 8
shrul r, r, 24
orl argb, r, a
.function video_orc_unpack_ABGR
.dest 4 argb guint8
.source 4 abgr guint8
.temp 4 a
.temp 4 r
swapl r, abgr
shll a, r, 8
shrul r, r, 24
orl argb, r, a
.function video_orc_pack_ABGR
.dest 4 abgr guint8
.source 4 argb guint8
.temp 4 a
.temp 4 r
swapl r, argb
shll a, r, 8
shrul r, r, 24
orl abgr, r, a
.function video_orc_unpack_NV12
.dest 8 d guint8
.source 2 y guint8
.source 2 uv guint8
.const 1 c255 255
.temp 4 ay
.temp 4 uvuv
mergewl uvuv, uv, uv
x2 mergebw ay, c255, y
x2 mergewl d, ay, uvuv
.function video_orc_pack_NV12
.dest 2 y guint8
.dest 2 uv guint8
.source 8 ayuv guint8
.temp 4 ay
.temp 4 uvuv
x2 splitlw uvuv, ay, ayuv
x2 select1wb y, ay
select0lw uv, uvuv
.function video_orc_unpack_NV21
.dest 8 d guint8
.source 2 y guint8
.source 2 vu guint8
.const 1 c255 255
.temp 2 uv
.temp 4 ay
.temp 4 uvuv
swapw uv, vu
mergewl uvuv, uv, uv
x2 mergebw ay, c255, y
x2 mergewl d, ay, uvuv
.function video_orc_pack_NV21
.dest 2 y guint8
.dest 2 vu guint8
.source 8 ayuv guint8
.temp 4 ay
.temp 4 uvuv
.temp 2 uv
x2 splitlw uvuv, ay, ayuv
x2 select1wb y, ay
select0lw uv, uvuv
swapw vu, uv
.function video_orc_unpack_NV24
.dest 4 d guint8
.source 1 y guint8
.source 2 uv guint8
.const 1 c255 255
.temp 2 ay
mergebw ay, c255, y
mergewl d, ay, uv
.function video_orc_pack_NV24
.dest 1 y guint8
.dest 2 uv guint8
.source 4 ayuv guint8
.temp 2 ay
splitlw uv, ay, ayuv
select1wb y, ay
.function video_orc_unpack_A420
.dest 4 d guint8
.source 1 y guint8
.source 1 u guint8
.source 1 v guint8
.source 1 a guint8
.temp 2 uv
.temp 2 ay
.temp 1 tu
.temp 1 tv
loadupdb tu, u
loadupdb tv, v
mergebw uv, tu, tv
mergebw ay, a, y
mergewl d, ay, uv
.function video_orc_pack_A420
.dest 2 y guint8
.dest 1 u guint8
.dest 1 v guint8
.dest 2 a guint8
.source 8 ayuv guint8
.temp 4 ay
.temp 4 uv
.temp 2 uu
.temp 2 vv
x2 splitlw uv, ay, ayuv
x2 select1wb y, ay
x2 select0wb a, ay
x2 splitwb vv, uu, uv
select0wb u, uu
select0wb v, vv
.function video_orc_pack_AY
.dest 1 y guint8
.dest 1 a guint8
.source 4 ayuv guint8
.temp 2 ay
select0lw ay, ayuv
select1wb y, ay
select0wb a, ay
.function video_orc_resample_bilinear_u32
.dest 4 d1 guint8
.source 4 s1 guint8
.param 4 p1
.param 4 p2
ldreslinl d1, s1, p1, p2
.function video_orc_merge_linear_u8
.dest 1 d1
.source 1 s1
.source 1 s2
.param 1 p1
.temp 2 t1
.temp 2 t2
.temp 1 a
.temp 1 t
loadb a, s1
convubw t1, s1
convubw t2, s2
subw t2, t2, t1
mullw t2, t2, p1
addw t2, t2, 128
convhwb t, t2
addb d1, t, a
.function video_orc_memcpy_2d
.flags 2d
.dest 1 d1 guint8
.source 1 s1 guint8
copyb d1, s1
.function video_orc_convert_u16_to_u8
.source 2 s guint16
.dest 1 d guint8
convhwb d, s
.function video_orc_convert_u8_to_u16
.source 1 s guint8
.dest 2 d guint16
mergebw d, s, s
.function video_orc_splat_u32
.dest 4 d1 guint8
.param 4 p1
storel d1, p1
.function video_orc_splat_u64
.dest 8 d1 guint8
.param 4 p1
x4 mergebw d1, p1, p1
.function video_orc_convert_I420_UYVY
.dest 4 d1 guint8
.dest 4 d2 guint8
.source 2 y1 guint8
.source 2 y2 guint8
.source 1 u guint8
.source 1 v guint8
.temp 2 uv
mergebw uv, u, v
x2 mergebw d1, uv, y1
x2 mergebw d2, uv, y2
.function video_orc_convert_I420_YUY2
.dest 4 d1 guint8
.dest 4 d2 guint8
.source 2 y1 guint8
.source 2 y2 guint8
.source 1 u guint8
.source 1 v guint8
.temp 2 uv
mergebw uv, u, v
x2 mergebw d1, y1, uv
x2 mergebw d2, y2, uv
.function video_orc_convert_I420_AYUV
.dest 4 d1 guint8
.dest 4 d2 guint8
.source 1 y1 guint8
.source 1 y2 guint8
.source 1 u guint8
.source 1 v guint8
.const 1 c255 255
.temp 2 uv
.temp 2 ay
.temp 1 tu
.temp 1 tv
loadupdb tu, u
loadupdb tv, v
mergebw uv, tu, tv
mergebw ay, c255, y1
mergewl d1, ay, uv
mergebw ay, c255, y2
mergewl d2, ay, uv
.function video_orc_convert_YUY2_I420
.dest 2 y1 guint8
.dest 2 y2 guint8
.dest 1 u guint8
.dest 1 v guint8
.source 4 yuv1 guint8
.source 4 yuv2 guint8
.temp 2 t1
.temp 2 t2
.temp 2 ty
x2 splitwb t1, ty, yuv1
storew y1, ty
x2 splitwb t2, ty, yuv2
storew y2, ty
x2 avgub t1, t1, t2
splitwb v, u, t1
.function video_orc_convert_UYVY_YUY2
.flags 2d
.dest 4 yuy2 guint8
.source 4 uyvy guint8
x2 swapw yuy2, uyvy
.function video_orc_planar_chroma_420_422
.flags 2d
.dest 1 d1 guint8
.dest 1 d2 guint8
.source 1 s guint8
copyb d1, s
copyb d2, s
.function video_orc_planar_chroma_420_444
.flags 2d
.dest 2 d1 guint8
.dest 2 d2 guint8
.source 1 s guint8
.temp 2 t
splatbw t, s
storew d1, t
storew d2, t
.function video_orc_planar_chroma_422_444
.flags 2d
.dest 2 d1 guint8
.source 1 s guint8
.temp 2 t
splatbw t, s
storew d1, t
.function video_orc_planar_chroma_444_422
.flags 2d
.dest 1 d guint8
.source 2 s guint8
.temp 1 t1
.temp 1 t2
splitwb t1, t2, s
avgub d, t1, t2
.function video_orc_planar_chroma_444_420
.flags 2d
.dest 1 d guint8
.source 2 s1 guint8
.source 2 s2 guint8
.temp 2 t
.temp 1 t1
.temp 1 t2
x2 avgub t, s1, s2
splitwb t1, t2, t
avgub d, t1, t2
.function video_orc_planar_chroma_422_420
.flags 2d
.dest 1 d guint8
.source 1 s1 guint8
.source 1 s2 guint8
avgub d, s1, s2
.function video_orc_convert_YUY2_AYUV
.flags 2d
.dest 8 ayuv guint8
.source 4 yuy2 guint8
.const 2 c255 0xff
.temp 2 yy
.temp 2 uv
.temp 4 ayay
.temp 4 uvuv
x2 splitwb uv, yy, yuy2
x2 mergebw ayay, c255, yy
mergewl uvuv, uv, uv
x2 mergewl ayuv, ayay, uvuv
.function video_orc_convert_UYVY_AYUV
.flags 2d
.dest 8 ayuv guint8
.source 4 uyvy guint8
.const 2 c255 0xff
.temp 2 yy
.temp 2 uv
.temp 4 ayay
.temp 4 uvuv
x2 splitwb yy, uv, uyvy
x2 mergebw ayay, c255, yy
mergewl uvuv, uv, uv
x2 mergewl ayuv, ayay, uvuv
.function video_orc_convert_YUY2_Y42B
.flags 2d
.dest 2 y guint8
.dest 1 u guint8
.dest 1 v guint8
.source 4 yuy2 guint8
.temp 2 uv
x2 splitwb uv, y, yuy2
splitwb v, u, uv
.function video_orc_convert_UYVY_Y42B
.flags 2d
.dest 2 y guint8
.dest 1 u guint8
.dest 1 v guint8
.source 4 uyvy guint8
.temp 2 uv
x2 splitwb y, uv, uyvy
splitwb v, u, uv
.function video_orc_convert_YUY2_Y444
.flags 2d
.dest 2 y guint8
.dest 2 uu guint8
.dest 2 vv guint8
.source 4 yuy2 guint8
.temp 2 uv
.temp 1 u
.temp 1 v
x2 splitwb uv, y, yuy2
splitwb v, u, uv
splatbw uu, u
splatbw vv, v
.function video_orc_convert_UYVY_Y444
.flags 2d
.dest 2 y guint8
.dest 2 uu guint8
.dest 2 vv guint8
.source 4 uyvy guint8
.temp 2 uv
.temp 1 u
.temp 1 v
x2 splitwb y, uv, uyvy
splitwb v, u, uv
splatbw uu, u
splatbw vv, v
.function video_orc_convert_UYVY_I420
.dest 2 y1 guint8
.dest 2 y2 guint8
.dest 1 u guint8
.dest 1 v guint8
.source 4 yuv1 guint8
.source 4 yuv2 guint8
.temp 2 t1
.temp 2 t2
.temp 2 ty
x2 splitwb ty, t1, yuv1
storew y1, ty
x2 splitwb ty, t2, yuv2
storew y2, ty
x2 avgub t1, t1, t2
splitwb v, u, t1
.function video_orc_convert_AYUV_I420
.flags 2d
.dest 2 y1 guint8
.dest 2 y2 guint8
.dest 1 u guint8
.dest 1 v guint8
.source 8 ayuv1 guint8
.source 8 ayuv2 guint8
.temp 4 ay
.temp 4 uv1
.temp 4 uv2
.temp 4 uv
.temp 2 uu
.temp 2 vv
.temp 1 t1
.temp 1 t2
x2 splitlw uv1, ay, ayuv1
x2 select1wb y1, ay
x2 splitlw uv2, ay, ayuv2
x2 select1wb y2, ay
x4 avgub uv, uv1, uv2
x2 splitwb vv, uu, uv
splitwb t1, t2, uu
avgub u, t1, t2
splitwb t1, t2, vv
avgub v, t1, t2
.function video_orc_convert_AYUV_YUY2
.flags 2d
.dest 4 yuy2 guint8
.source 8 ayuv guint8
.temp 2 yy
.temp 2 uv1
.temp 2 uv2
.temp 4 ayay
.temp 4 uvuv
x2 splitlw uvuv, ayay, ayuv
splitlw uv1, uv2, uvuv
x2 avgub uv1, uv1, uv2
x2 select1wb yy, ayay
x2 mergebw yuy2, yy, uv1
.function video_orc_convert_AYUV_UYVY
.flags 2d
.dest 4 yuy2 guint8
.source 8 ayuv guint8
.temp 2 yy
.temp 2 uv1
.temp 2 uv2
.temp 4 ayay
.temp 4 uvuv
x2 splitlw uvuv, ayay, ayuv
splitlw uv1, uv2, uvuv
x2 avgub uv1, uv1, uv2
x2 select1wb yy, ayay
x2 mergebw yuy2, uv1, yy
.function video_orc_convert_AYUV_Y42B
.flags 2d
.dest 2 y guint8
.dest 1 u guint8
.dest 1 v guint8
.source 8 ayuv guint8
.temp 4 ayay
.temp 4 uvuv
.temp 2 uv1
.temp 2 uv2
x2 splitlw uvuv, ayay, ayuv
splitlw uv1, uv2, uvuv
x2 avgub uv1, uv1, uv2
splitwb v, u, uv1
x2 select1wb y, ayay
.function video_orc_convert_AYUV_Y444
.flags 2d
.dest 1 y guint8
.dest 1 u guint8
.dest 1 v guint8
.source 4 ayuv guint8
.temp 2 ay
.temp 2 uv
splitlw uv, ay, ayuv
splitwb v, u, uv
select1wb y, ay
.function video_orc_convert_Y42B_YUY2
.flags 2d
.dest 4 yuy2 guint8
.source 2 y guint8
.source 1 u guint8
.source 1 v guint8
.temp 2 uv
mergebw uv, u, v
x2 mergebw yuy2, y, uv
.function video_orc_convert_Y42B_UYVY
.flags 2d
.dest 4 uyvy guint8
.source 2 y guint8
.source 1 u guint8
.source 1 v guint8
.temp 2 uv
mergebw uv, u, v
x2 mergebw uyvy, uv, y
.function video_orc_convert_Y42B_AYUV
.flags 2d
.dest 8 ayuv guint8
.source 2 yy guint8
.source 1 u guint8
.source 1 v guint8
.const 1 c255 255
.temp 2 uv
.temp 2 ay
.temp 4 uvuv
.temp 4 ayay
mergebw uv, u, v
x2 mergebw ayay, c255, yy
mergewl uvuv, uv, uv
x2 mergewl ayuv, ayay, uvuv
.function video_orc_convert_Y444_YUY2
.flags 2d
.dest 4 yuy2 guint8
.source 2 y guint8
.source 2 u guint8
.source 2 v guint8
.temp 2 uv
.temp 4 uvuv
.temp 2 uv1
.temp 2 uv2
x2 mergebw uvuv, u, v
splitlw uv1, uv2, uvuv
x2 avgub uv, uv1, uv2
x2 mergebw yuy2, y, uv
.function video_orc_convert_Y444_UYVY
.flags 2d
.dest 4 uyvy guint8
.source 2 y guint8
.source 2 u guint8
.source 2 v guint8
.temp 2 uv
.temp 4 uvuv
.temp 2 uv1
.temp 2 uv2
x2 mergebw uvuv, u, v
splitlw uv1, uv2, uvuv
x2 avgub uv, uv1, uv2
x2 mergebw uyvy, uv, y
.function video_orc_convert_Y444_AYUV
.flags 2d
.dest 4 ayuv guint8
.source 1 yy guint8
.source 1 u guint8
.source 1 v guint8
.const 1 c255 255
.temp 2 uv
.temp 2 ay
mergebw uv, u, v
mergebw ay, c255, yy
mergewl ayuv, ay, uv
.function video_orc_convert_AYUV_ARGB
.flags 2d
.dest 4 argb guint8
.source 4 ayuv guint8
.param 2 p1
.param 2 p2
.param 2 p3
.param 2 p4
.param 2 p5
.temp 1 a
.temp 1 y
.temp 1 u
.temp 1 v
.temp 2 wy
.temp 2 wu
.temp 2 wv
.temp 2 wr
.temp 2 wg
.temp 2 wb
.temp 1 r
.temp 1 g
.temp 1 b
.temp 4 x
.const 1 c128 128
x4 subb x, ayuv, c128
splitlw wv, wy, x
splitwb y, a, wy
splitwb v, u, wv
splatbw wy, y
splatbw wu, u
splatbw wv, v
mulhsw wy, wy, p1
mulhsw wr, wv, p2
addw wr, wy, wr
convssswb r, wr
mergebw wr, a, r
mulhsw wb, wu, p3
addw wb, wy, wb
convssswb b, wb
mulhsw wg, wu, p4
addw wg, wy, wg
mulhsw wy, wv, p5
addw wg, wg, wy
convssswb g, wg
mergebw wb, g, b
mergewl x, wr, wb
x4 addb argb, x, c128
.function video_orc_convert_AYUV_BGRA
.flags 2d
.dest 4 bgra guint8
.source 4 ayuv guint8
.param 2 p1
.param 2 p2
.param 2 p3
.param 2 p4
.param 2 p5
.temp 1 a
.temp 1 y
.temp 1 u
.temp 1 v
.temp 2 wy
.temp 2 wu
.temp 2 wv
.temp 2 wr
.temp 2 wg
.temp 2 wb
.temp 1 r
.temp 1 g
.temp 1 b
.temp 4 x
.const 1 c128 128
x4 subb x, ayuv, c128
splitlw wv, wy, x
splitwb y, a, wy
splitwb v, u, wv
splatbw wy, y
splatbw wu, u
splatbw wv, v
mulhsw wy, wy, p1
mulhsw wr, wv, p2
addw wr, wy, wr
convssswb r, wr
mergebw wr, r, a
mulhsw wb, wu, p3
addw wb, wy, wb
convssswb b, wb
mulhsw wg, wu, p4
addw wg, wy, wg
mulhsw wy, wv, p5
addw wg, wg, wy
convssswb g, wg
mergebw wb, b, g
mergewl x, wb, wr
x4 addb bgra, x, c128
.function video_orc_convert_AYUV_ABGR
.flags 2d
.dest 4 argb guint8
.source 4 ayuv guint8
.param 2 p1
.param 2 p2
.param 2 p3
.param 2 p4
.param 2 p5
.temp 1 a
.temp 1 y
.temp 1 u
.temp 1 v
.temp 2 wy
.temp 2 wu
.temp 2 wv
.temp 2 wr
.temp 2 wg
.temp 2 wb
.temp 1 r
.temp 1 g
.temp 1 b
.temp 4 x
.const 1 c128 128
x4 subb x, ayuv, c128
splitlw wv, wy, x
splitwb y, a, wy
splitwb v, u, wv
splatbw wy, y
splatbw wu, u
splatbw wv, v
mulhsw wy, wy, p1
mulhsw wr, wv, p2
addw wr, wy, wr
convssswb r, wr
mulhsw wb, wu, p3
addw wb, wy, wb
convssswb b, wb
mergebw wb, a, b
mulhsw wg, wu, p4
addw wg, wy, wg
mulhsw wy, wv, p5
addw wg, wg, wy
convssswb g, wg
mergebw wr, g, r
mergewl x, wb, wr
x4 addb argb, x, c128
.function video_orc_convert_AYUV_RGBA
.flags 2d
.dest 4 argb guint8
.source 4 ayuv guint8
.param 2 p1
.param 2 p2
.param 2 p3
.param 2 p4
.param 2 p5
.temp 1 a
.temp 1 y
.temp 1 u
.temp 1 v
.temp 2 wy
.temp 2 wu
.temp 2 wv
.temp 2 wr
.temp 2 wg
.temp 2 wb
.temp 1 r
.temp 1 g
.temp 1 b
.temp 4 x
.const 1 c128 128
x4 subb x, ayuv, c128
splitlw wv, wy, x
splitwb y, a, wy
splitwb v, u, wv
splatbw wy, y
splatbw wu, u
splatbw wv, v
mulhsw wy, wy, p1
mulhsw wr, wv, p2
addw wr, wy, wr
convssswb r, wr
mulhsw wb, wu, p3
addw wb, wy, wb
convssswb b, wb
mergebw wb, b, a
mulhsw wg, wu, p4
addw wg, wy, wg
mulhsw wy, wv, p5
addw wg, wg, wy
convssswb g, wg
mergebw wr, r, g
mergewl x, wr, wb
x4 addb argb, x, c128
.function video_orc_convert_I420_BGRA
.dest 4 argb guint8
.source 1 y guint8
.source 1 u guint8
.source 1 v guint8
.param 2 p1
.param 2 p2
.param 2 p3
.param 2 p4
.param 2 p5
.temp 2 wy
.temp 2 wu
.temp 2 wv
.temp 2 wr
.temp 2 wg
.temp 2 wb
.temp 1 r
.temp 1 g
.temp 1 b
.temp 4 x
.const 1 c128 128
subb r, y, c128
splatbw wy, r
loadupdb r, u
subb r, r, c128
splatbw wu, r
loadupdb r, v
subb r, r, c128
splatbw wv, r
mulhsw wy, wy, p1
mulhsw wr, wv, p2
addw wr, wy, wr
convssswb r, wr
mergebw wr, r, 127
mulhsw wb, wu, p3
addw wb, wy, wb
convssswb b, wb
mulhsw wg, wu, p4
addw wg, wy, wg
mulhsw wy, wv, p5
addw wg, wg, wy
convssswb g, wg
mergebw wb, b, g
mergewl x, wb, wr
x4 addb argb, x, c128
.function video_orc_matrix8
.backup _custom_video_orc_matrix8
.source 4 argb guint8
.dest 4 ayuv guint8
.longparam 8 p1
.longparam 8 p2
.longparam 8 p3
.longparam 8 p4
.const 1 c128 128
.temp 2 w1
.temp 2 w2
.temp 1 b1
.temp 1 b2
.temp 4 l1
.temp 4 ayuv2
.temp 8 aq
.temp 8 q1
.temp 8 pr1
.temp 8 pr2
.temp 8 pr3
loadpq pr1, p1
loadpq pr2, p2
loadpq pr3, p3
x4 subb l1, argb, c128
select0lw w1, l1
select1lw w2, l1
select0wb b1, w1
select1wb b2, w1
convubw w1, b1
convuwl l1, w1
x4 mergebw aq, l1, l1
splatbl l1, b2
mergelq q1, l1, l1
x4 mulhsw q1, q1, pr1
x4 addw aq, aq, q1
select0wb b1, w2
splatbl l1,b1
mergelq q1, l1, l1
x4 mulhsw q1, q1, pr2
x4 addw aq, aq, q1
select1wb b2, w2
splatbl l1, b2
mergelq q1, l1, l1
x4 mulhsw q1, q1, pr3
x4 addw aq, aq, q1
x4 convssswb ayuv2, aq
x4 addb ayuv, ayuv2, c128
#.function video_orc_resample_h_near_u32
#.source 4 src guint32
#.source 4 idx
#.dest 4 dest guint32
#.temp 4 t
#
#loadidxl t, src, idx
#storel dest, t
.function video_orc_resample_h_near_u32_lq
.dest 4 d1 guint32
.source 4 s1 guint32
.param 4 p1
.param 4 p2
ldresnearl d1, s1, p1, p2
.function video_orc_resample_h_2tap_4u8_lq
.dest 4 d1 guint32
.source 4 s1 guint32
.param 4 p1
.param 4 p2
ldreslinl d1, s1, p1, p2
.function video_orc_resample_v_2tap_u8_lq
.source 1 src1 guint32
.source 1 src2 guint32
.dest 1 dest guint32
.param 2 p1 gint16
.temp 1 t
.temp 2 w1
.temp 2 w2
convubw w1, src1
convubw w2, src2
subw w2, w2, w1
mullw w2, w2, p1
addw w2, w2, 128
convhwb t, w2
addb dest, t, src1
.function video_orc_resample_v_2tap_u16
.source 2 src1 guint64
.source 2 src2 guint64
.dest 2 dest guint64
.param 2 p1 gint16
.temp 4 l1
.temp 4 l2
.temp 4 l3
convuwl l1, src1
convuwl l2, src2
subl l2, l2, l1
convuwl l3, p1
mulll l2, l2, l3
addl l2, l2, 4096
shrsl l2, l2, 12
addl l1, l1, l2
convsuslw dest, l1
.function video_orc_resample_v_2tap_u8
.source 1 s1 guint32
.source 1 s2 guint32
.dest 1 d1 guint32
.param 2 p1 gint16
.temp 1 t
.temp 2 w1
.temp 2 w2
.temp 4 t1
.temp 4 t2
convubw w1, s1
convubw w2, s2
subw w2, w2, w1
mulswl t2, w2, p1
addl t2, t2, 4095
shrsl t2, t2, 12
convlw w2, t2
addw w2, w2, w1
convsuswb d1, w2
.function video_orc_resample_v_4tap_u8_lq
.source 1 s1 guint32
.source 1 s2 guint32
.source 1 s3 guint32
.source 1 s4 guint32
.dest 1 d1 guint32
.param 2 p1 gint16
.param 2 p2 gint16
.param 2 p3 gint16
.param 2 p4 gint16
.temp 2 w1
.temp 2 w2
convubw w1, s1
mullw w1, w1, p1
convubw w2, s2
mullw w2, w2, p2
addw w1, w1, w2
convubw w2, s3
mullw w2, w2, p3
addw w1, w1, w2
convubw w2, s4
mullw w2, w2, p4
addw w1, w1, w2
addw w1, w1, 32
shrsw w1, w1, 6
convsuswb d1, w1
.function video_orc_resample_v_4tap_u8
.source 1 s1 guint32
.source 1 s2 guint32
.source 1 s3 guint32
.source 1 s4 guint32
.dest 1 d1 guint32
.param 2 p1 gint16
.param 2 p2 gint16
.param 2 p3 gint16
.param 2 p4 gint16
.temp 2 w1
.temp 2 w2
.temp 4 t1
.temp 4 t2
convubw w1, s1
mulswl t1, w1, p1
convubw w2, s2
mulswl t2, w2, p2
addl t1, t1, t2
convubw w2, s3
mulswl t2, w2, p3
addl t1, t1, t2
convubw w2, s4
mulswl t2, w2, p4
addl t1, t1, t2
addl t1, t1, 4095
shrsl t1, t1, 12
convsuslw w1, t1
convsuswb d1, w1
# crashes ORC for now but is potentially faster
#.function video_orc_resample_h_4tap_u8
#.source 1 s1 guint32
#.source 1 s2 guint32
#.source 1 s3 guint32
#.source 1 s4 guint32
#.source 2 t1 gint16
#.source 2 t2 gint16
#.source 2 t3 gint16
#.source 2 t4 gint16
#.dest 1 d1 guint8
#.temp 2 w1
#.temp 2 w2
#.temp 4 l1
#.temp 4 l2
#
#convubw w1, s1
#mulswl l1, w1, t1
#convubw w2, s2
#mulswl l2, w2, t2
#addl l1, l1, l2
#convubw w2, s3
#mulswl l2, w2, t3
#addl l1, l1, l2
#convubw w2, s4
#mulswl l2, w2, t4
#addl l1, l1, l2
#addl l1, l1, 4095
#shrsl l1, l1, 12
#convsuslw w1, l1
#convsuswb d1, w1
.function video_orc_resample_h_multaps_u8
.source 1 s guint32
.source 2 t gint16
.dest 4 d gint32
.temp 2 w1
convubw w1, s
mulswl d, w1, t
.function video_orc_resample_h_muladdtaps_u8
.flags 2d
.source 1 s guint32
.source 2 t gint16
.dest 4 d gint32
.temp 2 w1
.temp 4 t1
convubw w1, s
mulswl t1, w1, t
addl d, d, t1
.function video_orc_resample_scaletaps_u8
.source 4 s gint32
.dest 1 d guint32
.temp 2 w1
.temp 4 t1
addl t1, s, 4095
shrsl t1, t1, 12
convsuslw w1, t1
convsuswb d, w1
.function video_orc_resample_h_multaps_u8_lq
.source 1 s guint32
.source 2 t gint16
.dest 2 d gint32
.temp 2 w1
convubw w1, s
mullw d, w1, t
.function video_orc_resample_h_muladdtaps_u8_lq
.flags 2d
.source 1 s guint32
.source 2 t gint16
.dest 2 d gint32
.temp 2 w1
convubw w1, s
mullw w1, w1, t
addw d, d, w1
.function video_orc_resample_h_multaps3_u8_lq
.source 1 s1 guint32
.source 1 s2 guint32
.source 1 s3 guint32
.source 2 t1 gint16
.source 2 t2 gint16
.source 2 t3 gint16
.dest 2 d gint32
.temp 2 w1
.temp 2 w2
convubw w1, s1
mullw w1, w1, t1
convubw w2, s2
mullw w2, w2, t2
addw w1, w1, w2
convubw w2, s3
mullw w2, w2, t3
addw d, w1, w2
.function video_orc_resample_h_muladdtaps3_u8_lq
.source 1 s1 guint32
.source 1 s2 guint32
.source 1 s3 guint32
.source 2 t1 gint16
.source 2 t2 gint16
.source 2 t3 gint16
.dest 2 d gint32
.temp 2 w1
.temp 2 w2
convubw w1, s1
mullw w1, w1, t1
convubw w2, s2
mullw w2, w2, t2
addw w1, w1, w2
convubw w2, s3
mullw w2, w2, t3
addw w1, w1, w2
addw d, d, w1
.function video_orc_resample_h_muladdscaletaps3_u8_lq
.source 1 s1 guint32
.source 1 s2 guint32
.source 1 s3 guint32
.source 2 t1 gint16
.source 2 t2 gint16
.source 2 t3 gint16
.source 2 temp gint32
.dest 1 d guint32
.temp 2 w1
.temp 2 w2
convubw w1, s1
mullw w1, w1, t1
convubw w2, s2
mullw w2, w2, t2
addw w1, w1, w2
convubw w2, s3
mullw w2, w2, t3
addw w1, w1, w2
addw w1, w1, temp
addw w1, w1, 32
shrsw w1, w1, 6
convsuswb d, w1
.function video_orc_resample_scaletaps_u8_lq
.source 2 s gint32
.dest 1 d guint32
.temp 2 w1
addw w1, s, 32
shrsw w1, w1, 6
convsuswb d, w1
.function video_orc_resample_h_multaps_u16
.source 2 s guint64
.source 2 t gint16
.dest 4 d gint32
.temp 4 l1
.temp 4 l2
convuwl l1, s
convswl l2, t
mulll d, l1, l2
.function video_orc_resample_h_muladdtaps_u16
.flags 2d
.source 2 s guint64
.source 2 t gint16
.dest 4 d gint32
.temp 4 l1
.temp 4 l2
convuwl l1, s
convswl l2, t
mulll l1, l1, l2
addl d, d, l1
.function video_orc_resample_scaletaps_u16
.source 4 s gint32
.dest 2 d guint64
.temp 4 t1
addl t1, s, 4095
shrsl t1, t1, 12
convsuslw d, t1
.function video_orc_resample_v_multaps_u8
.source 1 s guint32
.param 2 t gint16
.dest 4 d gint32
.temp 2 w1
convubw w1, s
mulswl d, w1, t
.function video_orc_resample_v_muladdtaps_u8
.source 1 s guint32
.param 2 t gint16
.dest 4 d gint32
.temp 2 w1
.temp 4 t1
convubw w1, s
mulswl t1, w1, t
addl d, d, t1
.function video_orc_resample_v_multaps_u16
.source 2 s guint64
.param 2 t gint16
.dest 4 d gint32
.temp 4 l1
convuwl l1, s
mulll d, l1, t
.function video_orc_resample_v_muladdtaps_u16
.source 2 s guint64
.param 2 t gint16
.dest 4 d gint32
.temp 4 t1
.temp 4 t2
convuwl t1, s
convswl t2, t
mulll t1, t1, t2
addl d, d, t1
.function video_orc_resample_v_multaps_u8_lq
.source 1 s guint32
.param 2 t gint16
.dest 2 d gint32
.temp 2 w1
convubw w1, s
mullw d, w1, t
.function video_orc_resample_v_multaps4_u8_lq
.source 1 s1 guint32
.source 1 s2 guint32
.source 1 s3 guint32
.source 1 s4 guint32
.param 2 t1 gint16
.param 2 t2 gint16
.param 2 t3 gint16
.param 2 t4 gint16
.dest 2 d gint32
.temp 2 w1
.temp 2 w2
convubw w1, s1
mullw w1, w1, t1
convubw w2, s2
mullw w2, w2, t2
addw w1, w1, w2
convubw w2, s3
mullw w2, w2, t3
addw w1, w1, w2
convubw w2, s4
mullw w2, w2, t4
addw d, w1, w2
.function video_orc_resample_v_muladdtaps_u8_lq
.source 1 s guint32
.param 2 t gint16
.dest 2 d gint32
.temp 2 w1
convubw w1, s
mullw w1, w1, t
addw d, d, w1
.function video_orc_resample_v_muladdtaps4_u8_lq
.source 1 s1 guint32
.source 1 s2 guint32
.source 1 s3 guint32
.source 1 s4 guint32
.param 2 t1 gint16
.param 2 t2 gint16
.param 2 t3 gint16
.param 2 t4 gint16
.dest 2 d gint32
.temp 2 w1
.temp 2 w2
convubw w1, s1
mullw w1, w1, t1
convubw w2, s2
mullw w2, w2, t2
addw w1, w1, w2
convubw w2, s3
mullw w2, w2, t3
addw w1, w1, w2
convubw w2, s4
mullw w2, w2, t4
addw w1, w1, w2
addw d, d, w1
.function video_orc_resample_v_muladdscaletaps4_u8_lq
.source 1 s1 guint32
.source 1 s2 guint32
.source 1 s3 guint32
.source 1 s4 guint32
.source 2 temp gint32
.param 2 t1 gint16
.param 2 t2 gint16
.param 2 t3 gint16
.param 2 t4 gint16
.dest 1 d guint32
.temp 2 w1
.temp 2 w2
convubw w1, s1
mullw w1, w1, t1
convubw w2, s2
mullw w2, w2, t2
addw w1, w1, w2
convubw w2, s3
mullw w2, w2, t3
addw w1, w1, w2
convubw w2, s4
mullw w2, w2, t4
addw w1, w1, w2
addw w1, w1, temp
addw w1, w1, 32
shrsw w1, w1, 6
convsuswb d, w1
.function video_orc_chroma_down_h2_u8
.source 8 s guint8
.dest 8 d guint8
.temp 4 ayuv1
.temp 4 ayuv2
.temp 2 ay1
.temp 2 uv1
.temp 2 uv2
splitql ayuv2, ayuv1, s
splitlw uv1, ay1, ayuv1
select1lw uv2, ayuv2
x2 avgub uv1, uv1, uv2
mergewl ayuv1, ay1, uv1
mergelq d, ayuv1, ayuv2
#.function video_orc_chroma_up_h2_cs_u8
#.source 8 s guint8
#.source 4 s1 guint8
#.dest 8 d guint8
#.temp 4 ayuv1
#.temp 4 ayuv2
#.temp 4 ayuv3
#.temp 2 ay2
#.temp 2 uv2
#.temp 2 uv3
#
#splitql ayuv2, ayuv1, s
#ldresnearl ayuv3, s1, 0x20000, 0x20000
#splitlw uv2, ay2, ayuv2
#select1lw uv3, ayuv3
#x2 avgub uv2, uv2, uv3
#mergewl ayuv2, ay2, uv2
#mergelq d, ayuv1, ayuv2
.function video_orc_chroma_down_v2_u8
.source 4 s1 guint8
.source 4 s2 guint8
.dest 4 d guint8
.temp 2 ay1
.temp 2 uv1
.temp 2 uv2
splitlw uv1, ay1, s1
select1lw uv2, s2
x2 avgub uv1, uv1, uv2
mergewl d, ay1, uv1
.function video_orc_chroma_up_v2_u8
.source 4 s1 guint8
.source 4 s2 guint8
.dest 4 d1 guint8
.dest 4 d2 guint8
.temp 2 ay1
.temp 2 ay2
.temp 2 uv1
.temp 2 uv2
.temp 4 uuvv1
.temp 4 uuvv2
.temp 4 uuvv3
splitlw uv1, ay1, s1
splitlw uv2, ay2, s2
x2 convubw uuvv1, uv1
x2 convubw uuvv2, uv2
x2 mullw uuvv3, uuvv1, 3
x2 addw uuvv3, uuvv3, uuvv2
x2 addw uuvv3, uuvv3, 2
x2 shruw uuvv3, uuvv3, 2
x2 convsuswb uv1, uuvv3
mergewl d1, ay1, uv1
x2 mullw uuvv3, uuvv2, 3
x2 addw uuvv3, uuvv3, uuvv1
x2 addw uuvv3, uuvv3, 2
x2 shruw uuvv3, uuvv3, 2
x2 convsuswb uv2, uuvv3
mergewl d2, ay2, uv2
.function video_orc_chroma_up_v2_u16
.source 8 s1 guint16
.source 8 s2 guint16
.dest 8 d1 guint16
.dest 8 d2 guint16
.temp 4 ay1
.temp 4 ay2
.temp 4 uv1
.temp 4 uv2
.temp 8 uuvv1
.temp 8 uuvv2
.temp 8 uuvv3
splitql uv1, ay1, s1
splitql uv2, ay2, s2
x2 convuwl uuvv1, uv1
x2 convuwl uuvv2, uv2
x2 mulll uuvv3, uuvv1, 3
x2 addl uuvv3, uuvv3, uuvv2
x2 addl uuvv3, uuvv3, 2
x2 shrul uuvv3, uuvv3, 2
x2 convsuslw uv1, uuvv3
mergelq d1, ay1, uv1
x2 mulll uuvv3, uuvv2, 3
x2 addl uuvv3, uuvv3, uuvv1
x2 addl uuvv3, uuvv3, 2
x2 shrul uuvv3, uuvv3, 2
x2 convsuslw uv2, uuvv3
mergelq d2, ay2, uv2
.function video_orc_chroma_down_v2_u16
.source 8 s1 guint16
.source 8 s2 guint16
.dest 8 d guint16
.temp 4 ay1
.temp 4 uv1
.temp 4 uv2
splitql uv1, ay1, s1
select1ql uv2, s2
x2 avguw uv1, uv1, uv2
mergelq d, ay1, uv1
.function video_orc_chroma_down_v4_u8
.source 4 s1 guint8
.source 4 s2 guint8
.source 4 s3 guint8
.source 4 s4 guint8
.dest 4 d guint8
.temp 2 ay1
.temp 2 uv1
.temp 4 uuvv1
.temp 4 uuvv2
.temp 4 uuvv3
splitlw uv1, ay1, s1
x2 convubw uuvv1, uv1
select1lw uv1, s4
x2 convubw uuvv2, uv1
x2 addw uuvv3, uuvv1, uuvv2
select1lw uv1, s2
x2 convubw uuvv1, uv1
select1lw uv1, s3
x2 convubw uuvv2, uv1
x2 addw uuvv1, uuvv1, uuvv2
x2 shlw uuvv2, uuvv1, 1
x2 addw uuvv1, uuvv1, uuvv2
x2 addw uuvv3, uuvv3, uuvv1
x2 addw uuvv3, uuvv3, 4
x2 shruw uuvv3, uuvv3, 3
x2 convsuswb uv1, uuvv3
mergewl d, ay1, uv1
.function video_orc_chroma_down_v4_u16
.source 8 s1 guint16
.source 8 s2 guint16
.source 8 s3 guint16
.source 8 s4 guint16
.dest 8 d guint16
.temp 4 ay1
.temp 4 uv1
.temp 8 uuvv1
.temp 8 uuvv2
.temp 8 uuvv3
splitql uv1, ay1, s1
x2 convuwl uuvv1, uv1
select1ql uv1, s4
x2 convuwl uuvv2, uv1
x2 addl uuvv3, uuvv1, uuvv2
select1ql uv1, s2
x2 convuwl uuvv1, uv1
select1ql uv1, s3
x2 convuwl uuvv2, uv1
x2 addl uuvv1, uuvv1, uuvv2
x2 shll uuvv2, uuvv1, 1
x2 addl uuvv1, uuvv1, uuvv2
x2 addl uuvv3, uuvv3, uuvv1
x2 addl uuvv3, uuvv3, 4
x2 shrul uuvv3, uuvv3, 3
x2 convsuslw uv1, uuvv3
mergelq d, ay1, uv1
.function video_orc_dither_none_4u8_mask
.dest 4 p guint8
.param 4 masks
.temp 4 m
loadpl m, masks
x4 andnb p, m, p
.function video_orc_dither_none_4u16_mask
.dest 8 p guint16
.longparam 8 masks
.temp 8 m
loadpq m, masks
x4 andnw p, m, p
.function video_orc_dither_verterr_4u8_mask
.dest 4 p guint8
.dest 8 e guint16
.longparam 8 masks
.temp 8 m
.temp 8 t1
loadpq m, masks
x4 convubw t1, p
x4 addw t1, e, t1
x4 andw e, m, t1
x4 andnw t1, m, t1
x4 convsuswb p, t1
.function video_orc_dither_fs_muladd_u8
.dest 2 e guint16
.temp 2 t1
.temp 2 t2
loadoffw t2, e, 4
mullw t2, t2, 5
addw t1, t2, e
loadoffw t2, e, 8
mullw t2, t2, 3
addw e, t1, t2
# due to error propagation we should disable
# loop_shift for this function and only work on
# 4 pixels at a time.
#.function video_orc_dither_fs_add_4u8_mask
#.flags no-unroll
#.dest 4 d guint8
#.dest 8 e1 guint16
#.dest 8 e2 guint16
#.longparam 8 masks
#.temp 8 p
#.temp 8 t1
#.temp 8 t2
#
#x4 mullw t1, e1, 7
#x4 addw t1, t1, e2
#x4 shruw t1, t1, 4
#x4 convubw p, d
#x4 addw t1, t1, p
#x4 andnw p, masks, t1
#x4 convsuswb d, p
#x4 andw e2, t1, masks
.function video_orc_dither_ordered_u8
.source 1 e guint8
.dest 1 d guint8
addusb d, d, e
.function video_orc_dither_ordered_4u8_mask
.source 8 e1 guint16
.dest 4 d guint8
.longparam 8 masks
.temp 8 p
.temp 8 m
loadpq m, masks
x4 convubw p, d
x4 addw p, p, e1
x4 andnw p, m, p
x4 convsuswb d, p
.function video_orc_dither_ordered_4u16_mask
.source 8 e1 guint16
.dest 8 d guint16
.longparam 8 masks
.temp 8 p
.temp 8 m
loadpq m, masks
x4 addw p, d, e1
x4 andnw d, m, p