video-scaler: add ORC optimized versions

Add ORC optimized versions of 2 and 4tap vertical scaling. Provide
a high quality 12 bits and a low quality 6 bits version.
This commit is contained in:
Wim Taymans 2014-10-29 16:20:56 +01:00
parent 84176843c1
commit 21ba193dd6
4 changed files with 1336 additions and 3 deletions

File diff suppressed because it is too large Load diff

View file

@ -149,6 +149,10 @@ void video_orc_convert_AYUV_ABGR (guint8 * ORC_RESTRICT d1, int d1_stride, const
void video_orc_convert_AYUV_RGBA (guint8 * ORC_RESTRICT d1, int d1_stride, const guint8 * ORC_RESTRICT s1, int s1_stride, int p1, int p2, int p3, int p4, int p5, int n, int m); void video_orc_convert_AYUV_RGBA (guint8 * ORC_RESTRICT d1, int d1_stride, const guint8 * ORC_RESTRICT s1, int s1_stride, int p1, int p2, int p3, int p4, int p5, int n, int m);
void video_orc_convert_I420_BGRA (guint8 * ORC_RESTRICT d1, const guint8 * ORC_RESTRICT s1, const guint8 * ORC_RESTRICT s2, const guint8 * ORC_RESTRICT s3, int p1, int p2, int p3, int p4, int p5, int n); void video_orc_convert_I420_BGRA (guint8 * ORC_RESTRICT d1, const guint8 * ORC_RESTRICT s1, const guint8 * ORC_RESTRICT s2, const guint8 * ORC_RESTRICT s3, int p1, int p2, int p3, int p4, int p5, int n);
void video_orc_matrix8 (guint8 * ORC_RESTRICT d1, const guint8 * ORC_RESTRICT s1, orc_int64 p1, orc_int64 p2, orc_int64 p3, int n); void video_orc_matrix8 (guint8 * ORC_RESTRICT d1, const guint8 * ORC_RESTRICT s1, orc_int64 p1, orc_int64 p2, orc_int64 p3, int n);
void video_orc_resample_v_2tap_8_lq (guint32 * ORC_RESTRICT d1, const guint32 * ORC_RESTRICT s1, const guint32 * ORC_RESTRICT s2, int p1, int n);
void video_orc_resample_v_2tap_8 (guint32 * ORC_RESTRICT d1, const guint32 * ORC_RESTRICT s1, const guint32 * ORC_RESTRICT s2, int p1, int n);
void video_orc_resample_v_4tap_8_lq (guint32 * ORC_RESTRICT d1, const guint32 * ORC_RESTRICT s1, const guint32 * ORC_RESTRICT s2, const guint32 * ORC_RESTRICT s3, const guint32 * ORC_RESTRICT s4, int p1, int p2, int p3, int p4, int n);
void video_orc_resample_v_4tap_8 (guint32 * ORC_RESTRICT d1, const guint32 * ORC_RESTRICT s1, const guint32 * ORC_RESTRICT s2, const guint32 * ORC_RESTRICT s3, const guint32 * ORC_RESTRICT s4, int p1, int p2, int p3, int p4, int n);
#ifdef __cplusplus #ifdef __cplusplus
} }

View file

@ -1284,3 +1284,156 @@ x4 addssw aq, aq, q1
x4 convssswb ayuv2, aq x4 convssswb ayuv2, aq
x4 addb ayuv, ayuv2, c128 x4 addb ayuv, ayuv2, c128
#.function video_orc_resample_h_near_8888
#.source 4 src guint32
#.source 4 idx
#.dest 4 dest guint32
#.temp 4 t
#
#loadidxl t, src, idx
#storel dest, t
#.function video_orc_resample_h_2tap_8888_16
#.source 4 src1 guint32
#.source 4 src2 guint32
#.source 8 coef1 guint64
#.source 8 coef2 guint64
#.source 4 idx
#.dest 4 dest guint32
#.temp 4 t1
#.temp 4 t2
#.temp 8 q1
#.temp 8 q2
#
#loadidxl t1, src1, idx
#x4 convubw q1, t1
#x4 mulhuw q1, q1, coef1
#
#loadidxl t2, src2, idx
#x4 convubw q2, t2
#x4 mulhuw q2, q2, coef2
#
#x4 addw q2, q2, q1
#x4 convuuswb dest, q2
#
#.function video_orc_resample_h_2tap_8888_lq
#.source 4 src1 guint32
#.source 4 src2 guint32
#.source 8 coef1 guint64
#.source 4 idx
#.dest 4 dest guint32
#.temp 4 t1
#.temp 4 t2
#.temp 8 q1
#.temp 8 q2
#
#loadidxl t1, src1, idx
#x4 convubw q1, t1
#loadidxl t2, src2, idx
#x4 convubw q2, t2
#x4 subw q2, q2, q1
#
#x4 mullw q2, q2, coef1
#x4 addw q2, q2, 128
#x4 convhwb t2, q2
#x4 addb dest, t2, t1
.function video_orc_resample_v_2tap_8_lq
.source 1 src1 guint32
.source 1 src2 guint32
.dest 1 dest guint32
.param 2 p1
.temp 1 t
.temp 2 w1
.temp 2 w2
convubw w1, src1
convubw w2, src2
subw w2, w2, w1
mullw w2, w2, p1
addw w2, w2, 128
convhwb t, w2
addb dest, t, src1
.function video_orc_resample_v_2tap_8
.source 1 s1 guint32
.source 1 s2 guint32
.dest 1 d1 guint32
.param 2 p1
.temp 1 t
.temp 2 w1
.temp 2 w2
.temp 4 t1
.temp 4 t2
convubw w1, s1
convubw w2, s2
subw w2, w2, w1
mulswl t2, w2, p1
addl t2, t2, 4095
shrsl t2, t2, 12
convlw w2, t2
addw w2, w2, w1
convsuswb d1, w2
.function video_orc_resample_v_4tap_8_lq
.source 1 s1 guint32
.source 1 s2 guint32
.source 1 s3 guint32
.source 1 s4 guint32
.dest 1 d1 guint32
.param 2 p1
.param 2 p2
.param 2 p3
.param 2 p4
.temp 2 w1
.temp 2 w2
convubw w1, s1
mullw w1, w1, p1
convubw w2, s2
mullw w2, w2, p2
addw w1, w1, w2
convubw w2, s3
mullw w2, w2, p3
addw w1, w1, w2
convubw w2, s4
mullw w2, w2, p4
addw w1, w1, w2
addw w1, w1, 32
shrsw w1, w1, 6
convsuswb d1, w1
.function video_orc_resample_v_4tap_8
.source 1 s1 guint32
.source 1 s2 guint32
.source 1 s3 guint32
.source 1 s4 guint32
.dest 1 d1 guint32
.param 2 p1
.param 2 p2
.param 2 p3
.param 2 p4
.temp 2 w1
.temp 2 w2
.temp 4 t1
.temp 4 t2
convubw w1, s1
mulswl t1, w1, p1
convubw w2, s2
mulswl t2, w2, p2
addl t1, t1, t2
convubw w2, s3
mulswl t2, w2, p3
addl t1, t1, t2
convubw w2, s4
mulswl t2, w2, p4
addl t1, t1, t2
addl t1, t1, 4095
shrsl t1, t1, 12
convlw w1, t1
convsuswb d1, w1

View file

@ -26,11 +26,15 @@
#include <math.h> #include <math.h>
#include "resampler.h" #include "resampler.h"
#include <orc/orcfunctions.h>
#include "video-orc.h"
#include "video-scaler.h" #include "video-scaler.h"
#define S16_SCALE 12 #define S16_SCALE 12
#define S16_SCALE_ROUND (1 << (S16_SCALE -1)) #define S16_SCALE_ROUND (1 << (S16_SCALE -1))
#define LQ
typedef void (*GstVideoScalerHFunc) (GstVideoScaler * scale, typedef void (*GstVideoScalerHFunc) (GstVideoScaler * scale,
gpointer src, gpointer dest, guint dest_offset, guint width); gpointer src, gpointer dest, guint dest_offset, guint width);
typedef void (*GstVideoScalerVFunc) (GstVideoScaler * scale, typedef void (*GstVideoScalerVFunc) (GstVideoScaler * scale,
@ -282,11 +286,42 @@ video_scale_h_near_8888 (GstVideoScaler * scale,
d[i] = s[offset[i]]; d[i] = s[offset[i]];
} }
#define BLEND_2TAP(a,b,p) (((((b)-(guint16)(a)) * p + S16_SCALE_ROUND) >> S16_SCALE) + (a))
static void static void
video_scale_v_near_8888 (GstVideoScaler * scale, video_scale_h_2tap_8888 (GstVideoScaler * scale,
gpointer srcs[], gpointer dest, guint dest_offset, guint width) gpointer src, gpointer dest, guint dest_offset, guint width)
{ {
memcpy (dest, srcs[0], 4 * width); gint i, max_taps, sum0, sum1, sum2, sum3;
guint8 *s1, *s2, *d;
guint32 *offset, *phase;
gint16 *taps, *t;
if (scale->taps_s16 == NULL)
make_s16_taps (scale, S16_SCALE);
max_taps = scale->resampler.max_taps;
offset = scale->resampler.offset + dest_offset;
phase = scale->resampler.phase + dest_offset;
taps = scale->taps_s16;
d = (guint8 *) dest + 4 * dest_offset;
for (i = 0; i < width; i++) {
s1 = (guint8 *) src + 4 * offset[i];
s2 = s1 + 4;
t = taps + (phase[i] * max_taps);
sum0 = BLEND_2TAP (s1[0], s2[0], t[1]);
sum1 = BLEND_2TAP (s1[1], s2[1], t[1]);
sum2 = BLEND_2TAP (s1[2], s2[2], t[1]);
sum3 = BLEND_2TAP (s1[3], s2[3], t[1]);
d[i * 4 + 0] = CLAMP (sum0, 0, 255);
d[i * 4 + 1] = CLAMP (sum1, 0, 255);
d[i * 4 + 2] = CLAMP (sum2, 0, 255);
d[i * 4 + 3] = CLAMP (sum3, 0, 255);
}
} }
static void static void
@ -330,6 +365,79 @@ video_scale_h_ntap_8888 (GstVideoScaler * scale,
} }
} }
static void
video_scale_v_near_8888 (GstVideoScaler * scale,
gpointer srcs[], gpointer dest, guint dest_offset, guint width)
{
orc_memcpy (dest, srcs[0], 4 * width);
}
static void
video_scale_v_2tap_8888 (GstVideoScaler * scale,
gpointer srcs[], gpointer dest, guint dest_offset, guint width)
{
gint max_taps;
guint32 *s1, *s2, *d;
guint64 p1;
if (scale->taps_s16 == NULL)
#ifdef LQ
make_s16_taps (scale, 8);
#else
make_s16_taps (scale, S16_SCALE);
#endif
max_taps = scale->resampler.max_taps;
d = (guint32 *) dest;
s1 = (guint32 *) srcs[0];
s2 = (guint32 *) srcs[1];
p1 = scale->taps_s16[dest_offset * max_taps + 1];
#ifdef LQ
video_orc_resample_v_2tap_8_lq (d, s1, s2, p1, width * 4);
#else
video_orc_resample_v_2tap_8 (d, s1, s2, p1, width * 4);
#endif
}
static void
video_scale_v_4tap_8888 (GstVideoScaler * scale,
gpointer srcs[], gpointer dest, guint dest_offset, guint width)
{
gint max_taps;
guint32 *s1, *s2, *s3, *s4, *d;
gint p1, p2, p3, p4;
gint16 *taps;
if (scale->taps_s16 == NULL)
#ifdef LQ
make_s16_taps (scale, 6);
#else
make_s16_taps (scale, S16_SCALE);
#endif
max_taps = scale->resampler.max_taps;
taps = scale->taps_s16 + dest_offset * max_taps;
d = (guint32 *) dest;
s1 = (guint32 *) srcs[0];
s2 = (guint32 *) srcs[1];
s3 = (guint32 *) srcs[2];
s4 = (guint32 *) srcs[3];
p1 = taps[0];
p2 = taps[1];
p3 = taps[2];
p4 = taps[3];
#ifdef LQ
video_orc_resample_v_4tap_8_lq (d, s1, s2, s3, s4, p1, p2, p3, p4, width * 4);
#else
video_orc_resample_v_4tap_8 (d, s1, s2, s3, s4, p1, p2, p3, p4, width * 4);
#endif
}
static void static void
video_scale_v_ntap_8888 (GstVideoScaler * scale, video_scale_v_ntap_8888 (GstVideoScaler * scale,
gpointer srcs[], gpointer dest, guint dest_offset, guint width) gpointer srcs[], gpointer dest, guint dest_offset, guint width)
@ -400,6 +508,9 @@ gst_video_scaler_horizontal (GstVideoScaler * scale, GstVideoFormat format,
case 1: case 1:
func = video_scale_h_near_8888; func = video_scale_h_near_8888;
break; break;
case 2:
func = video_scale_h_2tap_8888;
break;
default: default:
func = video_scale_h_ntap_8888; func = video_scale_h_ntap_8888;
break; break;
@ -437,6 +548,12 @@ gst_video_scaler_vertical (GstVideoScaler * scale, GstVideoFormat format,
case 1: case 1:
func = video_scale_v_near_8888; func = video_scale_v_near_8888;
break; break;
case 2:
func = video_scale_v_2tap_8888;
break;
case 4:
func = video_scale_v_4tap_8888;
break;
default: default:
func = video_scale_v_ntap_8888; func = video_scale_v_ntap_8888;
break; break;