video-scaler: combine scaling operations

Combine add and scale of multiple lines/pixels to reduce the amount of
read and writes to temporary memory.
This commit is contained in:
Wim Taymans 2014-11-25 17:25:02 +01:00
parent 6306cce41e
commit 74e63fa660
4 changed files with 1937 additions and 13 deletions

File diff suppressed because it is too large Load diff

View file

@ -167,6 +167,9 @@ void video_orc_resample_h_muladdtaps_u8 (gint32 * ORC_RESTRICT d1, int d1_stride
void video_orc_resample_scaletaps_u8 (guint32 * ORC_RESTRICT d1, const gint32 * ORC_RESTRICT s1, int n);
void video_orc_resample_h_multaps_u8_lq (gint32 * ORC_RESTRICT d1, const guint32 * ORC_RESTRICT s1, const gint16 * ORC_RESTRICT s2, int n);
void video_orc_resample_h_muladdtaps_u8_lq (gint32 * ORC_RESTRICT d1, int d1_stride, const guint32 * ORC_RESTRICT s1, int s1_stride, const gint16 * ORC_RESTRICT s2, int s2_stride, int n, int m);
void video_orc_resample_h_multaps3_u8_lq (gint32 * ORC_RESTRICT d1, const guint32 * ORC_RESTRICT s1, const guint32 * ORC_RESTRICT s2, const guint32 * ORC_RESTRICT s3, const gint16 * ORC_RESTRICT s4, const gint16 * ORC_RESTRICT s5, const gint16 * ORC_RESTRICT s6, int n);
void video_orc_resample_h_muladdtaps3_u8_lq (gint32 * ORC_RESTRICT d1, const guint32 * ORC_RESTRICT s1, const guint32 * ORC_RESTRICT s2, const guint32 * ORC_RESTRICT s3, const gint16 * ORC_RESTRICT s4, const gint16 * ORC_RESTRICT s5, const gint16 * ORC_RESTRICT s6, int n);
void video_orc_resample_h_muladdscaletaps3_u8_lq (guint32 * ORC_RESTRICT d1, const guint32 * ORC_RESTRICT s1, const guint32 * ORC_RESTRICT s2, const guint32 * ORC_RESTRICT s3, const gint16 * ORC_RESTRICT s4, const gint16 * ORC_RESTRICT s5, const gint16 * ORC_RESTRICT s6, const gint32 * ORC_RESTRICT s7, int n);
void video_orc_resample_scaletaps_u8_lq (guint32 * ORC_RESTRICT d1, const gint32 * ORC_RESTRICT s1, int n);
void video_orc_resample_h_multaps_u16 (gint32 * ORC_RESTRICT d1, const guint64 * ORC_RESTRICT s1, const gint16 * ORC_RESTRICT s2, int n);
void video_orc_resample_h_muladdtaps_u16 (gint32 * ORC_RESTRICT d1, int d1_stride, const guint64 * ORC_RESTRICT s1, int s1_stride, const gint16 * ORC_RESTRICT s2, int s2_stride, int n, int m);
@ -176,7 +179,10 @@ void video_orc_resample_v_muladdtaps_u8 (gint32 * ORC_RESTRICT d1, const guint32
void video_orc_resample_v_multaps_u16 (gint32 * ORC_RESTRICT d1, const guint64 * ORC_RESTRICT s1, int p1, int n);
void video_orc_resample_v_muladdtaps_u16 (gint32 * ORC_RESTRICT d1, const guint64 * ORC_RESTRICT s1, int p1, int n);
void video_orc_resample_v_multaps_u8_lq (gint32 * ORC_RESTRICT d1, const guint32 * ORC_RESTRICT s1, int p1, int n);
void video_orc_resample_v_multaps4_u8_lq (gint32 * ORC_RESTRICT d1, const guint32 * ORC_RESTRICT s1, const guint32 * ORC_RESTRICT s2, const guint32 * ORC_RESTRICT s3, const guint32 * ORC_RESTRICT s4, int p1, int p2, int p3, int p4, int n);
void video_orc_resample_v_muladdtaps_u8_lq (gint32 * ORC_RESTRICT d1, const guint32 * ORC_RESTRICT s1, int p1, int n);
void video_orc_resample_v_muladdtaps4_u8_lq (gint32 * ORC_RESTRICT d1, const guint32 * ORC_RESTRICT s1, const guint32 * ORC_RESTRICT s2, const guint32 * ORC_RESTRICT s3, const guint32 * ORC_RESTRICT s4, int p1, int p2, int p3, int p4, int n);
void video_orc_resample_v_muladdscaletaps4_u8_lq (guint32 * ORC_RESTRICT d1, const guint32 * ORC_RESTRICT s1, const guint32 * ORC_RESTRICT s2, const guint32 * ORC_RESTRICT s3, const guint32 * ORC_RESTRICT s4, const gint32 * ORC_RESTRICT s5, int p1, int p2, int p3, int p4, int n);
void video_orc_chroma_down_h2_u8 (guint8 * ORC_RESTRICT d1, const guint8 * ORC_RESTRICT s1, int n);
void video_orc_chroma_down_v2_u8 (guint8 * ORC_RESTRICT d1, const guint8 * ORC_RESTRICT s1, const guint8 * ORC_RESTRICT s2, int n);
void video_orc_chroma_up_v2_u8 (guint8 * ORC_RESTRICT d1, guint8 * ORC_RESTRICT d2, const guint8 * ORC_RESTRICT s1, const guint8 * ORC_RESTRICT s2, int n);

View file

@ -1500,7 +1500,6 @@ convsuswb d1, w1
#convsuslw w1, l1
#convsuswb d1, w1
.function video_orc_resample_h_multaps_u8
.source 1 s guint32
.source 2 t gint16
@ -1553,6 +1552,72 @@ convubw w1, s
mullw w1, w1, t
addw d, d, w1
.function video_orc_resample_h_multaps3_u8_lq
.source 1 s1 guint32
.source 1 s2 guint32
.source 1 s3 guint32
.source 2 t1 gint16
.source 2 t2 gint16
.source 2 t3 gint16
.dest 2 d gint32
.temp 2 w1
.temp 2 w2
convubw w1, s1
mullw w1, w1, t1
convubw w2, s2
mullw w2, w2, t2
addw w1, w1, w2
convubw w2, s3
mullw w2, w2, t3
addw d, w1, w2
.function video_orc_resample_h_muladdtaps3_u8_lq
.source 1 s1 guint32
.source 1 s2 guint32
.source 1 s3 guint32
.source 2 t1 gint16
.source 2 t2 gint16
.source 2 t3 gint16
.dest 2 d gint32
.temp 2 w1
.temp 2 w2
convubw w1, s1
mullw w1, w1, t1
convubw w2, s2
mullw w2, w2, t2
addw w1, w1, w2
convubw w2, s3
mullw w2, w2, t3
addw w1, w1, w2
addw d, d, w1
.function video_orc_resample_h_muladdscaletaps3_u8_lq
.source 1 s1 guint32
.source 1 s2 guint32
.source 1 s3 guint32
.source 2 t1 gint16
.source 2 t2 gint16
.source 2 t3 gint16
.source 2 temp gint32
.dest 1 d guint32
.temp 2 w1
.temp 2 w2
convubw w1, s1
mullw w1, w1, t1
convubw w2, s2
mullw w2, w2, t2
addw w1, w1, w2
convubw w2, s3
mullw w2, w2, t3
addw w1, w1, w2
addw w1, w1, temp
addw w1, w1, 32
shrsw w1, w1, 6
convsuswb d, w1
.function video_orc_resample_scaletaps_u8_lq
.source 2 s gint32
.dest 1 d guint32
@ -1645,6 +1710,31 @@ addl d, d, t1
convubw w1, s
mullw d, w1, t
.function video_orc_resample_v_multaps4_u8_lq
.source 1 s1 guint32
.source 1 s2 guint32
.source 1 s3 guint32
.source 1 s4 guint32
.param 2 t1 gint16
.param 2 t2 gint16
.param 2 t3 gint16
.param 2 t4 gint16
.dest 2 d gint32
.temp 2 w1
.temp 2 w2
convubw w1, s1
mullw w1, w1, t1
convubw w2, s2
mullw w2, w2, t2
addw w1, w1, w2
convubw w2, s3
mullw w2, w2, t3
addw w1, w1, w2
convubw w2, s4
mullw w2, w2, t4
addw d, w1, w2
.function video_orc_resample_v_muladdtaps_u8_lq
.source 1 s guint32
.param 2 t gint16
@ -1655,6 +1745,62 @@ convubw w1, s
mullw w1, w1, t
addw d, d, w1
.function video_orc_resample_v_muladdtaps4_u8_lq
.source 1 s1 guint32
.source 1 s2 guint32
.source 1 s3 guint32
.source 1 s4 guint32
.param 2 t1 gint16
.param 2 t2 gint16
.param 2 t3 gint16
.param 2 t4 gint16
.dest 2 d gint32
.temp 2 w1
.temp 2 w2
convubw w1, s1
mullw w1, w1, t1
convubw w2, s2
mullw w2, w2, t2
addw w1, w1, w2
convubw w2, s3
mullw w2, w2, t3
addw w1, w1, w2
convubw w2, s4
mullw w2, w2, t4
addw w1, w1, w2
addw d, d, w1
.function video_orc_resample_v_muladdscaletaps4_u8_lq
.source 1 s1 guint32
.source 1 s2 guint32
.source 1 s3 guint32
.source 1 s4 guint32
.source 2 temp gint32
.param 2 t1 gint16
.param 2 t2 gint16
.param 2 t3 gint16
.param 2 t4 gint16
.dest 1 d guint32
.temp 2 w1
.temp 2 w2
convubw w1, s1
mullw w1, w1, t1
convubw w2, s2
mullw w2, w2, t2
addw w1, w1, w2
convubw w2, s3
mullw w2, w2, t3
addw w1, w1, w2
convubw w2, s4
mullw w2, w2, t4
addw w1, w1, w2
addw w1, w1, temp
addw w1, w1, 32
shrsw w1, w1, 6
convsuswb d, w1
.function video_orc_chroma_down_h2_u8
.source 8 s guint8
.dest 8 d guint8

View file

@ -459,13 +459,50 @@ video_scale_h_ntap_4u8 (GstVideoScaler * scale,
count = width * 4;
#ifdef LQ
/* first pixels with first tap to t4 */
video_orc_resample_h_multaps_u8_lq (temp, pixels, taps, count);
/* add other pixels with other taps to t4 */
video_orc_resample_h_muladdtaps_u8_lq (temp, 0, pixels + width, count,
taps + count, count * 2, count, max_taps - 1);
/* scale and write final result */
video_orc_resample_scaletaps_u8_lq (d, temp, count);
/* first pixels with first tap to temp */
if (max_taps > 3) {
video_orc_resample_h_multaps3_u8_lq (temp, pixels, pixels + width,
pixels + width * 2, taps, taps + count, taps + count * 2, count);
max_taps -= 3;
pixels += width * 3;
taps += count * 3;
} else {
gint first = max_taps % 3;
video_orc_resample_h_multaps_u8_lq (temp, pixels, taps, count);
video_orc_resample_h_muladdtaps_u8_lq (temp, 0, pixels + width, count,
taps + count, count * 2, count, first - 1);
max_taps -= first;
pixels += width * first;
taps += count * first;
}
while (max_taps > 3) {
if (max_taps >= 6) {
video_orc_resample_h_muladdtaps3_u8_lq (temp, pixels, pixels + width,
pixels + width * 2, taps, taps + count, taps + count * 2, count);
max_taps -= 3;
pixels += width * 3;
taps += count * 3;
} else {
video_orc_resample_h_muladdtaps_u8_lq (temp, 0, pixels, count,
taps, count * 2, count, max_taps - 3);
pixels += width * (max_taps - 3);
taps += count * (max_taps - 3);
max_taps = 3;
}
}
if (max_taps == 3) {
video_orc_resample_h_muladdscaletaps3_u8_lq (d, pixels, pixels + width,
pixels + width * 2, taps, taps + count, taps + count * 2, temp, count);
} else {
if (max_taps) {
/* add other pixels with other taps to t4 */
video_orc_resample_h_muladdtaps_u8_lq (temp, 0, pixels, count,
taps, count * 2, count, max_taps);
}
/* scale and write final result */
video_orc_resample_scaletaps_u8_lq (d, temp, count);
}
#else
/* first pixels with first tap to t4 */
video_orc_resample_h_multaps_u8 (temp, pixels, taps, count);
@ -697,12 +734,53 @@ video_scale_v_ntap_4u8 (GstVideoScaler * scale,
count = width * 4;
#ifdef LQ
video_orc_resample_v_multaps_u8_lq (temp, srcs[0], taps[0], count);
for (i = 1; i < max_taps; i++) {
video_orc_resample_v_muladdtaps_u8_lq (temp, srcs[i * src_inc], taps[i],
count);
if (max_taps > 4) {
video_orc_resample_v_multaps4_u8_lq (temp, srcs[0], srcs[1 * src_inc],
srcs[2 * src_inc], srcs[3 * src_inc], taps[0], taps[1], taps[2],
taps[3], count);
max_taps -= 4;
srcs += 4 * src_inc;
taps += 4;
} else {
gint first = (max_taps % 4);
video_orc_resample_v_multaps_u8_lq (temp, srcs[0], taps[0], count);
for (i = 1; i < first; i++) {
video_orc_resample_v_muladdtaps_u8_lq (temp, srcs[i * src_inc], taps[i],
count);
}
max_taps -= first;
srcs += first * src_inc;
taps += first;
}
video_orc_resample_scaletaps_u8_lq (d, temp, count);
while (max_taps > 4) {
if (max_taps >= 8) {
video_orc_resample_v_muladdtaps4_u8_lq (temp, srcs[0], srcs[1 * src_inc],
srcs[2 * src_inc], srcs[3 * src_inc], taps[0], taps[1], taps[2],
taps[3], count);
max_taps -= 4;
srcs += 4 * src_inc;
taps += 4;
} else {
for (i = 0; i < max_taps - 4; i++)
video_orc_resample_v_muladdtaps_u8_lq (temp, srcs[i * src_inc], taps[i],
count);
srcs += (max_taps - 4) * src_inc;
taps += (max_taps - 4);
max_taps = 4;
}
}
if (max_taps == 4) {
video_orc_resample_v_muladdscaletaps4_u8_lq (d, srcs[0], srcs[1 * src_inc],
srcs[2 * src_inc], srcs[3 * src_inc], temp, taps[0], taps[1], taps[2],
taps[3], count);
} else {
for (i = 0; i < max_taps; i++)
video_orc_resample_v_muladdtaps_u8_lq (temp, srcs[i * src_inc], taps[i],
count);
video_orc_resample_scaletaps_u8_lq (d, temp, count);
}
#else
video_orc_resample_v_multaps_u8 (temp, srcs[0], taps[0], count);
for (i = 1; i < max_taps; i++) {
@ -780,6 +858,9 @@ gst_video_scaler_horizontal (GstVideoScaler * scale, GstVideoFormat format,
if (scale->tmpwidth < width)
realloc_tmplines (scale, width);
GST_DEBUG ("format %d, pstride %d max_taps %d", format, pstride,
scale->resampler.max_taps);
switch (pstride) {
case 4:
switch (scale->resampler.max_taps) {
@ -853,6 +934,9 @@ gst_video_scaler_vertical (GstVideoScaler * scale, GstVideoFormat format,
if (scale->tmpwidth < width)
realloc_tmplines (scale, width);
GST_DEBUG ("format %d, pstride %d max_taps %d", format, pstride,
scale->resampler.max_taps);
switch (pstride) {
case 4:
switch (scale->resampler.max_taps) {