mirror of
https://gitlab.freedesktop.org/gstreamer/gstreamer.git
synced 2025-01-23 23:58:17 +00:00
video-scaler: combine scaling operations
Combine add and scale of multiple lines/pixels to reduce the amount of read and writes to temporary memory.
This commit is contained in:
parent
6306cce41e
commit
74e63fa660
4 changed files with 1937 additions and 13 deletions
File diff suppressed because it is too large
Load diff
|
@ -167,6 +167,9 @@ void video_orc_resample_h_muladdtaps_u8 (gint32 * ORC_RESTRICT d1, int d1_stride
|
|||
void video_orc_resample_scaletaps_u8 (guint32 * ORC_RESTRICT d1, const gint32 * ORC_RESTRICT s1, int n);
|
||||
void video_orc_resample_h_multaps_u8_lq (gint32 * ORC_RESTRICT d1, const guint32 * ORC_RESTRICT s1, const gint16 * ORC_RESTRICT s2, int n);
|
||||
void video_orc_resample_h_muladdtaps_u8_lq (gint32 * ORC_RESTRICT d1, int d1_stride, const guint32 * ORC_RESTRICT s1, int s1_stride, const gint16 * ORC_RESTRICT s2, int s2_stride, int n, int m);
|
||||
void video_orc_resample_h_multaps3_u8_lq (gint32 * ORC_RESTRICT d1, const guint32 * ORC_RESTRICT s1, const guint32 * ORC_RESTRICT s2, const guint32 * ORC_RESTRICT s3, const gint16 * ORC_RESTRICT s4, const gint16 * ORC_RESTRICT s5, const gint16 * ORC_RESTRICT s6, int n);
|
||||
void video_orc_resample_h_muladdtaps3_u8_lq (gint32 * ORC_RESTRICT d1, const guint32 * ORC_RESTRICT s1, const guint32 * ORC_RESTRICT s2, const guint32 * ORC_RESTRICT s3, const gint16 * ORC_RESTRICT s4, const gint16 * ORC_RESTRICT s5, const gint16 * ORC_RESTRICT s6, int n);
|
||||
void video_orc_resample_h_muladdscaletaps3_u8_lq (guint32 * ORC_RESTRICT d1, const guint32 * ORC_RESTRICT s1, const guint32 * ORC_RESTRICT s2, const guint32 * ORC_RESTRICT s3, const gint16 * ORC_RESTRICT s4, const gint16 * ORC_RESTRICT s5, const gint16 * ORC_RESTRICT s6, const gint32 * ORC_RESTRICT s7, int n);
|
||||
void video_orc_resample_scaletaps_u8_lq (guint32 * ORC_RESTRICT d1, const gint32 * ORC_RESTRICT s1, int n);
|
||||
void video_orc_resample_h_multaps_u16 (gint32 * ORC_RESTRICT d1, const guint64 * ORC_RESTRICT s1, const gint16 * ORC_RESTRICT s2, int n);
|
||||
void video_orc_resample_h_muladdtaps_u16 (gint32 * ORC_RESTRICT d1, int d1_stride, const guint64 * ORC_RESTRICT s1, int s1_stride, const gint16 * ORC_RESTRICT s2, int s2_stride, int n, int m);
|
||||
|
@ -176,7 +179,10 @@ void video_orc_resample_v_muladdtaps_u8 (gint32 * ORC_RESTRICT d1, const guint32
|
|||
void video_orc_resample_v_multaps_u16 (gint32 * ORC_RESTRICT d1, const guint64 * ORC_RESTRICT s1, int p1, int n);
|
||||
void video_orc_resample_v_muladdtaps_u16 (gint32 * ORC_RESTRICT d1, const guint64 * ORC_RESTRICT s1, int p1, int n);
|
||||
void video_orc_resample_v_multaps_u8_lq (gint32 * ORC_RESTRICT d1, const guint32 * ORC_RESTRICT s1, int p1, int n);
|
||||
void video_orc_resample_v_multaps4_u8_lq (gint32 * ORC_RESTRICT d1, const guint32 * ORC_RESTRICT s1, const guint32 * ORC_RESTRICT s2, const guint32 * ORC_RESTRICT s3, const guint32 * ORC_RESTRICT s4, int p1, int p2, int p3, int p4, int n);
|
||||
void video_orc_resample_v_muladdtaps_u8_lq (gint32 * ORC_RESTRICT d1, const guint32 * ORC_RESTRICT s1, int p1, int n);
|
||||
void video_orc_resample_v_muladdtaps4_u8_lq (gint32 * ORC_RESTRICT d1, const guint32 * ORC_RESTRICT s1, const guint32 * ORC_RESTRICT s2, const guint32 * ORC_RESTRICT s3, const guint32 * ORC_RESTRICT s4, int p1, int p2, int p3, int p4, int n);
|
||||
void video_orc_resample_v_muladdscaletaps4_u8_lq (guint32 * ORC_RESTRICT d1, const guint32 * ORC_RESTRICT s1, const guint32 * ORC_RESTRICT s2, const guint32 * ORC_RESTRICT s3, const guint32 * ORC_RESTRICT s4, const gint32 * ORC_RESTRICT s5, int p1, int p2, int p3, int p4, int n);
|
||||
void video_orc_chroma_down_h2_u8 (guint8 * ORC_RESTRICT d1, const guint8 * ORC_RESTRICT s1, int n);
|
||||
void video_orc_chroma_down_v2_u8 (guint8 * ORC_RESTRICT d1, const guint8 * ORC_RESTRICT s1, const guint8 * ORC_RESTRICT s2, int n);
|
||||
void video_orc_chroma_up_v2_u8 (guint8 * ORC_RESTRICT d1, guint8 * ORC_RESTRICT d2, const guint8 * ORC_RESTRICT s1, const guint8 * ORC_RESTRICT s2, int n);
|
||||
|
|
|
@ -1500,7 +1500,6 @@ convsuswb d1, w1
|
|||
#convsuslw w1, l1
|
||||
#convsuswb d1, w1
|
||||
|
||||
|
||||
.function video_orc_resample_h_multaps_u8
|
||||
.source 1 s guint32
|
||||
.source 2 t gint16
|
||||
|
@ -1553,6 +1552,72 @@ convubw w1, s
|
|||
mullw w1, w1, t
|
||||
addw d, d, w1
|
||||
|
||||
.function video_orc_resample_h_multaps3_u8_lq
|
||||
.source 1 s1 guint32
|
||||
.source 1 s2 guint32
|
||||
.source 1 s3 guint32
|
||||
.source 2 t1 gint16
|
||||
.source 2 t2 gint16
|
||||
.source 2 t3 gint16
|
||||
.dest 2 d gint32
|
||||
.temp 2 w1
|
||||
.temp 2 w2
|
||||
|
||||
convubw w1, s1
|
||||
mullw w1, w1, t1
|
||||
convubw w2, s2
|
||||
mullw w2, w2, t2
|
||||
addw w1, w1, w2
|
||||
convubw w2, s3
|
||||
mullw w2, w2, t3
|
||||
addw d, w1, w2
|
||||
|
||||
.function video_orc_resample_h_muladdtaps3_u8_lq
|
||||
.source 1 s1 guint32
|
||||
.source 1 s2 guint32
|
||||
.source 1 s3 guint32
|
||||
.source 2 t1 gint16
|
||||
.source 2 t2 gint16
|
||||
.source 2 t3 gint16
|
||||
.dest 2 d gint32
|
||||
.temp 2 w1
|
||||
.temp 2 w2
|
||||
|
||||
convubw w1, s1
|
||||
mullw w1, w1, t1
|
||||
convubw w2, s2
|
||||
mullw w2, w2, t2
|
||||
addw w1, w1, w2
|
||||
convubw w2, s3
|
||||
mullw w2, w2, t3
|
||||
addw w1, w1, w2
|
||||
addw d, d, w1
|
||||
|
||||
.function video_orc_resample_h_muladdscaletaps3_u8_lq
|
||||
.source 1 s1 guint32
|
||||
.source 1 s2 guint32
|
||||
.source 1 s3 guint32
|
||||
.source 2 t1 gint16
|
||||
.source 2 t2 gint16
|
||||
.source 2 t3 gint16
|
||||
.source 2 temp gint32
|
||||
.dest 1 d guint32
|
||||
.temp 2 w1
|
||||
.temp 2 w2
|
||||
|
||||
convubw w1, s1
|
||||
mullw w1, w1, t1
|
||||
convubw w2, s2
|
||||
mullw w2, w2, t2
|
||||
addw w1, w1, w2
|
||||
convubw w2, s3
|
||||
mullw w2, w2, t3
|
||||
addw w1, w1, w2
|
||||
addw w1, w1, temp
|
||||
addw w1, w1, 32
|
||||
shrsw w1, w1, 6
|
||||
convsuswb d, w1
|
||||
|
||||
.function video_orc_resample_scaletaps_u8_lq
|
||||
.source 2 s gint32
|
||||
.dest 1 d guint32
|
||||
|
@ -1645,6 +1710,31 @@ addl d, d, t1
|
|||
convubw w1, s
|
||||
mullw d, w1, t
|
||||
|
||||
.function video_orc_resample_v_multaps4_u8_lq
|
||||
.source 1 s1 guint32
|
||||
.source 1 s2 guint32
|
||||
.source 1 s3 guint32
|
||||
.source 1 s4 guint32
|
||||
.param 2 t1 gint16
|
||||
.param 2 t2 gint16
|
||||
.param 2 t3 gint16
|
||||
.param 2 t4 gint16
|
||||
.dest 2 d gint32
|
||||
.temp 2 w1
|
||||
.temp 2 w2
|
||||
|
||||
convubw w1, s1
|
||||
mullw w1, w1, t1
|
||||
convubw w2, s2
|
||||
mullw w2, w2, t2
|
||||
addw w1, w1, w2
|
||||
convubw w2, s3
|
||||
mullw w2, w2, t3
|
||||
addw w1, w1, w2
|
||||
convubw w2, s4
|
||||
mullw w2, w2, t4
|
||||
addw d, w1, w2
|
||||
|
||||
.function video_orc_resample_v_muladdtaps_u8_lq
|
||||
.source 1 s guint32
|
||||
.param 2 t gint16
|
||||
|
@ -1655,6 +1745,62 @@ convubw w1, s
|
|||
mullw w1, w1, t
|
||||
addw d, d, w1
|
||||
|
||||
.function video_orc_resample_v_muladdtaps4_u8_lq
|
||||
.source 1 s1 guint32
|
||||
.source 1 s2 guint32
|
||||
.source 1 s3 guint32
|
||||
.source 1 s4 guint32
|
||||
.param 2 t1 gint16
|
||||
.param 2 t2 gint16
|
||||
.param 2 t3 gint16
|
||||
.param 2 t4 gint16
|
||||
.dest 2 d gint32
|
||||
.temp 2 w1
|
||||
.temp 2 w2
|
||||
|
||||
convubw w1, s1
|
||||
mullw w1, w1, t1
|
||||
convubw w2, s2
|
||||
mullw w2, w2, t2
|
||||
addw w1, w1, w2
|
||||
convubw w2, s3
|
||||
mullw w2, w2, t3
|
||||
addw w1, w1, w2
|
||||
convubw w2, s4
|
||||
mullw w2, w2, t4
|
||||
addw w1, w1, w2
|
||||
addw d, d, w1
|
||||
|
||||
.function video_orc_resample_v_muladdscaletaps4_u8_lq
|
||||
.source 1 s1 guint32
|
||||
.source 1 s2 guint32
|
||||
.source 1 s3 guint32
|
||||
.source 1 s4 guint32
|
||||
.source 2 temp gint32
|
||||
.param 2 t1 gint16
|
||||
.param 2 t2 gint16
|
||||
.param 2 t3 gint16
|
||||
.param 2 t4 gint16
|
||||
.dest 1 d guint32
|
||||
.temp 2 w1
|
||||
.temp 2 w2
|
||||
|
||||
convubw w1, s1
|
||||
mullw w1, w1, t1
|
||||
convubw w2, s2
|
||||
mullw w2, w2, t2
|
||||
addw w1, w1, w2
|
||||
convubw w2, s3
|
||||
mullw w2, w2, t3
|
||||
addw w1, w1, w2
|
||||
convubw w2, s4
|
||||
mullw w2, w2, t4
|
||||
addw w1, w1, w2
|
||||
addw w1, w1, temp
|
||||
addw w1, w1, 32
|
||||
shrsw w1, w1, 6
|
||||
convsuswb d, w1
|
||||
|
||||
.function video_orc_chroma_down_h2_u8
|
||||
.source 8 s guint8
|
||||
.dest 8 d guint8
|
||||
|
|
|
@ -459,13 +459,50 @@ video_scale_h_ntap_4u8 (GstVideoScaler * scale,
|
|||
count = width * 4;
|
||||
|
||||
#ifdef LQ
|
||||
/* first pixels with first tap to t4 */
|
||||
video_orc_resample_h_multaps_u8_lq (temp, pixels, taps, count);
|
||||
/* add other pixels with other taps to t4 */
|
||||
video_orc_resample_h_muladdtaps_u8_lq (temp, 0, pixels + width, count,
|
||||
taps + count, count * 2, count, max_taps - 1);
|
||||
/* scale and write final result */
|
||||
video_orc_resample_scaletaps_u8_lq (d, temp, count);
|
||||
/* first pixels with first tap to temp */
|
||||
if (max_taps > 3) {
|
||||
video_orc_resample_h_multaps3_u8_lq (temp, pixels, pixels + width,
|
||||
pixels + width * 2, taps, taps + count, taps + count * 2, count);
|
||||
max_taps -= 3;
|
||||
pixels += width * 3;
|
||||
taps += count * 3;
|
||||
} else {
|
||||
gint first = max_taps % 3;
|
||||
|
||||
video_orc_resample_h_multaps_u8_lq (temp, pixels, taps, count);
|
||||
video_orc_resample_h_muladdtaps_u8_lq (temp, 0, pixels + width, count,
|
||||
taps + count, count * 2, count, first - 1);
|
||||
max_taps -= first;
|
||||
pixels += width * first;
|
||||
taps += count * first;
|
||||
}
|
||||
while (max_taps > 3) {
|
||||
if (max_taps >= 6) {
|
||||
video_orc_resample_h_muladdtaps3_u8_lq (temp, pixels, pixels + width,
|
||||
pixels + width * 2, taps, taps + count, taps + count * 2, count);
|
||||
max_taps -= 3;
|
||||
pixels += width * 3;
|
||||
taps += count * 3;
|
||||
} else {
|
||||
video_orc_resample_h_muladdtaps_u8_lq (temp, 0, pixels, count,
|
||||
taps, count * 2, count, max_taps - 3);
|
||||
pixels += width * (max_taps - 3);
|
||||
taps += count * (max_taps - 3);
|
||||
max_taps = 3;
|
||||
}
|
||||
}
|
||||
if (max_taps == 3) {
|
||||
video_orc_resample_h_muladdscaletaps3_u8_lq (d, pixels, pixels + width,
|
||||
pixels + width * 2, taps, taps + count, taps + count * 2, temp, count);
|
||||
} else {
|
||||
if (max_taps) {
|
||||
/* add other pixels with other taps to t4 */
|
||||
video_orc_resample_h_muladdtaps_u8_lq (temp, 0, pixels, count,
|
||||
taps, count * 2, count, max_taps);
|
||||
}
|
||||
/* scale and write final result */
|
||||
video_orc_resample_scaletaps_u8_lq (d, temp, count);
|
||||
}
|
||||
#else
|
||||
/* first pixels with first tap to t4 */
|
||||
video_orc_resample_h_multaps_u8 (temp, pixels, taps, count);
|
||||
|
@ -697,12 +734,53 @@ video_scale_v_ntap_4u8 (GstVideoScaler * scale,
|
|||
count = width * 4;
|
||||
|
||||
#ifdef LQ
|
||||
video_orc_resample_v_multaps_u8_lq (temp, srcs[0], taps[0], count);
|
||||
for (i = 1; i < max_taps; i++) {
|
||||
video_orc_resample_v_muladdtaps_u8_lq (temp, srcs[i * src_inc], taps[i],
|
||||
count);
|
||||
if (max_taps > 4) {
|
||||
video_orc_resample_v_multaps4_u8_lq (temp, srcs[0], srcs[1 * src_inc],
|
||||
srcs[2 * src_inc], srcs[3 * src_inc], taps[0], taps[1], taps[2],
|
||||
taps[3], count);
|
||||
max_taps -= 4;
|
||||
srcs += 4 * src_inc;
|
||||
taps += 4;
|
||||
} else {
|
||||
gint first = (max_taps % 4);
|
||||
|
||||
video_orc_resample_v_multaps_u8_lq (temp, srcs[0], taps[0], count);
|
||||
for (i = 1; i < first; i++) {
|
||||
video_orc_resample_v_muladdtaps_u8_lq (temp, srcs[i * src_inc], taps[i],
|
||||
count);
|
||||
}
|
||||
max_taps -= first;
|
||||
srcs += first * src_inc;
|
||||
taps += first;
|
||||
}
|
||||
video_orc_resample_scaletaps_u8_lq (d, temp, count);
|
||||
while (max_taps > 4) {
|
||||
if (max_taps >= 8) {
|
||||
video_orc_resample_v_muladdtaps4_u8_lq (temp, srcs[0], srcs[1 * src_inc],
|
||||
srcs[2 * src_inc], srcs[3 * src_inc], taps[0], taps[1], taps[2],
|
||||
taps[3], count);
|
||||
max_taps -= 4;
|
||||
srcs += 4 * src_inc;
|
||||
taps += 4;
|
||||
} else {
|
||||
for (i = 0; i < max_taps - 4; i++)
|
||||
video_orc_resample_v_muladdtaps_u8_lq (temp, srcs[i * src_inc], taps[i],
|
||||
count);
|
||||
srcs += (max_taps - 4) * src_inc;
|
||||
taps += (max_taps - 4);
|
||||
max_taps = 4;
|
||||
}
|
||||
}
|
||||
if (max_taps == 4) {
|
||||
video_orc_resample_v_muladdscaletaps4_u8_lq (d, srcs[0], srcs[1 * src_inc],
|
||||
srcs[2 * src_inc], srcs[3 * src_inc], temp, taps[0], taps[1], taps[2],
|
||||
taps[3], count);
|
||||
} else {
|
||||
for (i = 0; i < max_taps; i++)
|
||||
video_orc_resample_v_muladdtaps_u8_lq (temp, srcs[i * src_inc], taps[i],
|
||||
count);
|
||||
video_orc_resample_scaletaps_u8_lq (d, temp, count);
|
||||
}
|
||||
|
||||
#else
|
||||
video_orc_resample_v_multaps_u8 (temp, srcs[0], taps[0], count);
|
||||
for (i = 1; i < max_taps; i++) {
|
||||
|
@ -780,6 +858,9 @@ gst_video_scaler_horizontal (GstVideoScaler * scale, GstVideoFormat format,
|
|||
if (scale->tmpwidth < width)
|
||||
realloc_tmplines (scale, width);
|
||||
|
||||
GST_DEBUG ("format %d, pstride %d max_taps %d", format, pstride,
|
||||
scale->resampler.max_taps);
|
||||
|
||||
switch (pstride) {
|
||||
case 4:
|
||||
switch (scale->resampler.max_taps) {
|
||||
|
@ -853,6 +934,9 @@ gst_video_scaler_vertical (GstVideoScaler * scale, GstVideoFormat format,
|
|||
if (scale->tmpwidth < width)
|
||||
realloc_tmplines (scale, width);
|
||||
|
||||
GST_DEBUG ("format %d, pstride %d max_taps %d", format, pstride,
|
||||
scale->resampler.max_taps);
|
||||
|
||||
switch (pstride) {
|
||||
case 4:
|
||||
switch (scale->resampler.max_taps) {
|
||||
|
|
Loading…
Reference in a new issue