videoscale: refactor using more Orc code

Convert downsampling to Orc.  Convert horizontal linear scaling
to Orc.  Combine horizontal and vertical scaling into one pass.
This commit is contained in:
David Schleef 2010-06-14 15:42:09 -07:00
parent 1a75dede56
commit 0cceeb2035
5 changed files with 1740 additions and 272 deletions

File diff suppressed because it is too large Load diff

View file

@ -55,10 +55,20 @@ typedef union { orc_int16 i; orc_int8 x2[2]; } orc_union16;
typedef union { orc_int32 i; float f; orc_int16 x2[2]; orc_int8 x4[4]; } orc_union32;
typedef union { orc_int64 i; double f; orc_int32 x2[2]; orc_int16 x4[4]; } orc_union64;
#endif
void orc_merge_linear_u8 (orc_uint8 * d1, const orc_uint8 * s1, const orc_uint8 * s2, int p1, int p2, int n);
void orc_merge_linear_u8 (orc_uint8 * d1, const orc_uint8 * s1, const orc_uint8 * s2, int p1, int n);
void orc_merge_linear_u16 (orc_uint16 * d1, const orc_uint16 * s1, const orc_uint16 * s2, int p1, int p2, int n);
void orc_splat_u16 (orc_uint16 * d1, int p1, int n);
void orc_splat_u32 (orc_uint32 * d1, int p1, int n);
void orc_downsample_u8 (guint8 * d1, const guint8 * s1, int n);
void orc_downsample_u16 (guint16 * d1, const guint16 * s1, int n);
void gst_videoscale_orc_downsample_u32 (guint8 * d1, const guint8 * s1, int n);
void gst_videoscale_orc_downsample_yuyv (guint8 * d1, const guint8 * s1, int n);
void gst_videoscale_orc_resample_nearest_u8 (guint8 * d1, const guint8 * s1, int p1, int p2, int n);
void gst_videoscale_orc_resample_bilinear_u8 (guint8 * d1, const guint8 * s1, int p1, int p2, int n);
void gst_videoscale_orc_resample_nearest_u32 (guint8 * d1, const guint8 * s1, int p1, int p2, int n);
void gst_videoscale_orc_resample_bilinear_u32 (guint8 * d1, const guint8 * s1, int p1, int p2, int n);
void gst_videoscale_orc_resample_merge_bilinear_u32 (guint8 * d1, guint8 * d2, const guint8 * s1, const guint8 * s2, int p1, int p2, int p3, int n);
void gst_videoscale_orc_merge_bicubic_u8 (guint8 * d1, const guint8 * s1, const guint8 * s2, const guint8 * s3, const guint8 * s4, int p1, int p2, int p3, int p4, int n);
#ifdef __cplusplus
}

View file

@ -6,16 +6,21 @@
.source 1 s1
.source 1 s2
.param 1 p1
.param 1 p2
.temp 2 t1
.temp 2 t2
.temp 1 a
.temp 1 t
loadb a, s1
convubw t1, s1
convubw t2, s2
subw t2, t2, t1
mullw t2, t2, p1
addw t2, t2, 128
convhwb t, t2
addb d1, t, a
mulubw t1, s1, p1
mulubw t2, s2, p2
addw t1, t1, t2
addw t1, t1, 128
shruw t1, t1, 8
convwb d1, t1
.function orc_merge_linear_u16
.dest 2 d1
@ -26,21 +31,162 @@ convwb d1, t1
.temp 4 t1
.temp 4 t2
# This is slightly different thatn the u8 case, since muluwl
# tends to be much faster than mulll
muluwl t1, s1, p1
muluwl t2, s2, p2
addl t1, t1, t2
shrul t1, t1, 16
convlw d1, t1
.function orc_splat_u16
.dest 2 d1
.param 2 p1
copyw d1, p1
.function orc_splat_u32
.dest 4 d1
.param 4 p1
copyl d1, p1
.function orc_downsample_u8
.dest 1 d1 guint8
.source 2 s1 guint8
.temp 1 t1
.temp 1 t2
splitwb t1, t2, s1
avgub d1, t1, t2
.function orc_downsample_u16
.dest 2 d1 guint16
.source 4 s1 guint16
.temp 2 t1
.temp 2 t2
splitlw t1, t2, s1
avguw d1, t1, t2
.function gst_videoscale_orc_downsample_u32
.dest 4 d1 guint8
.source 8 s1 guint8
.temp 4 t1
.temp 4 t2
splitql t1, t2, s1
x4 avgub d1, t1, t2
.function gst_videoscale_orc_downsample_yuyv
.dest 4 d1 guint8
.source 8 s1 guint8
.temp 4 yyyy
.temp 4 uvuv
.temp 2 t1
.temp 2 t2
.temp 2 yy
.temp 2 uv
x4 splitwb yyyy, uvuv, s1
x2 splitwb t1, t2, yyyy
x2 avgub yy, t1, t2
splitlw t1, t2, uvuv
x2 avgub uv, t1, t2
x2 mergebw d1, yy, uv
.function gst_videoscale_orc_resample_nearest_u8
.dest 1 d1 guint8
.source 1 s1 guint8
.param 4 p1
.param 4 p2
ldresnearb d1, s1, p1, p2
.function gst_videoscale_orc_resample_bilinear_u8
.dest 1 d1 guint8
.source 1 s1 guint8
.param 4 p1
.param 4 p2
ldreslinb d1, s1, p1, p2
.function gst_videoscale_orc_resample_nearest_u32
.dest 4 d1 guint8
.source 4 s1 guint8
.param 4 p1
.param 4 p2
ldresnearl d1, s1, p1, p2
.function gst_videoscale_orc_resample_bilinear_u32
.dest 4 d1 guint8
.source 4 s1 guint8
.param 4 p1
.param 4 p2
ldreslinl d1, s1, p1, p2
.function gst_videoscale_orc_resample_merge_bilinear_u32
.dest 4 d1 guint8
.dest 4 d2 guint8
.source 4 s1 guint8
.source 4 s2 guint8
.temp 4 a
.temp 4 b
.temp 4 t
.temp 8 t1
.temp 8 t2
.param 4 p1
.param 4 p2
.param 4 p3
ldreslinl b, s2, p2, p3
storel d2, b
x4 loadb a, s1
x4 convubw t1, a
x4 convubw t2, b
x4 subw t2, t2, t1
x4 mullw t2, t2, p1
x4 convhwb t, t2
x4 addb d1, t, a
.function gst_videoscale_orc_merge_bicubic_u8
.dest 1 d1 guint8
.source 1 s1 guint8
.source 1 s2 guint8
.source 1 s3 guint8
.source 1 s4 guint8
.param 4 p1
.param 4 p2
.param 4 p3
.param 4 p4
.temp 2 t1
.temp 2 t2
mulubw t1, s2, p2
mulubw t2, s3, p3
addw t1, t1, t2
mulubw t2, s1, p1
subw t1, t1, t2
mulubw t2, s4, p4
subw t1, t1, t2
addw t1, t1, 32
shrsw t1, t1, 6
convsuswb d1, t1

View file

@ -30,6 +30,9 @@
#include "vs_scanline.h"
#include "vs_image.h"
#include "gstvideoscaleorc.h"
#include <gst/gst.h>
#define ROUND_UP_2(x) (((x)+1)&~1)
#define ROUND_UP_4(x) (((x)+3)&~3)
#define ROUND_UP_8(x) (((x)+7)&~7)
@ -43,7 +46,7 @@ vs_image_scale_nearest_RGBA (const VSImage * dest, const VSImage * src,
int x_increment;
int i;
int j;
int xacc;
int prev_j;
if (dest->height == 1)
y_increment = 0;
@ -57,14 +60,19 @@ vs_image_scale_nearest_RGBA (const VSImage * dest, const VSImage * src,
acc = 0;
prev_j = -1;
for (i = 0; i < dest->height; i++) {
j = acc >> 16;
xacc = 0;
vs_scanline_resample_nearest_RGBA (dest->pixels + i * dest->stride,
src->pixels + j * src->stride, src->width, dest->width, &xacc,
x_increment);
if (j == prev_j) {
memcpy (dest->pixels + i * dest->stride,
dest->pixels + (i - 1) * dest->stride, dest->width * 4);
} else {
gst_videoscale_orc_resample_nearest_u32 (dest->pixels + i * dest->stride,
src->pixels + j * src->stride, 0, x_increment, dest->width);
}
prev_j = j;
acc += y_increment;
}
}
@ -76,15 +84,12 @@ vs_image_scale_linear_RGBA (const VSImage * dest, const VSImage * src,
int acc;
int y_increment;
int x_increment;
uint8_t *tmp1;
uint8_t *tmp2;
int y1;
int y2;
int i;
int j;
int x;
int dest_size;
int xacc;
if (dest->height == 1)
y_increment = 0;
@ -98,64 +103,34 @@ vs_image_scale_linear_RGBA (const VSImage * dest, const VSImage * src,
dest_size = dest->width * 4;
tmp1 = tmpbuf;
tmp2 = tmpbuf + dest_size;
#define LINE(x) ((tmpbuf) + (dest_size)*((x)&1))
acc = 0;
xacc = 0;
y2 = -1;
vs_scanline_resample_linear_RGBA (tmp1, src->pixels, src->width, dest->width,
&xacc, x_increment);
gst_videoscale_orc_resample_bilinear_u32 (LINE (0), src->pixels,
0, x_increment, dest->width);
y1 = 0;
for (i = 0; i < dest->height; i++) {
j = acc >> 16;
x = acc & 0xffff;
if (x == 0) {
if (j == y1) {
memcpy (dest->pixels + i * dest->stride, tmp1, dest_size);
} else if (j == y2) {
memcpy (dest->pixels + i * dest->stride, tmp2, dest_size);
} else {
xacc = 0;
vs_scanline_resample_linear_RGBA (tmp1, src->pixels + j * src->stride,
src->width, dest->width, &xacc, x_increment);
y1 = j;
memcpy (dest->pixels + i * dest->stride, tmp1, dest_size);
}
memcpy (dest->pixels + i * dest->stride, LINE (j), dest_size);
} else {
if (j == y1) {
if (j + 1 != y2) {
xacc = 0;
vs_scanline_resample_linear_RGBA (tmp2,
src->pixels + (j + 1) * src->stride, src->width, dest->width,
&xacc, x_increment);
y2 = j + 1;
}
vs_scanline_merge_linear_RGBA (dest->pixels + i * dest->stride,
tmp1, tmp2, dest->width, x);
} else if (j == y2) {
if (j + 1 != y1) {
xacc = 0;
vs_scanline_resample_linear_RGBA (tmp1,
src->pixels + (j + 1) * src->stride, src->width, dest->width,
&xacc, x_increment);
y1 = j + 1;
}
vs_scanline_merge_linear_RGBA (dest->pixels + i * dest->stride,
tmp2, tmp1, dest->width, x);
if (j > y1) {
gst_videoscale_orc_resample_bilinear_u32 (LINE (j),
src->pixels + j * src->stride, 0, x_increment, dest->width);
y1++;
}
if (j >= y1) {
gst_videoscale_orc_resample_merge_bilinear_u32 (dest->pixels +
i * dest->stride, LINE (j + 1), LINE (j),
src->pixels + (j + 1) * src->stride, (x >> 8), 0, x_increment,
dest->width);
y1++;
} else {
xacc = 0;
vs_scanline_resample_linear_RGBA (tmp1, src->pixels + j * src->stride,
src->width, dest->width, &xacc, x_increment);
y1 = j;
xacc = 0;
vs_scanline_resample_linear_RGBA (tmp2,
src->pixels + (j + 1) * src->stride, src->width, dest->width, &xacc,
x_increment);
y2 = (j + 1);
vs_scanline_merge_linear_RGBA (dest->pixels + i * dest->stride,
tmp1, tmp2, dest->width, x);
orc_merge_linear_u8 (dest->pixels + i * dest->stride,
LINE (j), LINE (j + 1), (x >> 8), dest->width * 4);
}
}
@ -563,7 +538,6 @@ vs_image_scale_nearest_Y (const VSImage * dest, const VSImage * src,
int x_increment;
int i;
int j;
int xacc;
if (dest->height == 1)
y_increment = 0;
@ -579,11 +553,8 @@ vs_image_scale_nearest_Y (const VSImage * dest, const VSImage * src,
for (i = 0; i < dest->height; i++) {
j = acc >> 16;
xacc = 0;
vs_scanline_resample_nearest_Y (dest->pixels + i * dest->stride,
src->pixels + j * src->stride, src->width, dest->width, &xacc,
x_increment);
gst_videoscale_orc_resample_nearest_u8 (dest->pixels + i * dest->stride,
src->pixels + j * src->stride, 0, x_increment, dest->width);
acc += y_increment;
}
}
@ -623,8 +594,8 @@ vs_image_scale_linear_Y (const VSImage * dest, const VSImage * src,
acc = 0;
xacc = 0;
y2 = -1;
vs_scanline_resample_linear_Y (tmp1, src->pixels, src->width, dest->width,
&xacc, x_increment);
gst_videoscale_orc_resample_bilinear_u8 (tmp1, src->pixels,
0, x_increment, dest->width);
y1 = 0;
for (i = 0; i < dest->height; i++) {
j = acc >> 16;
@ -637,8 +608,8 @@ vs_image_scale_linear_Y (const VSImage * dest, const VSImage * src,
memcpy (dest->pixels + i * dest->stride, tmp2, dest_size);
} else {
xacc = 0;
vs_scanline_resample_linear_Y (tmp1, src->pixels + j * src->stride,
src->width, dest->width, &xacc, x_increment);
gst_videoscale_orc_resample_bilinear_u8 (tmp1,
src->pixels + j * src->stride, 0, x_increment, dest->width);
y1 = j;
memcpy (dest->pixels + i * dest->stride, tmp1, dest_size);
}
@ -646,35 +617,42 @@ vs_image_scale_linear_Y (const VSImage * dest, const VSImage * src,
if (j == y1) {
if (j + 1 != y2) {
xacc = 0;
vs_scanline_resample_linear_Y (tmp2,
src->pixels + (j + 1) * src->stride, src->width, dest->width,
&xacc, x_increment);
gst_videoscale_orc_resample_bilinear_u8 (tmp2,
src->pixels + (j + 1) * src->stride, 0, x_increment, dest->width);
y2 = j + 1;
}
vs_scanline_merge_linear_Y (dest->pixels + i * dest->stride,
tmp1, tmp2, dest->width, x);
if ((x >> 8) == 0) {
memcpy (dest->pixels + i * dest->stride, tmp1, dest->width);
} else {
orc_merge_linear_u8 (dest->pixels + i * dest->stride,
tmp1, tmp2, (x >> 8), dest->width);
}
} else if (j == y2) {
if (j + 1 != y1) {
xacc = 0;
vs_scanline_resample_linear_Y (tmp1,
src->pixels + (j + 1) * src->stride, src->width, dest->width,
&xacc, x_increment);
gst_videoscale_orc_resample_bilinear_u8 (tmp1,
src->pixels + (j + 1) * src->stride, 0, x_increment, dest->width);
y1 = j + 1;
}
vs_scanline_merge_linear_Y (dest->pixels + i * dest->stride,
tmp2, tmp1, dest->width, x);
if ((x >> 8) == 0) {
memcpy (dest->pixels + i * dest->stride, tmp2, dest->width);
} else {
orc_merge_linear_u8 (dest->pixels + i * dest->stride,
tmp2, tmp1, (x >> 8), dest->width);
}
} else {
xacc = 0;
vs_scanline_resample_linear_Y (tmp1, src->pixels + j * src->stride,
src->width, dest->width, &xacc, x_increment);
gst_videoscale_orc_resample_bilinear_u8 (tmp1,
src->pixels + j * src->stride, 0, x_increment, dest->width);
y1 = j;
xacc = 0;
vs_scanline_resample_linear_Y (tmp2,
src->pixels + (j + 1) * src->stride, src->width, dest->width, &xacc,
x_increment);
gst_videoscale_orc_resample_bilinear_u8 (tmp2,
src->pixels + (j + 1) * src->stride, 0, x_increment, dest->width);
y2 = (j + 1);
vs_scanline_merge_linear_Y (dest->pixels + i * dest->stride,
tmp1, tmp2, dest->width, x);
if ((x >> 8) == 0) {
memcpy (dest->pixels + i * dest->stride, tmp1, dest->width);
} else {
orc_merge_linear_u8 (dest->pixels + i * dest->stride,
tmp1, tmp2, (x >> 8), dest->width);
}
}
}

View file

@ -28,6 +28,7 @@
#include "vs_scanline.h"
#include "gstvideoscaleorc.h"
#include <gst/gst.h>
#include <string.h>
@ -36,31 +37,17 @@
void
vs_scanline_downsample_Y (uint8_t * dest, uint8_t * src, int n)
{
int i;
for (i = 0; i < n; i++) {
dest[i] = (src[i * 2] + src[i * 2 + 1]) / 2;
}
orc_downsample_u8 (dest, src, n);
}
void
vs_scanline_resample_nearest_Y (uint8_t * dest, uint8_t * src, int src_width,
int n, int *accumulator, int increment)
{
int acc = *accumulator;
int i;
int j;
int x;
gst_videoscale_orc_resample_nearest_u8 (dest, src,
*accumulator, increment, n);
for (i = 0; i < n; i++) {
j = acc >> 16;
x = acc & 0xffff;
dest[i] = (x < 32768 || j + 1 >= src_width) ? src[j] : src[j + 1];
acc += increment;
}
*accumulator = acc;
*accumulator += n * increment;
}
#include <glib.h>
@ -68,24 +55,10 @@ void
vs_scanline_resample_linear_Y (uint8_t * dest, uint8_t * src, int src_width,
int n, int *accumulator, int increment)
{
int acc = *accumulator;
int i;
int j;
int x;
gst_videoscale_orc_resample_bilinear_u8 (dest, src,
*accumulator, increment, n);
for (i = 0; i < n; i++) {
j = acc >> 16;
x = acc & 0xffff;
if (j + 1 < src_width)
dest[i] = (src[j] * (65536 - x) + src[j + 1] * x) >> 16;
else
dest[i] = src[j];
acc += increment;
}
*accumulator = acc;
*accumulator += n * increment;
}
void
@ -97,19 +70,14 @@ vs_scanline_merge_linear_Y (uint8_t * dest, uint8_t * src1, uint8_t * src2,
if (value == 0) {
memcpy (dest, src1, n);
} else {
orc_merge_linear_u8 (dest, src1, src2, 256 - value, value, n);
orc_merge_linear_u8 (dest, src1, src2, value, n);
}
}
void
vs_scanline_downsample_Y16 (uint8_t * dest, uint8_t * src, int n)
{
int i;
uint16_t *d = (uint16_t *) dest, *s = (uint16_t *) src;
for (i = 0; i < n; i++) {
d[i] = (s[i * 2] + s[i * 2 + 1]) / 2;
}
orc_downsample_u16 ((uint16_t *) dest, (uint16_t *) src, n);
}
void
@ -178,80 +146,27 @@ vs_scanline_merge_linear_Y16 (uint8_t * dest, uint8_t * src1, uint8_t * src2,
void
vs_scanline_downsample_RGBA (uint8_t * dest, uint8_t * src, int n)
{
int i;
for (i = 0; i < n; i++) {
dest[i * 4 + 0] = (src[i * 8 + 0] + src[i * 8 + 4]) / 2;
dest[i * 4 + 1] = (src[i * 8 + 1] + src[i * 8 + 5]) / 2;
dest[i * 4 + 2] = (src[i * 8 + 2] + src[i * 8 + 6]) / 2;
dest[i * 4 + 3] = (src[i * 8 + 3] + src[i * 8 + 7]) / 2;
}
gst_videoscale_orc_downsample_u32 (dest, src, n);
}
void
vs_scanline_resample_nearest_RGBA (uint8_t * dest, uint8_t * src, int src_width,
int n, int *accumulator, int increment)
{
int acc = *accumulator;
int i;
int j;
int x;
gst_videoscale_orc_resample_nearest_u32 (dest, src,
*accumulator, increment, n);
for (i = 0; i < n; i++) {
j = acc >> 16;
x = acc & 0xffff;
if (j + 1 < src_width) {
dest[i * 4 + 0] = (x < 32768) ? src[j * 4 + 0] : src[j * 4 + 4];
dest[i * 4 + 1] = (x < 32768) ? src[j * 4 + 1] : src[j * 4 + 5];
dest[i * 4 + 2] = (x < 32768) ? src[j * 4 + 2] : src[j * 4 + 6];
dest[i * 4 + 3] = (x < 32768) ? src[j * 4 + 3] : src[j * 4 + 7];
} else {
dest[i * 4 + 0] = src[j * 4 + 0];
dest[i * 4 + 1] = src[j * 4 + 1];
dest[i * 4 + 2] = src[j * 4 + 2];
dest[i * 4 + 3] = src[j * 4 + 3];
}
acc += increment;
}
*accumulator = acc;
*accumulator += n * increment;
}
void
vs_scanline_resample_linear_RGBA (uint8_t * dest, uint8_t * src, int src_width,
int n, int *accumulator, int increment)
{
int acc = *accumulator;
int i;
int j;
int x;
gst_videoscale_orc_resample_bilinear_u32 (dest, src,
*accumulator, increment, n);
for (i = 0; i < n; i++) {
j = acc >> 16;
x = acc & 0xffff;
if (j + 1 < src_width) {
dest[i * 4 + 0] =
(src[j * 4 + 0] * (65536 - x) + src[j * 4 + 4] * x) >> 16;
dest[i * 4 + 1] =
(src[j * 4 + 1] * (65536 - x) + src[j * 4 + 5] * x) >> 16;
dest[i * 4 + 2] =
(src[j * 4 + 2] * (65536 - x) + src[j * 4 + 6] * x) >> 16;
dest[i * 4 + 3] =
(src[j * 4 + 3] * (65536 - x) + src[j * 4 + 7] * x) >> 16;
} else {
dest[i * 4 + 0] = src[j * 4 + 0];
dest[i * 4 + 1] = src[j * 4 + 1];
dest[i * 4 + 2] = src[j * 4 + 2];
dest[i * 4 + 3] = src[j * 4 + 3];
}
acc += increment;
}
*accumulator = acc;
*accumulator += n * increment;
}
void
@ -263,7 +178,7 @@ vs_scanline_merge_linear_RGBA (uint8_t * dest, uint8_t * src1, uint8_t * src2,
if (value == 0) {
memcpy (dest, src1, n * 4);
} else {
orc_merge_linear_u8 (dest, src1, src2, 256 - value, value, n * 4);
orc_merge_linear_u8 (dest, src1, src2, value, n * 4);
}
}
@ -348,7 +263,7 @@ vs_scanline_merge_linear_RGB (uint8_t * dest, uint8_t * src1, uint8_t * src2,
if (value == 0) {
memcpy (dest, src1, n * 3);
} else {
orc_merge_linear_u8 (dest, src1, src2, 256 - value, value, n * 3);
orc_merge_linear_u8 (dest, src1, src2, value, n * 3);
}
}
@ -361,14 +276,7 @@ vs_scanline_merge_linear_RGB (uint8_t * dest, uint8_t * src1, uint8_t * src2,
void
vs_scanline_downsample_YUYV (uint8_t * dest, uint8_t * src, int n)
{
int i;
for (i = 0; i < n; i++) {
dest[i * 4 + 0] = (src[i * 8 + 0] + src[i * 8 + 2]) / 2;
dest[i * 4 + 1] = (src[i * 8 + 1] + src[i * 8 + 5]) / 2;
dest[i * 4 + 2] = (src[i * 8 + 4] + src[i * 8 + 6]) / 2;
dest[i * 4 + 3] = (src[i * 8 + 3] + src[i * 8 + 7]) / 2;
}
gst_videoscale_orc_downsample_yuyv (dest, src, n);
}
void
@ -477,7 +385,7 @@ vs_scanline_merge_linear_YUYV (uint8_t * dest, uint8_t * src1, uint8_t * src2,
if (value == 0) {
memcpy (dest, src1, quads * 4);
} else {
orc_merge_linear_u8 (dest, src1, src2, 256 - value, value, quads * 4);
orc_merge_linear_u8 (dest, src1, src2, value, quads * 4);
}
}
@ -606,7 +514,7 @@ vs_scanline_merge_linear_UYVY (uint8_t * dest, uint8_t * src1,
if (value == 0) {
memcpy (dest, src1, quads * 4);
} else {
orc_merge_linear_u8 (dest, src1, src2, 256 - value, value, quads * 4);
orc_merge_linear_u8 (dest, src1, src2, value, quads * 4);
}
}