From 0f2be22e76df1bcf93b2adbc9296e376c48407c6 Mon Sep 17 00:00:00 2001 From: Wim Taymans Date: Tue, 2 Dec 2014 11:32:28 +0100 Subject: [PATCH] video-dither: add video dither helper object Add a new object that implements various dithering methods. --- gst-libs/gst/video/Makefile.am | 4 +- gst-libs/gst/video/video-dither.c | 484 ++++++++++++ gst-libs/gst/video/video-dither.h | 77 ++ gst-libs/gst/video/video-orc-dist.c | 1102 ++++++++++++++++++++++++++- gst-libs/gst/video/video-orc-dist.h | 7 + gst-libs/gst/video/video-orc.orc | 94 +++ gst-libs/gst/video/video.h | 1 + 7 files changed, 1762 insertions(+), 7 deletions(-) create mode 100644 gst-libs/gst/video/video-dither.c create mode 100644 gst-libs/gst/video/video-dither.h diff --git a/gst-libs/gst/video/Makefile.am b/gst-libs/gst/video/Makefile.am index c220f0b1a2..e18c49a1f3 100644 --- a/gst-libs/gst/video/Makefile.am +++ b/gst-libs/gst/video/Makefile.am @@ -2,7 +2,7 @@ ORC_SOURCE=video-orc include $(top_srcdir)/common/orc.mak -glib_enum_headers = video.h video-format.h video-color.h video-info.h \ +glib_enum_headers = video.h video-format.h video-color.h video-info.h video-dither.h \ colorbalance.h navigation.h video-chroma.h video-tile.h video-converter.h \ video-resampler.h glib_enum_define = GST_VIDEO @@ -27,6 +27,7 @@ libgstvideo_@GST_API_VERSION@_la_SOURCES = \ video-chroma.c \ video-color.c \ video-converter.c \ + video-dither.c \ video-info.c \ video-frame.c \ video-scaler.c \ @@ -58,6 +59,7 @@ libgstvideo_@GST_API_VERSION@include_HEADERS = \ video-chroma.h \ video-color.h \ video-converter.h \ + video-dither.h \ video-info.h \ video-frame.h \ video-scaler.h \ diff --git a/gst-libs/gst/video/video-dither.c b/gst-libs/gst/video/video-dither.c new file mode 100644 index 0000000000..881c703c4d --- /dev/null +++ b/gst-libs/gst/video/video-dither.c @@ -0,0 +1,484 @@ +/* GStreamer + * Copyright (C) <2014> Wim Taymans + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Library General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Library General Public License for more details. + * + * You should have received a copy of the GNU Library General Public + * License along with this library; if not, write to the + * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, + * Boston, MA 02110-1301, USA. + */ + +#include + +#include "video-dither.h" +#include "video-orc.h" + +struct _GstVideoDither +{ + GstVideoDitherMethod method; + GstVideoDitherFlags flags; + GstVideoFormat format; + guint width; + + guint depth; + guint n_comp; + + void (*func) (GstVideoDither * dither, gpointer pixels, guint x, guint y, + guint width); + guint8 shift[4]; + guint16 mask[4]; + guint64 orc_mask64; + guint32 orc_mask32; + + gpointer errors; +}; + +static void +dither_none_u8_mask (GstVideoDither * dither, gpointer pixels, guint x, guint y, + guint width) +{ + guint8 *p = pixels; + + video_orc_dither_none_4u8_mask (p + (x * 4), dither->orc_mask32, width); +} + +static void +dither_none_u16_mask (GstVideoDither * dither, gpointer pixels, guint x, + guint y, guint width) +{ + guint16 *p = pixels; + + video_orc_dither_none_4u16_mask (p + (x * 4), dither->orc_mask64, width); +} + +static void +dither_verterr_u8 (GstVideoDither * dither, gpointer pixels, guint x, guint y, + guint width) +{ + guint8 *p = pixels; + guint16 *e = dither->errors; + + if (y == 0) + memset (e + (x * 4), 0, width * 8); + + video_orc_dither_verterr_4u8_mask (p + (x * 4), e + (x * 4), + dither->orc_mask64, width); +} + +static void +dither_verterr_u16 (GstVideoDither * dither, gpointer pixels, guint x, guint y, + guint width) +{ + guint16 *p = pixels; + guint16 *e = dither->errors; + + if (y == 0) + memset (e + (x * 4), 0, width * 8); + + { + gint i, end; + guint16 *m = dither->mask; + guint32 v, mp; + + end = (width + x) * 4; + for (i = x * 4; i < end; i++) { + mp = m[i & 3]; + v = p[i] + e[i]; + /* take new error and store */ + e[i] = v & mp; + /* quantize and store */ + v &= ~mp; + p[i] = CLAMP (v, 0, 65535); + } + } +} + +static void +dither_floyd_steinberg_u8 (GstVideoDither * dither, gpointer pixels, guint x, + guint y, guint width) +{ + guint8 *p = pixels; + guint16 *e = dither->errors; + + if (y == 0) + memset (e + (x * 4), 0, (width + 1) * 8); + + /* add and multiply errors from previous line */ + video_orc_dither_fs_muladd_u8 (e + x * 4, width * 4); +#if 1 + { + gint i, end; + guint16 *m = dither->mask, mp; + guint16 v; + + end = (width + x) * 4; + + for (i = x * 4; i < end; i++) { + mp = m[i & 3]; + v = p[i] + ((7 * e[i] + e[i + 4]) >> 4); + /* take new error and store */ + e[i + 4] = v & mp; + /* quantize and store */ + v &= ~mp; + p[i] = CLAMP (v, 0, 255); + } + } +#else + video_orc_dither_fs_add_4u8 (p, e + x * 4, e + (x + 1) * 4, + dither->orc_mask64, width); +#endif +} + +static void +dither_floyd_steinberg_u16 (GstVideoDither * dither, gpointer pixels, guint x, + guint y, guint width) +{ + guint16 *p = pixels; + guint16 *e = dither->errors; + + if (y == 0) + memset (e + (x * 4), 0, (width + 1) * 8); + + { + gint i, end; + guint16 *m = dither->mask, mp; + guint32 v; + + end = (width + x) * 4; + for (i = x * 4; i < end; i++) { + mp = m[i & 3]; + /* apply previous errors to pixel */ + v = p[i] + ((7 * e[i] + e[i + 4] + 5 * e[i + 8] + 3 * e[i + 12]) >> 4); + /* take new error and store */ + e[i + 4] = v & mp; + /* quantize and store */ + v &= ~mp; + p[i] = CLAMP (v, 0, 65535); + } + } +} + +static void +dither_sierra_lite_u8 (GstVideoDither * dither, gpointer pixels, guint x, + guint y, guint width) +{ + guint8 *p = pixels; + guint16 *e = dither->errors; + gint i, end; + guint16 *m = dither->mask, mp; + guint16 v; + + if (y == 0) + memset (e + (x * 4), 0, (width + 4) * 8); + + end = (width + x) * 4; + for (i = x; i < end; i++) { + mp = m[i & 3]; + /* apply previous errors to pixel */ + v = p[i] + ((2 * e[i] + e[i + 8] + e[i + 12]) >> 2); + /* store new error */ + e[i + 4] = v & mp; + /* quantize and store */ + v &= ~mp; + p[i] = CLAMP (v, 0, 255); + } +} + +static void +dither_sierra_lite_u16 (GstVideoDither * dither, gpointer pixels, guint x, + guint y, guint width) +{ + guint16 *p = pixels; + guint16 *e = dither->errors; + gint i, end; + guint16 *m = dither->mask, mp; + guint32 v; + + if (y == 0) + memset (e + (x * 4), 0, (width + 4) * 8); + + end = (width + x) * 4; + for (i = x; i < end; i++) { + mp = m[i & 3]; + /* apply previous errors to pixel */ + v = p[i] + ((2 * e[i] + e[i + 8] + e[i + 12]) >> 2); + /* store new error */ + e[i + 4] = v & mp; + /* quantize and store */ + v &= ~mp; + p[i] = CLAMP (v & ~mp, 0, 65535); + } +} + +static const guint16 bayer_map[16][16] = { + {0, 128, 32, 160, 8, 136, 40, 168, 2, 130, 34, 162, 10, 138, 42, 170}, + {192, 64, 224, 96, 200, 72, 232, 104, 194, 66, 226, 98, 202, 74, 234, 106}, + {48, 176, 16, 144, 56, 184, 24, 152, 50, 178, 18, 146, 58, 186, 26, 154}, + {240, 112, 208, 80, 248, 120, 216, 88, 242, 114, 210, 82, 250, 122, 218, 90}, + {12, 240, 44, 172, 4, 132, 36, 164, 14, 242, 46, 174, 6, 134, 38, 166}, + {204, 76, 236, 108, 196, 68, 228, 100, 206, 78, 238, 110, 198, 70, 230, 102}, + {60, 188, 28, 156, 52, 180, 20, 148, 62, 190, 30, 158, 54, 182, 22, 150}, + {252, 142, 220, 92, 244, 116, 212, 84, 254, 144, 222, 94, 246, 118, 214, 86}, + {3, 131, 35, 163, 11, 139, 43, 171, 1, 129, 33, 161, 9, 137, 41, 169}, + {195, 67, 227, 99, 203, 75, 235, 107, 193, 65, 225, 97, 201, 73, 233, 105}, + {51, 179, 19, 147, 59, 187, 27, 155, 49, 177, 17, 145, 57, 185, 25, 153}, + {243, 115, 211, 83, 251, 123, 219, 91, 241, 113, 209, 81, 249, 121, 217, 89}, + {15, 243, 47, 175, 7, 135, 39, 167, 13, 241, 45, 173, 5, 133, 37, 165}, + {207, 79, 239, 111, 199, 71, 231, 103, 205, 77, 237, 109, 197, 69, 229, 101}, + {63, 191, 31, 159, 55, 183, 23, 151, 61, 189, 29, 157, 53, 181, 21, 149}, + {255, 145, 223, 95, 247, 119, 215, 87, 253, 143, 221, 93, 245, 117, 213, 85} +}; + +static void +dither_ordered_u8 (GstVideoDither * dither, gpointer pixels, guint x, guint y, + guint width) +{ + guint8 *p = pixels; + guint8 *c = (guint8 *) dither->errors + ((y & 15) * width + (x & 15)) * 4; + + video_orc_dither_ordered_u8 (p, c, width * 4); +} + +static void +dither_ordered_u8_mask (GstVideoDither * dither, gpointer pixels, guint x, + guint y, guint width) +{ + guint8 *p = pixels; + guint16 *c = (guint16 *) dither->errors + ((y & 15) * width + (x & 15)) * 4; + + video_orc_dither_ordered_4u8_mask (p, c, dither->orc_mask64, width); +} + +static void +dither_ordered_u16_mask (GstVideoDither * dither, gpointer pixels, guint x, + guint y, guint width) +{ + guint16 *p = pixels; + guint16 *c = (guint16 *) dither->errors + ((y & 15) * width + (x & 15)) * 4; + + video_orc_dither_ordered_4u16_mask (p, c, dither->orc_mask64, width); +} + +static void +alloc_errors (GstVideoDither * dither, guint lines) +{ + guint width, n_comp; + + width = dither->width; + n_comp = dither->n_comp; + + dither->errors = g_malloc0 (sizeof (guint16) * (width + 8) * n_comp * lines); +} + +static void +setup_bayer (GstVideoDither * dither) +{ + guint i, j, k, width, n_comp, errdepth; + guint8 *shift; + + width = dither->width; + shift = dither->shift; + n_comp = dither->n_comp; + + if (dither->depth == 8) { + if (dither->flags & GST_VIDEO_DITHER_FLAG_QUANTIZE) { + dither->func = dither_ordered_u8_mask; + errdepth = 16; + } else { + dither->func = dither_ordered_u8; + errdepth = 8; + } + } else { + dither->func = dither_ordered_u16_mask; + errdepth = 16; + } + + alloc_errors (dither, 16); + + if (errdepth == 8) { + for (i = 0; i < 16; i++) { + guint8 *p = (guint8 *) dither->errors + (n_comp * width * i), v; + for (j = 0; j < width; j++) { + for (k = 0; k < n_comp; k++) { + v = bayer_map[i & 15][j & 15]; + if (shift[k] < 8) + v = v >> (8 - shift[k]); + p[n_comp * j + k] = v; + } + } + } + } else { + for (i = 0; i < 16; i++) { + guint16 *p = (guint16 *) dither->errors + (n_comp * width * i), v; + for (j = 0; j < width; j++) { + for (k = 0; k < n_comp; k++) { + v = bayer_map[i & 15][j & 15]; + if (shift[k] < 8) + v = v >> (8 - shift[k]); + p[n_comp * j + k] = v; + } + } + } + } +} + +static gint +count_power (guint v) +{ + gint res = 0; + while (v > 1) { + res++; + v >>= 1; + } + return res; +} + +/** + * gst_video_dither_new: + * @method: a #GstVideoDitherMethod + * @flags: a #GstVideoDitherFlags + * @format: a #GstVideoFormat + * @quantizer: quantizer + * @width: the width of the lines + * + * Make a new dither object for dithering lines of @format using the + * algorithm described by @method. + * + * Each component will be quantized to a multiple of @quantizer. Better + * performance is achived when @quantizer is a power of 2. + * + * @width is the width of the lines that this ditherer will handle. + * + * Returns: a new #GstVideoDither + */ +GstVideoDither * +gst_video_dither_new (GstVideoDitherMethod method, GstVideoDitherFlags flags, + GstVideoFormat format, guint quantizer[GST_VIDEO_MAX_COMPONENTS], + guint width) +{ + GstVideoDither *dither; + gint i; + + dither = g_slice_new0 (GstVideoDither); + dither->method = method; + dither->flags = flags; + dither->format = format; + dither->width = width; + + dither->n_comp = 4; + + switch (format) { + case GST_VIDEO_FORMAT_AYUV: + case GST_VIDEO_FORMAT_ARGB: + dither->depth = 8; + break; + case GST_VIDEO_FORMAT_AYUV64: + case GST_VIDEO_FORMAT_ARGB64: + dither->depth = 16; + break; + default: + g_return_val_if_reached (NULL); + break; + } + + for (i = 0; i < 4; i++) { + /* FIXME, only power of 2 quantizers */ + guint q = quantizer[(i + 3) & 3]; + + dither->shift[i] = count_power (q); + dither->mask[i] = (1 << dither->shift[i]) - 1; + GST_DEBUG ("%d: quant %d shift %d mask %08x", i, q, dither->shift[i], + dither->mask[i]); + dither->orc_mask64 = + (dither->orc_mask64 << 16) | GUINT16_FROM_BE (dither->mask[i]); + dither->orc_mask32 = (dither->orc_mask32 << 8) | (guint8) dither->mask[i]; + } + dither->orc_mask64 = GUINT64_FROM_BE (dither->orc_mask64); + dither->orc_mask32 = GUINT32_FROM_BE (dither->orc_mask32); + GST_DEBUG ("mask64 %08llx", (unsigned long long int) dither->orc_mask64); + GST_DEBUG ("mask32 %08x", dither->orc_mask32); + + switch (method) { + case GST_VIDEO_DITHER_NONE: + if (dither->flags & GST_VIDEO_DITHER_FLAG_QUANTIZE) + if (dither->depth == 8) + dither->func = dither_none_u8_mask; + else + dither->func = dither_none_u16_mask; + else + dither->func = NULL; + break; + case GST_VIDEO_DITHER_VERTERR: + alloc_errors (dither, 1); + if (dither->depth == 8) { + dither->func = dither_verterr_u8; + } else + dither->func = dither_verterr_u16; + break; + case GST_VIDEO_DITHER_FLOYD_STEINBERG: + alloc_errors (dither, 1); + if (dither->depth == 8) { + dither->func = dither_floyd_steinberg_u8; + } else + dither->func = dither_floyd_steinberg_u16; + break; + case GST_VIDEO_DITHER_SIERRA_LITE: + alloc_errors (dither, 1); + if (dither->depth == 8) { + dither->func = dither_sierra_lite_u8; + } else + dither->func = dither_sierra_lite_u16; + break; + case GST_VIDEO_DITHER_BAYER: + setup_bayer (dither); + break; + } + return dither; +} + +/** + * gst_video_dither_free: + * @dither: a #GstVideoDither + * + * Free @dither + */ +void +gst_video_dither_free (GstVideoDither * dither) +{ + g_return_if_fail (dither != NULL); + + g_free (dither->errors); + g_slice_free (GstVideoDither, dither); +} + +/** + * gst_video_dither_line: + * @dither: a #GstVideoDither + * @line: pointer to the pixels of the line + * @x: x coordinate + * @y: y coordinate + * @width: the width + * + * Dither @width pixels starting from offset @x in @line using @dither. + * + * @y is the line number of @line in the output image. + */ +void +gst_video_dither_line (GstVideoDither * dither, gpointer line, guint x, guint y, + guint width) +{ + g_return_if_fail (dither != NULL); + g_return_if_fail (x + width < dither->width); + + if (dither->func) + dither->func (dither, line, x, y, width); +} diff --git a/gst-libs/gst/video/video-dither.h b/gst-libs/gst/video/video-dither.h new file mode 100644 index 0000000000..06378e91c5 --- /dev/null +++ b/gst-libs/gst/video/video-dither.h @@ -0,0 +1,77 @@ +/* GStreamer + * Copyright (C) <2014> Wim Taymans + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Library General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Library General Public License for more details. + * + * You should have received a copy of the GNU Library General Public + * License along with this library; if not, write to the + * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, + * Boston, MA 02110-1301, USA. + */ + +#ifndef __GST_VIDEO_DITHER_H__ +#define __GST_VIDEO_DITHER_H__ + +#include + +G_BEGIN_DECLS + +/** + * GstVideoDitherMethod: + * @GST_VIDEO_DITHER_NONE: no dithering + * @GST_VIDEO_DITHER_VERTERR: propagate rounding errors downwards + * @GST_VIDEO_DITHER_FLOYD_STEINBERG: Dither with floyd-steinberg error diffusion + * @GST_VIDEO_DITHER_SIERRA_LITE: Dither with Sierra Lite error diffusion + * @GST_VIDEO_DITHER_BAYER: ordered dither using a bayer pattern + * + * Different dithering methods to use. + */ +typedef enum { + GST_VIDEO_DITHER_NONE, + GST_VIDEO_DITHER_VERTERR, + GST_VIDEO_DITHER_FLOYD_STEINBERG, + GST_VIDEO_DITHER_SIERRA_LITE, + GST_VIDEO_DITHER_BAYER, +} GstVideoDitherMethod; + +/** + * GstVideoDitherFlags: + * @GST_VIDEO_DITHER_FLAG_NONE: no flags + * @GST_VIDEO_DITHER_FLAG_INTERLACED: the input is interlaced + * @GST_VIDEO_DITHER_FLAG_QUANTIZE: quantize values in addition to adding dither. + * + * Extra flags that influence the result from gst_video_chroma_resample_new(). + */ +typedef enum { + GST_VIDEO_DITHER_FLAG_NONE = 0, + GST_VIDEO_DITHER_FLAG_INTERLACED = (1 << 0), + GST_VIDEO_DITHER_FLAG_QUANTIZE = (1 << 1), +} GstVideoDitherFlags; + +typedef struct _GstVideoDither GstVideoDither; + +/* circular dependency, need to include this after defining the enums */ +#include + +GstVideoDither * gst_video_dither_new (GstVideoDitherMethod method, + GstVideoDitherFlags flags, + GstVideoFormat format, + guint quantizer[GST_VIDEO_MAX_COMPONENTS], + guint width); + +void gst_video_dither_free (GstVideoDither *dither); + +void gst_video_dither_line (GstVideoDither *dither, + gpointer line, guint x, guint y, guint width); + +G_END_DECLS + +#endif /* __GST_VIDEO_DITHER_H__ */ diff --git a/gst-libs/gst/video/video-orc-dist.c b/gst-libs/gst/video/video-orc-dist.c index e0ac2b35e9..0072c0e7f8 100644 --- a/gst-libs/gst/video/video-orc-dist.c +++ b/gst-libs/gst/video/video-orc-dist.c @@ -389,6 +389,18 @@ void video_orc_chroma_down_v4_u8 (guint8 * ORC_RESTRICT d1, void video_orc_chroma_down_v4_u16 (guint16 * ORC_RESTRICT d1, const guint16 * ORC_RESTRICT s1, const guint16 * ORC_RESTRICT s2, const guint16 * ORC_RESTRICT s3, const guint16 * ORC_RESTRICT s4, int n); +void video_orc_dither_none_4u8_mask (guint8 * ORC_RESTRICT d1, int p1, int n); +void video_orc_dither_none_4u16_mask (guint16 * ORC_RESTRICT d1, orc_int64 p1, + int n); +void video_orc_dither_verterr_4u8_mask (guint8 * ORC_RESTRICT d1, + guint16 * ORC_RESTRICT d2, orc_int64 p1, int n); +void video_orc_dither_fs_muladd_u8 (guint16 * ORC_RESTRICT d1, int n); +void video_orc_dither_ordered_u8 (guint8 * ORC_RESTRICT d1, + const guint8 * ORC_RESTRICT s1, int n); +void video_orc_dither_ordered_4u8_mask (guint8 * ORC_RESTRICT d1, + const guint16 * ORC_RESTRICT s1, orc_int64 p1, int n); +void video_orc_dither_ordered_4u16_mask (guint16 * ORC_RESTRICT d1, + const guint16 * ORC_RESTRICT s1, orc_int64 p1, int n); /* begin Orc C target preamble */ @@ -17078,20 +17090,20 @@ video_orc_matrix8 (guint8 * ORC_RESTRICT d1, const guint8 * ORC_RESTRICT s1, { orc_union64 tmp; tmp.i = p1; - ex->params[ORC_VAR_P1] = tmp.x2[0]; - ex->params[ORC_VAR_T1] = tmp.x2[1]; + ex->params[ORC_VAR_P1] = ((orc_uint64) tmp.i) & 0xffffffff; + ex->params[ORC_VAR_T1] = ((orc_uint64) tmp.i) >> 32; } { orc_union64 tmp; tmp.i = p2; - ex->params[ORC_VAR_P2] = tmp.x2[0]; - ex->params[ORC_VAR_T2] = tmp.x2[1]; + ex->params[ORC_VAR_P2] = ((orc_uint64) tmp.i) & 0xffffffff; + ex->params[ORC_VAR_T2] = ((orc_uint64) tmp.i) >> 32; } { orc_union64 tmp; tmp.i = p3; - ex->params[ORC_VAR_P3] = tmp.x2[0]; - ex->params[ORC_VAR_T3] = tmp.x2[1]; + ex->params[ORC_VAR_P3] = ((orc_uint64) tmp.i) & 0xffffffff; + ex->params[ORC_VAR_T3] = ((orc_uint64) tmp.i) >> 32; } func = c->exec; @@ -24470,3 +24482,1081 @@ video_orc_chroma_down_v4_u16 (guint16 * ORC_RESTRICT d1, func (ex); } #endif + + +/* video_orc_dither_none_4u8_mask */ +#ifdef DISABLE_ORC +void +video_orc_dither_none_4u8_mask (guint8 * ORC_RESTRICT d1, int p1, int n) +{ + int i; + orc_union32 *ORC_RESTRICT ptr0; + orc_union32 var33; + orc_union32 var34; + orc_union32 var35; + + ptr0 = (orc_union32 *) d1; + + /* 0: loadpl */ + var35.i = p1; + + for (i = 0; i < n; i++) { + /* 1: loadl */ + var33 = ptr0[i]; + /* 2: andnb */ + var34.x4[0] = (~var35.x4[0]) & var33.x4[0]; + var34.x4[1] = (~var35.x4[1]) & var33.x4[1]; + var34.x4[2] = (~var35.x4[2]) & var33.x4[2]; + var34.x4[3] = (~var35.x4[3]) & var33.x4[3]; + /* 3: storel */ + ptr0[i] = var34; + } + +} + +#else +static void +_backup_video_orc_dither_none_4u8_mask (OrcExecutor * ORC_RESTRICT ex) +{ + int i; + int n = ex->n; + orc_union32 *ORC_RESTRICT ptr0; + orc_union32 var33; + orc_union32 var34; + orc_union32 var35; + + ptr0 = (orc_union32 *) ex->arrays[0]; + + /* 0: loadpl */ + var35.i = ex->params[24]; + + for (i = 0; i < n; i++) { + /* 1: loadl */ + var33 = ptr0[i]; + /* 2: andnb */ + var34.x4[0] = (~var35.x4[0]) & var33.x4[0]; + var34.x4[1] = (~var35.x4[1]) & var33.x4[1]; + var34.x4[2] = (~var35.x4[2]) & var33.x4[2]; + var34.x4[3] = (~var35.x4[3]) & var33.x4[3]; + /* 3: storel */ + ptr0[i] = var34; + } + +} + +void +video_orc_dither_none_4u8_mask (guint8 * ORC_RESTRICT d1, int p1, int n) +{ + OrcExecutor _ex, *ex = &_ex; + static volatile int p_inited = 0; + static OrcCode *c = 0; + void (*func) (OrcExecutor *); + + if (!p_inited) { + orc_once_mutex_lock (); + if (!p_inited) { + OrcProgram *p; + +#if 1 + static const orc_uint8 bc[] = { + 1, 9, 30, 118, 105, 100, 101, 111, 95, 111, 114, 99, 95, 100, 105, 116, + 104, 101, 114, 95, 110, 111, 110, 101, 95, 52, 117, 56, 95, 109, 97, + 115, + 107, 11, 4, 4, 16, 4, 20, 4, 115, 32, 24, 21, 2, 37, 0, 32, + 0, 2, 0, + }; + p = orc_program_new_from_static_bytecode (bc); + orc_program_set_backup_function (p, + _backup_video_orc_dither_none_4u8_mask); +#else + p = orc_program_new (); + orc_program_set_name (p, "video_orc_dither_none_4u8_mask"); + orc_program_set_backup_function (p, + _backup_video_orc_dither_none_4u8_mask); + orc_program_add_destination (p, 4, "d1"); + orc_program_add_parameter (p, 4, "p1"); + orc_program_add_temporary (p, 4, "t1"); + + orc_program_append_2 (p, "loadpl", 0, ORC_VAR_T1, ORC_VAR_P1, ORC_VAR_D1, + ORC_VAR_D1); + orc_program_append_2 (p, "andnb", 2, ORC_VAR_D1, ORC_VAR_T1, ORC_VAR_D1, + ORC_VAR_D1); +#endif + + orc_program_compile (p); + c = orc_program_take_code (p); + orc_program_free (p); + } + p_inited = TRUE; + orc_once_mutex_unlock (); + } + ex->arrays[ORC_VAR_A2] = c; + ex->program = 0; + + ex->n = n; + ex->arrays[ORC_VAR_D1] = d1; + ex->params[ORC_VAR_P1] = p1; + + func = c->exec; + func (ex); +} +#endif + + +/* video_orc_dither_none_4u16_mask */ +#ifdef DISABLE_ORC +void +video_orc_dither_none_4u16_mask (guint16 * ORC_RESTRICT d1, orc_int64 p1, int n) +{ + int i; + orc_union64 *ORC_RESTRICT ptr0; + orc_union64 var33; + orc_union64 var34; + orc_union64 var35; + + ptr0 = (orc_union64 *) d1; + + /* 0: loadpq */ + var35.i = p1; + + for (i = 0; i < n; i++) { + /* 1: loadq */ + var33 = ptr0[i]; + /* 2: andnw */ + var34.x4[0] = (~var35.x4[0]) & var33.x4[0]; + var34.x4[1] = (~var35.x4[1]) & var33.x4[1]; + var34.x4[2] = (~var35.x4[2]) & var33.x4[2]; + var34.x4[3] = (~var35.x4[3]) & var33.x4[3]; + /* 3: storeq */ + ptr0[i] = var34; + } + +} + +#else +static void +_backup_video_orc_dither_none_4u16_mask (OrcExecutor * ORC_RESTRICT ex) +{ + int i; + int n = ex->n; + orc_union64 *ORC_RESTRICT ptr0; + orc_union64 var33; + orc_union64 var34; + orc_union64 var35; + + ptr0 = (orc_union64 *) ex->arrays[0]; + + /* 0: loadpq */ + var35.i = + (ex->params[24] & 0xffffffff) | ((orc_uint64) (ex->params[24 + + (ORC_VAR_T1 - ORC_VAR_P1)]) << 32); + + for (i = 0; i < n; i++) { + /* 1: loadq */ + var33 = ptr0[i]; + /* 2: andnw */ + var34.x4[0] = (~var35.x4[0]) & var33.x4[0]; + var34.x4[1] = (~var35.x4[1]) & var33.x4[1]; + var34.x4[2] = (~var35.x4[2]) & var33.x4[2]; + var34.x4[3] = (~var35.x4[3]) & var33.x4[3]; + /* 3: storeq */ + ptr0[i] = var34; + } + +} + +void +video_orc_dither_none_4u16_mask (guint16 * ORC_RESTRICT d1, orc_int64 p1, int n) +{ + OrcExecutor _ex, *ex = &_ex; + static volatile int p_inited = 0; + static OrcCode *c = 0; + void (*func) (OrcExecutor *); + + if (!p_inited) { + orc_once_mutex_lock (); + if (!p_inited) { + OrcProgram *p; + +#if 1 + static const orc_uint8 bc[] = { + 1, 9, 31, 118, 105, 100, 101, 111, 95, 111, 114, 99, 95, 100, 105, 116, + 104, 101, 114, 95, 110, 111, 110, 101, 95, 52, 117, 49, 54, 95, 109, 97, + 115, 107, 11, 8, 8, 18, 8, 20, 8, 134, 32, 24, 21, 2, 74, 0, + 32, 0, 2, 0, + }; + p = orc_program_new_from_static_bytecode (bc); + orc_program_set_backup_function (p, + _backup_video_orc_dither_none_4u16_mask); +#else + p = orc_program_new (); + orc_program_set_name (p, "video_orc_dither_none_4u16_mask"); + orc_program_set_backup_function (p, + _backup_video_orc_dither_none_4u16_mask); + orc_program_add_destination (p, 8, "d1"); + orc_program_add_parameter_int64 (p, 8, "p1"); + orc_program_add_temporary (p, 8, "t1"); + + orc_program_append_2 (p, "loadpq", 0, ORC_VAR_T1, ORC_VAR_P1, ORC_VAR_D1, + ORC_VAR_D1); + orc_program_append_2 (p, "andnw", 2, ORC_VAR_D1, ORC_VAR_T1, ORC_VAR_D1, + ORC_VAR_D1); +#endif + + orc_program_compile (p); + c = orc_program_take_code (p); + orc_program_free (p); + } + p_inited = TRUE; + orc_once_mutex_unlock (); + } + ex->arrays[ORC_VAR_A2] = c; + ex->program = 0; + + ex->n = n; + ex->arrays[ORC_VAR_D1] = d1; + { + orc_union64 tmp; + tmp.i = p1; + ex->params[ORC_VAR_P1] = ((orc_uint64) tmp.i) & 0xffffffff; + ex->params[ORC_VAR_T1] = ((orc_uint64) tmp.i) >> 32; + } + + func = c->exec; + func (ex); +} +#endif + + +/* video_orc_dither_verterr_4u8_mask */ +#ifdef DISABLE_ORC +void +video_orc_dither_verterr_4u8_mask (guint8 * ORC_RESTRICT d1, + guint16 * ORC_RESTRICT d2, orc_int64 p1, int n) +{ + int i; + orc_union32 *ORC_RESTRICT ptr0; + orc_union64 *ORC_RESTRICT ptr1; + orc_union32 var34; + orc_union64 var35; + orc_union64 var36; + orc_union32 var37; + orc_union64 var38; + orc_union64 var39; + orc_union64 var40; + orc_union64 var41; + + ptr0 = (orc_union32 *) d1; + ptr1 = (orc_union64 *) d2; + + /* 0: loadpq */ + var38.i = p1; + + for (i = 0; i < n; i++) { + /* 1: loadl */ + var34 = ptr0[i]; + /* 2: convubw */ + var39.x4[0] = (orc_uint8) var34.x4[0]; + var39.x4[1] = (orc_uint8) var34.x4[1]; + var39.x4[2] = (orc_uint8) var34.x4[2]; + var39.x4[3] = (orc_uint8) var34.x4[3]; + /* 3: loadq */ + var35 = ptr1[i]; + /* 4: addw */ + var40.x4[0] = var35.x4[0] + var39.x4[0]; + var40.x4[1] = var35.x4[1] + var39.x4[1]; + var40.x4[2] = var35.x4[2] + var39.x4[2]; + var40.x4[3] = var35.x4[3] + var39.x4[3]; + /* 5: andw */ + var36.x4[0] = var38.x4[0] & var40.x4[0]; + var36.x4[1] = var38.x4[1] & var40.x4[1]; + var36.x4[2] = var38.x4[2] & var40.x4[2]; + var36.x4[3] = var38.x4[3] & var40.x4[3]; + /* 6: storeq */ + ptr1[i] = var36; + /* 7: andnw */ + var41.x4[0] = (~var38.x4[0]) & var40.x4[0]; + var41.x4[1] = (~var38.x4[1]) & var40.x4[1]; + var41.x4[2] = (~var38.x4[2]) & var40.x4[2]; + var41.x4[3] = (~var38.x4[3]) & var40.x4[3]; + /* 8: convsuswb */ + var37.x4[0] = ORC_CLAMP_UB (var41.x4[0]); + var37.x4[1] = ORC_CLAMP_UB (var41.x4[1]); + var37.x4[2] = ORC_CLAMP_UB (var41.x4[2]); + var37.x4[3] = ORC_CLAMP_UB (var41.x4[3]); + /* 9: storel */ + ptr0[i] = var37; + } + +} + +#else +static void +_backup_video_orc_dither_verterr_4u8_mask (OrcExecutor * ORC_RESTRICT ex) +{ + int i; + int n = ex->n; + orc_union32 *ORC_RESTRICT ptr0; + orc_union64 *ORC_RESTRICT ptr1; + orc_union32 var34; + orc_union64 var35; + orc_union64 var36; + orc_union32 var37; + orc_union64 var38; + orc_union64 var39; + orc_union64 var40; + orc_union64 var41; + + ptr0 = (orc_union32 *) ex->arrays[0]; + ptr1 = (orc_union64 *) ex->arrays[1]; + + /* 0: loadpq */ + var38.i = + (ex->params[24] & 0xffffffff) | ((orc_uint64) (ex->params[24 + + (ORC_VAR_T1 - ORC_VAR_P1)]) << 32); + + for (i = 0; i < n; i++) { + /* 1: loadl */ + var34 = ptr0[i]; + /* 2: convubw */ + var39.x4[0] = (orc_uint8) var34.x4[0]; + var39.x4[1] = (orc_uint8) var34.x4[1]; + var39.x4[2] = (orc_uint8) var34.x4[2]; + var39.x4[3] = (orc_uint8) var34.x4[3]; + /* 3: loadq */ + var35 = ptr1[i]; + /* 4: addw */ + var40.x4[0] = var35.x4[0] + var39.x4[0]; + var40.x4[1] = var35.x4[1] + var39.x4[1]; + var40.x4[2] = var35.x4[2] + var39.x4[2]; + var40.x4[3] = var35.x4[3] + var39.x4[3]; + /* 5: andw */ + var36.x4[0] = var38.x4[0] & var40.x4[0]; + var36.x4[1] = var38.x4[1] & var40.x4[1]; + var36.x4[2] = var38.x4[2] & var40.x4[2]; + var36.x4[3] = var38.x4[3] & var40.x4[3]; + /* 6: storeq */ + ptr1[i] = var36; + /* 7: andnw */ + var41.x4[0] = (~var38.x4[0]) & var40.x4[0]; + var41.x4[1] = (~var38.x4[1]) & var40.x4[1]; + var41.x4[2] = (~var38.x4[2]) & var40.x4[2]; + var41.x4[3] = (~var38.x4[3]) & var40.x4[3]; + /* 8: convsuswb */ + var37.x4[0] = ORC_CLAMP_UB (var41.x4[0]); + var37.x4[1] = ORC_CLAMP_UB (var41.x4[1]); + var37.x4[2] = ORC_CLAMP_UB (var41.x4[2]); + var37.x4[3] = ORC_CLAMP_UB (var41.x4[3]); + /* 9: storel */ + ptr0[i] = var37; + } + +} + +void +video_orc_dither_verterr_4u8_mask (guint8 * ORC_RESTRICT d1, + guint16 * ORC_RESTRICT d2, orc_int64 p1, int n) +{ + OrcExecutor _ex, *ex = &_ex; + static volatile int p_inited = 0; + static OrcCode *c = 0; + void (*func) (OrcExecutor *); + + if (!p_inited) { + orc_once_mutex_lock (); + if (!p_inited) { + OrcProgram *p; + +#if 1 + static const orc_uint8 bc[] = { + 1, 9, 33, 118, 105, 100, 101, 111, 95, 111, 114, 99, 95, 100, 105, 116, + 104, 101, 114, 95, 118, 101, 114, 116, 101, 114, 114, 95, 52, 117, 56, + 95, + 109, 97, 115, 107, 11, 4, 4, 11, 8, 8, 18, 8, 20, 8, 20, 8, + 134, 32, 24, 21, 2, 150, 33, 0, 21, 2, 70, 33, 1, 33, 21, 2, + 73, 1, 32, 33, 21, 2, 74, 33, 32, 33, 21, 2, 160, 0, 33, 2, + 0, + }; + p = orc_program_new_from_static_bytecode (bc); + orc_program_set_backup_function (p, + _backup_video_orc_dither_verterr_4u8_mask); +#else + p = orc_program_new (); + orc_program_set_name (p, "video_orc_dither_verterr_4u8_mask"); + orc_program_set_backup_function (p, + _backup_video_orc_dither_verterr_4u8_mask); + orc_program_add_destination (p, 4, "d1"); + orc_program_add_destination (p, 8, "d2"); + orc_program_add_parameter_int64 (p, 8, "p1"); + orc_program_add_temporary (p, 8, "t1"); + orc_program_add_temporary (p, 8, "t2"); + + orc_program_append_2 (p, "loadpq", 0, ORC_VAR_T1, ORC_VAR_P1, ORC_VAR_D1, + ORC_VAR_D1); + orc_program_append_2 (p, "convubw", 2, ORC_VAR_T2, ORC_VAR_D1, ORC_VAR_D1, + ORC_VAR_D1); + orc_program_append_2 (p, "addw", 2, ORC_VAR_T2, ORC_VAR_D2, ORC_VAR_T2, + ORC_VAR_D1); + orc_program_append_2 (p, "andw", 2, ORC_VAR_D2, ORC_VAR_T1, ORC_VAR_T2, + ORC_VAR_D1); + orc_program_append_2 (p, "andnw", 2, ORC_VAR_T2, ORC_VAR_T1, ORC_VAR_T2, + ORC_VAR_D1); + orc_program_append_2 (p, "convsuswb", 2, ORC_VAR_D1, ORC_VAR_T2, + ORC_VAR_D1, ORC_VAR_D1); +#endif + + orc_program_compile (p); + c = orc_program_take_code (p); + orc_program_free (p); + } + p_inited = TRUE; + orc_once_mutex_unlock (); + } + ex->arrays[ORC_VAR_A2] = c; + ex->program = 0; + + ex->n = n; + ex->arrays[ORC_VAR_D1] = d1; + ex->arrays[ORC_VAR_D2] = d2; + { + orc_union64 tmp; + tmp.i = p1; + ex->params[ORC_VAR_P1] = ((orc_uint64) tmp.i) & 0xffffffff; + ex->params[ORC_VAR_T1] = ((orc_uint64) tmp.i) >> 32; + } + + func = c->exec; + func (ex); +} +#endif + + +/* video_orc_dither_fs_muladd_u8 */ +#ifdef DISABLE_ORC +void +video_orc_dither_fs_muladd_u8 (guint16 * ORC_RESTRICT d1, int n) +{ + int i; + orc_union16 *ORC_RESTRICT ptr0; +#if defined(__APPLE__) && __GNUC__ == 4 && __GNUC_MINOR__ == 2 && defined (__i386__) + volatile orc_union16 var34; +#else + orc_union16 var34; +#endif + orc_union16 var35; +#if defined(__APPLE__) && __GNUC__ == 4 && __GNUC_MINOR__ == 2 && defined (__i386__) + volatile orc_union16 var36; +#else + orc_union16 var36; +#endif + orc_union16 var37; + orc_union16 var38; + orc_union16 var39; + orc_union16 var40; + orc_union16 var41; + orc_union16 var42; + + ptr0 = (orc_union16 *) d1; + + /* 1: loadpw */ + var34.i = (int) 0x00000005; /* 5 or 2.47033e-323f */ + /* 6: loadpw */ + var36.i = (int) 0x00000003; /* 3 or 1.4822e-323f */ + + for (i = 0; i < n; i++) { + /* 0: loadoffw */ + var38 = ptr0[i + 4]; + /* 2: mullw */ + var39.i = (var38.i * var34.i) & 0xffff; + /* 3: loadw */ + var35 = ptr0[i]; + /* 4: addw */ + var40.i = var39.i + var35.i; + /* 5: loadoffw */ + var41 = ptr0[i + 8]; + /* 7: mullw */ + var42.i = (var41.i * var36.i) & 0xffff; + /* 8: addw */ + var37.i = var40.i + var42.i; + /* 9: storew */ + ptr0[i] = var37; + } + +} + +#else +static void +_backup_video_orc_dither_fs_muladd_u8 (OrcExecutor * ORC_RESTRICT ex) +{ + int i; + int n = ex->n; + orc_union16 *ORC_RESTRICT ptr0; +#if defined(__APPLE__) && __GNUC__ == 4 && __GNUC_MINOR__ == 2 && defined (__i386__) + volatile orc_union16 var34; +#else + orc_union16 var34; +#endif + orc_union16 var35; +#if defined(__APPLE__) && __GNUC__ == 4 && __GNUC_MINOR__ == 2 && defined (__i386__) + volatile orc_union16 var36; +#else + orc_union16 var36; +#endif + orc_union16 var37; + orc_union16 var38; + orc_union16 var39; + orc_union16 var40; + orc_union16 var41; + orc_union16 var42; + + ptr0 = (orc_union16 *) ex->arrays[0]; + + /* 1: loadpw */ + var34.i = (int) 0x00000005; /* 5 or 2.47033e-323f */ + /* 6: loadpw */ + var36.i = (int) 0x00000003; /* 3 or 1.4822e-323f */ + + for (i = 0; i < n; i++) { + /* 0: loadoffw */ + var38 = ptr0[i + 4]; + /* 2: mullw */ + var39.i = (var38.i * var34.i) & 0xffff; + /* 3: loadw */ + var35 = ptr0[i]; + /* 4: addw */ + var40.i = var39.i + var35.i; + /* 5: loadoffw */ + var41 = ptr0[i + 8]; + /* 7: mullw */ + var42.i = (var41.i * var36.i) & 0xffff; + /* 8: addw */ + var37.i = var40.i + var42.i; + /* 9: storew */ + ptr0[i] = var37; + } + +} + +void +video_orc_dither_fs_muladd_u8 (guint16 * ORC_RESTRICT d1, int n) +{ + OrcExecutor _ex, *ex = &_ex; + static volatile int p_inited = 0; + static OrcCode *c = 0; + void (*func) (OrcExecutor *); + + if (!p_inited) { + orc_once_mutex_lock (); + if (!p_inited) { + OrcProgram *p; + +#if 1 + static const orc_uint8 bc[] = { + 1, 9, 29, 118, 105, 100, 101, 111, 95, 111, 114, 99, 95, 100, 105, 116, + 104, 101, 114, 95, 102, 115, 95, 109, 117, 108, 97, 100, 100, 95, 117, + 56, + 11, 2, 2, 14, 4, 4, 0, 0, 0, 14, 2, 5, 0, 0, 0, 14, + 4, 8, 0, 0, 0, 14, 2, 3, 0, 0, 0, 20, 2, 20, 2, 83, + 33, 0, 16, 89, 33, 33, 17, 70, 32, 33, 0, 83, 33, 0, 18, 89, + 33, 33, 19, 70, 0, 32, 33, 2, 0, + }; + p = orc_program_new_from_static_bytecode (bc); + orc_program_set_backup_function (p, + _backup_video_orc_dither_fs_muladd_u8); +#else + p = orc_program_new (); + orc_program_set_name (p, "video_orc_dither_fs_muladd_u8"); + orc_program_set_backup_function (p, + _backup_video_orc_dither_fs_muladd_u8); + orc_program_add_destination (p, 2, "d1"); + orc_program_add_constant (p, 4, 0x00000004, "c1"); + orc_program_add_constant (p, 2, 0x00000005, "c2"); + orc_program_add_constant (p, 4, 0x00000008, "c3"); + orc_program_add_constant (p, 2, 0x00000003, "c4"); + orc_program_add_temporary (p, 2, "t1"); + orc_program_add_temporary (p, 2, "t2"); + + orc_program_append_2 (p, "loadoffw", 0, ORC_VAR_T2, ORC_VAR_D1, + ORC_VAR_C1, ORC_VAR_D1); + orc_program_append_2 (p, "mullw", 0, ORC_VAR_T2, ORC_VAR_T2, ORC_VAR_C2, + ORC_VAR_D1); + orc_program_append_2 (p, "addw", 0, ORC_VAR_T1, ORC_VAR_T2, ORC_VAR_D1, + ORC_VAR_D1); + orc_program_append_2 (p, "loadoffw", 0, ORC_VAR_T2, ORC_VAR_D1, + ORC_VAR_C3, ORC_VAR_D1); + orc_program_append_2 (p, "mullw", 0, ORC_VAR_T2, ORC_VAR_T2, ORC_VAR_C4, + ORC_VAR_D1); + orc_program_append_2 (p, "addw", 0, ORC_VAR_D1, ORC_VAR_T1, ORC_VAR_T2, + ORC_VAR_D1); +#endif + + orc_program_compile (p); + c = orc_program_take_code (p); + orc_program_free (p); + } + p_inited = TRUE; + orc_once_mutex_unlock (); + } + ex->arrays[ORC_VAR_A2] = c; + ex->program = 0; + + ex->n = n; + ex->arrays[ORC_VAR_D1] = d1; + + func = c->exec; + func (ex); +} +#endif + + +/* video_orc_dither_ordered_u8 */ +#ifdef DISABLE_ORC +void +video_orc_dither_ordered_u8 (guint8 * ORC_RESTRICT d1, + const guint8 * ORC_RESTRICT s1, int n) +{ + int i; + orc_int8 *ORC_RESTRICT ptr0; + const orc_int8 *ORC_RESTRICT ptr4; + orc_int8 var32; + orc_int8 var33; + orc_int8 var34; + + ptr0 = (orc_int8 *) d1; + ptr4 = (orc_int8 *) s1; + + + for (i = 0; i < n; i++) { + /* 0: loadb */ + var32 = ptr0[i]; + /* 1: loadb */ + var33 = ptr4[i]; + /* 2: addusb */ + var34 = ORC_CLAMP_UB ((orc_uint8) var32 + (orc_uint8) var33); + /* 3: storeb */ + ptr0[i] = var34; + } + +} + +#else +static void +_backup_video_orc_dither_ordered_u8 (OrcExecutor * ORC_RESTRICT ex) +{ + int i; + int n = ex->n; + orc_int8 *ORC_RESTRICT ptr0; + const orc_int8 *ORC_RESTRICT ptr4; + orc_int8 var32; + orc_int8 var33; + orc_int8 var34; + + ptr0 = (orc_int8 *) ex->arrays[0]; + ptr4 = (orc_int8 *) ex->arrays[4]; + + + for (i = 0; i < n; i++) { + /* 0: loadb */ + var32 = ptr0[i]; + /* 1: loadb */ + var33 = ptr4[i]; + /* 2: addusb */ + var34 = ORC_CLAMP_UB ((orc_uint8) var32 + (orc_uint8) var33); + /* 3: storeb */ + ptr0[i] = var34; + } + +} + +void +video_orc_dither_ordered_u8 (guint8 * ORC_RESTRICT d1, + const guint8 * ORC_RESTRICT s1, int n) +{ + OrcExecutor _ex, *ex = &_ex; + static volatile int p_inited = 0; + static OrcCode *c = 0; + void (*func) (OrcExecutor *); + + if (!p_inited) { + orc_once_mutex_lock (); + if (!p_inited) { + OrcProgram *p; + +#if 1 + static const orc_uint8 bc[] = { + 1, 9, 27, 118, 105, 100, 101, 111, 95, 111, 114, 99, 95, 100, 105, 116, + 104, 101, 114, 95, 111, 114, 100, 101, 114, 101, 100, 95, 117, 56, 11, + 1, + 1, 12, 1, 1, 35, 0, 0, 4, 2, 0, + }; + p = orc_program_new_from_static_bytecode (bc); + orc_program_set_backup_function (p, _backup_video_orc_dither_ordered_u8); +#else + p = orc_program_new (); + orc_program_set_name (p, "video_orc_dither_ordered_u8"); + orc_program_set_backup_function (p, _backup_video_orc_dither_ordered_u8); + orc_program_add_destination (p, 1, "d1"); + orc_program_add_source (p, 1, "s1"); + + orc_program_append_2 (p, "addusb", 0, ORC_VAR_D1, ORC_VAR_D1, ORC_VAR_S1, + ORC_VAR_D1); +#endif + + orc_program_compile (p); + c = orc_program_take_code (p); + orc_program_free (p); + } + p_inited = TRUE; + orc_once_mutex_unlock (); + } + ex->arrays[ORC_VAR_A2] = c; + ex->program = 0; + + ex->n = n; + ex->arrays[ORC_VAR_D1] = d1; + ex->arrays[ORC_VAR_S1] = (void *) s1; + + func = c->exec; + func (ex); +} +#endif + + +/* video_orc_dither_ordered_4u8_mask */ +#ifdef DISABLE_ORC +void +video_orc_dither_ordered_4u8_mask (guint8 * ORC_RESTRICT d1, + const guint16 * ORC_RESTRICT s1, orc_int64 p1, int n) +{ + int i; + orc_union32 *ORC_RESTRICT ptr0; + const orc_union64 *ORC_RESTRICT ptr4; + orc_union32 var34; + orc_union64 var35; + orc_union32 var36; + orc_union64 var37; + orc_union64 var38; + orc_union64 var39; + orc_union64 var40; + + ptr0 = (orc_union32 *) d1; + ptr4 = (orc_union64 *) s1; + + /* 0: loadpq */ + var37.i = p1; + + for (i = 0; i < n; i++) { + /* 1: loadl */ + var34 = ptr0[i]; + /* 2: convubw */ + var38.x4[0] = (orc_uint8) var34.x4[0]; + var38.x4[1] = (orc_uint8) var34.x4[1]; + var38.x4[2] = (orc_uint8) var34.x4[2]; + var38.x4[3] = (orc_uint8) var34.x4[3]; + /* 3: loadq */ + var35 = ptr4[i]; + /* 4: addw */ + var39.x4[0] = var38.x4[0] + var35.x4[0]; + var39.x4[1] = var38.x4[1] + var35.x4[1]; + var39.x4[2] = var38.x4[2] + var35.x4[2]; + var39.x4[3] = var38.x4[3] + var35.x4[3]; + /* 5: andnw */ + var40.x4[0] = (~var37.x4[0]) & var39.x4[0]; + var40.x4[1] = (~var37.x4[1]) & var39.x4[1]; + var40.x4[2] = (~var37.x4[2]) & var39.x4[2]; + var40.x4[3] = (~var37.x4[3]) & var39.x4[3]; + /* 6: convsuswb */ + var36.x4[0] = ORC_CLAMP_UB (var40.x4[0]); + var36.x4[1] = ORC_CLAMP_UB (var40.x4[1]); + var36.x4[2] = ORC_CLAMP_UB (var40.x4[2]); + var36.x4[3] = ORC_CLAMP_UB (var40.x4[3]); + /* 7: storel */ + ptr0[i] = var36; + } + +} + +#else +static void +_backup_video_orc_dither_ordered_4u8_mask (OrcExecutor * ORC_RESTRICT ex) +{ + int i; + int n = ex->n; + orc_union32 *ORC_RESTRICT ptr0; + const orc_union64 *ORC_RESTRICT ptr4; + orc_union32 var34; + orc_union64 var35; + orc_union32 var36; + orc_union64 var37; + orc_union64 var38; + orc_union64 var39; + orc_union64 var40; + + ptr0 = (orc_union32 *) ex->arrays[0]; + ptr4 = (orc_union64 *) ex->arrays[4]; + + /* 0: loadpq */ + var37.i = + (ex->params[24] & 0xffffffff) | ((orc_uint64) (ex->params[24 + + (ORC_VAR_T1 - ORC_VAR_P1)]) << 32); + + for (i = 0; i < n; i++) { + /* 1: loadl */ + var34 = ptr0[i]; + /* 2: convubw */ + var38.x4[0] = (orc_uint8) var34.x4[0]; + var38.x4[1] = (orc_uint8) var34.x4[1]; + var38.x4[2] = (orc_uint8) var34.x4[2]; + var38.x4[3] = (orc_uint8) var34.x4[3]; + /* 3: loadq */ + var35 = ptr4[i]; + /* 4: addw */ + var39.x4[0] = var38.x4[0] + var35.x4[0]; + var39.x4[1] = var38.x4[1] + var35.x4[1]; + var39.x4[2] = var38.x4[2] + var35.x4[2]; + var39.x4[3] = var38.x4[3] + var35.x4[3]; + /* 5: andnw */ + var40.x4[0] = (~var37.x4[0]) & var39.x4[0]; + var40.x4[1] = (~var37.x4[1]) & var39.x4[1]; + var40.x4[2] = (~var37.x4[2]) & var39.x4[2]; + var40.x4[3] = (~var37.x4[3]) & var39.x4[3]; + /* 6: convsuswb */ + var36.x4[0] = ORC_CLAMP_UB (var40.x4[0]); + var36.x4[1] = ORC_CLAMP_UB (var40.x4[1]); + var36.x4[2] = ORC_CLAMP_UB (var40.x4[2]); + var36.x4[3] = ORC_CLAMP_UB (var40.x4[3]); + /* 7: storel */ + ptr0[i] = var36; + } + +} + +void +video_orc_dither_ordered_4u8_mask (guint8 * ORC_RESTRICT d1, + const guint16 * ORC_RESTRICT s1, orc_int64 p1, int n) +{ + OrcExecutor _ex, *ex = &_ex; + static volatile int p_inited = 0; + static OrcCode *c = 0; + void (*func) (OrcExecutor *); + + if (!p_inited) { + orc_once_mutex_lock (); + if (!p_inited) { + OrcProgram *p; + +#if 1 + static const orc_uint8 bc[] = { + 1, 9, 33, 118, 105, 100, 101, 111, 95, 111, 114, 99, 95, 100, 105, 116, + 104, 101, 114, 95, 111, 114, 100, 101, 114, 101, 100, 95, 52, 117, 56, + 95, + 109, 97, 115, 107, 11, 4, 4, 12, 8, 8, 18, 8, 20, 8, 20, 8, + 134, 33, 24, 21, 2, 150, 32, 0, 21, 2, 70, 32, 32, 4, 21, 2, + 74, 32, 33, 32, 21, 2, 160, 0, 32, 2, 0, + }; + p = orc_program_new_from_static_bytecode (bc); + orc_program_set_backup_function (p, + _backup_video_orc_dither_ordered_4u8_mask); +#else + p = orc_program_new (); + orc_program_set_name (p, "video_orc_dither_ordered_4u8_mask"); + orc_program_set_backup_function (p, + _backup_video_orc_dither_ordered_4u8_mask); + orc_program_add_destination (p, 4, "d1"); + orc_program_add_source (p, 8, "s1"); + orc_program_add_parameter_int64 (p, 8, "p1"); + orc_program_add_temporary (p, 8, "t1"); + orc_program_add_temporary (p, 8, "t2"); + + orc_program_append_2 (p, "loadpq", 0, ORC_VAR_T2, ORC_VAR_P1, ORC_VAR_D1, + ORC_VAR_D1); + orc_program_append_2 (p, "convubw", 2, ORC_VAR_T1, ORC_VAR_D1, ORC_VAR_D1, + ORC_VAR_D1); + orc_program_append_2 (p, "addw", 2, ORC_VAR_T1, ORC_VAR_T1, ORC_VAR_S1, + ORC_VAR_D1); + orc_program_append_2 (p, "andnw", 2, ORC_VAR_T1, ORC_VAR_T2, ORC_VAR_T1, + ORC_VAR_D1); + orc_program_append_2 (p, "convsuswb", 2, ORC_VAR_D1, ORC_VAR_T1, + ORC_VAR_D1, ORC_VAR_D1); +#endif + + orc_program_compile (p); + c = orc_program_take_code (p); + orc_program_free (p); + } + p_inited = TRUE; + orc_once_mutex_unlock (); + } + ex->arrays[ORC_VAR_A2] = c; + ex->program = 0; + + ex->n = n; + ex->arrays[ORC_VAR_D1] = d1; + ex->arrays[ORC_VAR_S1] = (void *) s1; + { + orc_union64 tmp; + tmp.i = p1; + ex->params[ORC_VAR_P1] = ((orc_uint64) tmp.i) & 0xffffffff; + ex->params[ORC_VAR_T1] = ((orc_uint64) tmp.i) >> 32; + } + + func = c->exec; + func (ex); +} +#endif + + +/* video_orc_dither_ordered_4u16_mask */ +#ifdef DISABLE_ORC +void +video_orc_dither_ordered_4u16_mask (guint16 * ORC_RESTRICT d1, + const guint16 * ORC_RESTRICT s1, orc_int64 p1, int n) +{ + int i; + orc_union64 *ORC_RESTRICT ptr0; + const orc_union64 *ORC_RESTRICT ptr4; + orc_union64 var34; + orc_union64 var35; + orc_union64 var36; + orc_union64 var37; + orc_union64 var38; + + ptr0 = (orc_union64 *) d1; + ptr4 = (orc_union64 *) s1; + + /* 0: loadpq */ + var37.i = p1; + + for (i = 0; i < n; i++) { + /* 1: loadq */ + var34 = ptr0[i]; + /* 2: loadq */ + var35 = ptr4[i]; + /* 3: addw */ + var38.x4[0] = var34.x4[0] + var35.x4[0]; + var38.x4[1] = var34.x4[1] + var35.x4[1]; + var38.x4[2] = var34.x4[2] + var35.x4[2]; + var38.x4[3] = var34.x4[3] + var35.x4[3]; + /* 4: andnw */ + var36.x4[0] = (~var37.x4[0]) & var38.x4[0]; + var36.x4[1] = (~var37.x4[1]) & var38.x4[1]; + var36.x4[2] = (~var37.x4[2]) & var38.x4[2]; + var36.x4[3] = (~var37.x4[3]) & var38.x4[3]; + /* 5: storeq */ + ptr0[i] = var36; + } + +} + +#else +static void +_backup_video_orc_dither_ordered_4u16_mask (OrcExecutor * ORC_RESTRICT ex) +{ + int i; + int n = ex->n; + orc_union64 *ORC_RESTRICT ptr0; + const orc_union64 *ORC_RESTRICT ptr4; + orc_union64 var34; + orc_union64 var35; + orc_union64 var36; + orc_union64 var37; + orc_union64 var38; + + ptr0 = (orc_union64 *) ex->arrays[0]; + ptr4 = (orc_union64 *) ex->arrays[4]; + + /* 0: loadpq */ + var37.i = + (ex->params[24] & 0xffffffff) | ((orc_uint64) (ex->params[24 + + (ORC_VAR_T1 - ORC_VAR_P1)]) << 32); + + for (i = 0; i < n; i++) { + /* 1: loadq */ + var34 = ptr0[i]; + /* 2: loadq */ + var35 = ptr4[i]; + /* 3: addw */ + var38.x4[0] = var34.x4[0] + var35.x4[0]; + var38.x4[1] = var34.x4[1] + var35.x4[1]; + var38.x4[2] = var34.x4[2] + var35.x4[2]; + var38.x4[3] = var34.x4[3] + var35.x4[3]; + /* 4: andnw */ + var36.x4[0] = (~var37.x4[0]) & var38.x4[0]; + var36.x4[1] = (~var37.x4[1]) & var38.x4[1]; + var36.x4[2] = (~var37.x4[2]) & var38.x4[2]; + var36.x4[3] = (~var37.x4[3]) & var38.x4[3]; + /* 5: storeq */ + ptr0[i] = var36; + } + +} + +void +video_orc_dither_ordered_4u16_mask (guint16 * ORC_RESTRICT d1, + const guint16 * ORC_RESTRICT s1, orc_int64 p1, int n) +{ + OrcExecutor _ex, *ex = &_ex; + static volatile int p_inited = 0; + static OrcCode *c = 0; + void (*func) (OrcExecutor *); + + if (!p_inited) { + orc_once_mutex_lock (); + if (!p_inited) { + OrcProgram *p; + +#if 1 + static const orc_uint8 bc[] = { + 1, 9, 34, 118, 105, 100, 101, 111, 95, 111, 114, 99, 95, 100, 105, 116, + 104, 101, 114, 95, 111, 114, 100, 101, 114, 101, 100, 95, 52, 117, 49, + 54, + 95, 109, 97, 115, 107, 11, 8, 8, 12, 8, 8, 18, 8, 20, 8, 20, + 8, 134, 33, 24, 21, 2, 70, 32, 0, 4, 21, 2, 74, 0, 33, 32, + 2, 0, + }; + p = orc_program_new_from_static_bytecode (bc); + orc_program_set_backup_function (p, + _backup_video_orc_dither_ordered_4u16_mask); +#else + p = orc_program_new (); + orc_program_set_name (p, "video_orc_dither_ordered_4u16_mask"); + orc_program_set_backup_function (p, + _backup_video_orc_dither_ordered_4u16_mask); + orc_program_add_destination (p, 8, "d1"); + orc_program_add_source (p, 8, "s1"); + orc_program_add_parameter_int64 (p, 8, "p1"); + orc_program_add_temporary (p, 8, "t1"); + orc_program_add_temporary (p, 8, "t2"); + + orc_program_append_2 (p, "loadpq", 0, ORC_VAR_T2, ORC_VAR_P1, ORC_VAR_D1, + ORC_VAR_D1); + orc_program_append_2 (p, "addw", 2, ORC_VAR_T1, ORC_VAR_D1, ORC_VAR_S1, + ORC_VAR_D1); + orc_program_append_2 (p, "andnw", 2, ORC_VAR_D1, ORC_VAR_T2, ORC_VAR_T1, + ORC_VAR_D1); +#endif + + orc_program_compile (p); + c = orc_program_take_code (p); + orc_program_free (p); + } + p_inited = TRUE; + orc_once_mutex_unlock (); + } + ex->arrays[ORC_VAR_A2] = c; + ex->program = 0; + + ex->n = n; + ex->arrays[ORC_VAR_D1] = d1; + ex->arrays[ORC_VAR_S1] = (void *) s1; + { + orc_union64 tmp; + tmp.i = p1; + ex->params[ORC_VAR_P1] = ((orc_uint64) tmp.i) & 0xffffffff; + ex->params[ORC_VAR_T1] = ((orc_uint64) tmp.i) >> 32; + } + + func = c->exec; + func (ex); +} +#endif diff --git a/gst-libs/gst/video/video-orc-dist.h b/gst-libs/gst/video/video-orc-dist.h index b07b241093..0cad4312ea 100644 --- a/gst-libs/gst/video/video-orc-dist.h +++ b/gst-libs/gst/video/video-orc-dist.h @@ -190,6 +190,13 @@ void video_orc_chroma_up_v2_u16 (guint16 * ORC_RESTRICT d1, guint16 * ORC_RESTRI void video_orc_chroma_down_v2_u16 (guint16 * ORC_RESTRICT d1, const guint16 * ORC_RESTRICT s1, const guint16 * ORC_RESTRICT s2, int n); void video_orc_chroma_down_v4_u8 (guint8 * ORC_RESTRICT d1, const guint8 * ORC_RESTRICT s1, const guint8 * ORC_RESTRICT s2, const guint8 * ORC_RESTRICT s3, const guint8 * ORC_RESTRICT s4, int n); void video_orc_chroma_down_v4_u16 (guint16 * ORC_RESTRICT d1, const guint16 * ORC_RESTRICT s1, const guint16 * ORC_RESTRICT s2, const guint16 * ORC_RESTRICT s3, const guint16 * ORC_RESTRICT s4, int n); +void video_orc_dither_none_4u8_mask (guint8 * ORC_RESTRICT d1, int p1, int n); +void video_orc_dither_none_4u16_mask (guint16 * ORC_RESTRICT d1, orc_int64 p1, int n); +void video_orc_dither_verterr_4u8_mask (guint8 * ORC_RESTRICT d1, guint16 * ORC_RESTRICT d2, orc_int64 p1, int n); +void video_orc_dither_fs_muladd_u8 (guint16 * ORC_RESTRICT d1, int n); +void video_orc_dither_ordered_u8 (guint8 * ORC_RESTRICT d1, const guint8 * ORC_RESTRICT s1, int n); +void video_orc_dither_ordered_4u8_mask (guint8 * ORC_RESTRICT d1, const guint16 * ORC_RESTRICT s1, orc_int64 p1, int n); +void video_orc_dither_ordered_4u16_mask (guint16 * ORC_RESTRICT d1, const guint16 * ORC_RESTRICT s1, orc_int64 p1, int n); #ifdef __cplusplus } diff --git a/gst-libs/gst/video/video-orc.orc b/gst-libs/gst/video/video-orc.orc index cd7a86c49a..023ebcc0f6 100644 --- a/gst-libs/gst/video/video-orc.orc +++ b/gst-libs/gst/video/video-orc.orc @@ -1986,3 +1986,97 @@ x2 addl uuvv3, uuvv3, 4 x2 shrul uuvv3, uuvv3, 3 x2 convsuslw uv1, uuvv3 mergelq d, ay1, uv1 + +.function video_orc_dither_none_4u8_mask +.dest 4 p guint8 +.param 4 masks +.temp 4 m + +loadpl m, masks +x4 andnb p, m, p + +.function video_orc_dither_none_4u16_mask +.dest 8 p guint16 +.longparam 8 masks +.temp 8 m + +loadpq m, masks +x4 andnw p, m, p + +.function video_orc_dither_verterr_4u8_mask +.dest 4 p guint8 +.dest 8 e guint16 +.longparam 8 masks +.temp 8 m +.temp 8 t1 + +loadpq m, masks +x4 convubw t1, p +x4 addw t1, e, t1 +x4 andw e, m, t1 +x4 andnw t1, m, t1 +x4 convsuswb p, t1 + +.function video_orc_dither_fs_muladd_u8 +.dest 2 e guint16 +.temp 2 t1 +.temp 2 t2 + +loadoffw t2, e, 4 +mullw t2, t2, 5 +addw t1, t2, e +loadoffw t2, e, 8 +mullw t2, t2, 3 +addw e, t1, t2 + +# due to error propagation we should disable +# loop_shift for this function and only work on +# 4 pixels at a time. +#.function video_orc_dither_fs_add_4u8_mask +#.flags no-unroll +#.dest 4 d guint8 +#.dest 8 e1 guint16 +#.dest 8 e2 guint16 +#.longparam 8 masks +#.temp 8 p +#.temp 8 t1 +#.temp 8 t2 +# +#x4 mullw t1, e1, 7 +#x4 addw t1, t1, e2 +#x4 shruw t1, t1, 4 +#x4 convubw p, d +#x4 addw t1, t1, p +#x4 andnw p, masks, t1 +#x4 convsuswb d, p +#x4 andw e2, t1, masks + +.function video_orc_dither_ordered_u8 +.source 1 e guint8 +.dest 1 d guint8 + +addusb d, d, e + +.function video_orc_dither_ordered_4u8_mask +.source 8 e1 guint16 +.dest 4 d guint8 +.longparam 8 masks +.temp 8 p +.temp 8 m + +loadpq m, masks +x4 convubw p, d +x4 addw p, p, e1 +x4 andnw p, m, p +x4 convsuswb d, p + +.function video_orc_dither_ordered_4u16_mask +.source 8 e1 guint16 +.dest 8 d guint16 +.longparam 8 masks +.temp 8 p +.temp 8 m + +loadpq m, masks +x4 addw p, d, e1 +x4 andnw d, m, p diff --git a/gst-libs/gst/video/video.h b/gst-libs/gst/video/video.h index 37ae7ff467..b6235b1e86 100644 --- a/gst-libs/gst/video/video.h +++ b/gst-libs/gst/video/video.h @@ -26,6 +26,7 @@ typedef struct _GstVideoAlignment GstVideoAlignment; #include #include +#include #include #include #include