video-converter: add orc optimized matrix8 function

Add an ORC implementation of the matrix8 function. Regenerate video-orc-dist.[ch]
2025-04-08 00:59:48 +00:00 · 2014-09-25 17:32:32 +02:00 · 2014-09-25 17:32:32 +02:00 · b5f3e5261a
commit b5f3e5261a
parent c47b005197
4 changed files with 1512 additions and 989 deletions
--- a/gst-libs/gst/video/video-converter.c
+++ b/gst-libs/gst/video/video-converter.c
@ -66,6 +66,9 @@ struct _GstVideoConverter
  gint in_bits;
  gint out_bits;
  gint cmatrix[4][4];
+  guint64 orc_p1;
+  guint64 orc_p2;
+  guint64 orc_p3;

  GstStructure *config;
  GstVideoDitherMethod dither;
@ -344,6 +347,12 @@ gst_video_converter_frame (GstVideoConverter * convert,
 static void
 video_converter_matrix8 (GstVideoConverter * convert, gpointer pixels)
 {
+#if 1
+  video_orc_matrix8 (pixels, pixels, convert->orc_p1, convert->orc_p2,
+      convert->orc_p3, convert->width);
+#elif 0
+  /* FIXME we would like to set this as a backup function, it's faster than the
+   * orc generated one */
  int i;
  int r, g, b;
  int y, u, v;
@ -365,6 +374,7 @@ video_converter_matrix8 (GstVideoConverter * convert, gpointer pixels)
    p[i * 4 + 2] = CLAMP (u, 0, 255);
    p[i * 4 + 3] = CLAMP (v, 0, 255);
  }
+#endif
 }

 static void
@ -627,6 +637,16 @@ video_converter_compute_matrix (GstVideoConverter * convert)
  GST_DEBUG ("[%6d %6d %6d %6d]", convert->cmatrix[3][0],
      convert->cmatrix[3][1], convert->cmatrix[3][2], convert->cmatrix[3][3]);

+  convert->orc_p1 = (((guint64) (guint16) convert->cmatrix[2][0]) << 48) |
+      (((guint64) (guint16) convert->cmatrix[1][0]) << 32) |
+      (((guint64) (guint16) convert->cmatrix[0][0]) << 16);
+  convert->orc_p2 = (((guint64) (guint16) convert->cmatrix[2][1]) << 48) |
+      (((guint64) (guint16) convert->cmatrix[1][1]) << 32) |
+      (((guint64) (guint16) convert->cmatrix[0][1]) << 16);
+  convert->orc_p3 = (((guint64) (guint16) convert->cmatrix[2][2]) << 48) |
+      (((guint64) (guint16) convert->cmatrix[1][2]) << 32) |
+      (((guint64) (guint16) convert->cmatrix[0][2]) << 16);
+
  return TRUE;

  /* ERRORS */
--- a/gst-libs/gst/video/video-orc-dist.c
+++ b/gst-libs/gst/video/video-orc-dist.c
--- a/gst-libs/gst/video/video-orc-dist.h
+++ b/gst-libs/gst/video/video-orc-dist.h
@ -148,6 +148,7 @@ void video_orc_convert_AYUV_BGRA (guint8 * ORC_RESTRICT d1, int d1_stride, const
 void video_orc_convert_AYUV_ABGR (guint8 * ORC_RESTRICT d1, int d1_stride, const guint8 * ORC_RESTRICT s1, int s1_stride, int p1, int p2, int p3, int p4, int p5, int n, int m);
 void video_orc_convert_AYUV_RGBA (guint8 * ORC_RESTRICT d1, int d1_stride, const guint8 * ORC_RESTRICT s1, int s1_stride, int p1, int p2, int p3, int p4, int p5, int n, int m);
 void video_orc_convert_I420_BGRA (guint8 * ORC_RESTRICT d1, const guint8 * ORC_RESTRICT s1, const guint8 * ORC_RESTRICT s2, const guint8 * ORC_RESTRICT s3, int p1, int p2, int p3, int p4, int p5, int n);
+void video_orc_matrix8 (guint8 * ORC_RESTRICT d1, const guint8 * ORC_RESTRICT s1, orc_int64 p1, orc_int64 p2, orc_int64 p3, int n);

 #ifdef __cplusplus
 }
--- a/gst-libs/gst/video/video-orc.orc
+++ b/gst-libs/gst/video/video-orc.orc
@ -1230,3 +1230,57 @@ convssswb g, wg
 mergebw wb, b, g
 mergewl x, wb, wr
 x4 addb argb, x, c128
+
+.function video_orc_matrix8
+.source 4 argb guint8
+.dest 4 ayuv guint8
+.longparam 8 p1
+.longparam 8 p2
+.longparam 8 p3
+.const 1 c128 128
+.temp 2 w1
+.temp 2 w2
+.temp 1 b1
+.temp 1 b2
+.temp 4 l1
+.temp 4 ayuv2
+.temp 8 aq
+.temp 8 q1
+.temp 8 pr1
+.temp 8 pr2
+.temp 8 pr3
+
+loadpq pr1, p1
+loadpq pr2, p2
+loadpq pr3, p3
+
+x4 subb l1, argb, c128
+
+select0lw w1, l1
+select1lw w2, l1
+select0wb b1, w1
+select1wb b2, w1
+
+splatbl l1, b1
+mergelq aq, l1, l1
+andq aq, aq, 0xff
+
+splatbl l1, b2
+mergelq q1, l1, l1
+x4 mulhsw q1, q1, pr1
+x4 addssw aq, aq, q1
+
+select0wb b1, w2
+splatbl l1,b1
+mergelq q1, l1, l1
+x4 mulhsw q1, q1, pr2
+x4 addssw aq, aq, q1
+
+select1wb b2, w2
+splatbl l1, b2
+mergelq q1, l1, l1
+x4 mulhsw q1, q1, pr3
+x4 addssw aq, aq, q1
+
+x4 convssswb ayuv2, aq
+x4 addb ayuv, ayuv2, c128