From 172d14acef42f60376fabb7c36c6147d312dc4b9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sebastian=20Dr=C3=B6ge?= Date: Fri, 7 Dec 2018 19:09:30 +0200 Subject: [PATCH] cairooverlay: Optimize premultiplication/unpremultiplication loops Pull in video frame fields into local variables. Without this the compiler must assume that they could've changed on every use and read them from memory again. This reduces the inner loop from 6 memory reads per pixels to 4, and the number of writes stays at 3. --- ext/cairo/gstcairooverlay.c | 52 +++++++++++++++++++++++++------------ 1 file changed, 36 insertions(+), 16 deletions(-) diff --git a/ext/cairo/gstcairooverlay.c b/ext/cairo/gstcairooverlay.c index b5acc12bed..6cea06f540 100644 --- a/ext/cairo/gstcairooverlay.c +++ b/ext/cairo/gstcairooverlay.c @@ -230,12 +230,17 @@ static void gst_video_overlay_rectangle_premultiply_0 (GstVideoFrame * frame) { int i, j; - for (j = 0; j < GST_VIDEO_FRAME_HEIGHT (frame); ++j) { + int width = GST_VIDEO_FRAME_WIDTH (frame); + int height = GST_VIDEO_FRAME_HEIGHT (frame); + int stride = GST_VIDEO_FRAME_PLANE_STRIDE (frame, 0); + guint8 *data = GST_VIDEO_FRAME_PLANE_DATA (frame, 0); + + for (j = 0; j < height; ++j) { guint8 *line; - line = GST_VIDEO_FRAME_PLANE_DATA (frame, 0); - line += GST_VIDEO_FRAME_PLANE_STRIDE (frame, 0) * j; - for (i = 0; i < GST_VIDEO_FRAME_WIDTH (frame); ++i) { + line = data; + line += stride * j; + for (i = 0; i < width; ++i) { int a = line[0]; line[1] = line[1] * a / 255; line[2] = line[2] * a / 255; @@ -250,12 +255,17 @@ static void gst_video_overlay_rectangle_premultiply_3 (GstVideoFrame * frame) { int i, j; - for (j = 0; j < GST_VIDEO_FRAME_HEIGHT (frame); ++j) { + int width = GST_VIDEO_FRAME_WIDTH (frame); + int height = GST_VIDEO_FRAME_HEIGHT (frame); + int stride = GST_VIDEO_FRAME_PLANE_STRIDE (frame, 0); + guint8 *data = GST_VIDEO_FRAME_PLANE_DATA (frame, 0); + + for (j = 0; j < height; ++j) { guint8 *line; - line = GST_VIDEO_FRAME_PLANE_DATA (frame, 0); - line += GST_VIDEO_FRAME_PLANE_STRIDE (frame, 0) * j; - for (i = 0; i < GST_VIDEO_FRAME_WIDTH (frame); ++i) { + line = data; + line += stride * j; + for (i = 0; i < width; ++i) { int a = line[3]; line[0] = line[0] * a / 255; line[1] = line[1] * a / 255; @@ -290,12 +300,17 @@ static void gst_video_overlay_rectangle_unpremultiply_0 (GstVideoFrame * frame) { int i, j; - for (j = 0; j < GST_VIDEO_FRAME_HEIGHT (frame); ++j) { + int width = GST_VIDEO_FRAME_WIDTH (frame); + int height = GST_VIDEO_FRAME_HEIGHT (frame); + int stride = GST_VIDEO_FRAME_PLANE_STRIDE (frame, 0); + guint8 *data = GST_VIDEO_FRAME_PLANE_DATA (frame, 0); + + for (j = 0; j < height; ++j) { guint8 *line; - line = GST_VIDEO_FRAME_PLANE_DATA (frame, 0); - line += GST_VIDEO_FRAME_PLANE_STRIDE (frame, 0) * j; - for (i = 0; i < GST_VIDEO_FRAME_WIDTH (frame); ++i) { + line = data; + line += stride * j; + for (i = 0; i < width; ++i) { int a = line[0]; if (a) { line[1] = MIN ((line[1] * 255 + a / 2) / a, 255); @@ -312,12 +327,17 @@ static void gst_video_overlay_rectangle_unpremultiply_3 (GstVideoFrame * frame) { int i, j; - for (j = 0; j < GST_VIDEO_FRAME_HEIGHT (frame); ++j) { + int width = GST_VIDEO_FRAME_WIDTH (frame); + int height = GST_VIDEO_FRAME_HEIGHT (frame); + int stride = GST_VIDEO_FRAME_PLANE_STRIDE (frame, 0); + guint8 *data = GST_VIDEO_FRAME_PLANE_DATA (frame, 0); + + for (j = 0; j < height; ++j) { guint8 *line; - line = GST_VIDEO_FRAME_PLANE_DATA (frame, 0); - line += GST_VIDEO_FRAME_PLANE_STRIDE (frame, 0) * j; - for (i = 0; i < GST_VIDEO_FRAME_WIDTH (frame); ++i) { + line = data; + line += stride * j; + for (i = 0; i < width; ++i) { int a = line[3]; if (a) { line[0] = MIN ((line[0] * 255 + a / 2) / a, 255);