From f441b5216e7e3af806fd7a501ee50192dbe5fecf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sebastian=20Dr=C3=B6ge?= Date: Fri, 7 Dec 2018 19:13:59 +0200 Subject: [PATCH] video-overlay-composition: Optimize premultiplication/unpremultiplication loops Pull in video frame fields into local variables. Without this the compiler must assume that they could've changed on every use and read them from memory again. This reduces the inner loop from 6 memory reads per pixels to 4, and the number of writes stays at 3. --- .../gst/video/video-overlay-composition.c | 52 +++++++++++++------ 1 file changed, 36 insertions(+), 16 deletions(-) diff --git a/gst-libs/gst/video/video-overlay-composition.c b/gst-libs/gst/video/video-overlay-composition.c index 565c15cb08..c1d2fcd6df 100644 --- a/gst-libs/gst/video/video-overlay-composition.c +++ b/gst-libs/gst/video/video-overlay-composition.c @@ -841,12 +841,17 @@ static void gst_video_overlay_rectangle_premultiply_0 (GstVideoFrame * frame) { int i, j; - for (j = 0; j < GST_VIDEO_FRAME_HEIGHT (frame); ++j) { + int width = GST_VIDEO_FRAME_WIDTH (frame); + int height = GST_VIDEO_FRAME_HEIGHT (frame); + int stride = GST_VIDEO_FRAME_PLANE_STRIDE (frame, 0); + guint8 *data = GST_VIDEO_FRAME_PLANE_DATA (frame, 0); + + for (j = 0; j < height; ++j) { guint8 *line; - line = GST_VIDEO_FRAME_PLANE_DATA (frame, 0); - line += GST_VIDEO_FRAME_PLANE_STRIDE (frame, 0) * j; - for (i = 0; i < GST_VIDEO_FRAME_WIDTH (frame); ++i) { + line = data; + line += stride * j; + for (i = 0; i < width; ++i) { int a = line[0]; line[1] = line[1] * a / 255; line[2] = line[2] * a / 255; @@ -860,12 +865,17 @@ static void gst_video_overlay_rectangle_premultiply_3 (GstVideoFrame * frame) { int i, j; - for (j = 0; j < GST_VIDEO_FRAME_HEIGHT (frame); ++j) { + int width = GST_VIDEO_FRAME_WIDTH (frame); + int height = GST_VIDEO_FRAME_HEIGHT (frame); + int stride = GST_VIDEO_FRAME_PLANE_STRIDE (frame, 0); + guint8 *data = GST_VIDEO_FRAME_PLANE_DATA (frame, 0); + + for (j = 0; j < height; ++j) { guint8 *line; - line = GST_VIDEO_FRAME_PLANE_DATA (frame, 0); - line += GST_VIDEO_FRAME_PLANE_STRIDE (frame, 0) * j; - for (i = 0; i < GST_VIDEO_FRAME_WIDTH (frame); ++i) { + line = data; + line += stride * j; + for (i = 0; i < width; ++i) { int a = line[3]; line[0] = line[0] * a / 255; line[1] = line[1] * a / 255; @@ -899,12 +909,17 @@ static void gst_video_overlay_rectangle_unpremultiply_0 (GstVideoFrame * frame) { int i, j; - for (j = 0; j < GST_VIDEO_FRAME_HEIGHT (frame); ++j) { + int width = GST_VIDEO_FRAME_WIDTH (frame); + int height = GST_VIDEO_FRAME_HEIGHT (frame); + int stride = GST_VIDEO_FRAME_PLANE_STRIDE (frame, 0); + guint8 *data = GST_VIDEO_FRAME_PLANE_DATA (frame, 0); + + for (j = 0; j < height; ++j) { guint8 *line; - line = GST_VIDEO_FRAME_PLANE_DATA (frame, 0); - line += GST_VIDEO_FRAME_PLANE_STRIDE (frame, 0) * j; - for (i = 0; i < GST_VIDEO_FRAME_WIDTH (frame); ++i) { + line = data; + line += stride * j; + for (i = 0; i < width; ++i) { int a = line[0]; if (a) { line[1] = MIN ((line[1] * 255 + a / 2) / a, 255); @@ -920,12 +935,17 @@ static void gst_video_overlay_rectangle_unpremultiply_3 (GstVideoFrame * frame) { int i, j; - for (j = 0; j < GST_VIDEO_FRAME_HEIGHT (frame); ++j) { + int width = GST_VIDEO_FRAME_WIDTH (frame); + int height = GST_VIDEO_FRAME_HEIGHT (frame); + int stride = GST_VIDEO_FRAME_PLANE_STRIDE (frame, 0); + guint8 *data = GST_VIDEO_FRAME_PLANE_DATA (frame, 0); + + for (j = 0; j < height; ++j) { guint8 *line; - line = GST_VIDEO_FRAME_PLANE_DATA (frame, 0); - line += GST_VIDEO_FRAME_PLANE_STRIDE (frame, 0) * j; - for (i = 0; i < GST_VIDEO_FRAME_WIDTH (frame); ++i) { + line = data; + line += stride * j; + for (i = 0; i < width; ++i) { int a = line[3]; if (a) { line[0] = MIN ((line[0] * 255 + a / 2) / a, 255);