diff --git a/subprojects/gst-plugins-bad/sys/nvcodec/cuda-converter.c b/subprojects/gst-plugins-bad/sys/nvcodec/cuda-converter.c deleted file mode 100644 index f2930019a6..0000000000 --- a/subprojects/gst-plugins-bad/sys/nvcodec/cuda-converter.c +++ /dev/null @@ -1,2090 +0,0 @@ -/* GStreamer - * Copyright (C) 2010 David Schleef - * Copyright (C) 2010 Sebastian Dröge - * Copyright (C) 2019 Seungha Yang - * - * This library is free software; you can redistribute it and/or - * modify it under the terms of the GNU Library General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Library General Public License for more details. - * - * You should have received a copy of the GNU Library General Public - * License along with this library; if not, write to the - * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, - * Boston, MA 02110-1301, USA. - */ - -/** - * SECTION:cudaconverter - * @title: GstCudaConverter - * @short_description: Generic video conversion using CUDA - * - * This object is used to convert video frames from one format to another. - * The object can perform conversion of: - * - * * video format - * * video colorspace - * * video size - */ - -/** - * TODO: - * * Add more interpolation method and make it selectable, - * currently default bi-linear interpolation only - * * Add fast-path for conversion like videoconvert - * * Full colorimetry and chroma-siting support - * * cropping, and x, y position support - */ - -#ifdef HAVE_CONFIG_H -#include "config.h" -#endif - -#include "cuda-converter.h" -#include -#include -#include -#include - -#define CUDA_BLOCK_X 16 -#define CUDA_BLOCK_Y 16 -#define DIV_UP(size,block) (((size) + ((block) - 1)) / (block)) - -static gboolean cuda_converter_lookup_path (GstCudaConverter * convert); - -#ifndef GST_DISABLE_GST_DEBUG -#define GST_CAT_DEFAULT ensure_debug_category() -static GstDebugCategory * -ensure_debug_category (void) -{ - static gsize cat_gonce = 0; - - if (g_once_init_enter (&cat_gonce)) { - gsize cat_done; - - cat_done = (gsize) _gst_debug_category_new ("cuda-converter", 0, - "cuda-converter object"); - - g_once_init_leave (&cat_gonce, cat_done); - } - - return (GstDebugCategory *) cat_gonce; -} -#else -#define ensure_debug_category() -#endif - -#define GST_CUDA_KERNEL_FUNC "gst_cuda_kernel_func" - -#define GST_CUDA_KERNEL_FUNC_TO_Y444 "gst_cuda_kernel_func_to_y444" - -#define GST_CUDA_KERNEL_FUNC_Y444_TO_YUV "gst_cuda_kernel_func_y444_to_yuv" - -#define GST_CUDA_KERNEL_FUNC_TO_ARGB "gst_cuda_kernel_func_to_argb" - -#define GST_CUDA_KERNEL_FUNC_SCALE_RGB "gst_cuda_kernel_func_scale_rgb" - -/* *INDENT-OFF* */ -/** - * read_chroma: - * @tex1: a CUDA texture object representing a semi-planar chroma plane - * @tex2: dummy object - * @x: the x coordinate to read data from @tex1 - * @y: the y coordinate to read data from @tex1 - * - * Returns: a #ushort2 vector representing both chroma pixel values - */ -static const gchar READ_CHROMA_FROM_SEMI_PLANAR[] = -"__device__ ushort2\n" -"read_chroma (cudaTextureObject_t tex1, cudaTextureObject_t tex2, \n" -" float x, float y)\n" -"{\n" -" return tex2D(tex1, x, y);\n" -"}"; - -/** - * read_chroma: - * @tex1: a CUDA texture object representing a chroma planar plane - * @tex2: a CUDA texture object representing the other planar plane - * @x: the x coordinate to read data from @tex1 and @tex2 - * @y: the y coordinate to read data from @tex1 and @tex2 - * - * Returns: a #ushort2 vector representing both chroma pixel values - */ -static const gchar READ_CHROMA_FROM_PLANAR[] = -"__device__ ushort2\n" -"read_chroma (cudaTextureObject_t tex1, cudaTextureObject_t tex2, \n" -" float x, float y)\n" -"{\n" -" unsigned short u, v;\n" -" u = tex2D(tex1, x, y);\n" -" v = tex2D(tex2, x, y);\n" -" return make_ushort2(u, v);\n" -"}"; - -/** - * write_chroma: - * @dst1: a CUDA global memory pointing to a semi-planar chroma plane - * @dst2: dummy - * @u: a pixel value to write @dst1 - * @v: a pixel value to write @dst1 - * @x: the x coordinate to write data into @tex1 - * @x: the y coordinate to write data into @tex1 - * @pstride: the pixel stride of @dst1 - * @mask: bitmask to be applied to high bitdepth plane - * - * Write @u and @v pixel value to @dst1 semi-planar plane - */ -static const gchar WRITE_CHROMA_TO_SEMI_PLANAR[] = -"__device__ void\n" -"write_chroma (unsigned char *dst1, unsigned char *dst2, unsigned short u,\n" -" unsigned short v, int x, int y, int pstride, int stride, int mask)\n" -"{\n" -" if (OUT_DEPTH > 8) {\n" -" *(unsigned short *)&dst1[x * pstride + y * stride] = (u & mask);\n" -" *(unsigned short *)&dst1[x * pstride + 2 + y * stride] = (v & mask);\n" -" } else {\n" -" dst1[x * pstride + y * stride] = u;\n" -" dst1[x * pstride + 1 + y * stride] = v;\n" -" }\n" -"}"; - -/** - * write_chroma: - * @dst1: a CUDA global memory pointing to a planar chroma plane - * @dst2: a CUDA global memory pointing to a the other planar chroma plane - * @u: a pixel value to write @dst1 - * @v: a pixel value to write @dst1 - * @x: the x coordinate to write data into @tex1 - * @x: the y coordinate to write data into @tex1 - * @pstride: the pixel stride of @dst1 - * @mask: bitmask to be applied to high bitdepth plane - * - * Write @u and @v pixel value into @dst1 and @dst2 planar planes - */ -static const gchar WRITE_CHROMA_TO_PLANAR[] = -"__device__ void\n" -"write_chroma (unsigned char *dst1, unsigned char *dst2, unsigned short u,\n" -" unsigned short v, int x, int y, int pstride, int stride, int mask)\n" -"{\n" -" if (OUT_DEPTH > 8) {\n" -" *(unsigned short *)&dst1[x * pstride + y * stride] = (u & mask);\n" -" *(unsigned short *)&dst2[x * pstride + y * stride] = (v & mask);\n" -" } else {\n" -" dst1[x * pstride + y * stride] = u;\n" -" dst2[x * pstride + y * stride] = v;\n" -" }\n" -"}"; - -/* CUDA kernel source for from YUV to YUV conversion and scale */ -static const gchar templ_YUV_TO_YUV[] = -"extern \"C\"{\n" -"__constant__ float SCALE_H = %s;\n" -"__constant__ float SCALE_V = %s;\n" -"__constant__ float CHROMA_SCALE_H = %s;\n" -"__constant__ float CHROMA_SCALE_V = %s;\n" -"__constant__ int WIDTH = %d;\n" -"__constant__ int HEIGHT = %d;\n" -"__constant__ int CHROMA_WIDTH = %d;\n" -"__constant__ int CHROMA_HEIGHT = %d;\n" -"__constant__ int IN_DEPTH = %d;\n" -"__constant__ int OUT_DEPTH = %d;\n" -"__constant__ int PSTRIDE = %d;\n" -"__constant__ int CHROMA_PSTRIDE = %d;\n" -"__constant__ int IN_SHIFT = %d;\n" -"__constant__ int OUT_SHIFT = %d;\n" -"__constant__ int MASK = %d;\n" -"__constant__ int SWAP_UV = %d;\n" -"\n" -"__device__ unsigned short\n" -"do_scale_pixel (unsigned short val) \n" -"{\n" -" unsigned int diff;\n" -" if (OUT_DEPTH > IN_DEPTH) {\n" -" diff = OUT_DEPTH - IN_DEPTH;\n" -" return (val << diff) | (val >> (IN_DEPTH - diff));\n" -" } else if (IN_DEPTH > OUT_DEPTH) {\n" -" return val >> (IN_DEPTH - OUT_DEPTH);\n" -" }\n" -" return val;\n" -"}\n" -"\n" -/* __device__ ushort2 - * read_chroma (cudaTextureObject_t tex1, cudaTextureObject_t tex2, float x, float y); - */ -"%s\n" -"\n" -/* __device__ void - * write_chroma (unsigned char *dst1, unsigned char *dst2, unsigned short u, - * unsigned short v, int x, int y, int pstride, int stride, int mask); - */ -"%s\n" -"\n" -"__global__ void\n" -GST_CUDA_KERNEL_FUNC -"(cudaTextureObject_t tex0, cudaTextureObject_t tex1, cudaTextureObject_t tex2,\n" -" unsigned char *dst0, unsigned char *dst1, unsigned char *dst2,\n" -" int stride, int uv_stride)\n" -"{\n" -" int x_pos = blockIdx.x * blockDim.x + threadIdx.x;\n" -" int y_pos = blockIdx.y * blockDim.y + threadIdx.y;\n" -" if (x_pos < WIDTH && y_pos < HEIGHT) {\n" -" float src_xpos = SCALE_H * x_pos;\n" -" float src_ypos = SCALE_V * y_pos;\n" -" unsigned short y = tex2D(tex0, src_xpos, src_ypos);\n" -" y = y >> IN_SHIFT;\n" -" y = do_scale_pixel (y);\n" -" y = y << OUT_SHIFT;\n" -" if (OUT_DEPTH > 8) {\n" -" *(unsigned short *)&dst0[x_pos * PSTRIDE + y_pos * stride] = (y & MASK);\n" -" } else {\n" -" dst0[x_pos * PSTRIDE + y_pos * stride] = y;\n" -" }\n" -" }\n" -" if (x_pos < CHROMA_WIDTH && y_pos < CHROMA_HEIGHT) {\n" -" float src_xpos = CHROMA_SCALE_H * x_pos;\n" -" float src_ypos = CHROMA_SCALE_V * y_pos;\n" -" unsigned short u, v;\n" -" ushort2 uv = read_chroma (tex1, tex2, src_xpos, src_ypos);\n" -" u = uv.x;\n" -" v = uv.y;\n" -" u = u >> IN_SHIFT;\n" -" v = v >> IN_SHIFT;\n" -" u = do_scale_pixel (u);\n" -" v = do_scale_pixel (v);\n" -" u = u << OUT_SHIFT;\n" -" v = v << OUT_SHIFT;\n" -" if (SWAP_UV) {\n" -" unsigned short tmp = u;\n" -" u = v;\n" -" v = tmp;\n" -" }\n" -" write_chroma (dst1,\n" -" dst2, u, v, x_pos, y_pos, CHROMA_PSTRIDE, uv_stride, MASK);\n" -" }\n" -"}\n" -"\n" -"}"; - -/* CUDA kernel source for from YUV to RGB conversion and scale */ -static const gchar templ_YUV_TO_RGB[] = -"extern \"C\"{\n" -"__constant__ float offset[3] = {%s, %s, %s};\n" -"__constant__ float rcoeff[3] = {%s, %s, %s};\n" -"__constant__ float gcoeff[3] = {%s, %s, %s};\n" -"__constant__ float bcoeff[3] = {%s, %s, %s};\n" -"\n" -"__constant__ float SCALE_H = %s;\n" -"__constant__ float SCALE_V = %s;\n" -"__constant__ float CHROMA_SCALE_H = %s;\n" -"__constant__ float CHROMA_SCALE_V = %s;\n" -"__constant__ int WIDTH = %d;\n" -"__constant__ int HEIGHT = %d;\n" -"__constant__ int CHROMA_WIDTH = %d;\n" -"__constant__ int CHROMA_HEIGHT = %d;\n" -"__constant__ int IN_DEPTH = %d;\n" -"__constant__ int OUT_DEPTH = %d;\n" -"__constant__ int PSTRIDE = %d;\n" -"__constant__ int CHROMA_PSTRIDE = %d;\n" -"__constant__ int IN_SHIFT = %d;\n" -"__constant__ int OUT_SHIFT = %d;\n" -"__constant__ int MASK = %d;\n" -"__constant__ int SWAP_UV = %d;\n" -"__constant__ int MAX_IN_VAL = %d;\n" -"__constant__ int R_IDX = %d;\n" -"__constant__ int G_IDX = %d;\n" -"__constant__ int B_IDX = %d;\n" -"__constant__ int A_IDX = %d;\n" -"__constant__ int X_IDX = %d;\n" -"\n" -"__device__ unsigned short\n" -"do_scale_pixel (unsigned short val) \n" -"{\n" -" unsigned int diff;\n" -" if (OUT_DEPTH > IN_DEPTH) {\n" -" diff = OUT_DEPTH - IN_DEPTH;\n" -" return (val << diff) | (val >> (IN_DEPTH - diff));\n" -" } else if (IN_DEPTH > OUT_DEPTH) {\n" -" return val >> (IN_DEPTH - OUT_DEPTH);\n" -" }\n" -" return val;\n" -"}\n" -"\n" -"__device__ float\n" -"dot(float3 val, float *coeff)\n" -"{\n" -" return val.x * coeff[0] + val.y * coeff[1] + val.z * coeff[2];\n" -"}\n" -"\n" -"__device__ uint3\n" -"yuv_to_rgb (unsigned short y, unsigned short u, unsigned short v, unsigned int max_val)\n" -"{\n" -" float3 yuv = make_float3 (y, u, v);\n" -" uint3 rgb;\n" -" rgb.x = max ((unsigned int)(dot (yuv, rcoeff) + offset[0]), 0);\n" -" rgb.y = max ((unsigned int)(dot (yuv, gcoeff) + offset[1]), 0);\n" -" rgb.z = max ((unsigned int)(dot (yuv, bcoeff) + offset[2]), 0);\n" -" rgb.x = min (rgb.x, max_val);\n" -" rgb.y = min (rgb.y, max_val);\n" -" rgb.z = min (rgb.z, max_val);\n" -" return rgb;\n" -"}\n" -"\n" -/* __device__ ushort2 - * read_chroma (cudaTextureObject_t tex1, cudaTextureObject_t tex2, float x, float y); - */ -"%s\n" -"\n" -"__global__ void\n" -GST_CUDA_KERNEL_FUNC -"(cudaTextureObject_t tex0, cudaTextureObject_t tex1, cudaTextureObject_t tex2,\n" -" unsigned char *dstRGB, int stride)\n" -"{\n" -" int x_pos = blockIdx.x * blockDim.x + threadIdx.x;\n" -" int y_pos = blockIdx.y * blockDim.y + threadIdx.y;\n" -" if (x_pos < WIDTH && y_pos < HEIGHT) {\n" -" float src_xpos = SCALE_H * x_pos;\n" -" float src_ypos = SCALE_V * y_pos;\n" -" unsigned short y = tex2D(tex0, src_xpos, src_ypos);\n" -" ushort2 uv;\n" -" unsigned short u, v;\n" -" uint3 rgb;\n" -" unsigned int clip_max = MAX_IN_VAL;\n" -" src_xpos = CHROMA_SCALE_H * x_pos;\n" -" src_ypos = CHROMA_SCALE_V * y_pos;\n" -" uv = read_chroma (tex1, tex2, src_xpos, src_ypos);\n" -" u = uv.x;\n" -" v = uv.y;\n" -" y = y >> IN_SHIFT;\n" -" u = u >> IN_SHIFT;\n" -" v = v >> IN_SHIFT;\n" -" if (SWAP_UV) {\n" -" unsigned short tmp = u;\n" -" u = v;\n" -" v = tmp;\n" -" }\n" - /* conversion matrix is scaled to higher bitdepth between in/out formats */ -" if (OUT_DEPTH > IN_DEPTH) {\n" -" y = do_scale_pixel (y);\n" -" u = do_scale_pixel (u);\n" -" v = do_scale_pixel (v);\n" -" clip_max = MASK;\n" -" }" -" rgb = yuv_to_rgb (y, u, v, clip_max);\n" -" if (OUT_DEPTH < IN_DEPTH) {\n" -" rgb.x = do_scale_pixel (rgb.x);\n" -" rgb.y = do_scale_pixel (rgb.y);\n" -" rgb.z = do_scale_pixel (rgb.z);\n" -" }" -" if (OUT_DEPTH > 8) {\n" -" unsigned int packed_rgb = 0;\n" - /* A is always MSB, we support only little endian system */ -" packed_rgb = 0xc000 << 16;\n" -" packed_rgb |= (rgb.x << (30 - (R_IDX * 10)));\n" -" packed_rgb |= (rgb.y << (30 - (G_IDX * 10)));\n" -" packed_rgb |= (rgb.z << (30 - (B_IDX * 10)));\n" -" *(unsigned int *)&dstRGB[x_pos * PSTRIDE + y_pos * stride] = packed_rgb;\n" -" } else {\n" -" dstRGB[x_pos * PSTRIDE + R_IDX + y_pos * stride] = (unsigned char) rgb.x;\n" -" dstRGB[x_pos * PSTRIDE + G_IDX + y_pos * stride] = (unsigned char) rgb.y;\n" -" dstRGB[x_pos * PSTRIDE + B_IDX + y_pos * stride] = (unsigned char) rgb.z;\n" -" if (A_IDX >= 0 || X_IDX >= 0)\n" -" dstRGB[x_pos * PSTRIDE + A_IDX + y_pos * stride] = 0xff;\n" -" }\n" -" }\n" -"}\n" -"\n" -"}"; - -/** - * GST_CUDA_KERNEL_FUNC_TO_ARGB: - * @srcRGB: a CUDA global memory containing a RGB image - * @dstRGB: a CUDA global memory to store unpacked ARGB image - * @width: the width of @srcRGB and @dstRGB - * @height: the height of @srcRGB and @dstRGB - * @src_stride: the stride of @srcRGB - * @src_pstride: the pixel stride of @srcRGB - * @dst_stride: the stride of @dstRGB - * @r_idx: the index of red component of @srcRGB - * @g_idx: the index of green component of @srcRGB - * @b_idx: the index of blue component of @srcRGB - * @a_idx: the index of alpha component of @srcRGB - * - * Unpack a RGB image from @srcRGB and write the unpacked data into @dstRGB - */ -static const gchar unpack_to_ARGB[] = -"__global__ void\n" -GST_CUDA_KERNEL_FUNC_TO_ARGB -"(unsigned char *srcRGB, unsigned char *dstRGB, int width, int height,\n" -" int src_stride, int src_pstride, int dst_stride,\n" -" int r_idx, int g_idx, int b_idx, int a_idx)\n" -"{\n" -" int x_pos = blockIdx.x * blockDim.x + threadIdx.x;\n" -" int y_pos = blockIdx.y * blockDim.y + threadIdx.y;\n" -" if (x_pos < width && y_pos < height) {\n" -" if (a_idx >= 0) {\n" -" dstRGB[x_pos * 4 + y_pos * dst_stride] =\n" -" srcRGB[x_pos * src_pstride + a_idx + y_pos * src_stride];\n" -" } else {\n" -" dstRGB[x_pos * 4 + y_pos * dst_stride] = 0xff;\n" -" }\n" -" dstRGB[x_pos * 4 + 1 + y_pos * dst_stride] =\n" -" srcRGB[x_pos * src_pstride + r_idx + y_pos * src_stride];\n" -" dstRGB[x_pos * 4 + 2 + y_pos * dst_stride] =\n" -" srcRGB[x_pos * src_pstride + g_idx + y_pos * src_stride];\n" -" dstRGB[x_pos * 4 + 3 + y_pos * dst_stride] =\n" -" srcRGB[x_pos * src_pstride + b_idx + y_pos * src_stride];\n" -" }\n" -"}\n"; - -/** - * GST_CUDA_KERNEL_FUNC_TO_ARGB: - * @srcRGB: a CUDA global memory containing a RGB image - * @dstRGB: a CUDA global memory to store unpacked ARGB64 image - * @width: the width of @srcRGB and @dstRGB - * @height: the height of @srcRGB and @dstRGB - * @src_stride: the stride of @srcRGB - * @src_pstride: the pixel stride of @srcRGB - * @dst_stride: the stride of @dstRGB - * @r_idx: the index of red component of @srcRGB - * @g_idx: the index of green component of @srcRGB - * @b_idx: the index of blue component of @srcRGB - * @a_idx: the index of alpha component of @srcRGB - * - * Unpack a RGB image from @srcRGB and write the unpacked data into @dstRGB - */ -static const gchar unpack_to_ARGB64[] = -"__global__ void\n" -GST_CUDA_KERNEL_FUNC_TO_ARGB -"(unsigned char *srcRGB, unsigned char *dstRGB, int width, int height,\n" -" int src_stride, int src_pstride, int dst_stride,\n" -" int r_idx, int g_idx, int b_idx, int a_idx)\n" -"{\n" -" int x_pos = blockIdx.x * blockDim.x + threadIdx.x;\n" -" int y_pos = blockIdx.y * blockDim.y + threadIdx.y;\n" -" if (x_pos < width && y_pos < height) {\n" -" unsigned short a, r, g, b;\n" -" unsigned int read_val;\n" -" read_val = *(unsigned int *)&srcRGB[x_pos * src_pstride + y_pos * src_stride];\n" -" a = (read_val >> 30) & 0x03;\n" -" a = (a << 14) | (a << 12) | (a << 10) | (a << 8) | (a << 6) | (a << 4) | (a << 2) | (a << 0);\n" -" r = ((read_val >> (30 - (r_idx * 10))) & 0x3ff);\n" -" r = (r << 6) | (r >> 4);\n" -" g = ((read_val >> (30 - (g_idx * 10))) & 0x3ff);\n" -" g = (g << 6) | (g >> 4);\n" -" b = ((read_val >> (30 - (b_idx * 10))) & 0x3ff);\n" -" b = (b << 6) | (b >> 4);\n" -" *(unsigned short *)&dstRGB[x_pos * 8 + y_pos * dst_stride] = 0xffff;\n" -" *(unsigned short *)&dstRGB[x_pos * 8 + 2 + y_pos * dst_stride] = r;\n" -" *(unsigned short *)&dstRGB[x_pos * 8 + 4 + y_pos * dst_stride] = g;\n" -" *(unsigned short *)&dstRGB[x_pos * 8 + 6 + y_pos * dst_stride] = b;\n" -" }\n" -"}\n"; - -/* CUDA kernel source for from RGB to YUV conversion and scale */ -static const gchar templ_RGB_TO_YUV[] = -"extern \"C\"{\n" -"__constant__ float offset[3] = {%s, %s, %s};\n" -"__constant__ float ycoeff[3] = {%s, %s, %s};\n" -"__constant__ float ucoeff[3] = {%s, %s, %s};\n" -"__constant__ float vcoeff[3] = {%s, %s, %s};\n" -"\n" -"__constant__ float SCALE_H = %s;\n" -"__constant__ float SCALE_V = %s;\n" -"__constant__ float CHROMA_SCALE_H = %s;\n" -"__constant__ float CHROMA_SCALE_V = %s;\n" -"__constant__ int WIDTH = %d;\n" -"__constant__ int HEIGHT = %d;\n" -"__constant__ int CHROMA_WIDTH = %d;\n" -"__constant__ int CHROMA_HEIGHT = %d;\n" -"__constant__ int IN_DEPTH = %d;\n" -"__constant__ int OUT_DEPTH = %d;\n" -"__constant__ int PSTRIDE = %d;\n" -"__constant__ int CHROMA_PSTRIDE = %d;\n" -"__constant__ int IN_SHIFT = %d;\n" -"__constant__ int OUT_SHIFT = %d;\n" -"__constant__ int MASK = %d;\n" -"__constant__ int SWAP_UV = %d;\n" -"\n" -"__device__ unsigned short\n" -"do_scale_pixel (unsigned short val) \n" -"{\n" -" unsigned int diff;\n" -" if (OUT_DEPTH > IN_DEPTH) {\n" -" diff = OUT_DEPTH - IN_DEPTH;\n" -" return (val << diff) | (val >> (IN_DEPTH - diff));\n" -" } else if (IN_DEPTH > OUT_DEPTH) {\n" -" return val >> (IN_DEPTH - OUT_DEPTH);\n" -" }\n" -" return val;\n" -"}\n" -"\n" -"__device__ float\n" -"dot(float3 val, float *coeff)\n" -"{\n" -" return val.x * coeff[0] + val.y * coeff[1] + val.z * coeff[2];\n" -"}\n" -"\n" -"__device__ uint3\n" -"rgb_to_yuv (unsigned short r, unsigned short g, unsigned short b,\n" -" unsigned int max_val)\n" -"{\n" -" float3 rgb = make_float3 (r, g, b);\n" -" uint3 yuv;\n" -" yuv.x = max ((unsigned int)(dot (rgb, ycoeff) + offset[0]), 0);\n" -" yuv.y = max ((unsigned int)(dot (rgb, ucoeff) + offset[1]), 0);\n" -" yuv.z = max ((unsigned int)(dot (rgb, vcoeff) + offset[2]), 0);\n" -" yuv.x = min (yuv.x, max_val);\n" -" yuv.y = min (yuv.y, max_val);\n" -" yuv.z = min (yuv.z, max_val);\n" -" return yuv;\n" -"}\n" -"\n" -/* __global__ void - * GST_CUDA_KERNEL_FUNC_TO_ARGB - */ -"%s\n" -"\n" -/* __device__ ushort2 - * read_chroma (cudaTextureObject_t tex1, cudaTextureObject_t tex2, float x, float y); - */ -"%s\n" -"\n" -/* __device__ void - * write_chroma (unsigned char *dst1, unsigned char *dst2, unsigned short u, - * unsigned short v, int x, int y, int pstride, int stride, int mask); - */ -"%s\n" -"\n" -"__global__ void\n" -GST_CUDA_KERNEL_FUNC_TO_Y444 -"(cudaTextureObject_t srcRGB, unsigned char *dstY, int y_stride,\n" -" unsigned char *dstU, int u_stride, unsigned char *dstV, int v_stride,\n" -" int width, int height, int dst_pstride, int in_depth)\n" -"{\n" -" int x_pos = blockIdx.x * blockDim.x + threadIdx.x;\n" -" int y_pos = blockIdx.y * blockDim.y + threadIdx.y;\n" -" if (x_pos < width && y_pos < height) {\n" -" ushort4 argb = tex2D(srcRGB, x_pos, y_pos);\n" -" uint3 yuv;\n" -" yuv = rgb_to_yuv (argb.y, argb.z, argb.w, (1 << in_depth) - 1);\n" -" if (in_depth > 8) {\n" -" *(unsigned short *)&dstY[x_pos * dst_pstride + y_pos * y_stride] = yuv.x;\n" -" *(unsigned short *)&dstU[x_pos * dst_pstride + y_pos * u_stride] = yuv.y;\n" -" *(unsigned short *)&dstV[x_pos * dst_pstride + y_pos * v_stride] = yuv.z;\n" -" } else {\n" -" dstY[x_pos * dst_pstride + y_pos * y_stride] = yuv.x;\n" -" dstU[x_pos * dst_pstride + y_pos * u_stride] = yuv.y;\n" -" dstV[x_pos * dst_pstride + y_pos * v_stride] = yuv.z;\n" -" }\n" -" }\n" -"}\n" -"\n" -"__global__ void\n" -GST_CUDA_KERNEL_FUNC_Y444_TO_YUV -"(cudaTextureObject_t tex0, cudaTextureObject_t tex1, cudaTextureObject_t tex2,\n" -" unsigned char *dst0, unsigned char *dst1, unsigned char *dst2,\n" -" int stride, int uv_stride)\n" -"{\n" -" int x_pos = blockIdx.x * blockDim.x + threadIdx.x;\n" -" int y_pos = blockIdx.y * blockDim.y + threadIdx.y;\n" -" if (x_pos < WIDTH && y_pos < HEIGHT) {\n" -" float src_xpos = SCALE_H * x_pos;\n" -" float src_ypos = SCALE_V * y_pos;\n" -" unsigned short y = tex2D(tex0, src_xpos, src_ypos);\n" -" y = y >> IN_SHIFT;\n" -" y = do_scale_pixel (y);\n" -" y = y << OUT_SHIFT;\n" -" if (OUT_DEPTH > 8) {\n" -" *(unsigned short *)&dst0[x_pos * PSTRIDE + y_pos * stride] = (y & MASK);\n" -" } else {\n" -" dst0[x_pos * PSTRIDE + y_pos * stride] = y;\n" -" }\n" -" }\n" -" if (x_pos < CHROMA_WIDTH && y_pos < CHROMA_HEIGHT) {\n" -" float src_xpos = CHROMA_SCALE_H * x_pos;\n" -" float src_ypos = CHROMA_SCALE_V * y_pos;\n" -" unsigned short u, v;\n" -" ushort2 uv;\n" -" uv = read_chroma (tex1, tex2, src_xpos, src_ypos);\n" -" u = uv.x;\n" -" v = uv.y;\n" -" u = u >> IN_SHIFT;\n" -" v = v >> IN_SHIFT;\n" -" u = do_scale_pixel (u);\n" -" v = do_scale_pixel (v);\n" -" u = u << OUT_SHIFT;\n" -" v = v << OUT_SHIFT;\n" -" if (SWAP_UV) {\n" -" unsigned short tmp = u;\n" -" u = v;\n" -" v = tmp;\n" -" }\n" -" write_chroma (dst1,\n" -" dst2, u, v, x_pos, y_pos, CHROMA_PSTRIDE, uv_stride, MASK);\n" -" }\n" -"}\n" -"\n" -"}"; - -/* CUDA kernel source for from RGB to RGB conversion and scale */ -static const gchar templ_RGB_to_RGB[] = -"extern \"C\"{\n" -"__constant__ float SCALE_H = %s;\n" -"__constant__ float SCALE_V = %s;\n" -"__constant__ int WIDTH = %d;\n" -"__constant__ int HEIGHT = %d;\n" -"__constant__ int IN_DEPTH = %d;\n" -"__constant__ int OUT_DEPTH = %d;\n" -"__constant__ int PSTRIDE = %d;\n" -"__constant__ int R_IDX = %d;\n" -"__constant__ int G_IDX = %d;\n" -"__constant__ int B_IDX = %d;\n" -"__constant__ int A_IDX = %d;\n" -"__constant__ int X_IDX = %d;\n" -"\n" -"__device__ unsigned short\n" -"do_scale_pixel (unsigned short val) \n" -"{\n" -" unsigned int diff;\n" -" if (OUT_DEPTH > IN_DEPTH) {\n" -" diff = OUT_DEPTH - IN_DEPTH;\n" -" return (val << diff) | (val >> (IN_DEPTH - diff));\n" -" } else if (IN_DEPTH > OUT_DEPTH) {\n" -" return val >> (IN_DEPTH - OUT_DEPTH);\n" -" }\n" -" return val;\n" -"}\n" -"\n" -/* __global__ void - * GST_CUDA_KERNEL_FUNC_TO_ARGB - */ -"%s\n" -"\n" -/* convert ARGB or ARGB64 to other RGB formats with scale */ -"__global__ void\n" -GST_CUDA_KERNEL_FUNC_SCALE_RGB -"(cudaTextureObject_t srcRGB, unsigned char *dstRGB, int dst_stride)\n" -"{\n" -" int x_pos = blockIdx.x * blockDim.x + threadIdx.x;\n" -" int y_pos = blockIdx.y * blockDim.y + threadIdx.y;\n" -" if (x_pos < WIDTH && y_pos < HEIGHT) {\n" -" float src_xpos = SCALE_H * x_pos;\n" -" float src_ypos = SCALE_V * y_pos;\n" -" ushort4 argb = tex2D(srcRGB, src_xpos, src_ypos);\n" -" argb.x = do_scale_pixel(argb.x);\n" -" argb.y = do_scale_pixel(argb.y);\n" -" argb.z = do_scale_pixel(argb.z);\n" -" argb.w = do_scale_pixel(argb.w);\n" - /* FIXME: RGB10A2_LE or BGR10A2_LE only */ -" if (OUT_DEPTH > 8) {\n" -" unsigned int packed_rgb = 0;\n" -" unsigned int a, r, g, b;" -" a = (argb.x >> 8) & 0x3;\n" -" r = argb.y & 0x3ff;\n" -" g = argb.z & 0x3ff;\n" -" b = argb.w & 0x3ff;\n" - /* A is always MSB, we support only little endian system */ -" packed_rgb = a << 30;\n" -" packed_rgb |= (r << (30 - (R_IDX * 10)));\n" -" packed_rgb |= (g << (30 - (G_IDX * 10)));\n" -" packed_rgb |= (b << (30 - (B_IDX * 10)));\n" -" *(unsigned int *)&dstRGB[x_pos * 4 + y_pos * dst_stride] = packed_rgb;\n" -" } else {\n" -" if (A_IDX >= 0) {\n" -" argb.x = do_scale_pixel(argb.x);\n" -" dstRGB[x_pos * PSTRIDE + A_IDX + y_pos * dst_stride] = argb.x;\n" -" } else if (X_IDX >= 0) {\n" -" dstRGB[x_pos * PSTRIDE + X_IDX + y_pos * dst_stride] = 0xff;\n" -" }\n" -" dstRGB[x_pos * PSTRIDE + R_IDX + y_pos * dst_stride] = argb.y;\n" -" dstRGB[x_pos * PSTRIDE + G_IDX + y_pos * dst_stride] = argb.z;\n" -" dstRGB[x_pos * PSTRIDE + B_IDX + y_pos * dst_stride] = argb.w;\n" -" }\n" -" }\n" -"}\n" -"\n" -"}"; -/* *INDENT-ON* */ - -typedef struct -{ - gint R; - gint G; - gint B; - gint A; - gint X; -} GstCudaRGBOrder; - -typedef struct -{ - CUdeviceptr device_ptr; - gsize cuda_stride; -} GstCudaStageBuffer; - -#define CONVERTER_MAX_NUM_FUNC 4 - -struct _GstCudaConverter -{ - GstVideoInfo in_info; - GstVideoInfo out_info; - gboolean keep_size; - - gint texture_alignment; - - GstCudaContext *cuda_ctx; - CUmodule cuda_module; - CUfunction kernel_func[CONVERTER_MAX_NUM_FUNC]; - const gchar *func_names[CONVERTER_MAX_NUM_FUNC]; - gchar *kernel_source; - gchar *ptx; - GstCudaStageBuffer fallback_buffer[GST_VIDEO_MAX_PLANES]; - - /* *INDENT-OFF* */ - gboolean (*convert) (GstCudaConverter * convert, GstVideoFrame * src_frame, - GstVideoFrame * dst_frame, CUstream cuda_stream); - /* *INDENT-ON* */ - - const CUdeviceptr src; - GstVideoInfo *cur_in_info; - - CUdeviceptr dest; - GstVideoInfo *cur_out_info; - - /* rgb to {rgb, yuv} only */ - GstCudaRGBOrder in_rgb_order; - GstCudaStageBuffer unpack_surface; - GstCudaStageBuffer y444_surface[GST_VIDEO_MAX_PLANES]; -}; - -#define LOAD_CUDA_FUNC(module,func,name) G_STMT_START { \ - if (!gst_cuda_result (CuModuleGetFunction (&(func), (module), name))) { \ - GST_ERROR ("failed to get %s function", (name)); \ - goto error; \ - } \ -} G_STMT_END - -/** - * gst_cuda_converter_new: - * @in_info: a #GstVideoInfo - * @out_info: a #GstVideoInfo - * @cuda_ctx: (transfer none): a #GstCudaContext - * - * Create a new converter object to convert between @in_info and @out_info - * with @config. - * - * Returns: a #GstCudaConverter or %NULL if conversion is not possible. - */ -GstCudaConverter * -gst_cuda_converter_new (GstVideoInfo * in_info, GstVideoInfo * out_info, - GstCudaContext * cuda_ctx) -{ - GstCudaConverter *convert; - gint i; - - g_return_val_if_fail (in_info != NULL, NULL); - g_return_val_if_fail (out_info != NULL, NULL); - g_return_val_if_fail (cuda_ctx != NULL, NULL); - /* we won't ever do framerate conversion */ - g_return_val_if_fail (in_info->fps_n == out_info->fps_n, NULL); - g_return_val_if_fail (in_info->fps_d == out_info->fps_d, NULL); - /* we won't ever do deinterlace */ - g_return_val_if_fail (in_info->interlace_mode == out_info->interlace_mode, - NULL); - - convert = g_new0 (GstCudaConverter, 1); - - convert->in_info = *in_info; - convert->out_info = *out_info; - - /* FIXME: should return kernel source */ - if (!gst_cuda_context_push (cuda_ctx)) { - GST_ERROR ("cannot push context"); - goto error; - } - - if (!cuda_converter_lookup_path (convert)) - goto error; - - convert->ptx = gst_cuda_nvrtc_compile (convert->kernel_source); - if (!convert->ptx) { - GST_ERROR ("no PTX data to load"); - goto error; - } - - GST_TRACE ("compiled convert ptx \n%s", convert->ptx); - - if (!gst_cuda_result (CuModuleLoadData (&convert->cuda_module, convert->ptx))) { - gst_cuda_context_pop (NULL); - GST_ERROR ("failed to load cuda module data"); - - goto error; - } - - for (i = 0; i < CONVERTER_MAX_NUM_FUNC; i++) { - if (!convert->func_names[i]) - break; - - LOAD_CUDA_FUNC (convert->cuda_module, convert->kernel_func[i], - convert->func_names[i]); - GST_DEBUG ("kernel function \"%s\" loaded", convert->func_names[i]); - } - - gst_cuda_context_pop (NULL); - convert->cuda_ctx = gst_object_ref (cuda_ctx); - convert->texture_alignment = - gst_cuda_context_get_texture_alignment (cuda_ctx); - - g_free (convert->kernel_source); - g_free (convert->ptx); - convert->kernel_source = NULL; - convert->ptx = NULL; - - return convert; - -error: - gst_cuda_context_pop (NULL); - gst_cuda_converter_free (convert); - - return NULL; -} - -/** - * gst_video_converter_free: - * @convert: a #GstCudaConverter - * - * Free @convert - */ -void -gst_cuda_converter_free (GstCudaConverter * convert) -{ - g_return_if_fail (convert != NULL); - - if (convert->cuda_ctx) { - if (gst_cuda_context_push (convert->cuda_ctx)) { - gint i; - - if (convert->cuda_module) { - gst_cuda_result (CuModuleUnload (convert->cuda_module)); - } - - for (i = 0; i < GST_VIDEO_MAX_PLANES; i++) { - if (convert->fallback_buffer[i].device_ptr) - gst_cuda_result (CuMemFree (convert->fallback_buffer[i].device_ptr)); - if (convert->y444_surface[i].device_ptr) - gst_cuda_result (CuMemFree (convert->y444_surface[i].device_ptr)); - } - - if (convert->unpack_surface.device_ptr) - gst_cuda_result (CuMemFree (convert->unpack_surface.device_ptr)); - - gst_cuda_context_pop (NULL); - } - - gst_object_unref (convert->cuda_ctx); - } - - g_free (convert->kernel_source); - g_free (convert->ptx); - g_free (convert); -} - -gboolean -gst_cuda_converter_convert_frame (GstCudaConverter * convert, - GstVideoFrame * src_frame, GstVideoFrame * dst_frame, CUstream cuda_stream) -{ - gboolean ret; - - g_return_val_if_fail (convert, FALSE); - g_return_val_if_fail (src_frame, FALSE); - g_return_val_if_fail (dst_frame, FALSE); - - gst_cuda_context_push (convert->cuda_ctx); - - ret = convert->convert (convert, src_frame, dst_frame, cuda_stream); - - gst_cuda_context_pop (NULL); - - return ret; -} - -/* allocate fallback memory for texture alignment requirement */ -static gboolean -convert_ensure_fallback_memory (GstCudaConverter * convert, - GstVideoInfo * info, guint plane) -{ - CUresult ret; - guint element_size = 8; - - if (convert->fallback_buffer[plane].device_ptr) - return TRUE; - - if (GST_VIDEO_INFO_COMP_DEPTH (info, 0) > 8) - element_size = 16; - - ret = CuMemAllocPitch (&convert->fallback_buffer[plane].device_ptr, - &convert->fallback_buffer[plane].cuda_stride, - GST_VIDEO_INFO_COMP_WIDTH (info, plane) * - GST_VIDEO_INFO_COMP_PSTRIDE (info, plane), - GST_VIDEO_INFO_COMP_HEIGHT (info, plane), element_size); - - if (!gst_cuda_result (ret)) { - GST_ERROR ("failed to allocated fallback memory"); - return FALSE; - } - - return TRUE; -} - -/* create a 2D CUDA texture without alignment check */ -static CUtexObject -convert_create_texture_unchecked (const CUdeviceptr src, gint width, - gint height, gint channels, gint stride, CUarray_format format, - CUfilter_mode mode, CUstream cuda_stream) -{ - CUDA_TEXTURE_DESC texture_desc; - CUDA_RESOURCE_DESC resource_desc; - CUtexObject texture = 0; - CUresult cuda_ret; - - memset (&texture_desc, 0, sizeof (CUDA_TEXTURE_DESC)); - memset (&resource_desc, 0, sizeof (CUDA_RESOURCE_DESC)); - - resource_desc.resType = CU_RESOURCE_TYPE_PITCH2D; - resource_desc.res.pitch2D.format = format; - resource_desc.res.pitch2D.numChannels = channels; - resource_desc.res.pitch2D.width = width; - resource_desc.res.pitch2D.height = height; - resource_desc.res.pitch2D.pitchInBytes = stride; - resource_desc.res.pitch2D.devPtr = src; - - texture_desc.filterMode = mode; - texture_desc.flags = CU_TRSF_READ_AS_INTEGER; - - gst_cuda_result (CuStreamSynchronize (cuda_stream)); - cuda_ret = CuTexObjectCreate (&texture, &resource_desc, &texture_desc, NULL); - - if (!gst_cuda_result (cuda_ret)) { - GST_ERROR ("couldn't create texture"); - - return 0; - } - - return texture; -} - -static CUtexObject -convert_create_texture (GstCudaConverter * convert, GstVideoFrame * src_frame, - guint plane, CUstream cuda_stream) -{ - CUarray_format format = CU_AD_FORMAT_UNSIGNED_INT8; - guint channels = 1; - CUdeviceptr src_ptr; - gsize stride; - CUresult cuda_ret; - CUfilter_mode mode; - - if (GST_VIDEO_FRAME_COMP_DEPTH (src_frame, plane) > 8) - format = CU_AD_FORMAT_UNSIGNED_INT16; - - /* FIXME: more graceful method ? */ - if (plane != 0 && - GST_VIDEO_FRAME_N_PLANES (src_frame) != - GST_VIDEO_FRAME_N_COMPONENTS (src_frame)) { - channels = 2; - } - - src_ptr = (CUdeviceptr) GST_VIDEO_FRAME_PLANE_DATA (src_frame, plane); - stride = GST_VIDEO_FRAME_PLANE_STRIDE (src_frame, plane); - - if (convert->texture_alignment && (src_ptr % convert->texture_alignment)) { - CUDA_MEMCPY2D copy_params = { 0, }; - - if (!convert_ensure_fallback_memory (convert, &src_frame->info, plane)) - return 0; - - GST_LOG ("device memory was not aligned, copy to fallback memory"); - - copy_params.srcMemoryType = CU_MEMORYTYPE_DEVICE; - copy_params.srcPitch = stride; - copy_params.srcDevice = (CUdeviceptr) src_ptr; - - copy_params.dstMemoryType = CU_MEMORYTYPE_DEVICE; - copy_params.dstPitch = convert->fallback_buffer[plane].cuda_stride; - copy_params.dstDevice = convert->fallback_buffer[plane].device_ptr; - copy_params.WidthInBytes = GST_VIDEO_FRAME_COMP_WIDTH (src_frame, plane) - * GST_VIDEO_FRAME_COMP_PSTRIDE (src_frame, plane); - copy_params.Height = GST_VIDEO_FRAME_COMP_HEIGHT (src_frame, plane); - - cuda_ret = CuMemcpy2DAsync (©_params, cuda_stream); - if (!gst_cuda_result (cuda_ret)) { - GST_ERROR ("failed to copy to fallback buffer"); - return 0; - } - - src_ptr = convert->fallback_buffer[plane].device_ptr; - stride = convert->fallback_buffer[plane].cuda_stride; - } - - /* Use h/w linear interpolation only when resize is required. - * Otherwise the image might be blurred */ - if (convert->keep_size) - mode = CU_TR_FILTER_MODE_POINT; - else - mode = CU_TR_FILTER_MODE_LINEAR; - - return convert_create_texture_unchecked (src_ptr, - GST_VIDEO_FRAME_COMP_WIDTH (src_frame, plane), - GST_VIDEO_FRAME_COMP_HEIGHT (src_frame, plane), channels, stride, format, - mode, cuda_stream); -} - -/* main conversion function for YUV to YUV conversion */ -static gboolean -convert_YUV_TO_YUV (GstCudaConverter * convert, GstVideoFrame * src_frame, - GstVideoFrame * dst_frame, CUstream cuda_stream) -{ - CUtexObject texture[GST_VIDEO_MAX_PLANES] = { 0, }; - CUresult cuda_ret; - gboolean ret = FALSE; - CUdeviceptr dst_ptr[GST_VIDEO_MAX_PLANES] = { 0, }; - gint dst_stride, dst_uv_stride; - gint width, height; - gint i; - - gpointer kernel_args[] = { &texture[0], &texture[1], &texture[2], - &dst_ptr[0], &dst_ptr[1], &dst_ptr[2], &dst_stride, &dst_uv_stride - }; - - /* conversion step - * STEP 1: create CUtexObject per plane - * STEP 2: call YUV to YUV conversion kernel function. - * resize, uv reordering and bitdepth conversion will be performed in - * the CUDA kernel function - */ - - /* map CUDA device memory to CUDA texture object */ - for (i = 0; i < GST_VIDEO_FRAME_N_PLANES (src_frame); i++) { - texture[i] = convert_create_texture (convert, src_frame, i, cuda_stream); - if (!texture[i]) { - GST_ERROR ("couldn't create texture for %d th plane", i); - goto done; - } - } - - for (i = 0; i < GST_VIDEO_FRAME_N_PLANES (dst_frame); i++) { - dst_ptr[i] = (CUdeviceptr) GST_VIDEO_FRAME_PLANE_DATA (dst_frame, i); - } - - dst_stride = GST_VIDEO_FRAME_PLANE_STRIDE (dst_frame, 0); - dst_uv_stride = GST_VIDEO_FRAME_PLANE_STRIDE (dst_frame, 1); - - width = GST_VIDEO_FRAME_WIDTH (dst_frame); - height = GST_VIDEO_FRAME_HEIGHT (dst_frame); - - cuda_ret = - CuLaunchKernel (convert->kernel_func[0], DIV_UP (width, CUDA_BLOCK_X), - DIV_UP (height, CUDA_BLOCK_Y), 1, CUDA_BLOCK_X, CUDA_BLOCK_Y, 1, 0, - cuda_stream, kernel_args, NULL); - - if (!gst_cuda_result (cuda_ret)) { - GST_ERROR ("could not rescale plane"); - goto done; - } - - ret = TRUE; - gst_cuda_result (CuStreamSynchronize (cuda_stream)); - -done: - for (i = 0; i < GST_VIDEO_FRAME_N_PLANES (src_frame); i++) { - if (texture[i]) - gst_cuda_result (CuTexObjectDestroy (texture[i])); - } - - return ret; -} - -/* main conversion function for YUV to RGB conversion */ -static gboolean -convert_YUV_TO_RGB (GstCudaConverter * convert, GstVideoFrame * src_frame, - GstVideoFrame * dst_frame, CUstream cuda_stream) -{ - CUtexObject texture[GST_VIDEO_MAX_PLANES] = { 0, }; - CUresult cuda_ret; - gboolean ret = FALSE; - CUdeviceptr dstRGB = 0; - gint dst_stride; - gint width, height; - gint i; - - gpointer kernel_args[] = { &texture[0], &texture[1], &texture[2], - &dstRGB, &dst_stride - }; - - /* conversion step - * STEP 1: create CUtexObject per plane - * STEP 2: call YUV to RGB conversion kernel function. - * resizing, argb ordering and bitdepth conversion will be performed in - * the CUDA kernel function - */ - - /* map CUDA device memory to CUDA texture object */ - for (i = 0; i < GST_VIDEO_FRAME_N_PLANES (src_frame); i++) { - texture[i] = convert_create_texture (convert, src_frame, i, cuda_stream); - if (!texture[i]) { - GST_ERROR ("couldn't create texture for %d th plane", i); - goto done; - } - } - - dstRGB = (CUdeviceptr) GST_VIDEO_FRAME_PLANE_DATA (dst_frame, 0); - dst_stride = GST_VIDEO_FRAME_PLANE_STRIDE (dst_frame, 0); - - width = GST_VIDEO_FRAME_WIDTH (dst_frame); - height = GST_VIDEO_FRAME_HEIGHT (dst_frame); - - cuda_ret = - CuLaunchKernel (convert->kernel_func[0], DIV_UP (width, CUDA_BLOCK_X), - DIV_UP (height, CUDA_BLOCK_Y), 1, CUDA_BLOCK_X, CUDA_BLOCK_Y, 1, 0, - cuda_stream, kernel_args, NULL); - - if (!gst_cuda_result (cuda_ret)) { - GST_ERROR ("could not rescale plane"); - goto done; - } - - ret = TRUE; - gst_cuda_result (CuStreamSynchronize (cuda_stream)); - -done: - for (i = 0; i < GST_VIDEO_FRAME_N_PLANES (src_frame); i++) { - if (texture[i]) - gst_cuda_result (CuTexObjectDestroy (texture[i])); - } - - return ret; -} - -static gboolean -convert_UNPACK_RGB (GstCudaConverter * convert, CUfunction kernel_func, - CUstream cuda_stream, GstVideoFrame * src_frame, - CUdeviceptr dst, gint dst_stride, GstCudaRGBOrder * rgb_order) -{ - CUdeviceptr srcRGB = 0; - gint width, height; - gint src_stride, src_pstride; - CUresult cuda_ret; - - gpointer unpack_kernel_args[] = { &srcRGB, &dst, - &width, &height, - &src_stride, &src_pstride, &dst_stride, - &convert->in_rgb_order.R, &convert->in_rgb_order.G, - &convert->in_rgb_order.B, &convert->in_rgb_order.A, - }; - - srcRGB = (CUdeviceptr) GST_VIDEO_FRAME_PLANE_DATA (src_frame, 0); - src_stride = GST_VIDEO_FRAME_PLANE_STRIDE (src_frame, 0); - - width = GST_VIDEO_FRAME_WIDTH (src_frame); - height = GST_VIDEO_FRAME_HEIGHT (src_frame); - src_pstride = GST_VIDEO_FRAME_COMP_PSTRIDE (src_frame, 0); - - cuda_ret = - CuLaunchKernel (kernel_func, DIV_UP (width, CUDA_BLOCK_X), - DIV_UP (height, CUDA_BLOCK_Y), 1, CUDA_BLOCK_X, CUDA_BLOCK_Y, 1, 0, - cuda_stream, unpack_kernel_args, NULL); - - if (!gst_cuda_result (cuda_ret)) { - GST_ERROR ("could not unpack rgb"); - return FALSE; - } - - return TRUE; -} - -static gboolean -convert_TO_Y444 (GstCudaConverter * convert, CUfunction kernel_func, - CUstream cuda_stream, CUtexObject srcRGB, CUdeviceptr dstY, gint y_stride, - CUdeviceptr dstU, gint u_stride, CUdeviceptr dstV, gint v_stride, - gint width, gint height, gint pstride, gint bitdepth) -{ - CUresult cuda_ret; - - gpointer kernel_args[] = { &srcRGB, &dstY, &y_stride, &dstU, &u_stride, &dstV, - &v_stride, &width, &height, &pstride, &bitdepth, - }; - - cuda_ret = - CuLaunchKernel (kernel_func, DIV_UP (width, CUDA_BLOCK_X), - DIV_UP (height, CUDA_BLOCK_Y), 1, CUDA_BLOCK_X, CUDA_BLOCK_Y, 1, 0, - cuda_stream, kernel_args, NULL); - - if (!gst_cuda_result (cuda_ret)) { - GST_ERROR ("could not unpack rgb"); - return FALSE; - } - - return TRUE; -} - -/* main conversion function for RGB to YUV conversion */ -static gboolean -convert_RGB_TO_YUV (GstCudaConverter * convert, GstVideoFrame * src_frame, - GstVideoFrame * dst_frame, CUstream cuda_stream) -{ - CUtexObject texture = 0; - CUtexObject yuv_texture[3] = { 0, }; - CUdeviceptr dst_ptr[GST_VIDEO_MAX_PLANES] = { 0, }; - CUresult cuda_ret; - gboolean ret = FALSE; - gint in_width, in_height; - gint out_width, out_height; - gint dst_stride, dst_uv_stride; - CUarray_format format = CU_AD_FORMAT_UNSIGNED_INT8; - CUfilter_mode mode = CU_TR_FILTER_MODE_POINT; - gint pstride = 1; - gint bitdepth = 8; - gint i; - - gpointer kernel_args[] = { &yuv_texture[0], &yuv_texture[1], &yuv_texture[2], - &dst_ptr[0], &dst_ptr[1], &dst_ptr[2], &dst_stride, &dst_uv_stride - }; - - /* conversion step - * STEP 1: unpack src RGB into ARGB or ARGB64 format - * STEP 2: convert unpacked ARGB (or ARGB64) to Y444 (or Y444_16LE) - * STEP 3: convert Y444 (or Y444_16LE) to final YUV format. - * resizing, bitdepth conversion, uv reordering will be performed in - * the CUDA kernel function - */ - if (!convert_UNPACK_RGB (convert, convert->kernel_func[0], cuda_stream, - src_frame, convert->unpack_surface.device_ptr, - convert->unpack_surface.cuda_stride, &convert->in_rgb_order)) { - GST_ERROR ("could not unpack input rgb"); - - goto done; - } - - in_width = GST_VIDEO_FRAME_WIDTH (src_frame); - in_height = GST_VIDEO_FRAME_HEIGHT (src_frame); - - out_width = GST_VIDEO_FRAME_WIDTH (dst_frame); - out_height = GST_VIDEO_FRAME_HEIGHT (dst_frame); - dst_stride = GST_VIDEO_FRAME_PLANE_STRIDE (dst_frame, 0); - dst_uv_stride = GST_VIDEO_FRAME_PLANE_STRIDE (dst_frame, 1); - - if (GST_VIDEO_FRAME_COMP_DEPTH (src_frame, 0) > 8) { - pstride = 2; - bitdepth = 16; - format = CU_AD_FORMAT_UNSIGNED_INT16; - } - - texture = - convert_create_texture_unchecked (convert->unpack_surface.device_ptr, - in_width, in_height, 4, convert->unpack_surface.cuda_stride, format, - mode, cuda_stream); - - if (!texture) { - GST_ERROR ("could not create texture"); - goto done; - } - - if (!convert_TO_Y444 (convert, convert->kernel_func[1], cuda_stream, texture, - convert->y444_surface[0].device_ptr, - convert->y444_surface[0].cuda_stride, - convert->y444_surface[1].device_ptr, - convert->y444_surface[1].cuda_stride, - convert->y444_surface[2].device_ptr, - convert->y444_surface[2].cuda_stride, in_width, in_height, pstride, - bitdepth)) { - GST_ERROR ("could not convert to Y444 or Y444_16LE"); - goto done; - } - - /* Use h/w linear interpolation only when resize is required. - * Otherwise the image might be blurred */ - if (convert->keep_size) - mode = CU_TR_FILTER_MODE_POINT; - else - mode = CU_TR_FILTER_MODE_LINEAR; - - for (i = 0; i < 3; i++) { - yuv_texture[i] = - convert_create_texture_unchecked (convert->y444_surface[i].device_ptr, - in_width, in_height, 1, convert->y444_surface[i].cuda_stride, format, - mode, cuda_stream); - - if (!yuv_texture[i]) { - GST_ERROR ("could not create %dth yuv texture", i); - goto done; - } - } - - for (i = 0; i < GST_VIDEO_FRAME_N_PLANES (dst_frame); i++) - dst_ptr[i] = (CUdeviceptr) GST_VIDEO_FRAME_PLANE_DATA (dst_frame, i); - - cuda_ret = - CuLaunchKernel (convert->kernel_func[2], DIV_UP (out_width, CUDA_BLOCK_X), - DIV_UP (out_height, CUDA_BLOCK_Y), 1, CUDA_BLOCK_X, CUDA_BLOCK_Y, 1, 0, - cuda_stream, kernel_args, NULL); - - if (!gst_cuda_result (cuda_ret)) { - GST_ERROR ("could not rescale plane"); - goto done; - } - - ret = TRUE; - gst_cuda_result (CuStreamSynchronize (cuda_stream)); - -done: - if (texture) - gst_cuda_result (CuTexObjectDestroy (texture)); - for (i = 0; i < 3; i++) { - if (yuv_texture[i]) - gst_cuda_result (CuTexObjectDestroy (yuv_texture[i])); - } - - return ret; -} - -/* main conversion function for RGB to RGB conversion */ -static gboolean -convert_RGB_TO_RGB (GstCudaConverter * convert, GstVideoFrame * src_frame, - GstVideoFrame * dst_frame, CUstream cuda_stream) -{ - CUtexObject texture = 0; - CUresult cuda_ret; - gboolean ret = FALSE; - CUdeviceptr dstRGB = 0; - gint in_width, in_height; - gint out_width, out_height; - gint dst_stride; - CUfilter_mode mode; - CUarray_format format = CU_AD_FORMAT_UNSIGNED_INT8; - - gpointer rescale_kernel_args[] = { &texture, &dstRGB, &dst_stride }; - - /* conversion step - * STEP 1: unpack src RGB into ARGB or ARGB64 format - * STEP 2: convert ARGB (or ARGB64) to final RGB format. - * resizing, bitdepth conversion, argb reordering will be performed in - * the CUDA kernel function - */ - - if (!convert_UNPACK_RGB (convert, convert->kernel_func[0], cuda_stream, - src_frame, convert->unpack_surface.device_ptr, - convert->unpack_surface.cuda_stride, &convert->in_rgb_order)) { - GST_ERROR ("could not unpack input rgb"); - - goto done; - } - - in_width = GST_VIDEO_FRAME_WIDTH (src_frame); - in_height = GST_VIDEO_FRAME_HEIGHT (src_frame); - - out_width = GST_VIDEO_FRAME_WIDTH (dst_frame); - out_height = GST_VIDEO_FRAME_HEIGHT (dst_frame); - - dstRGB = (CUdeviceptr) GST_VIDEO_FRAME_PLANE_DATA (dst_frame, 0); - dst_stride = GST_VIDEO_FRAME_PLANE_STRIDE (dst_frame, 0); - - if (GST_VIDEO_FRAME_COMP_DEPTH (src_frame, 0) > 8) - format = CU_AD_FORMAT_UNSIGNED_INT16; - - /* Use h/w linear interpolation only when resize is required. - * Otherwise the image might be blurred */ - if (convert->keep_size) - mode = CU_TR_FILTER_MODE_POINT; - else - mode = CU_TR_FILTER_MODE_LINEAR; - - texture = - convert_create_texture_unchecked (convert->unpack_surface.device_ptr, - in_width, in_height, 4, convert->unpack_surface.cuda_stride, format, - mode, cuda_stream); - - if (!texture) { - GST_ERROR ("could not create texture"); - goto done; - } - - cuda_ret = - CuLaunchKernel (convert->kernel_func[1], DIV_UP (out_width, CUDA_BLOCK_X), - DIV_UP (out_height, CUDA_BLOCK_Y), 1, CUDA_BLOCK_X, CUDA_BLOCK_Y, 1, 0, - cuda_stream, rescale_kernel_args, NULL); - - if (!gst_cuda_result (cuda_ret)) { - GST_ERROR ("could not rescale plane"); - goto done; - } - - ret = TRUE; - gst_cuda_result (CuStreamSynchronize (cuda_stream)); - -done: - if (texture) - gst_cuda_result (CuTexObjectDestroy (texture)); - - return ret; -} - -/* from video-converter.c */ -typedef struct -{ - gdouble dm[4][4]; -} MatrixData; - -static void -color_matrix_set_identity (MatrixData * m) -{ - gint i, j; - - for (i = 0; i < 4; i++) { - for (j = 0; j < 4; j++) { - m->dm[i][j] = (i == j); - } - } -} - -static void -color_matrix_copy (MatrixData * d, const MatrixData * s) -{ - gint i, j; - - for (i = 0; i < 4; i++) - for (j = 0; j < 4; j++) - d->dm[i][j] = s->dm[i][j]; -} - -/* Perform 4x4 matrix multiplication: - * - @dst@ = @a@ * @b@ - * - @dst@ may be a pointer to @a@ andor @b@ - */ -static void -color_matrix_multiply (MatrixData * dst, MatrixData * a, MatrixData * b) -{ - MatrixData tmp; - gint i, j, k; - - for (i = 0; i < 4; i++) { - for (j = 0; j < 4; j++) { - gdouble x = 0; - for (k = 0; k < 4; k++) { - x += a->dm[i][k] * b->dm[k][j]; - } - tmp.dm[i][j] = x; - } - } - color_matrix_copy (dst, &tmp); -} - -static void -color_matrix_offset_components (MatrixData * m, gdouble a1, gdouble a2, - gdouble a3) -{ - MatrixData a; - - color_matrix_set_identity (&a); - a.dm[0][3] = a1; - a.dm[1][3] = a2; - a.dm[2][3] = a3; - color_matrix_multiply (m, &a, m); -} - -static void -color_matrix_scale_components (MatrixData * m, gdouble a1, gdouble a2, - gdouble a3) -{ - MatrixData a; - - color_matrix_set_identity (&a); - a.dm[0][0] = a1; - a.dm[1][1] = a2; - a.dm[2][2] = a3; - color_matrix_multiply (m, &a, m); -} - -static void -color_matrix_debug (const MatrixData * s) -{ - GST_DEBUG ("[%f %f %f %f]", s->dm[0][0], s->dm[0][1], s->dm[0][2], - s->dm[0][3]); - GST_DEBUG ("[%f %f %f %f]", s->dm[1][0], s->dm[1][1], s->dm[1][2], - s->dm[1][3]); - GST_DEBUG ("[%f %f %f %f]", s->dm[2][0], s->dm[2][1], s->dm[2][2], - s->dm[2][3]); - GST_DEBUG ("[%f %f %f %f]", s->dm[3][0], s->dm[3][1], s->dm[3][2], - s->dm[3][3]); -} - -static void -color_matrix_YCbCr_to_RGB (MatrixData * m, gdouble Kr, gdouble Kb) -{ - gdouble Kg = 1.0 - Kr - Kb; - MatrixData k = { - { - {1., 0., 2 * (1 - Kr), 0.}, - {1., -2 * Kb * (1 - Kb) / Kg, -2 * Kr * (1 - Kr) / Kg, 0.}, - {1., 2 * (1 - Kb), 0., 0.}, - {0., 0., 0., 1.}, - } - }; - - color_matrix_multiply (m, &k, m); -} - -static void -color_matrix_RGB_to_YCbCr (MatrixData * m, gdouble Kr, gdouble Kb) -{ - gdouble Kg = 1.0 - Kr - Kb; - MatrixData k; - gdouble x; - - k.dm[0][0] = Kr; - k.dm[0][1] = Kg; - k.dm[0][2] = Kb; - k.dm[0][3] = 0; - - x = 1 / (2 * (1 - Kb)); - k.dm[1][0] = -x * Kr; - k.dm[1][1] = -x * Kg; - k.dm[1][2] = x * (1 - Kb); - k.dm[1][3] = 0; - - x = 1 / (2 * (1 - Kr)); - k.dm[2][0] = x * (1 - Kr); - k.dm[2][1] = -x * Kg; - k.dm[2][2] = -x * Kb; - k.dm[2][3] = 0; - - k.dm[3][0] = 0; - k.dm[3][1] = 0; - k.dm[3][2] = 0; - k.dm[3][3] = 1; - - color_matrix_multiply (m, &k, m); -} - -static void -compute_matrix_to_RGB (GstCudaConverter * convert, MatrixData * data, - GstVideoInfo * info) -{ - gdouble Kr = 0, Kb = 0; - gint offset[4], scale[4]; - - /* bring color components to [0..1.0] range */ - gst_video_color_range_offsets (info->colorimetry.range, info->finfo, offset, - scale); - - color_matrix_offset_components (data, -offset[0], -offset[1], -offset[2]); - color_matrix_scale_components (data, 1 / ((float) scale[0]), - 1 / ((float) scale[1]), 1 / ((float) scale[2])); - - if (!GST_VIDEO_INFO_IS_RGB (info)) { - /* bring components to R'G'B' space */ - if (gst_video_color_matrix_get_Kr_Kb (info->colorimetry.matrix, &Kr, &Kb)) - color_matrix_YCbCr_to_RGB (data, Kr, Kb); - } - color_matrix_debug (data); -} - -static void -compute_matrix_to_YUV (GstCudaConverter * convert, MatrixData * data, - GstVideoInfo * info) -{ - gdouble Kr = 0, Kb = 0; - gint offset[4], scale[4]; - - if (!GST_VIDEO_INFO_IS_RGB (info)) { - /* bring components to YCbCr space */ - if (gst_video_color_matrix_get_Kr_Kb (info->colorimetry.matrix, &Kr, &Kb)) - color_matrix_RGB_to_YCbCr (data, Kr, Kb); - } - - /* bring color components to nominal range */ - gst_video_color_range_offsets (info->colorimetry.range, info->finfo, offset, - scale); - - color_matrix_scale_components (data, (float) scale[0], (float) scale[1], - (float) scale[2]); - color_matrix_offset_components (data, offset[0], offset[1], offset[2]); - - color_matrix_debug (data); -} - -static gboolean -cuda_converter_get_matrix (GstCudaConverter * convert, MatrixData * matrix, - GstVideoInfo * in_info, GstVideoInfo * out_info) -{ - gboolean same_matrix, same_bits; - guint in_bits, out_bits; - - in_bits = GST_VIDEO_INFO_COMP_DEPTH (in_info, 0); - out_bits = GST_VIDEO_INFO_COMP_DEPTH (out_info, 0); - - same_bits = in_bits == out_bits; - same_matrix = in_info->colorimetry.matrix == out_info->colorimetry.matrix; - - GST_DEBUG ("matrix %d -> %d (%d)", in_info->colorimetry.matrix, - out_info->colorimetry.matrix, same_matrix); - GST_DEBUG ("bits %d -> %d (%d)", in_bits, out_bits, same_bits); - - color_matrix_set_identity (matrix); - - if (same_bits && same_matrix) { - GST_DEBUG ("conversion matrix is not required"); - - return FALSE; - } - - if (in_bits < out_bits) { - gint scale = 1 << (out_bits - in_bits); - color_matrix_scale_components (matrix, - 1 / (float) scale, 1 / (float) scale, 1 / (float) scale); - } - - GST_DEBUG ("to RGB matrix"); - compute_matrix_to_RGB (convert, matrix, in_info); - GST_DEBUG ("current matrix"); - color_matrix_debug (matrix); - - GST_DEBUG ("to YUV matrix"); - compute_matrix_to_YUV (convert, matrix, out_info); - GST_DEBUG ("current matrix"); - color_matrix_debug (matrix); - - if (in_bits > out_bits) { - gint scale = 1 << (in_bits - out_bits); - color_matrix_scale_components (matrix, - (float) scale, (float) scale, (float) scale); - } - - GST_DEBUG ("final matrix"); - color_matrix_debug (matrix); - - return TRUE; -} - -static gboolean -is_uv_swapped (GstVideoFormat format) -{ - static GstVideoFormat swapped_formats[] = { - GST_VIDEO_FORMAT_YV12, - GST_VIDEO_FORMAT_NV21, - }; - gint i; - - for (i = 0; i < G_N_ELEMENTS (swapped_formats); i++) { - if (format == swapped_formats[i]) - return TRUE; - } - - return FALSE; -} - -typedef struct -{ - const gchar *read_chroma; - const gchar *write_chroma; - const gchar *unpack_function; - gfloat scale_h, scale_v; - gfloat chroma_scale_h, chroma_scale_v; - gint width, height; - gint chroma_width, chroma_height; - gint in_depth; - gint out_depth; - gint pstride, chroma_pstride; - gint in_shift, out_shift; - gint mask; - gint swap_uv; - /* RGBA specific variables */ - gint max_in_val; - GstCudaRGBOrder rgb_order; -} GstCudaKernelTempl; - -static gchar * -cuda_converter_generate_yuv_to_yuv_kernel_code (GstCudaConverter * convert, - GstCudaKernelTempl * templ) -{ - gchar scale_h_str[G_ASCII_DTOSTR_BUF_SIZE]; - gchar scale_v_str[G_ASCII_DTOSTR_BUF_SIZE]; - gchar chroma_scale_h_str[G_ASCII_DTOSTR_BUF_SIZE]; - gchar chroma_scale_v_str[G_ASCII_DTOSTR_BUF_SIZE]; - g_ascii_formatd (scale_h_str, G_ASCII_DTOSTR_BUF_SIZE, "%f", templ->scale_h); - g_ascii_formatd (scale_v_str, G_ASCII_DTOSTR_BUF_SIZE, "%f", templ->scale_v); - g_ascii_formatd (chroma_scale_h_str, G_ASCII_DTOSTR_BUF_SIZE, "%f", - templ->chroma_scale_h); - g_ascii_formatd (chroma_scale_v_str, G_ASCII_DTOSTR_BUF_SIZE, "%f", - templ->chroma_scale_v); - return g_strdup_printf (templ_YUV_TO_YUV, scale_h_str, scale_v_str, - chroma_scale_h_str, chroma_scale_v_str, templ->width, templ->height, - templ->chroma_width, templ->chroma_height, templ->in_depth, - templ->out_depth, templ->pstride, templ->chroma_pstride, templ->in_shift, - templ->out_shift, templ->mask, templ->swap_uv, templ->read_chroma, - templ->write_chroma); -} - -static gchar * -cuda_converter_generate_yuv_to_rgb_kernel_code (GstCudaConverter * convert, - GstCudaKernelTempl * templ, MatrixData * matrix) -{ - gchar matrix_dm[4][4][G_ASCII_DTOSTR_BUF_SIZE]; - gchar scale_h_str[G_ASCII_DTOSTR_BUF_SIZE]; - gchar scale_v_str[G_ASCII_DTOSTR_BUF_SIZE]; - gchar chroma_scale_h_str[G_ASCII_DTOSTR_BUF_SIZE]; - gchar chroma_scale_v_str[G_ASCII_DTOSTR_BUF_SIZE]; - gint i, j; - for (i = 0; i < 4; i++) { - for (j = 0; j < 4; j++) { - g_ascii_formatd (matrix_dm[i][j], G_ASCII_DTOSTR_BUF_SIZE, "%f", - matrix->dm[i][j]); - } - } - g_ascii_formatd (scale_h_str, G_ASCII_DTOSTR_BUF_SIZE, "%f", templ->scale_h); - g_ascii_formatd (scale_v_str, G_ASCII_DTOSTR_BUF_SIZE, "%f", templ->scale_v); - g_ascii_formatd (chroma_scale_h_str, G_ASCII_DTOSTR_BUF_SIZE, "%f", - templ->chroma_scale_h); - g_ascii_formatd (chroma_scale_v_str, G_ASCII_DTOSTR_BUF_SIZE, "%f", - templ->chroma_scale_v); - return g_strdup_printf (templ_YUV_TO_RGB, matrix_dm[0][3], matrix_dm[1][3], - matrix_dm[2][3], matrix_dm[0][0], matrix_dm[0][1], matrix_dm[0][2], - matrix_dm[1][0], matrix_dm[1][1], matrix_dm[1][2], matrix_dm[2][0], - matrix_dm[2][1], matrix_dm[2][2], scale_h_str, scale_v_str, - chroma_scale_h_str, chroma_scale_v_str, templ->width, templ->height, - templ->chroma_width, templ->chroma_height, templ->in_depth, - templ->out_depth, templ->pstride, templ->chroma_pstride, templ->in_shift, - templ->out_shift, templ->mask, templ->swap_uv, templ->max_in_val, - templ->rgb_order.R, templ->rgb_order.G, templ->rgb_order.B, - templ->rgb_order.A, templ->rgb_order.X, templ->read_chroma); -} - -static gchar * -cuda_converter_generate_rgb_to_yuv_kernel_code (GstCudaConverter * convert, - GstCudaKernelTempl * templ, MatrixData * matrix) -{ - gchar matrix_dm[4][4][G_ASCII_DTOSTR_BUF_SIZE]; - gchar scale_h_str[G_ASCII_DTOSTR_BUF_SIZE]; - gchar scale_v_str[G_ASCII_DTOSTR_BUF_SIZE]; - gchar chroma_scale_h_str[G_ASCII_DTOSTR_BUF_SIZE]; - gchar chroma_scale_v_str[G_ASCII_DTOSTR_BUF_SIZE]; - gint i, j; - for (i = 0; i < 4; i++) { - for (j = 0; j < 4; j++) { - g_ascii_formatd (matrix_dm[i][j], G_ASCII_DTOSTR_BUF_SIZE, "%f", - matrix->dm[i][j]); - } - } - g_ascii_formatd (scale_h_str, G_ASCII_DTOSTR_BUF_SIZE, "%f", templ->scale_h); - g_ascii_formatd (scale_v_str, G_ASCII_DTOSTR_BUF_SIZE, "%f", templ->scale_v); - g_ascii_formatd (chroma_scale_h_str, G_ASCII_DTOSTR_BUF_SIZE, "%f", - templ->chroma_scale_h); - g_ascii_formatd (chroma_scale_v_str, G_ASCII_DTOSTR_BUF_SIZE, "%f", - templ->chroma_scale_v); - return g_strdup_printf (templ_RGB_TO_YUV, matrix_dm[0][3], matrix_dm[1][3], - matrix_dm[2][3], matrix_dm[0][0], matrix_dm[0][1], matrix_dm[0][2], - matrix_dm[1][0], matrix_dm[1][1], matrix_dm[1][2], matrix_dm[2][0], - matrix_dm[2][1], matrix_dm[2][2], scale_h_str, scale_v_str, - chroma_scale_h_str, chroma_scale_v_str, templ->width, templ->height, - templ->chroma_width, templ->chroma_height, templ->in_depth, - templ->out_depth, templ->pstride, templ->chroma_pstride, templ->in_shift, - templ->out_shift, templ->mask, templ->swap_uv, templ->unpack_function, - templ->read_chroma, templ->write_chroma); -} - -static gchar * -cuda_converter_generate_rgb_to_rgb_kernel_code (GstCudaConverter * convert, - GstCudaKernelTempl * templ) -{ - gchar scale_h_str[G_ASCII_DTOSTR_BUF_SIZE]; - gchar scale_v_str[G_ASCII_DTOSTR_BUF_SIZE]; - g_ascii_formatd (scale_h_str, G_ASCII_DTOSTR_BUF_SIZE, "%f", templ->scale_h); - g_ascii_formatd (scale_v_str, G_ASCII_DTOSTR_BUF_SIZE, "%f", templ->scale_v); - return g_strdup_printf (templ_RGB_to_RGB, - scale_h_str, scale_v_str, - templ->width, templ->height, - templ->in_depth, templ->out_depth, templ->pstride, - templ->rgb_order.R, templ->rgb_order.G, - templ->rgb_order.B, templ->rgb_order.A, templ->rgb_order.X, - templ->unpack_function); -} - -#define SET_ORDER(o,r,g,b,a,x) G_STMT_START { \ - (o)->R = (r); \ - (o)->G = (g); \ - (o)->B = (b); \ - (o)->A = (a); \ - (o)->X = (x); \ -} G_STMT_END - -static void -cuda_converter_get_rgb_order (GstVideoFormat format, GstCudaRGBOrder * order) -{ - switch (format) { - case GST_VIDEO_FORMAT_RGBA: - SET_ORDER (order, 0, 1, 2, 3, -1); - break; - case GST_VIDEO_FORMAT_RGBx: - SET_ORDER (order, 0, 1, 2, -1, 3); - break; - case GST_VIDEO_FORMAT_BGRA: - SET_ORDER (order, 2, 1, 0, 3, -1); - break; - case GST_VIDEO_FORMAT_BGRx: - SET_ORDER (order, 2, 1, 0, -1, 3); - break; - case GST_VIDEO_FORMAT_ARGB: - SET_ORDER (order, 1, 2, 3, 0, -1); - break; - case GST_VIDEO_FORMAT_ABGR: - SET_ORDER (order, 3, 2, 1, 0, -1); - break; - case GST_VIDEO_FORMAT_RGB: - SET_ORDER (order, 0, 1, 2, -1, -1); - break; - case GST_VIDEO_FORMAT_BGR: - SET_ORDER (order, 2, 1, 0, -1, -1); - break; - case GST_VIDEO_FORMAT_BGR10A2_LE: - SET_ORDER (order, 1, 2, 3, 0, -1); - break; - case GST_VIDEO_FORMAT_RGB10A2_LE: - SET_ORDER (order, 3, 2, 1, 0, -1); - break; - default: - g_assert_not_reached (); - break; - } -} - -static gboolean -cuda_converter_lookup_path (GstCudaConverter * convert) -{ - GstVideoFormat in_format, out_format; - gboolean src_yuv, dst_yuv; - gboolean src_planar, dst_planar; - GstCudaKernelTempl templ = { 0, }; - GstVideoInfo *in_info, *out_info; - gboolean ret = FALSE; - CUresult cuda_ret; - - in_info = &convert->in_info; - out_info = &convert->out_info; - - in_format = GST_VIDEO_INFO_FORMAT (in_info); - out_format = GST_VIDEO_INFO_FORMAT (out_info); - - src_yuv = GST_VIDEO_INFO_IS_YUV (in_info); - dst_yuv = GST_VIDEO_INFO_IS_YUV (out_info); - - src_planar = GST_VIDEO_INFO_N_PLANES (in_info) == - GST_VIDEO_INFO_N_COMPONENTS (in_info); - dst_planar = GST_VIDEO_INFO_N_PLANES (out_info) == - GST_VIDEO_INFO_N_COMPONENTS (out_info); - - convert->keep_size = (GST_VIDEO_INFO_WIDTH (&convert->in_info) == - GST_VIDEO_INFO_WIDTH (&convert->out_info) && - GST_VIDEO_INFO_HEIGHT (&convert->in_info) == - GST_VIDEO_INFO_HEIGHT (&convert->out_info)); - - templ.scale_h = (gfloat) GST_VIDEO_INFO_COMP_WIDTH (in_info, 0) / - (gfloat) GST_VIDEO_INFO_COMP_WIDTH (out_info, 0); - templ.scale_v = (gfloat) GST_VIDEO_INFO_COMP_HEIGHT (in_info, 0) / - (gfloat) GST_VIDEO_INFO_COMP_HEIGHT (out_info, 0); - templ.chroma_scale_h = (gfloat) GST_VIDEO_INFO_COMP_WIDTH (in_info, 1) / - (gfloat) GST_VIDEO_INFO_COMP_WIDTH (out_info, 1); - templ.chroma_scale_v = (gfloat) GST_VIDEO_INFO_COMP_HEIGHT (in_info, 1) / - (gfloat) GST_VIDEO_INFO_COMP_HEIGHT (out_info, 1); - templ.width = GST_VIDEO_INFO_COMP_WIDTH (out_info, 0); - templ.height = GST_VIDEO_INFO_COMP_HEIGHT (out_info, 0); - templ.chroma_width = GST_VIDEO_INFO_COMP_WIDTH (out_info, 1); - templ.chroma_height = GST_VIDEO_INFO_COMP_HEIGHT (out_info, 1); - - templ.in_depth = GST_VIDEO_INFO_COMP_DEPTH (in_info, 0); - templ.out_depth = GST_VIDEO_INFO_COMP_DEPTH (out_info, 0); - templ.pstride = GST_VIDEO_INFO_COMP_PSTRIDE (out_info, 0); - templ.chroma_pstride = GST_VIDEO_INFO_COMP_PSTRIDE (out_info, 1); - templ.in_shift = in_info->finfo->shift[0]; - templ.out_shift = out_info->finfo->shift[0]; - templ.mask = ((1 << templ.out_depth) - 1) << templ.out_shift; - templ.swap_uv = (is_uv_swapped (in_format) != is_uv_swapped (out_format)); - - if (src_yuv && dst_yuv) { - convert->convert = convert_YUV_TO_YUV; - - if (src_planar && dst_planar) { - templ.read_chroma = READ_CHROMA_FROM_PLANAR; - templ.write_chroma = WRITE_CHROMA_TO_PLANAR; - } else if (!src_planar && dst_planar) { - templ.read_chroma = READ_CHROMA_FROM_SEMI_PLANAR; - templ.write_chroma = WRITE_CHROMA_TO_PLANAR; - } else if (src_planar && !dst_planar) { - templ.read_chroma = READ_CHROMA_FROM_PLANAR; - templ.write_chroma = WRITE_CHROMA_TO_SEMI_PLANAR; - } else { - templ.read_chroma = READ_CHROMA_FROM_SEMI_PLANAR; - templ.write_chroma = WRITE_CHROMA_TO_SEMI_PLANAR; - } - - convert->kernel_source = - cuda_converter_generate_yuv_to_yuv_kernel_code (convert, &templ); - convert->func_names[0] = GST_CUDA_KERNEL_FUNC; - - ret = TRUE; - } else if (src_yuv && !dst_yuv) { - MatrixData matrix; - - if (src_planar) { - templ.read_chroma = READ_CHROMA_FROM_PLANAR; - } else { - templ.read_chroma = READ_CHROMA_FROM_SEMI_PLANAR; - } - - templ.max_in_val = (1 << templ.in_depth) - 1; - cuda_converter_get_rgb_order (out_format, &templ.rgb_order); - - cuda_converter_get_matrix (convert, &matrix, in_info, out_info); - convert->kernel_source = - cuda_converter_generate_yuv_to_rgb_kernel_code (convert, - &templ, &matrix); - convert->func_names[0] = GST_CUDA_KERNEL_FUNC; - - convert->convert = convert_YUV_TO_RGB; - - ret = TRUE; - } else if (!src_yuv && dst_yuv) { - MatrixData matrix; - gsize element_size = 8; - GstVideoFormat unpack_format; - GstVideoFormat y444_format; - GstVideoInfo unpack_info; - GstVideoInfo y444_info; - gint i; - - if (dst_planar) { - templ.write_chroma = WRITE_CHROMA_TO_PLANAR; - } else { - templ.write_chroma = WRITE_CHROMA_TO_SEMI_PLANAR; - } - templ.read_chroma = READ_CHROMA_FROM_PLANAR; - - cuda_converter_get_rgb_order (in_format, &convert->in_rgb_order); - - if (templ.in_depth > 8) { - /* FIXME: RGB10A2_LE and BGR10A2_LE only */ - element_size = 16; - unpack_format = GST_VIDEO_FORMAT_ARGB64; - y444_format = GST_VIDEO_FORMAT_Y444_16LE; - templ.unpack_function = unpack_to_ARGB64; - } else { - unpack_format = GST_VIDEO_FORMAT_ARGB; - y444_format = GST_VIDEO_FORMAT_Y444; - templ.unpack_function = unpack_to_ARGB; - } - - gst_video_info_set_format (&unpack_info, - unpack_format, GST_VIDEO_INFO_WIDTH (in_info), - GST_VIDEO_INFO_HEIGHT (in_info)); - gst_video_info_set_format (&y444_info, - y444_format, GST_VIDEO_INFO_WIDTH (in_info), - GST_VIDEO_INFO_HEIGHT (in_info)); - - templ.in_depth = GST_VIDEO_INFO_COMP_DEPTH (&unpack_info, 0); - - cuda_ret = CuMemAllocPitch (&convert->unpack_surface.device_ptr, - &convert->unpack_surface.cuda_stride, - GST_VIDEO_INFO_COMP_WIDTH (&unpack_info, 0) * - GST_VIDEO_INFO_COMP_PSTRIDE (&unpack_info, 0), - GST_VIDEO_INFO_HEIGHT (&unpack_info), element_size); - - if (!gst_cuda_result (cuda_ret)) { - GST_ERROR ("couldn't alloc unpack surface"); - return FALSE; - } - - for (i = 0; i < 3; i++) { - cuda_ret = CuMemAllocPitch (&convert->y444_surface[i].device_ptr, - &convert->y444_surface[i].cuda_stride, - GST_VIDEO_INFO_COMP_WIDTH (&y444_info, i) * - GST_VIDEO_INFO_COMP_PSTRIDE (&y444_info, i), - GST_VIDEO_INFO_COMP_HEIGHT (&y444_info, i), element_size); - - if (!gst_cuda_result (cuda_ret)) { - GST_ERROR ("couldn't alloc %dth y444 surface", i); - return FALSE; - } - } - - cuda_converter_get_matrix (convert, &matrix, &unpack_info, &y444_info); - - convert->kernel_source = - cuda_converter_generate_rgb_to_yuv_kernel_code (convert, - &templ, &matrix); - - convert->func_names[0] = GST_CUDA_KERNEL_FUNC_TO_ARGB; - convert->func_names[1] = GST_CUDA_KERNEL_FUNC_TO_Y444; - convert->func_names[2] = GST_CUDA_KERNEL_FUNC_Y444_TO_YUV; - - convert->convert = convert_RGB_TO_YUV; - - ret = TRUE; - } else { - gsize element_size = 8; - GstVideoFormat unpack_format; - GstVideoInfo unpack_info; - - cuda_converter_get_rgb_order (in_format, &convert->in_rgb_order); - cuda_converter_get_rgb_order (out_format, &templ.rgb_order); - - if (templ.in_depth > 8) { - /* FIXME: RGB10A2_LE and BGR10A2_LE only */ - element_size = 16; - unpack_format = GST_VIDEO_FORMAT_ARGB64; - templ.unpack_function = unpack_to_ARGB64; - } else { - unpack_format = GST_VIDEO_FORMAT_ARGB; - templ.unpack_function = unpack_to_ARGB; - } - - gst_video_info_set_format (&unpack_info, - unpack_format, GST_VIDEO_INFO_WIDTH (in_info), - GST_VIDEO_INFO_HEIGHT (in_info)); - - templ.in_depth = GST_VIDEO_INFO_COMP_DEPTH (&unpack_info, 0); - - cuda_ret = CuMemAllocPitch (&convert->unpack_surface.device_ptr, - &convert->unpack_surface.cuda_stride, - GST_VIDEO_INFO_COMP_WIDTH (&unpack_info, 0) * - GST_VIDEO_INFO_COMP_PSTRIDE (&unpack_info, 0), - GST_VIDEO_INFO_HEIGHT (&unpack_info), element_size); - - if (!gst_cuda_result (cuda_ret)) { - GST_ERROR ("couldn't alloc unpack surface"); - return FALSE; - } - - convert->kernel_source = - cuda_converter_generate_rgb_to_rgb_kernel_code (convert, &templ); - - convert->func_names[0] = GST_CUDA_KERNEL_FUNC_TO_ARGB; - convert->func_names[1] = GST_CUDA_KERNEL_FUNC_SCALE_RGB; - - convert->convert = convert_RGB_TO_RGB; - - ret = TRUE; - } - - if (!ret) { - GST_DEBUG ("no path found"); - - return FALSE; - } - - GST_TRACE ("configured CUDA kernel source\n%s", convert->kernel_source); - - return TRUE; -} diff --git a/subprojects/gst-plugins-bad/sys/nvcodec/cuda-converter.h b/subprojects/gst-plugins-bad/sys/nvcodec/cuda-converter.h deleted file mode 100644 index 82c5f169eb..0000000000 --- a/subprojects/gst-plugins-bad/sys/nvcodec/cuda-converter.h +++ /dev/null @@ -1,44 +0,0 @@ -/* GStreamer - * Copyright (C) 2019 Seungha Yang - * - * This library is free software; you can redistribute it and/or - * modify it under the terms of the GNU Library General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Library General Public License for more details. - * - * You should have received a copy of the GNU Library General Public - * License along with this library; if not, write to the - * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, - * Boston, MA 02110-1301, USA. - */ - -#ifndef __GST_CUDA_CONVERTER_H__ -#define __GST_CUDA_CONVERTER_H__ - -#include -#include -#include - -G_BEGIN_DECLS - -typedef struct _GstCudaConverter GstCudaConverter; - -GstCudaConverter * gst_cuda_converter_new (GstVideoInfo * in_info, - GstVideoInfo * out_info, - GstCudaContext * cuda_ctx); - -void gst_cuda_converter_free (GstCudaConverter * convert); - -gboolean gst_cuda_converter_convert_frame (GstCudaConverter * convert, - GstVideoFrame * src_frame, - GstVideoFrame * dst_frame, - CUstream cuda_stream); - -G_END_DECLS - -#endif /* __GST_CUDA_CONVERTER_H__ */ diff --git a/subprojects/gst-plugins-bad/sys/nvcodec/gstcudaconverter.c b/subprojects/gst-plugins-bad/sys/nvcodec/gstcudaconverter.c new file mode 100644 index 0000000000..a7ea73d141 --- /dev/null +++ b/subprojects/gst-plugins-bad/sys/nvcodec/gstcudaconverter.c @@ -0,0 +1,2243 @@ +/* GStreamer + * Copyright (C) 2022 Seungha Yang + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Library General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Library General Public License for more details. + * + * You should have received a copy of the GNU Library General Public + * License along with this library; if not, write to the + * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, + * Boston, MA 02110-1301, USA. + */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "gstcudaconverter.h" +#include +#include +#include +#include + +GST_DEBUG_CATEGORY_STATIC (gst_cuda_converter_debug); +#define GST_CAT_DEFAULT gst_cuda_converter_debug + +#define CUDA_BLOCK_X 16 +#define CUDA_BLOCK_Y 16 +#define DIV_UP(size,block) (((size) + ((block) - 1)) / (block)) + +/* from GstD3D11 */ +typedef struct _GstCudaColorMatrix +{ + gdouble matrix[3][3]; + gdouble offset[3]; + gdouble min[3]; + gdouble max[3]; +} GstCudaColorMatrix; + +static gchar * +gst_cuda_dump_color_matrix (GstCudaColorMatrix * matrix) +{ + /* *INDENT-OFF* */ + static const gchar format[] = + "[MATRIX]\n" + "|% .6f, % .6f, % .6f|\n" + "|% .6f, % .6f, % .6f|\n" + "|% .6f, % .6f, % .6f|\n" + "[OFFSET]\n" + "|% .6f, % .6f, % .6f|\n" + "[MIN]\n" + "|% .6f, % .6f, % .6f|\n" + "[MAX]\n" + "|% .6f, % .6f, % .6f|"; + /* *INDENT-ON* */ + + return g_strdup_printf (format, + matrix->matrix[0][0], matrix->matrix[0][1], matrix->matrix[0][2], + matrix->matrix[1][0], matrix->matrix[1][1], matrix->matrix[1][2], + matrix->matrix[2][0], matrix->matrix[2][1], matrix->matrix[2][2], + matrix->offset[0], matrix->offset[1], matrix->offset[2], + matrix->min[0], matrix->min[1], matrix->min[2], + matrix->max[0], matrix->max[1], matrix->max[2]); +} + +static void +color_matrix_copy (GstCudaColorMatrix * dst, const GstCudaColorMatrix * src) +{ + for (guint i = 0; i < 3; i++) { + for (guint j = 0; j < 3; j++) { + dst->matrix[i][j] = src->matrix[i][j]; + } + } +} + +static void +color_matrix_multiply (GstCudaColorMatrix * dst, GstCudaColorMatrix * a, + GstCudaColorMatrix * b) +{ + GstCudaColorMatrix tmp; + + for (guint i = 0; i < 3; i++) { + for (guint j = 0; j < 3; j++) { + gdouble val = 0; + for (guint k = 0; k < 3; k++) { + val += a->matrix[i][k] * b->matrix[k][j]; + } + + tmp.matrix[i][j] = val; + } + } + + color_matrix_copy (dst, &tmp); +} + +static void +color_matrix_identity (GstCudaColorMatrix * m) +{ + for (guint i = 0; i < 3; i++) { + for (guint j = 0; j < 3; j++) { + if (i == j) + m->matrix[i][j] = 1.0; + else + m->matrix[i][j] = 0; + } + } +} + +/** + * gst_cuda_color_range_adjust_matrix_unorm: + * @in_info: a #GstVideoInfo + * @out_info: a #GstVideoInfo + * @matrix: a #GstCudaColorMatrix + * + * Calculates matrix for color range adjustment. Both input and output + * signals are in normalized [0.0..1.0] space. + * + * Resulting values can be calculated by + * | Yout | | Yin | | matrix.offset[0] | + * | Uout | = clamp ( matrix.matrix * | Uin | + | matrix.offset[1] |, matrix.min, matrix.max ) + * | Vout | | Vin | | matrix.offset[2] | + * + * Returns: %TRUE if successful + */ +static gboolean +gst_cuda_color_range_adjust_matrix_unorm (const GstVideoInfo * in_info, + const GstVideoInfo * out_info, GstCudaColorMatrix * matrix) +{ + gboolean in_rgb, out_rgb; + gint in_offset[GST_VIDEO_MAX_COMPONENTS]; + gint in_scale[GST_VIDEO_MAX_COMPONENTS]; + gint out_offset[GST_VIDEO_MAX_COMPONENTS]; + gint out_scale[GST_VIDEO_MAX_COMPONENTS]; + GstVideoColorRange in_range; + GstVideoColorRange out_range; + gdouble src_fullscale, dst_fullscale; + + memset (matrix, 0, sizeof (GstCudaColorMatrix)); + for (guint i = 0; i < 3; i++) { + matrix->matrix[i][i] = 1.0; + matrix->matrix[i][i] = 1.0; + matrix->matrix[i][i] = 1.0; + matrix->max[i] = 1.0; + } + + in_rgb = GST_VIDEO_INFO_IS_RGB (in_info); + out_rgb = GST_VIDEO_INFO_IS_RGB (out_info); + + if (in_rgb != out_rgb) { + GST_WARNING ("Invalid format conversion"); + return FALSE; + } + + in_range = in_info->colorimetry.range; + out_range = out_info->colorimetry.range; + + if (in_range == GST_VIDEO_COLOR_RANGE_UNKNOWN) { + GST_WARNING ("Unknown input color range"); + if (in_rgb || GST_VIDEO_INFO_IS_GRAY (in_info)) + in_range = GST_VIDEO_COLOR_RANGE_0_255; + else + in_range = GST_VIDEO_COLOR_RANGE_16_235; + } + + if (out_range == GST_VIDEO_COLOR_RANGE_UNKNOWN) { + GST_WARNING ("Unknown output color range"); + if (out_rgb || GST_VIDEO_INFO_IS_GRAY (out_info)) + out_range = GST_VIDEO_COLOR_RANGE_0_255; + else + out_range = GST_VIDEO_COLOR_RANGE_16_235; + } + + src_fullscale = (gdouble) ((1 << in_info->finfo->depth[0]) - 1); + dst_fullscale = (gdouble) ((1 << out_info->finfo->depth[0]) - 1); + + gst_video_color_range_offsets (in_range, in_info->finfo, in_offset, in_scale); + gst_video_color_range_offsets (out_range, + out_info->finfo, out_offset, out_scale); + + matrix->min[0] = matrix->min[1] = matrix->min[2] = + (gdouble) out_offset[0] / dst_fullscale; + + matrix->max[0] = (out_scale[0] + out_offset[0]) / dst_fullscale; + matrix->max[1] = matrix->max[2] = + (out_scale[1] + out_offset[0]) / dst_fullscale; + + if (in_info->colorimetry.range == out_info->colorimetry.range) { + GST_DEBUG ("Same color range"); + return TRUE; + } + + /* Formula + * + * 1) Scales and offset compensates input to [0..1] range + * SRC_NORM[i] = (src[i] * src_fullscale - in_offset[i]) / in_scale[i] + * = (src[i] * src_fullscale / in_scale[i]) - in_offset[i] / in_scale[i] + * + * 2) Reverse to output UNIT scale + * DST_UINT[i] = SRC_NORM[i] * out_scale[i] + out_offset[i] + * = src[i] * src_fullscale * out_scale[i] / in_scale[i] + * - in_offset[i] * out_scale[i] / in_scale[i] + * + out_offset[i] + * + * 3) Back to [0..1] scale + * dst[i] = DST_UINT[i] / dst_fullscale + * = COEFF[i] * src[i] + OFF[i] + * where + * src_fullscale * out_scale[i] + * COEFF[i] = ------------------------------ + * dst_fullscale * in_scale[i] + * + * out_offset[i] in_offset[i] * out_scale[i] + * OFF[i] = -------------- - ------------------------------ + * dst_fullscale dst_fullscale * in_scale[i] + */ + for (guint i = 0; i < 3; i++) { + matrix->matrix[i][i] = (src_fullscale * out_scale[i]) / + (dst_fullscale * in_scale[i]); + matrix->offset[i] = (out_offset[i] / dst_fullscale) - + ((gdouble) in_offset[i] * out_scale[i] / (dst_fullscale * in_scale[i])); + } + + return TRUE; +} + +/** + * gst_cuda_yuv_to_rgb_matrix_unorm: + * @in_yuv_info: a #GstVideoInfo of input YUV signal + * @out_rgb_info: a #GstVideoInfo of output RGB signal + * @matrix: a #GstCudaColorMatrix + * + * Calculates transform matrix from YUV to RGB conversion. Both input and output + * signals are in normalized [0.0..1.0] space and additional gamma decoding + * or primary/transfer function transform is not performed by this matrix. + * + * Resulting non-linear RGB values can be calculated by + * | R' | | Y' | | matrix.offset[0] | + * | G' | = clamp ( matrix.matrix * | Cb | + | matrix.offset[1] | matrix.min, matrix.max ) + * | B' | | Cr | | matrix.offset[2] | + * + * Returns: %TRUE if successful + */ +static gboolean +gst_cuda_yuv_to_rgb_matrix_unorm (const GstVideoInfo * in_yuv_info, + const GstVideoInfo * out_rgb_info, GstCudaColorMatrix * matrix) +{ + gint offset[4], scale[4]; + gdouble Kr, Kb, Kg; + + /* + * + * + * Input: Unsigned normalized Y'CbCr(unorm), [0.0..1.0] range + * Output: Unsigned normalized non-linear R'G'B'(unorm), [0.0..1.0] range + * + * 1) Y'CbCr(unorm) to scaled Y'CbCr + * | Y' | | Y'(unorm) | + * | Cb | = S | Cb(unorm) | + * | Cb | | Cr(unorm) | + * where S = (2 ^ bitdepth) - 1 + * + * 2) Y'CbCr to YPbPr + * Y = (Y' - offsetY ) / scaleY + * Pb = [(Cb - offsetCbCr) / scaleCbCr] + * Pr = [(Cr - offsetCrCr) / scaleCrCr] + * => + * Y = Y'(unorm) * Sy + Oy + * Pb = Cb(unorm) * Suv + Ouv + * Pb = Cr(unorm) * Suv + Ouv + * where + * Sy = S / scaleY + * Suv = S / scaleCbCr + * Oy = -(offsetY / scaleY) + * Ouv = -(offsetCbCr / scaleCbCr) + * + * 3) YPbPr to R'G'B' + * | R' | | Y | + * | G' | = M *| Pb | + * | B' | | Pr | + * where + * | vecR | + * M = | vecG | + * | vecB | + * vecR = | 1, 0 , 2(1 - Kr) | + * vecG = | 1, -(Kb/Kg) * 2(1 - Kb), -(Kr/Kg) * 2(1 - Kr) | + * vecB = | 1, 2(1 - Kb) , 0 | + * => + * R' = dot(vecR, (Syuv * Y'CbCr(unorm))) + dot(vecR, Offset) + * G' = dot(vecG, (Svuy * Y'CbCr(unorm))) + dot(vecG, Offset) + * B' = dot(vecB, (Syuv * Y'CbCr(unorm)) + dot(vecB, Offset) + * where + * | Sy, 0, 0 | + * Syuv = | 0, Suv, 0 | + * | 0 0, Suv | + * + * | Oy | + * Offset = | Ouv | + * | Ouv | + * + * 4) YUV -> RGB matrix + * | R' | | Y'(unorm) | | offsetA | + * | G' | = Matrix * | Cb(unorm) | + | offsetB | + * | B' | | Cr(unorm) | | offsetC | + * + * where + * | vecR | + * Matrix = | vecG | * Syuv + * | vecB | + * + * offsetA = dot(vecR, Offset) + * offsetB = dot(vecG, Offset) + * offsetC = dot(vecB, Offset) + * + * 4) Consider 16-235 scale RGB + * RGBfull(0..255) -> RGBfull(16..235) matrix is represented by + * | Rs | | Rf | | Or | + * | Gs | = Ms | Gf | + | Og | + * | Bs | | Bf | | Ob | + * + * Combining all matrix into + * | Rs | | Y'(unorm) | | offsetA | | Or | + * | Gs | = Ms * ( Matrix * | Cb(unorm) | + | offsetB | ) + | Og | + * | Bs | | Cr(unorm) | | offsetC | | Ob | + * + * | Y'(unorm) | | offsetA | | Or | + * = Ms * Matrix * | Cb(unorm) | + Ms | offsetB | + | Og | + * | Cr(unorm) | | offsetC | | Ob | + */ + + memset (matrix, 0, sizeof (GstCudaColorMatrix)); + for (guint i = 0; i < 3; i++) + matrix->max[i] = 1.0; + + gst_video_color_range_offsets (in_yuv_info->colorimetry.range, + in_yuv_info->finfo, offset, scale); + + if (gst_video_color_matrix_get_Kr_Kb (in_yuv_info->colorimetry.matrix, + &Kr, &Kb)) { + guint S; + gdouble Sy, Suv; + gdouble Oy, Ouv; + gdouble vecR[3], vecG[3], vecB[3]; + + Kg = 1.0 - Kr - Kb; + + vecR[0] = 1.0; + vecR[1] = 0; + vecR[2] = 2 * (1 - Kr); + + vecG[0] = 1.0; + vecG[1] = -(Kb / Kg) * 2 * (1 - Kb); + vecG[2] = -(Kr / Kg) * 2 * (1 - Kr); + + vecB[0] = 1.0; + vecB[1] = 2 * (1 - Kb); + vecB[2] = 0; + + /* Assume all components has the same bitdepth */ + S = (1 << in_yuv_info->finfo->depth[0]) - 1; + Sy = (gdouble) S / scale[0]; + Suv = (gdouble) S / scale[1]; + Oy = -((gdouble) offset[0] / scale[0]); + Ouv = -((gdouble) offset[1] / scale[1]); + + matrix->matrix[0][0] = Sy * vecR[0]; + matrix->matrix[1][0] = Sy * vecG[0]; + matrix->matrix[2][0] = Sy * vecB[0]; + + matrix->matrix[0][1] = Suv * vecR[1]; + matrix->matrix[1][1] = Suv * vecG[1]; + matrix->matrix[2][1] = Suv * vecB[1]; + + matrix->matrix[0][2] = Suv * vecR[2]; + matrix->matrix[1][2] = Suv * vecG[2]; + matrix->matrix[2][2] = Suv * vecB[2]; + + matrix->offset[0] = vecR[0] * Oy + vecR[1] * Ouv + vecR[2] * Ouv; + matrix->offset[1] = vecG[0] * Oy + vecG[1] * Ouv + vecG[2] * Ouv; + matrix->offset[2] = vecB[0] * Oy + vecB[1] * Ouv + vecB[2] * Ouv; + + /* Apply RGB range scale matrix */ + if (out_rgb_info->colorimetry.range == GST_VIDEO_COLOR_RANGE_16_235) { + GstCudaColorMatrix scale_matrix, rst; + GstVideoInfo full_rgb = *out_rgb_info; + + full_rgb.colorimetry.range = GST_VIDEO_COLOR_RANGE_0_255; + + if (gst_cuda_color_range_adjust_matrix_unorm (&full_rgb, + out_rgb_info, &scale_matrix)) { + /* Ms * Matrix */ + color_matrix_multiply (&rst, &scale_matrix, matrix); + + /* Ms * transform offsets */ + for (guint i = 0; i < 3; i++) { + gdouble val = 0; + for (guint j = 0; j < 3; j++) { + val += scale_matrix.matrix[i][j] * matrix->offset[j]; + } + rst.offset[i] = val + scale_matrix.offset[i]; + } + + /* copy back to output matrix */ + for (guint i = 0; i < 3; i++) { + for (guint j = 0; j < 3; j++) { + matrix->matrix[i][j] = rst.matrix[i][j]; + } + matrix->offset[i] = rst.offset[i]; + matrix->min[i] = scale_matrix.min[i]; + matrix->max[i] = scale_matrix.max[i]; + } + } + } + } else { + /* Unknown matrix */ + matrix->matrix[0][0] = 1.0; + matrix->matrix[1][1] = 1.0; + matrix->matrix[2][2] = 1.0; + } + + return TRUE; +} + +/** + * gst_cuda_rgb_to_yuv_matrix_unorm: + * @in_rgb_info: a #GstVideoInfo of input RGB signal + * @out_yuv_info: a #GstVideoInfo of output YUV signal + * @matrix: a #GstCudaColorMatrix + * + * Calculates transform matrix from RGB to YUV conversion. Both input and output + * signals are in normalized [0.0..1.0] space and additional gamma decoding + * or primary/transfer function transform is not performed by this matrix. + * + * Resulting RGB values can be calculated by + * | Y' | | R' | | matrix.offset[0] | + * | Cb | = clamp ( matrix.matrix * | G' | + | matrix.offset[1] |, matrix.min, matrix.max ) + * | Cr | | B' | | matrix.offset[2] | + * + * Returns: %TRUE if successful + */ +static gboolean +gst_cuda_rgb_to_yuv_matrix_unorm (const GstVideoInfo * in_rgb_info, + const GstVideoInfo * out_yuv_info, GstCudaColorMatrix * matrix) +{ + gint offset[4], scale[4]; + gdouble Kr, Kb, Kg; + + /* + * + * + * Input: Unsigned normalized non-linear R'G'B'(unorm), [0.0..1.0] range + * Output: Unsigned normalized Y'CbCr(unorm), [0.0..1.0] range + * + * 1) R'G'B' to YPbPr + * | Y | | R' | + * | Pb | = M *| G' | + * | Pr | | B' | + * where + * | vecY | + * M = | vecU | + * | vecV | + * vecY = | Kr , Kg , Kb | + * vecU = | -0.5*Kr/(1-Kb), -0.5*Kg/(1-Kb), 0.5 | + * vecV = | 0.5 , -0.5*Kg/(1-Kr), -0.5*Kb(1-Kr) | + * + * 2) YPbPr to Y'CbCr(unorm) + * Y'(unorm) = (Y * scaleY + offsetY) / S + * Cb(unorm) = (Pb * scaleCbCr + offsetCbCr) / S + * Cr(unorm) = (Pr * scaleCbCr + offsetCbCr) / S + * => + * Y'(unorm) = (Y * scaleY / S) + (offsetY / S) + * Cb(unorm) = (Pb * scaleCbCr / S) + (offsetCbCr / S) + * Cr(unorm) = (Pb * scaleCbCr / S) + (offsetCbCr / S) + * where S = (2 ^ bitdepth) - 1 + * + * 3) RGB -> YUV matrix + * | Y'(unorm) | | R' | | offsetA | + * | Cb(unorm) | = Matrix * | G' | + | offsetB | + * | Cr(unorm) | | B' | | offsetC | + * + * where + * | (scaleY/S) * vecY | + * Matrix = | (scaleCbCr/S) * vecU | + * | (scaleCbCr/S) * vecV | + * + * offsetA = offsetY / S + * offsetB = offsetCbCr / S + * offsetC = offsetCbCr / S + * + * 4) Consider 16-235 scale RGB + * RGBstudio(16..235) -> RGBfull(0..255) matrix is represented by + * | Rf | | Rs | | Or | + * | Gf | = Ms | Gs | + | Og | + * | Bf | | Bs | | Ob | + * + * Combining all matrix into + * | Y'(unorm) | | Rs | | Or | | offsetA | + * | Cb(unorm) | = Matrix * ( Ms | Gs | + | Og | ) + | offsetB | + * | Cr(unorm) | | Bs | | Ob | | offsetC | + * + * | Rs | | Or | | offsetA | + * = Matrix * Ms | Gs | + Matrix | Og | + | offsetB | + * | Bs | | Ob | | offsetB | + */ + + memset (matrix, 0, sizeof (GstCudaColorMatrix)); + for (guint i = 0; i < 3; i++) + matrix->max[i] = 1.0; + + gst_video_color_range_offsets (out_yuv_info->colorimetry.range, + out_yuv_info->finfo, offset, scale); + + if (gst_video_color_matrix_get_Kr_Kb (out_yuv_info->colorimetry.matrix, + &Kr, &Kb)) { + guint S; + gdouble Sy, Suv; + gdouble Oy, Ouv; + gdouble vecY[3], vecU[3], vecV[3]; + + Kg = 1.0 - Kr - Kb; + + vecY[0] = Kr; + vecY[1] = Kg; + vecY[2] = Kb; + + vecU[0] = -0.5 * Kr / (1 - Kb); + vecU[1] = -0.5 * Kg / (1 - Kb); + vecU[2] = 0.5; + + vecV[0] = 0.5; + vecV[1] = -0.5 * Kg / (1 - Kr); + vecV[2] = -0.5 * Kb / (1 - Kr); + + /* Assume all components has the same bitdepth */ + S = (1 << out_yuv_info->finfo->depth[0]) - 1; + Sy = (gdouble) scale[0] / S; + Suv = (gdouble) scale[1] / S; + Oy = (gdouble) offset[0] / S; + Ouv = (gdouble) offset[1] / S; + + for (guint i = 0; i < 3; i++) { + matrix->matrix[0][i] = Sy * vecY[i]; + matrix->matrix[1][i] = Suv * vecU[i]; + matrix->matrix[2][i] = Suv * vecV[i]; + } + + matrix->offset[0] = Oy; + matrix->offset[1] = Ouv; + matrix->offset[2] = Ouv; + + matrix->min[0] = Oy; + matrix->min[1] = Oy; + matrix->min[2] = Oy; + + matrix->max[0] = ((gdouble) scale[0] + offset[0]) / S; + matrix->max[1] = ((gdouble) scale[1] + offset[0]) / S; + matrix->max[2] = ((gdouble) scale[1] + offset[0]) / S; + + /* Apply RGB range scale matrix */ + if (in_rgb_info->colorimetry.range == GST_VIDEO_COLOR_RANGE_16_235) { + GstCudaColorMatrix scale_matrix, rst; + GstVideoInfo full_rgb = *in_rgb_info; + + full_rgb.colorimetry.range = GST_VIDEO_COLOR_RANGE_0_255; + + if (gst_cuda_color_range_adjust_matrix_unorm (in_rgb_info, + &full_rgb, &scale_matrix)) { + /* Matrix * Ms */ + color_matrix_multiply (&rst, matrix, &scale_matrix); + + /* Matrix * scale offsets */ + for (guint i = 0; i < 3; i++) { + gdouble val = 0; + for (guint j = 0; j < 3; j++) { + val += matrix->matrix[i][j] * scale_matrix.offset[j]; + } + rst.offset[i] = val + matrix->offset[i]; + } + + /* copy back to output matrix */ + for (guint i = 0; i < 3; i++) { + for (guint j = 0; j < 3; j++) { + matrix->matrix[i][j] = rst.matrix[i][j]; + } + matrix->offset[i] = rst.offset[i]; + } + } + } + } else { + /* Unknown matrix */ + matrix->matrix[0][0] = 1.0; + matrix->matrix[1][1] = 1.0; + matrix->matrix[2][2] = 1.0; + } + + return TRUE; +} + +typedef struct +{ + float coeffX[3]; + float coeffY[3]; + float coeffZ[3]; + float offset[3]; + float min[3]; + float max[3]; +} ColorMatrix; + +typedef struct +{ + ColorMatrix toRGBCoeff; + ColorMatrix toYuvCoeff; + ColorMatrix primariesCoeff; +} ConstBuffer; + +#define COLOR_SPACE_IDENTITY "color_space_identity" +#define COLOR_SPACE_CONVERT "color_space_convert" + +#define SAMPLE_YUV_PLANAR "sample_yuv_planar" +#define SAMPLE_YV12 "sample_yv12" +#define SAMPLE_YUV_PLANAR_10BIS "sample_yuv_planar_10bits" +#define SAMPLE_YUV_PLANAR_12BIS "sample_yuv_planar_12bits" +#define SAMPLE_SEMI_PLANAR "sample_semi_planar" +#define SAMPLE_SEMI_PLANAR_SWAP "sample_semi_planar_swap" +#define SAMPLE_RGBA "sample_rgba" +#define SAMPLE_BGRA "sample_bgra" +#define SAMPLE_RGBx "sample_rgbx" +#define SAMPLE_BGRx "sample_bgrx" +#define SAMPLE_ARGB "sample_argb" +/* same as ARGB */ +#define SAMPLE_ARGB64 "sample_argb" +#define SAMPLE_AGBR "sample_abgr" +#define SAMPLE_RGBP "sample_rgbp" +#define SAMPLE_BGRP "sample_bgrp" +#define SAMPLE_GBR "sample_gbr" +#define SAMPLE_GBRA "sample_gbra" + +#define WRITE_I420 "write_i420" +#define WRITE_YV12 "write_yv12" +#define WRITE_NV12 "write_nv12" +#define WRITE_NV21 "write_nv21" +#define WRITE_P010 "write_p010" +/* same as P010 */ +#define WRITE_P016 "write_p010" +#define WRITE_I420_10 "write_i420_10" +#define WRITE_Y444 "write_y444" +#define WRITE_Y444_16 "write_y444_16" +#define WRITE_RGBA "write_rgba" +#define WRITE_RGBx "write_rgbx" +#define WRITE_BGRA "write_bgra" +#define WRITE_BGRx "write_bgrx" +#define WRITE_ARGB "write_argb" +#define WRITE_ABGR "write_abgr" +#define WRITE_RGB "write_rgb" +#define WRITE_BGR "write_bgr" +#define WRITE_RGB10A2 "write_rgb10a2" +#define WRITE_BGR10A2 "write_bgr10a2" +#define WRITE_Y42B "write_y42b" +#define WRITE_I422_10 "write_i422_10" +#define WRITE_I422_12 "write_i422_12" +#define WRITE_RGBP "write_rgbp" +#define WRITE_BGRP "write_bgrp" +#define WRITE_GBR "write_gbr" +#define WRITE_GBRA "write_gbra" + +/* *INDENT-OFF* */ +const static gchar KERNEL_COMMON[] = +"struct ColorMatrix\n" +"{\n" +" float CoeffX[3];\n" +" float CoeffY[3];\n" +" float CoeffZ[3];\n" +" float Offset[3];\n" +" float Min[3];\n" +" float Max[3];\n" +"};\n" +"\n" +"__device__ inline float\n" +"dot (const float coeff[3], float3 val)\n" +"{\n" +" return coeff[0] * val.x + coeff[1] * val.y + coeff[2] * val.z;\n" +"}\n" +"\n" +"__device__ inline float\n" +"clamp (float val, float min_val, float max_val)\n" +"{\n" +" return max (min_val, min (val, max_val));\n" +"}\n" +"\n" +"__device__ inline float3\n" +"clamp3 (float3 val, const float min_val[3], const float max_val[3])\n" +"{\n" +" return make_float3 (clamp (val.x, min_val[0], max_val[0]),\n" +" clamp (val.y, min_val[1], max_val[2]),\n" +" clamp (val.z, min_val[1], max_val[2]));\n" +"}\n" +"\n" +"__device__ inline unsigned char\n" +"scale_to_2bits (float val)\n" +"{\n" +" return (unsigned short) __float2int_rz (val * 3.0);\n" +"}\n" +"\n" +"__device__ inline unsigned char\n" +"scale_to_uchar (float val)\n" +"{\n" +" return (unsigned char) __float2int_rz (val * 255.0);\n" +"}\n" +"\n" +"__device__ inline unsigned short\n" +"scale_to_ushort (float val)\n" +"{\n" +" return (unsigned short) __float2int_rz (val * 65535.0);\n" +"}\n" +"\n" +"__device__ inline unsigned short\n" +"scale_to_10bits (float val)\n" +"{\n" +" return (unsigned short) __float2int_rz (val * 1023.0);\n" +"}\n" +"\n" +"__device__ inline unsigned short\n" +"scale_to_12bits (float val)\n" +"{\n" +" return (unsigned short) __float2int_rz (val * 4095.0);\n" +"}\n" +"\n" +"__device__ inline float3\n" +COLOR_SPACE_IDENTITY "(float3 sample, const ColorMatrix * matrix)\n" +"{\n" +" return sample;\n" +"}\n" +"\n" +"__device__ inline float3\n" +COLOR_SPACE_CONVERT "(float3 sample, const ColorMatrix * matrix)\n" +"{\n" +" float3 out;\n" +" out.x = dot (matrix->CoeffX, sample);\n" +" out.y = dot (matrix->CoeffY, sample);\n" +" out.z = dot (matrix->CoeffZ, sample);\n" +" out.x += matrix->Offset[0];\n" +" out.y += matrix->Offset[1];\n" +" out.z += matrix->Offset[2];\n" +" return clamp3 (out, matrix->Min, matrix->Max);\n" +"}\n" +"/* All 8bits yuv planar except for yv12 */\n" +"__device__ inline float4\n" +SAMPLE_YUV_PLANAR "(cudaTextureObject_t tex0, cudaTextureObject_t tex1,\n" +" cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)\n" +"{\n" +" float luma = tex2D(tex0, x, y);\n" +" float u = tex2D(tex1, x, y);\n" +" float v = tex2D(tex2, x, y);\n" +" return make_float4 (luma, u, v, 1);\n" +"}\n" +"\n" +"__device__ inline float4\n" +SAMPLE_YV12 "(cudaTextureObject_t tex0, cudaTextureObject_t tex1,\n" +" cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)\n" +"{\n" +" float luma = tex2D(tex0, x, y);\n" +" float u = tex2D(tex2, x, y);\n" +" float v = tex2D(tex1, x, y);\n" +" return make_float4 (luma, u, v, 1);\n" +"}\n" +"\n" +"__device__ inline float4\n" +SAMPLE_YUV_PLANAR_10BIS "(cudaTextureObject_t tex0, cudaTextureObject_t tex1,\n" +" cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)\n" +"{\n" +" float luma = tex2D(tex0, x, y);\n" +" float u = tex2D(tex1, x, y);\n" +" float v = tex2D(tex2, x, y);\n" +" /* (1 << 6) to scale [0, 1.0) range */\n" +" return make_float4 (luma * 64, u * 64, v * 64, 1);\n" +"}\n" +"\n" +"__device__ inline float4\n" +SAMPLE_YUV_PLANAR_12BIS "(cudaTextureObject_t tex0, cudaTextureObject_t tex1,\n" +" cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)\n" +"{\n" +" float luma = tex2D(tex0, x, y);\n" +" float u = tex2D(tex1, x, y);\n" +" float v = tex2D(tex2, x, y);\n" +" /* (1 << 6) to scale [0, 1.0) range */\n" +" return make_float4 (luma * 16, u * 16, v * 16, 1);\n" +"}\n" +"\n" +"/* NV12, P010, and P016 */\n" +"__device__ inline float4\n" +SAMPLE_SEMI_PLANAR "(cudaTextureObject_t tex0, cudaTextureObject_t tex1,\n" +" cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)\n" +"{\n" +" float luma = tex2D(tex0, x, y);\n" +" float2 uv = tex2D(tex1, x, y);\n" +" return make_float4 (luma, uv.x, uv.y, 1);\n" +"}\n" +"\n" +"__device__ inline float4\n" +SAMPLE_SEMI_PLANAR_SWAP "(cudaTextureObject_t tex0, cudaTextureObject_t tex1,\n" +" cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)\n" +"{\n" +" float luma = tex2D(tex0, x, y);\n" +" float2 vu = tex2D(tex1, x, y);\n" +" return make_float4 (luma, vu.y, vu.x, 1);\n" +"}\n" +"\n" +"__device__ inline float4\n" +SAMPLE_RGBA "(cudaTextureObject_t tex0, cudaTextureObject_t tex1,\n" +" cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)\n" +"{\n" +" return tex2D(tex0, x, y);\n" +"}\n" +"\n" +"__device__ inline float4\n" +SAMPLE_BGRA "(cudaTextureObject_t tex0, cudaTextureObject_t tex1,\n" +" cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)\n" +"{\n" +" float4 bgra = tex2D(tex0, x, y);\n" +" return make_float4 (bgra.z, bgra.y, bgra.x, bgra.w);\n" +"}\n" +"\n" +"__device__ inline float4\n" +SAMPLE_RGBx "(cudaTextureObject_t tex0, cudaTextureObject_t tex1,\n" +" cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)\n" +"{\n" +" float4 rgbx = tex2D(tex0, x, y);\n" +" rgbx.w = 1;\n" +" return rgbx;\n" +"}\n" +"\n" +"__device__ inline float4\n" +SAMPLE_BGRx "(cudaTextureObject_t tex0, cudaTextureObject_t tex1,\n" +" cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)\n" +"{\n" +" float4 bgrx = tex2D(tex0, x, y);\n" +" return make_float4 (bgrx.z, bgrx.y, bgrx.x, 1);\n" +"}\n" +"\n" +"__device__ inline float4\n" +SAMPLE_ARGB "(cudaTextureObject_t tex0, cudaTextureObject_t tex1,\n" +" cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)\n" +"{\n" +" float4 argb = tex2D(tex0, x, y);\n" +" return make_float4 (argb.y, argb.z, argb.w, argb.x);\n" +"}\n" +"\n" +"__device__ inline float4\n" +SAMPLE_AGBR "(cudaTextureObject_t tex0, cudaTextureObject_t tex1,\n" +" cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)\n" +"{\n" +" float4 abgr = tex2D(tex0, x, y);\n" +" return make_float4 (abgr.w, abgr.z, abgr.y, abgr.x);\n" +"}\n" +"\n" +"__device__ inline float4\n" +SAMPLE_RGBP "(cudaTextureObject_t tex0, cudaTextureObject_t tex1,\n" +" cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)\n" +"{\n" +" float r = tex2D(tex0, x, y);\n" +" float g = tex2D(tex1, x, y);\n" +" float b = tex2D(tex2, x, y);\n" +" return make_float4 (r, g, b, 1);\n" +"}\n" +"\n" +"__device__ inline float4\n" +SAMPLE_BGRP "(cudaTextureObject_t tex0, cudaTextureObject_t tex1,\n" +" cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)\n" +"{\n" +" float b = tex2D(tex0, x, y);\n" +" float g = tex2D(tex1, x, y);\n" +" float r = tex2D(tex2, x, y);\n" +" return make_float4 (r, g, b, 1);\n" +"}\n" +"\n" +"__device__ inline float4\n" +SAMPLE_GBR "(cudaTextureObject_t tex0, cudaTextureObject_t tex1,\n" +" cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)\n" +"{\n" +" float g = tex2D(tex0, x, y);\n" +" float b = tex2D(tex1, x, y);\n" +" float r = tex2D(tex2, x, y);\n" +" return make_float4 (r, g, b, 1);\n" +"}\n" +"\n" +"__device__ inline float4\n" +SAMPLE_GBRA "(cudaTextureObject_t tex0, cudaTextureObject_t tex1,\n" +" cudaTextureObject_t tex2, cudaTextureObject_t tex3, float x, float y)\n" +"{\n" +" float g = tex2D(tex0, x, y);\n" +" float b = tex2D(tex1, x, y);\n" +" float r = tex2D(tex2, x, y);\n" +" float a = tex2D(tex3, x, y);\n" +" return make_float4 (r, g, b, a);\n" +"}\n" +"\n" +"__device__ inline void\n" +WRITE_I420 "(unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n" +" unsigned char * dst3, float4 sample, int x, int y, int stride0, int stride1)\n" +"{\n" +" dst0[x + y * stride0] = scale_to_uchar (sample.x);\n" +" if (x % 2 == 0 && y % 2 == 0) {\n" +" unsigned int pos = x / 2 + (y / 2) * stride1;\n" +" dst1[pos] = scale_to_uchar (sample.y);\n" +" dst2[pos] = scale_to_uchar (sample.z);\n" +" }\n" +"}\n" +"\n" +"__device__ inline void\n" +WRITE_YV12 "(unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n" +" unsigned char * dst3, float4 sample, int x, int y, int stride0, int stride1)\n" +"{\n" +" dst0[x + y * stride0] = scale_to_uchar (sample.x);\n" +" if (x % 2 == 0 && y % 2 == 0) {\n" +" unsigned int pos = x / 2 + (y / 2) * stride1;\n" +" dst1[pos] = scale_to_uchar (sample.z);\n" +" dst2[pos] = scale_to_uchar (sample.y);\n" +" }\n" +"}\n" +"\n" +"__device__ inline void\n" +WRITE_NV12 "(unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n" +" unsigned char * dst3, float4 sample, int x, int y, int stride0, int stride1)\n" +"{\n" +" dst0[x + y * stride0] = scale_to_uchar (sample.x);\n" +" if (x % 2 == 0 && y % 2 == 0) {\n" +" unsigned int pos = x + (y / 2) * stride1;\n" +" dst1[pos] = scale_to_uchar (sample.y);\n" +" dst1[pos + 1] = scale_to_uchar (sample.z);\n" +" }\n" +"}\n" +"\n" +"__device__ inline void\n" +WRITE_NV21 "(unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n" +" unsigned char * dst3, float4 sample, int x, int y, int stride0, int stride1)\n" +"{\n" +" dst0[x + y * stride0] = scale_to_uchar (sample.x);\n" +" if (x % 2 == 0 && y % 2 == 0) {\n" +" unsigned int pos = x + (y / 2) * stride1;\n" +" dst1[pos] = scale_to_uchar (sample.z);\n" +" dst1[pos + 1] = scale_to_uchar (sample.y);\n" +" }\n" +"}\n" +"\n" +"__device__ inline void\n" +WRITE_P010 "(unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n" +" unsigned char * dst3, float4 sample, int x, int y, int stride0, int stride1)\n" +"{\n" +" *(unsigned short *) &dst0[x * 2 + y * stride0] = scale_to_ushort (sample.x);\n" +" if (x % 2 == 0 && y % 2 == 0) {\n" +" unsigned int pos = x * 2 + (y / 2) * stride1;\n" +" *(unsigned short *) &dst1[pos] = scale_to_ushort (sample.y);\n" +" *(unsigned short *) &dst1[pos + 2] = scale_to_ushort (sample.z);\n" +" }\n" +"}\n" +"\n" +"__device__ inline void\n" +WRITE_I420_10 "(unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n" +" unsigned char * dst3, float4 sample, int x, int y, int stride0, int stride1)\n" +"{\n" +" *(unsigned short *) &dst0[x * 2 + y * stride0] = scale_to_10bits (sample.x);\n" +" if (x % 2 == 0 && y % 2 == 0) {\n" +" unsigned int pos = x + (y / 2) * stride1;\n" +" *(unsigned short *) &dst1[pos] = scale_to_10bits (sample.y);\n" +" *(unsigned short *) &dst2[pos] = scale_to_10bits (sample.z);\n" +" }\n" +"}\n" +"\n" +"__device__ inline void\n" +WRITE_Y444 "(unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n" +" unsigned char * dst3, float4 sample, int x, int y, int stride0, int stride1)\n" +"{\n" +" int pos = x + y * stride0;\n" +" dst0[pos] = scale_to_uchar (sample.x);\n" +" dst1[pos] = scale_to_uchar (sample.y);\n" +" dst2[pos] = scale_to_uchar (sample.z);\n" +"}\n" +"\n" +"__device__ inline void\n" +WRITE_Y444_16 "(unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n" +" unsigned char * dst3, float4 sample, int x, int y, int stride0, int stride1)\n" +"{\n" +" int pos = x * 2 + y * stride0;\n" +" *(unsigned short *) &dst0[pos] = scale_to_ushort (sample.x);\n" +" *(unsigned short *) &dst1[pos] = scale_to_ushort (sample.y);\n" +" *(unsigned short *) &dst2[pos] = scale_to_ushort (sample.z);\n" +"}\n" +"\n" +"__device__ inline void\n" +WRITE_RGBA "(unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n" +" unsigned char * dst3, float4 sample, int x, int y, int stride0, int stride1)\n" +"{\n" +" int pos = x * 4 + y * stride0;\n" +" dst0[pos] = scale_to_uchar (sample.x);\n" +" dst0[pos + 1] = scale_to_uchar (sample.y);\n" +" dst0[pos + 2] = scale_to_uchar (sample.z);\n" +" dst0[pos + 3] = scale_to_uchar (sample.w);\n" +"}\n" +"\n" +"__device__ inline void\n" +WRITE_RGBx "(unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n" +" unsigned char * dst3, float4 sample, int x, int y, int stride0, int stride1)\n" +"{\n" +" int pos = x * 4 + y * stride0;\n" +" dst0[pos] = scale_to_uchar (sample.x);\n" +" dst0[pos + 1] = scale_to_uchar (sample.y);\n" +" dst0[pos + 2] = scale_to_uchar (sample.z);\n" +" dst0[pos + 3] = 255;\n" +"}\n" +"\n" +"__device__ inline void\n" +WRITE_BGRA "(unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n" +" unsigned char * dst3, float4 sample, int x, int y, int stride0, int stride1)\n" +"{\n" +" int pos = x * 4 + y * stride0;\n" +" dst0[pos] = scale_to_uchar (sample.z);\n" +" dst0[pos + 1] = scale_to_uchar (sample.y);\n" +" dst0[pos + 2] = scale_to_uchar (sample.x);\n" +" dst0[pos + 3] = scale_to_uchar (sample.w);\n" +"}\n" +"\n" +"__device__ inline void\n" +WRITE_BGRx "(unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n" +" unsigned char * dst3, float4 sample, int x, int y, int stride0, int stride1)\n" +"{\n" +" int pos = x * 4 + y * stride0;\n" +" dst0[pos] = scale_to_uchar (sample.z);\n" +" dst0[pos + 1] = scale_to_uchar (sample.y);\n" +" dst0[pos + 2] = scale_to_uchar (sample.x);\n" +" dst0[pos + 3] = 255;\n" +"}\n" +"\n" +"__device__ inline void\n" +WRITE_ARGB "(unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n" +" unsigned char * dst3, float4 sample, int x, int y, int stride0, int stride1)\n" +"{\n" +" int pos = x * 4 + y * stride0;\n" +" dst0[pos] = scale_to_uchar (sample.w);\n" +" dst0[pos + 1] = scale_to_uchar (sample.x);\n" +" dst0[pos + 2] = scale_to_uchar (sample.y);\n" +" dst0[pos + 3] = scale_to_uchar (sample.z);\n" +"}\n" +"\n" +"__device__ inline void\n" +WRITE_ABGR "(unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n" +" unsigned char * dst3, float4 sample, int x, int y, int stride0, int stride1)\n" +"{\n" +" int pos = x * 4 + y * stride0;\n" +" dst0[pos] = scale_to_uchar (sample.w);\n" +" dst0[pos + 1] = scale_to_uchar (sample.z);\n" +" dst0[pos + 2] = scale_to_uchar (sample.y);\n" +" dst0[pos + 3] = scale_to_uchar (sample.x);\n" +"}\n" +"\n" +"__device__ inline void\n" +WRITE_RGB "(unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n" +" unsigned char * dst3, float4 sample, int x, int y, int stride0, int stride1)\n" +"{\n" +" int pos = x * 3 + y * stride0;\n" +" dst0[pos] = scale_to_uchar (sample.x);\n" +" dst0[pos + 1] = scale_to_uchar (sample.y);\n" +" dst0[pos + 2] = scale_to_uchar (sample.z);\n" +"}\n" +"\n" +"__device__ inline void\n" +WRITE_BGR "(unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n" +" unsigned char * dst3, float4 sample, int x, int y, int stride0, int stride1)\n" +"{\n" +" int pos = x * 3 + y * stride0;\n" +" dst0[pos] = scale_to_uchar (sample.z);\n" +" dst0[pos + 1] = scale_to_uchar (sample.y);\n" +" dst0[pos + 2] = scale_to_uchar (sample.x);\n" +"}\n" +"\n" +"__device__ inline void\n" +WRITE_RGB10A2 "(unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n" +" unsigned char * dst3, float4 sample, int x, int y, int stride0, int stride1)\n" +"{\n" +" unsigned int alpha = (unsigned int) scale_to_2bits (sample.x);\n" +" unsigned int packed_rgb = alpha << 30;\n" +" packed_rgb |= ((unsigned int) scale_to_10bits (sample.x));\n" +" packed_rgb |= ((unsigned int) scale_to_10bits (sample.y)) << 10;\n" +" packed_rgb |= ((unsigned int) scale_to_10bits (sample.z)) << 20;\n" +" *(unsigned int *) &dst0[x * 4 + y * stride0] = packed_rgb;\n" +"}\n" +"\n" +"__device__ inline void\n" +WRITE_BGR10A2 "(unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n" +" unsigned char * dst3, float4 sample, int x, int y, int stride0, int stride1)\n" +"{\n" +" unsigned int alpha = (unsigned int) scale_to_2bits (sample.x);\n" +" unsigned int packed_rgb = alpha << 30;\n" +" packed_rgb |= ((unsigned int) scale_to_10bits (sample.x)) << 20;\n" +" packed_rgb |= ((unsigned int) scale_to_10bits (sample.y)) << 10;\n" +" packed_rgb |= ((unsigned int) scale_to_10bits (sample.z));\n" +" *(unsigned int *) &dst0[x * 4 + y * stride0] = packed_rgb;\n" +"}\n" +"\n" +"__device__ inline void\n" +WRITE_Y42B "(unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n" +" unsigned char * dst3, float4 sample, int x, int y, int stride0, int stride1)\n" +"{\n" +" dst0[x + y * stride0] = scale_to_uchar (sample.x);\n" +" if (x % 2 == 0) {\n" +" unsigned int pos = x / 2 + y * stride1;\n" +" dst1[pos] = scale_to_uchar (sample.y);\n" +" dst2[pos] = scale_to_uchar (sample.z);\n" +" }\n" +"}\n" +"\n" +"__device__ inline void\n" +WRITE_I422_10 "(unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n" +" unsigned char * dst3, float4 sample, int x, int y, int stride0, int stride1)\n" +"{\n" +" *(unsigned short *) &dst0[x * 2 + y * stride0] = scale_to_10bits (sample.x);\n" +" if (x % 2 == 0) {\n" +" unsigned int pos = x + y * stride1;\n" +" *(unsigned short *) &dst1[pos] = scale_to_10bits (sample.y);\n" +" *(unsigned short *) &dst2[pos] = scale_to_10bits (sample.z);\n" +" }\n" +"}\n" +"\n" +"__device__ inline void\n" +WRITE_I422_12 "(unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n" +" unsigned char * dst3, float4 sample, int x, int y, int stride0, int stride1)\n" +"{\n" +" *(unsigned short *) &dst0[x * 2 + y * stride0] = scale_to_12bits (sample.x);\n" +" if (x % 2 == 0) {\n" +" unsigned int pos = x + y * stride1;\n" +" *(unsigned short *) &dst1[pos] = scale_to_12bits (sample.y);\n" +" *(unsigned short *) &dst2[pos] = scale_to_12bits (sample.z);\n" +" }\n" +"}\n" +"\n" +"__device__ inline void\n" +WRITE_RGBP "(unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n" +" unsigned char * dst3, float4 sample, int x, int y, int stride0, int stride1)\n" +"{\n" +" int pos = x + y * stride0;\n" +" dst0[pos] = scale_to_uchar (sample.x);\n" +" dst1[pos] = scale_to_uchar (sample.y);\n" +" dst2[pos] = scale_to_uchar (sample.z);\n" +"}\n" +"\n" +"__device__ inline void\n" +WRITE_BGRP "(unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n" +" unsigned char * dst3, float4 sample, int x, int y, int stride0, int stride1)\n" +"{\n" +" int pos = x + y * stride0;\n" +" dst0[pos] = scale_to_uchar (sample.z);\n" +" dst1[pos] = scale_to_uchar (sample.y);\n" +" dst2[pos] = scale_to_uchar (sample.x);\n" +"}\n" +"\n" +"__device__ inline void\n" +WRITE_GBR "(unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n" +" unsigned char * dst3, float4 sample, int x, int y, int stride0, int stride1)\n" +"{\n" +" int pos = x + y * stride0;\n" +" dst0[pos] = scale_to_uchar (sample.y);\n" +" dst1[pos] = scale_to_uchar (sample.z);\n" +" dst2[pos] = scale_to_uchar (sample.x);\n" +"}\n" +"\n" +"__device__ inline void\n" +WRITE_GBRA "(unsigned char * dst0, unsigned char * dst1, unsigned char * dst2,\n" +" unsigned char * dst3, float4 sample, int x, int y, int stride0, int stride1)\n" +"{\n" +" int pos = x + y * stride0;\n" +" dst0[pos] = scale_to_uchar (sample.y);\n" +" dst1[pos] = scale_to_uchar (sample.z);\n" +" dst2[pos] = scale_to_uchar (sample.x);\n" +" dst3[pos] = scale_to_uchar (sample.w);\n" +"}\n"; + +#define GST_CUDA_KERNEL_UNPACK_FUNC "gst_cuda_kernel_unpack_func" +static const gchar RGB_TO_RGBx[] = +"extern \"C\" {\n" +"__global__ void\n" +GST_CUDA_KERNEL_UNPACK_FUNC +"(unsigned char *src, unsigned char *dst, int width, int height,\n" +" int src_stride, int dst_stride)\n" +"{\n" +" int x_pos = blockIdx.x * blockDim.x + threadIdx.x;\n" +" int y_pos = blockIdx.y * blockDim.y + threadIdx.y;\n" +" if (x_pos < width && y_pos < height) {\n" +" int dst_pos = x_pos * 4 + y_pos * dst_stride;\n" +" int src_pos = x_pos * 3 + y_pos * src_stride;\n" +" dst[dst_pos] = src[src_pos];\n" +" dst[dst_pos + 1] = src[src_pos + 1];\n" +" dst[dst_pos + 2] = src[src_pos + 2];\n" +" dst[dst_pos + 3] = 0xff;\n" +" }\n" +"}\n" +"}\n"; + +static const gchar RGB10A2_TO_ARGB64[] = +"extern \"C\" {\n" +"__global__ void\n" +GST_CUDA_KERNEL_UNPACK_FUNC +"(unsigned char *src, unsigned char *dst, int width, int height,\n" +" int src_stride, int dst_stride)\n" +"{\n" +" int x_pos = blockIdx.x * blockDim.x + threadIdx.x;\n" +" int y_pos = blockIdx.y * blockDim.y + threadIdx.y;\n" +" if (x_pos < width && y_pos < height) {\n" +" unsigned short a, r, g, b;\n" +" unsigned int val;\n" +" int dst_pos = x_pos * 8 + y_pos * dst_stride;\n" +" val = *(unsigned int *)&src[x_pos * 4 + y_pos * src_stride];\n" +" a = (val >> 30) & 0x03;\n" +" a = (a << 14) | (a << 12) | (a << 10) | (a << 8) | (a << 6) | (a << 4) | (a << 2) | (a << 0);\n" +" r = (val & 0x3ff);\n" +" r = (r << 6) | (r >> 4);\n" +" g = ((val >> 10) & 0x3ff);\n" +" g = (g << 6) | (g >> 4);\n" +" b = ((val >> 20) & 0x3ff);\n" +" b = (b << 6) | (b >> 4);\n" +" *(unsigned short *) &dst[dst_pos] = a;\n" +" *(unsigned short *) &dst[dst_pos + 2] = r;\n" +" *(unsigned short *) &dst[dst_pos + 4] = g;\n" +" *(unsigned short *) &dst[dst_pos + 6] = b;\n" +" }\n" +"}\n" +"}\n"; + +static const gchar BGR10A2_TO_ARGB64[] = +"extern \"C\" {\n" +"__global__ void\n" +GST_CUDA_KERNEL_UNPACK_FUNC +"(unsigned char *src, unsigned char *dst, int width, int height,\n" +" int src_stride, int dst_stride)\n" +"{\n" +" int x_pos = blockIdx.x * blockDim.x + threadIdx.x;\n" +" int y_pos = blockIdx.y * blockDim.y + threadIdx.y;\n" +" if (x_pos < width && y_pos < height) {\n" +" unsigned short a, r, g, b;\n" +" unsigned int val;\n" +" int dst_pos = x_pos * 8 + y_pos * dst_stride;\n" +" val = *(unsigned int *)&src[x_pos * 4 + y_pos * src_stride];\n" +" a = (val >> 30) & 0x03;\n" +" a = (a << 14) | (a << 12) | (a << 10) | (a << 8) | (a << 6) | (a << 4) | (a << 2) | (a << 0);\n" +" b = (val & 0x3ff);\n" +" b = (b << 6) | (b >> 4);\n" +" g = ((val >> 10) & 0x3ff);\n" +" g = (g << 6) | (g >> 4);\n" +" r = ((val >> 20) & 0x3ff);\n" +" r = (r << 6) | (r >> 4);\n" +" *(unsigned short *) &dst[dst_pos] = a;\n" +" *(unsigned short *) &dst[dst_pos + 2] = r;\n" +" *(unsigned short *) &dst[dst_pos + 4] = g;\n" +" *(unsigned short *) &dst[dst_pos + 6] = b;\n" +" }\n" +"}\n" +"}\n"; + +#define GST_CUDA_KERNEL_MAIN_FUNC "KernelMain" + +static const gchar TEMPLETA_KERNEL[] = +/* KERNEL_COMMON */ +"%s\n" +/* UNPACK FUNCTION */ +"%s\n" +"__constant__ ColorMatrix TO_RGB_MATRIX = { { %s, %s, %s },\n" +" { %s, %s, %s },\n" +" { %s, %s, %s },\n" +" { %s, %s, %s },\n" +" { %s, %s, %s },\n" +" { %s, %s, %s } };\n" +"__constant__ ColorMatrix TO_YUV_MATRIX = { { %s, %s, %s },\n" +" { %s, %s, %s },\n" +" { %s, %s, %s },\n" +" { %s, %s, %s },\n" +" { %s, %s, %s },\n" +" { %s, %s, %s } };\n" +"__constant__ int WIDTH = %d;\n" +"__constant__ int HEIGHT = %d;\n" +"__constant__ int LEFT = %d;\n" +"__constant__ int TOP = %d;\n" +"__constant__ int RIGHT = %d;\n" +"__constant__ int BOTTOM = %d;\n" +"__constant__ int VIEW_WIDTH = %d;\n" +"__constant__ int VIEW_HEIGHT = %d;\n" +"__constant__ float OFFSET_X = %s;\n" +"__constant__ float OFFSET_Y = %s;\n" +"__constant__ float BORDER_X = %s;\n" +"__constant__ float BORDER_Y = %s;\n" +"__constant__ float BORDER_Z = %s;\n" +"__constant__ float BORDER_W = %s;\n" +"\n" +"extern \"C\" {\n" +"__global__ void\n" +GST_CUDA_KERNEL_MAIN_FUNC "(cudaTextureObject_t tex0, cudaTextureObject_t tex1,\n" +" cudaTextureObject_t tex2, cudaTextureObject_t tex3, unsigned char * dst0,\n" +" unsigned char * dst1, unsigned char * dst2, unsigned char * dst3,\n" +" int stride0, int stride1)\n" +"{\n" +" int x_pos = blockIdx.x * blockDim.x + threadIdx.x;\n" +" int y_pos = blockIdx.y * blockDim.y + threadIdx.y;\n" +" float4 sample;\n" +" if (x_pos >= WIDTH || y_pos >= HEIGHT)\n" +" return;\n" +" if (x_pos < LEFT || x_pos >= RIGHT || y_pos < TOP || y_pos >= BOTTOM) {\n" +" sample = make_float4 (BORDER_X, BORDER_Y, BORDER_Z, BORDER_W);\n" +" } else {\n" +" float x = OFFSET_X + (float) (x_pos - LEFT) / VIEW_WIDTH;\n" +" float y = OFFSET_Y + (float) (y_pos - TOP) / VIEW_HEIGHT;\n" +" float4 s = %s (tex0, tex1, tex2, tex3, x, y);\n" +" float3 xyz = make_float3 (s.x, s.y, s.z);\n" +" float3 rgb = %s (xyz, &TO_RGB_MATRIX);\n" +" float3 yuv = %s (rgb, &TO_YUV_MATRIX);\n" +" sample = make_float4 (yuv.x, yuv.y, yuv.z, s.w);\n" +" }\n" +" %s (dst0, dst1, dst2, dst3, sample, x_pos, y_pos, stride0, stride1);\n" +"}\n" +"}\n"; +/* *INDENT-ON* */ + +typedef struct _TextureFormat +{ + GstVideoFormat format; + CUarray_format array_format[GST_VIDEO_MAX_COMPONENTS]; + guint channels[GST_VIDEO_MAX_COMPONENTS]; + const gchar *sample_func; +} TextureFormat; + +#define CU_AD_FORMAT_NONE 0 +#define MAKE_FORMAT_YUV_PLANAR(f,cf,sample_func) \ + { GST_VIDEO_FORMAT_ ##f, { CU_AD_FORMAT_ ##cf, CU_AD_FORMAT_ ##cf, \ + CU_AD_FORMAT_ ##cf, CU_AD_FORMAT_NONE }, {1, 1, 1, 0}, sample_func } +#define MAKE_FORMAT_YUV_SEMI_PLANAR(f,cf,sample_func) \ + { GST_VIDEO_FORMAT_ ##f, { CU_AD_FORMAT_ ##cf, CU_AD_FORMAT_ ##cf, \ + CU_AD_FORMAT_NONE, CU_AD_FORMAT_NONE }, {1, 2, 0, 0}, sample_func } +#define MAKE_FORMAT_RGB(f,cf,sample_func) \ + { GST_VIDEO_FORMAT_ ##f, { CU_AD_FORMAT_ ##cf, CU_AD_FORMAT_NONE, \ + CU_AD_FORMAT_NONE, CU_AD_FORMAT_NONE }, {4, 0, 0, 0}, sample_func } +#define MAKE_FORMAT_RGBP(f,cf,sample_func) \ + { GST_VIDEO_FORMAT_ ##f, { CU_AD_FORMAT_ ##cf, CU_AD_FORMAT_ ##cf, \ + CU_AD_FORMAT_ ##cf, CU_AD_FORMAT_NONE }, {1, 1, 1, 0}, sample_func } +#define MAKE_FORMAT_RGBAP(f,cf,sample_func) \ + { GST_VIDEO_FORMAT_ ##f, { CU_AD_FORMAT_ ##cf, CU_AD_FORMAT_ ##cf, \ + CU_AD_FORMAT_ ##cf, CU_AD_FORMAT_ ##cf }, {1, 1, 1, 1}, sample_func } + +static const TextureFormat format_map[] = { + MAKE_FORMAT_YUV_PLANAR (I420, UNSIGNED_INT8, SAMPLE_YUV_PLANAR), + MAKE_FORMAT_YUV_PLANAR (YV12, UNSIGNED_INT8, SAMPLE_YV12), + MAKE_FORMAT_YUV_SEMI_PLANAR (NV12, UNSIGNED_INT8, SAMPLE_SEMI_PLANAR), + MAKE_FORMAT_YUV_SEMI_PLANAR (NV21, UNSIGNED_INT8, SAMPLE_SEMI_PLANAR_SWAP), + MAKE_FORMAT_YUV_SEMI_PLANAR (P010_10LE, UNSIGNED_INT16, SAMPLE_SEMI_PLANAR), + MAKE_FORMAT_YUV_SEMI_PLANAR (P016_LE, UNSIGNED_INT16, SAMPLE_SEMI_PLANAR), + MAKE_FORMAT_YUV_PLANAR (I420_10LE, UNSIGNED_INT16, SAMPLE_YUV_PLANAR_10BIS), + MAKE_FORMAT_YUV_PLANAR (Y444, UNSIGNED_INT8, SAMPLE_YUV_PLANAR), + MAKE_FORMAT_YUV_PLANAR (Y444_16LE, UNSIGNED_INT16, SAMPLE_YUV_PLANAR), + MAKE_FORMAT_RGB (RGBA, UNSIGNED_INT8, SAMPLE_RGBA), + MAKE_FORMAT_RGB (BGRA, UNSIGNED_INT8, SAMPLE_BGRA), + MAKE_FORMAT_RGB (RGBx, UNSIGNED_INT8, SAMPLE_RGBx), + MAKE_FORMAT_RGB (BGRx, UNSIGNED_INT8, SAMPLE_BGRx), + MAKE_FORMAT_RGB (ARGB, UNSIGNED_INT8, SAMPLE_ARGB), + MAKE_FORMAT_RGB (ARGB64, UNSIGNED_INT16, SAMPLE_ARGB64), + MAKE_FORMAT_RGB (ABGR, UNSIGNED_INT8, SAMPLE_AGBR), + MAKE_FORMAT_YUV_PLANAR (Y42B, UNSIGNED_INT8, SAMPLE_YUV_PLANAR), + MAKE_FORMAT_YUV_PLANAR (I422_10LE, UNSIGNED_INT16, SAMPLE_YUV_PLANAR_10BIS), + MAKE_FORMAT_YUV_PLANAR (I422_12LE, UNSIGNED_INT16, SAMPLE_YUV_PLANAR_12BIS), + MAKE_FORMAT_RGBP (RGBP, UNSIGNED_INT8, SAMPLE_RGBP), + MAKE_FORMAT_RGBP (BGRP, UNSIGNED_INT8, SAMPLE_BGRP), + MAKE_FORMAT_RGBP (GBR, UNSIGNED_INT8, SAMPLE_GBR), + MAKE_FORMAT_RGBAP (GBRA, UNSIGNED_INT8, SAMPLE_GBRA), +}; + +typedef struct _TextureBuffer +{ + CUdeviceptr ptr; + gsize stride; +} TextureBuffer; + +typedef struct +{ + gint x; + gint y; + gint width; + gint height; +} ConverterRect; + +struct _GstCudaConverterPrivate +{ + GstVideoInfo in_info; + GstVideoInfo out_info; + + GstStructure *config; + + GstVideoInfo texture_info; + const TextureFormat *texture_fmt; + gint texture_align; + ConverterRect dest_rect; + + TextureBuffer fallback_buffer[GST_VIDEO_MAX_COMPONENTS]; + CUfilter_mode filter_mode[GST_VIDEO_MAX_COMPONENTS]; + TextureBuffer unpack_buffer; + + CUmodule module; + CUfunction main_func; + CUfunction unpack_func; +}; + +static void gst_cuda_converter_dispose (GObject * object); +static void gst_cuda_converter_finalize (GObject * object); + +#define gst_cuda_converter_parent_class parent_class +G_DEFINE_TYPE_WITH_PRIVATE (GstCudaConverter, gst_cuda_converter, + GST_TYPE_OBJECT); + +static void +gst_cuda_converter_class_init (GstCudaConverterClass * klass) +{ + GObjectClass *object_class = G_OBJECT_CLASS (klass); + + object_class->dispose = gst_cuda_converter_dispose; + object_class->finalize = gst_cuda_converter_finalize; + + GST_DEBUG_CATEGORY_INIT (gst_cuda_converter_debug, + "cudaconverter", 0, "cudaconverter"); +} + +static void +gst_cuda_converter_init (GstCudaConverter * self) +{ + GstCudaConverterPrivate *priv; + + self->priv = priv = gst_cuda_converter_get_instance_private (self); + priv->config = gst_structure_new_empty ("GstCudaConverter"); +} + +static void +gst_cuda_converter_dispose (GObject * object) +{ + GstCudaConverter *self = GST_CUDA_CONVERTER (object); + GstCudaConverterPrivate *priv = self->priv; + guint i; + + if (self->context && gst_cuda_context_push (self->context)) { + if (priv->module) { + CuModuleUnload (priv->module); + priv->module = NULL; + } + + for (i = 0; i < G_N_ELEMENTS (priv->fallback_buffer); i++) { + if (priv->fallback_buffer[i].ptr) { + CuMemFree (priv->fallback_buffer[i].ptr); + priv->fallback_buffer[i].ptr = 0; + } + } + + if (priv->unpack_buffer.ptr) { + CuMemFree (priv->unpack_buffer.ptr); + priv->unpack_buffer.ptr = 0; + } + + gst_cuda_context_pop (NULL); + } + + gst_clear_object (&self->context); + + G_OBJECT_CLASS (parent_class)->dispose (object); +} + +static void +gst_cuda_converter_finalize (GObject * object) +{ + GstCudaConverter *self = GST_CUDA_CONVERTER (object); + GstCudaConverterPrivate *priv = self->priv; + + gst_structure_free (priv->config); + + G_OBJECT_CLASS (parent_class)->finalize (object); +} + +static const gchar * +get_color_range_name (GstVideoColorRange range) +{ + switch (range) { + case GST_VIDEO_COLOR_RANGE_0_255: + return "FULL"; + case GST_VIDEO_COLOR_RANGE_16_235: + return "STUDIO"; + default: + break; + } + + return "UNKNOWN"; +} + +typedef struct _GstCudaColorMatrixString +{ + gchar matrix[3][3][G_ASCII_DTOSTR_BUF_SIZE]; + gchar offset[3][G_ASCII_DTOSTR_BUF_SIZE]; + gchar min[3][G_ASCII_DTOSTR_BUF_SIZE]; + gchar max[3][G_ASCII_DTOSTR_BUF_SIZE]; +} GstCudaColorMatrixString; + +static void +color_matrix_to_string (const GstCudaColorMatrix * m, + GstCudaColorMatrixString * str) +{ + guint i, j; + for (i = 0; i < 3; i++) { + for (j = 0; j < 3; j++) { + g_ascii_formatd (str->matrix[i][j], G_ASCII_DTOSTR_BUF_SIZE, "%f", + m->matrix[i][j]); + } + + g_ascii_formatd (str->offset[i], + G_ASCII_DTOSTR_BUF_SIZE, "%f", m->offset[i]); + g_ascii_formatd (str->min[i], G_ASCII_DTOSTR_BUF_SIZE, "%f", m->min[i]); + g_ascii_formatd (str->max[i], G_ASCII_DTOSTR_BUF_SIZE, "%f", m->max[i]); + } +} + +static gboolean +gst_cuda_converter_setup (GstCudaConverter * self) +{ + GstCudaConverterPrivate *priv = self->priv; + const GstVideoInfo *in_info; + const GstVideoInfo *out_info; + const GstVideoInfo *texture_info; + GstCudaColorMatrix to_rgb_matrix; + GstCudaColorMatrix to_yuv_matrix; + GstCudaColorMatrix border_color_matrix; + GstCudaColorMatrixString to_rgb_matrix_str; + GstCudaColorMatrixString to_yuv_matrix_str; + gchar border_color_str[4][G_ASCII_DTOSTR_BUF_SIZE]; + gdouble border_color[4]; + gchar offset_x[G_ASCII_DTOSTR_BUF_SIZE]; + gchar offset_y[G_ASCII_DTOSTR_BUF_SIZE]; + gint i, j; + const gchar *unpack_function = NULL; + const gchar *write_func = NULL; + const gchar *to_rgb_func = COLOR_SPACE_IDENTITY; + const gchar *to_yuv_func = COLOR_SPACE_IDENTITY; + const GstVideoColorimetry *in_color; + const GstVideoColorimetry *out_color; + gchar *str; + gchar *ptx; + CUresult ret; + + in_info = &priv->in_info; + out_info = &priv->out_info; + texture_info = &priv->texture_info; + in_color = &in_info->colorimetry; + out_color = &out_info->colorimetry; + + memset (&to_rgb_matrix, 0, sizeof (GstCudaColorMatrix)); + color_matrix_identity (&to_rgb_matrix); + + memset (&to_yuv_matrix, 0, sizeof (GstCudaColorMatrix)); + color_matrix_identity (&to_yuv_matrix); + + switch (GST_VIDEO_INFO_FORMAT (out_info)) { + case GST_VIDEO_FORMAT_I420: + write_func = WRITE_I420; + break; + case GST_VIDEO_FORMAT_YV12: + write_func = WRITE_YV12; + break; + case GST_VIDEO_FORMAT_NV12: + write_func = WRITE_NV12; + break; + case GST_VIDEO_FORMAT_NV21: + write_func = WRITE_NV21; + break; + case GST_VIDEO_FORMAT_P010_10LE: + write_func = WRITE_P010; + break; + case GST_VIDEO_FORMAT_P016_LE: + write_func = WRITE_P016; + break; + case GST_VIDEO_FORMAT_I420_10LE: + write_func = WRITE_I420_10; + break; + case GST_VIDEO_FORMAT_Y444: + write_func = WRITE_Y444; + break; + case GST_VIDEO_FORMAT_Y444_16LE: + write_func = WRITE_Y444_16; + break; + case GST_VIDEO_FORMAT_RGBA: + write_func = WRITE_RGBA; + break; + case GST_VIDEO_FORMAT_RGBx: + write_func = WRITE_RGBx; + break; + case GST_VIDEO_FORMAT_BGRA: + write_func = WRITE_BGRA; + break; + case GST_VIDEO_FORMAT_BGRx: + write_func = WRITE_BGRx; + break; + case GST_VIDEO_FORMAT_ARGB: + write_func = WRITE_ARGB; + break; + case GST_VIDEO_FORMAT_ABGR: + write_func = WRITE_ABGR; + break; + case GST_VIDEO_FORMAT_RGB: + write_func = WRITE_RGB; + break; + case GST_VIDEO_FORMAT_BGR: + write_func = WRITE_BGR; + break; + case GST_VIDEO_FORMAT_RGB10A2_LE: + write_func = WRITE_RGB10A2; + break; + case GST_VIDEO_FORMAT_BGR10A2_LE: + write_func = WRITE_BGR10A2; + break; + case GST_VIDEO_FORMAT_Y42B: + write_func = WRITE_Y42B; + break; + case GST_VIDEO_FORMAT_I422_10LE: + write_func = WRITE_I422_10; + break; + case GST_VIDEO_FORMAT_I422_12LE: + write_func = WRITE_I422_12; + break; + case GST_VIDEO_FORMAT_RGBP: + write_func = WRITE_RGBP; + break; + case GST_VIDEO_FORMAT_BGRP: + write_func = WRITE_BGRP; + break; + case GST_VIDEO_FORMAT_GBR: + write_func = WRITE_GBR; + break; + case GST_VIDEO_FORMAT_GBRA: + write_func = WRITE_GBRA; + break; + default: + break; + } + + if (!write_func) { + GST_ERROR_OBJECT (self, "Unknown write function for format %s", + gst_video_format_to_string (GST_VIDEO_INFO_FORMAT (out_info))); + return FALSE; + } + + /* Decide texture info to use, 3 channel RGB or 10bits packed RGB + * need be converted to other format */ + priv->texture_info = priv->in_info; + switch (GST_VIDEO_INFO_FORMAT (in_info)) { + case GST_VIDEO_FORMAT_RGB: + gst_video_info_set_format (&priv->texture_info, + GST_VIDEO_FORMAT_RGBx, GST_VIDEO_INFO_WIDTH (in_info), + GST_VIDEO_INFO_HEIGHT (in_info)); + unpack_function = RGB_TO_RGBx; + break; + case GST_VIDEO_FORMAT_BGR: + gst_video_info_set_format (&priv->texture_info, + GST_VIDEO_FORMAT_BGRx, GST_VIDEO_INFO_WIDTH (in_info), + GST_VIDEO_INFO_HEIGHT (in_info)); + unpack_function = RGB_TO_RGBx; + break; + case GST_VIDEO_FORMAT_RGB10A2_LE: + gst_video_info_set_format (&priv->texture_info, + GST_VIDEO_FORMAT_ARGB64, GST_VIDEO_INFO_WIDTH (in_info), + GST_VIDEO_INFO_HEIGHT (in_info)); + unpack_function = RGB10A2_TO_ARGB64; + break; + case GST_VIDEO_FORMAT_BGR10A2_LE: + gst_video_info_set_format (&priv->texture_info, + GST_VIDEO_FORMAT_ARGB64, GST_VIDEO_INFO_WIDTH (in_info), + GST_VIDEO_INFO_HEIGHT (in_info)); + unpack_function = BGR10A2_TO_ARGB64; + break; + default: + break; + } + + for (i = 0; i < G_N_ELEMENTS (format_map); i++) { + if (format_map[i].format == GST_VIDEO_INFO_FORMAT (texture_info)) { + priv->texture_fmt = &format_map[i]; + break; + } + } + + if (!priv->texture_fmt) { + GST_ERROR_OBJECT (self, "Couldn't find texture format for %s (%s)", + gst_video_format_to_string (GST_VIDEO_INFO_FORMAT (in_info)), + gst_video_format_to_string (GST_VIDEO_INFO_FORMAT (texture_info))); + return FALSE; + } + + /* calculate black color + * TODO: add support border color */ + if (GST_VIDEO_INFO_IS_RGB (out_info)) { + GstVideoInfo rgb_info = *out_info; + rgb_info.colorimetry.range = GST_VIDEO_COLOR_RANGE_0_255; + gst_cuda_color_range_adjust_matrix_unorm (&rgb_info, out_info, + &border_color_matrix); + } else { + GstVideoInfo rgb_info; + + gst_video_info_set_format (&rgb_info, GST_VIDEO_FORMAT_RGBA64_LE, + out_info->width, out_info->height); + + gst_cuda_rgb_to_yuv_matrix_unorm (&rgb_info, + out_info, &border_color_matrix); + } + + for (i = 0; i < 3; i++) { + /* TODO: property */ + gdouble border_rgba[4] = { 0, 0, 0 }; + border_color[i] = 0; + for (j = 0; j < 3; j++) + border_color[i] += border_color_matrix.matrix[i][j] * border_rgba[i]; + border_color[i] = border_color_matrix.offset[i]; + border_color[i] = CLAMP (border_color[i], + border_color_matrix.min[i], border_color_matrix.max[i]); + + g_ascii_formatd (border_color_str[i], + G_ASCII_DTOSTR_BUF_SIZE, "%f", border_color[i]); + } + g_ascii_formatd (border_color_str[3], G_ASCII_DTOSTR_BUF_SIZE, "%f", 1); + + /* FIXME: handle primaries and transfer functions */ + if (GST_VIDEO_INFO_IS_RGB (texture_info)) { + if (GST_VIDEO_INFO_IS_RGB (out_info)) { + /* RGB -> RGB */ + if (in_color->range == out_color->range) { + GST_DEBUG_OBJECT (self, "RGB -> RGB conversion without matrix"); + } else { + if (!gst_cuda_color_range_adjust_matrix_unorm (in_info, out_info, + &to_rgb_matrix)) { + GST_ERROR_OBJECT (self, "Failed to get RGB range adjust matrix"); + return FALSE; + } + + str = gst_cuda_dump_color_matrix (&to_rgb_matrix); + GST_DEBUG_OBJECT (self, "RGB range adjust %s -> %s\n%s", + get_color_range_name (in_color->range), + get_color_range_name (out_color->range), str); + g_free (str); + + to_rgb_func = COLOR_SPACE_CONVERT; + } + } else { + /* RGB -> YUV */ + if (!gst_cuda_rgb_to_yuv_matrix_unorm (in_info, out_info, &to_yuv_matrix)) { + GST_ERROR_OBJECT (self, "Failed to get RGB -> YUV transform matrix"); + return FALSE; + } + + str = gst_cuda_dump_color_matrix (&to_yuv_matrix); + GST_DEBUG_OBJECT (self, "RGB -> YUV matrix:\n%s", str); + g_free (str); + + to_yuv_func = COLOR_SPACE_CONVERT; + } + } else { + if (GST_VIDEO_INFO_IS_RGB (out_info)) { + /* YUV -> RGB */ + if (!gst_cuda_yuv_to_rgb_matrix_unorm (in_info, out_info, &to_rgb_matrix)) { + GST_ERROR_OBJECT (self, "Failed to get YUV -> RGB transform matrix"); + return FALSE; + } + + str = gst_cuda_dump_color_matrix (&to_rgb_matrix); + GST_DEBUG_OBJECT (self, "YUV -> RGB matrix:\n%s", str); + g_free (str); + + to_rgb_func = COLOR_SPACE_CONVERT; + } else { + /* YUV -> YUV */ + if (in_color->range == out_color->range) { + GST_DEBUG_OBJECT (self, "YUV -> YU conversion without matrix"); + } else { + if (!gst_cuda_color_range_adjust_matrix_unorm (in_info, out_info, + &to_yuv_matrix)) { + GST_ERROR_OBJECT (self, "Failed to get GRAY range adjust matrix"); + return FALSE; + } + + str = gst_cuda_dump_color_matrix (&to_yuv_matrix); + GST_DEBUG_OBJECT (self, "YUV range adjust matrix:\n%s", str); + g_free (str); + + to_yuv_func = COLOR_SPACE_CONVERT; + } + } + } + + color_matrix_to_string (&to_rgb_matrix, &to_rgb_matrix_str); + color_matrix_to_string (&to_yuv_matrix, &to_yuv_matrix_str); + + /* half pixel offset, to sample texture at center of the pixel position */ + g_ascii_formatd (offset_x, G_ASCII_DTOSTR_BUF_SIZE, "%f", + (gdouble) 0.5 / priv->dest_rect.width); + g_ascii_formatd (offset_y, G_ASCII_DTOSTR_BUF_SIZE, "%f", + (gdouble) 0.5 / priv->dest_rect.height); + + str = g_strdup_printf (TEMPLETA_KERNEL, KERNEL_COMMON, + unpack_function ? unpack_function : "", + /* TO RGB matrix */ + to_rgb_matrix_str.matrix[0][0], + to_rgb_matrix_str.matrix[0][1], + to_rgb_matrix_str.matrix[0][2], + to_rgb_matrix_str.matrix[1][0], + to_rgb_matrix_str.matrix[1][1], + to_rgb_matrix_str.matrix[1][2], + to_rgb_matrix_str.matrix[2][0], + to_rgb_matrix_str.matrix[2][1], + to_rgb_matrix_str.matrix[2][2], + to_rgb_matrix_str.offset[0], + to_rgb_matrix_str.offset[1], + to_rgb_matrix_str.offset[2], + to_rgb_matrix_str.min[0], + to_rgb_matrix_str.min[1], + to_rgb_matrix_str.min[2], + to_rgb_matrix_str.max[0], + to_rgb_matrix_str.max[1], to_rgb_matrix_str.max[2], + /* TO YUV matrix */ + to_yuv_matrix_str.matrix[0][0], + to_yuv_matrix_str.matrix[0][1], + to_yuv_matrix_str.matrix[0][2], + to_yuv_matrix_str.matrix[1][0], + to_yuv_matrix_str.matrix[1][1], + to_yuv_matrix_str.matrix[1][2], + to_yuv_matrix_str.matrix[2][0], + to_yuv_matrix_str.matrix[2][1], + to_yuv_matrix_str.matrix[2][2], + to_yuv_matrix_str.offset[0], + to_yuv_matrix_str.offset[1], + to_yuv_matrix_str.offset[2], + to_yuv_matrix_str.min[0], + to_yuv_matrix_str.min[1], + to_yuv_matrix_str.min[2], + to_yuv_matrix_str.max[0], + to_yuv_matrix_str.max[1], to_yuv_matrix_str.max[2], + /* width/height */ + GST_VIDEO_INFO_WIDTH (out_info), GST_VIDEO_INFO_HEIGHT (out_info), + /* viewport */ + priv->dest_rect.x, priv->dest_rect.y, + priv->dest_rect.x + priv->dest_rect.width, + priv->dest_rect.y + priv->dest_rect.height, + priv->dest_rect.width, priv->dest_rect.height, + /* half pixel offsets */ + offset_x, offset_y, + /* border colors */ + border_color_str[0], border_color_str[1], + border_color_str[2], border_color_str[3], + /* sampler function name */ + priv->texture_fmt->sample_func, + /* TO RGB conversion function name */ + to_rgb_func, + /* TO YUV conversion function name */ + to_yuv_func, + /* write function name */ + write_func); + + GST_LOG_OBJECT (self, "kernel code:\n%s\n", str); + ptx = gst_cuda_nvrtc_compile (str); + g_free (str); + + if (!ptx) { + GST_ERROR_OBJECT (self, "Could not compile code"); + return FALSE; + } + + if (!gst_cuda_context_push (self->context)) { + GST_ERROR_OBJECT (self, "Couldn't push context"); + return FALSE; + } + + /* Allocates intermediate memory for texture */ + if (unpack_function) { + ret = CuMemAllocPitch (&priv->unpack_buffer.ptr, + &priv->unpack_buffer.stride, + GST_VIDEO_INFO_COMP_WIDTH (texture_info, 0) * + GST_VIDEO_INFO_COMP_PSTRIDE (texture_info, 0), + GST_VIDEO_INFO_HEIGHT (texture_info), 16); + if (!gst_cuda_result (ret)) { + GST_ERROR_OBJECT (self, "Couldn't allocate unpack buffer"); + goto error; + } + } + + ret = CuModuleLoadData (&priv->module, ptx); + g_free (ptx); + if (!gst_cuda_result (ret)) { + GST_ERROR_OBJECT (self, "Could not load module"); + priv->module = NULL; + goto error; + } + + ret = CuModuleGetFunction (&priv->main_func, + priv->module, GST_CUDA_KERNEL_MAIN_FUNC); + if (!gst_cuda_result (ret)) { + GST_ERROR_OBJECT (self, "Could not get main function"); + goto error; + } + + if (unpack_function) { + ret = CuModuleGetFunction (&priv->unpack_func, + priv->module, GST_CUDA_KERNEL_UNPACK_FUNC); + if (!gst_cuda_result (ret)) { + GST_ERROR_OBJECT (self, "Could not get unpack function"); + goto error; + } + } + + gst_cuda_context_pop (NULL); + + if (priv->dest_rect.x != 0 || priv->dest_rect.y != 0 || + priv->dest_rect.width != out_info->width || + priv->dest_rect.height != out_info->height || + in_info->width != out_info->width + || in_info->height != out_info->height) { + for (i = 0; i < G_N_ELEMENTS (priv->filter_mode); i++) + priv->filter_mode[i] = CU_TR_FILTER_MODE_LINEAR; + } else { + for (i = 0; i < G_N_ELEMENTS (priv->filter_mode); i++) + priv->filter_mode[i] = CU_TR_FILTER_MODE_POINT; + } + + return TRUE; + +error: + gst_cuda_context_pop (NULL); + return FALSE; +} + +static gboolean +copy_config (GQuark field_id, const GValue * value, gpointer user_data) +{ + GstCudaConverter *self = (GstCudaConverter *) user_data; + + gst_structure_id_set_value (self->priv->config, field_id, value); + + return TRUE; +} + +static void +gst_cuda_converter_set_config (GstCudaConverter * self, GstStructure * config) +{ + gst_structure_foreach (config, copy_config, self); + gst_structure_free (config); +} + +static gint +get_opt_int (GstCudaConverter * self, const gchar * opt, gint def) +{ + gint res; + if (!gst_structure_get_int (self->priv->config, opt, &res)) + res = def; + return res; +} + +GstCudaConverter * +gst_cuda_converter_new (const GstVideoInfo * in_info, + const GstVideoInfo * out_info, GstCudaContext * context, + GstStructure * config) +{ + GstCudaConverter *self; + GstCudaConverterPrivate *priv; + + g_return_val_if_fail (in_info != NULL, NULL); + g_return_val_if_fail (out_info != NULL, NULL); + g_return_val_if_fail (GST_IS_CUDA_CONTEXT (context), NULL); + + self = g_object_new (GST_TYPE_CUDA_CONVERTER, NULL); + + if (!GST_IS_CUDA_CONTEXT (context)) { + GST_WARNING_OBJECT (self, "Not a valid cuda context object"); + goto error; + } + + self->context = gst_object_ref (context); + priv = self->priv; + priv->in_info = *in_info; + priv->out_info = *out_info; + + if (config) + gst_cuda_converter_set_config (self, config); + + priv->dest_rect.x = get_opt_int (self, GST_CUDA_CONVERTER_OPT_DEST_X, 0); + priv->dest_rect.y = get_opt_int (self, GST_CUDA_CONVERTER_OPT_DEST_Y, 0); + priv->dest_rect.width = get_opt_int (self, + GST_CUDA_CONVERTER_OPT_DEST_WIDTH, out_info->width); + priv->dest_rect.height = get_opt_int (self, + GST_CUDA_CONVERTER_OPT_DEST_HEIGHT, out_info->height); + + if (!gst_cuda_converter_setup (self)) + goto error; + + priv->texture_align = gst_cuda_context_get_texture_alignment (context); + + gst_object_ref_sink (self); + return self; + +error: + gst_object_unref (self); + return NULL; +} + + +static CUtexObject +gst_cuda_converter_create_texture_unchecked (GstCudaConverter * self, + CUdeviceptr src, gint width, gint height, CUarray_format format, + guint channels, gint stride, gint plane, CUfilter_mode mode) +{ + CUDA_TEXTURE_DESC texture_desc; + CUDA_RESOURCE_DESC resource_desc; + CUtexObject texture = 0; + CUresult cuda_ret; + + memset (&texture_desc, 0, sizeof (CUDA_TEXTURE_DESC)); + memset (&resource_desc, 0, sizeof (CUDA_RESOURCE_DESC)); + + resource_desc.resType = CU_RESOURCE_TYPE_PITCH2D; + resource_desc.res.pitch2D.format = format; + resource_desc.res.pitch2D.numChannels = channels; + resource_desc.res.pitch2D.width = width; + resource_desc.res.pitch2D.height = height; + resource_desc.res.pitch2D.pitchInBytes = stride; + resource_desc.res.pitch2D.devPtr = src; + + texture_desc.filterMode = mode; + /* Will read texture value as a normalized [0, 1] float value + * with [0, 1) coordinates */ + /* CU_TRSF_NORMALIZED_COORDINATES */ + texture_desc.flags = 0x2; + /* CU_TR_ADDRESS_MODE_CLAMP */ + texture_desc.addressMode[0] = 1; + texture_desc.addressMode[1] = 1; + texture_desc.addressMode[2] = 1; + + cuda_ret = CuTexObjectCreate (&texture, &resource_desc, &texture_desc, NULL); + + if (!gst_cuda_result (cuda_ret)) { + GST_ERROR_OBJECT (self, "Could not create texture"); + return 0; + } + + return texture; +} + +static gboolean +ensure_fallback_buffer (GstCudaConverter * self, gint width_in_bytes, + gint height, guint plane) +{ + GstCudaConverterPrivate *priv = self->priv; + CUresult ret; + + if (priv->fallback_buffer[plane].ptr) + return TRUE; + + ret = CuMemAllocPitch (&priv->fallback_buffer[plane].ptr, + &priv->fallback_buffer[plane].stride, width_in_bytes, height, 16); + + if (!gst_cuda_result (ret)) { + GST_ERROR_OBJECT (self, "Couldn't allocate fallback buffer"); + return FALSE; + } + + return TRUE; +} + +static CUtexObject +gst_cuda_converter_create_texture (GstCudaConverter * self, + CUdeviceptr src, gint width, gint height, gint stride, CUfilter_mode mode, + CUarray_format format, guint channles, gint plane, CUstream stream) +{ + GstCudaConverterPrivate *priv = self->priv; + CUresult ret; + CUdeviceptr src_ptr; + + src_ptr = src; + + if (priv->texture_align > 0 && (src_ptr % priv->texture_align) != 0) { + CUDA_MEMCPY2D params = { 0, }; + + GST_DEBUG_OBJECT (self, "Plane %d is not aligned, copying", plane); + + if (!ensure_fallback_buffer (self, stride, height, plane)) + return 0; + + params.srcMemoryType = CU_MEMORYTYPE_DEVICE; + params.srcPitch = stride; + params.srcDevice = (CUdeviceptr) src_ptr; + + params.dstMemoryType = CU_MEMORYTYPE_DEVICE; + params.dstPitch = priv->fallback_buffer[plane].stride; + params.dstDevice = priv->fallback_buffer[plane].ptr; + params.WidthInBytes = GST_VIDEO_INFO_COMP_WIDTH (&priv->in_info, plane) + * GST_VIDEO_INFO_COMP_PSTRIDE (&priv->in_info, plane), + params.Height = GST_VIDEO_INFO_COMP_HEIGHT (&priv->in_info, plane); + + ret = CuMemcpy2D (¶ms); + if (!gst_cuda_result (ret)) { + GST_ERROR_OBJECT (self, "Couldn't copy to fallback buffer"); + return 0; + } + + src_ptr = priv->fallback_buffer[plane].ptr; + stride = priv->fallback_buffer[plane].stride; + } + + return gst_cuda_converter_create_texture_unchecked (self, + src_ptr, width, height, format, channles, stride, plane, mode); +} + +static gboolean +gst_cuda_converter_unpack_rgb (GstCudaConverter * self, + GstVideoFrame * src_frame, CUstream stream) +{ + GstCudaConverterPrivate *priv = self->priv; + CUdeviceptr src; + gint width, height, src_stride, dst_stride; + CUresult ret; + gpointer args[] = { &src, &priv->unpack_buffer.ptr, + &width, &height, &src_stride, &dst_stride + }; + + g_assert (priv->unpack_buffer.ptr); + g_assert (priv->unpack_buffer.stride > 0); + + src = (CUdeviceptr) GST_VIDEO_FRAME_PLANE_DATA (src_frame, 0); + width = GST_VIDEO_FRAME_WIDTH (src_frame); + height = GST_VIDEO_FRAME_HEIGHT (src_frame); + src_stride = GST_VIDEO_FRAME_PLANE_STRIDE (src_frame, 0); + dst_stride = (gint) priv->unpack_buffer.stride; + + ret = CuLaunchKernel (priv->unpack_func, DIV_UP (width, CUDA_BLOCK_X), + DIV_UP (height, CUDA_BLOCK_Y), 1, CUDA_BLOCK_X, CUDA_BLOCK_Y, 1, 0, + stream, args, NULL); + + if (!gst_cuda_result (ret)) { + GST_ERROR_OBJECT (self, "Couldn't unpack source RGB"); + return FALSE; + } + + return TRUE; +} + +gboolean +gst_cuda_converter_convert_frame (GstCudaConverter * converter, + GstVideoFrame * src_frame, GstVideoFrame * dst_frame, CUstream stream) +{ + GstCudaConverterPrivate *priv; + const TextureFormat *format; + CUtexObject texture[GST_VIDEO_MAX_COMPONENTS] = { 0, }; + guint8 *dst[GST_VIDEO_MAX_COMPONENTS] = { NULL, }; + gint stride[2] = { 0, }; + gint i; + gboolean ret = FALSE; + CUresult cuda_ret; + gint width, height; + gpointer args[] = { &texture[0], &texture[1], &texture[2], &texture[3], + &dst[0], &dst[1], &dst[2], &dst[3], &stride[0], &stride[1] + }; + + g_return_val_if_fail (GST_IS_CUDA_CONVERTER (converter), FALSE); + g_return_val_if_fail (src_frame != NULL, FALSE); + g_return_val_if_fail (dst_frame != NULL, FALSE); + + priv = converter->priv; + format = priv->texture_fmt; + + g_assert (format); + + if (!gst_cuda_context_push (converter->context)) { + GST_ERROR_OBJECT (converter, "Couldn't push context"); + return FALSE; + } + + if (priv->unpack_func) { + if (!gst_cuda_converter_unpack_rgb (converter, src_frame, stream)) + goto out; + + texture[0] = gst_cuda_converter_create_texture_unchecked (converter, + priv->unpack_buffer.ptr, priv->in_info.width, priv->in_info.height, + format->array_format[0], 4, priv->unpack_buffer.stride, 0, + priv->filter_mode[0]); + if (!texture[0]) { + GST_ERROR_OBJECT (converter, "Couldn't create texture"); + goto out; + } + } else { + for (i = 0; i < GST_VIDEO_FRAME_N_PLANES (src_frame); i++) { + CUdeviceptr src; + + src = (CUdeviceptr) GST_VIDEO_FRAME_PLANE_DATA (src_frame, i); + texture[i] = gst_cuda_converter_create_texture (converter, + src, GST_VIDEO_FRAME_COMP_WIDTH (src_frame, i), + GST_VIDEO_FRAME_COMP_HEIGHT (src_frame, i), + GST_VIDEO_FRAME_PLANE_STRIDE (src_frame, i), + priv->filter_mode[i], format->array_format[i], format->channels[i], i, + stream); + if (!texture[i]) { + GST_ERROR_OBJECT (converter, "Couldn't create texture %d", i); + goto out; + } + } + } + + width = GST_VIDEO_FRAME_WIDTH (dst_frame); + height = GST_VIDEO_FRAME_HEIGHT (dst_frame); + + for (i = 0; i < GST_VIDEO_FRAME_N_PLANES (dst_frame); i++) + dst[i] = GST_VIDEO_FRAME_PLANE_DATA (dst_frame, i); + + stride[0] = stride[1] = GST_VIDEO_FRAME_PLANE_STRIDE (dst_frame, 0); + if (GST_VIDEO_FRAME_N_PLANES (dst_frame) > 1) + stride[1] = GST_VIDEO_FRAME_PLANE_STRIDE (dst_frame, 1); + + cuda_ret = CuLaunchKernel (priv->main_func, DIV_UP (width, CUDA_BLOCK_X), + DIV_UP (height, CUDA_BLOCK_Y), 1, CUDA_BLOCK_X, CUDA_BLOCK_Y, 1, 0, + stream, args, NULL); + + if (!gst_cuda_result (cuda_ret)) { + GST_ERROR_OBJECT (converter, "Couldn't convert frame"); + goto out; + } + + CuStreamSynchronize (stream); + + ret = TRUE; + +out: + for (i = 0; i < G_N_ELEMENTS (texture); i++) { + if (texture[i]) + CuTexObjectDestroy (texture[i]); + else + break; + } + + gst_cuda_context_pop (NULL); + return ret; +} diff --git a/subprojects/gst-plugins-bad/sys/nvcodec/gstcudaconverter.h b/subprojects/gst-plugins-bad/sys/nvcodec/gstcudaconverter.h new file mode 100644 index 0000000000..d7b009a545 --- /dev/null +++ b/subprojects/gst-plugins-bad/sys/nvcodec/gstcudaconverter.h @@ -0,0 +1,99 @@ +/* GStreamer + * Copyright (C) 2019 Seungha Yang + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Library General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Library General Public License for more details. + * + * You should have received a copy of the GNU Library General Public + * License along with this library; if not, write to the + * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, + * Boston, MA 02110-1301, USA. + */ + +#pragma once + +#include +#include + +G_BEGIN_DECLS + +#define GST_TYPE_CUDA_CONVERTER (gst_cuda_converter_get_type()) +#define GST_CUDA_CONVERTER(obj) (G_TYPE_CHECK_INSTANCE_CAST((obj),GST_TYPE_CUDA_CONVERTER,GstCudaConverter)) +#define GST_CUDA_CONVERTER_CLASS(klass) (G_TYPE_CHECK_CLASS_CAST((klass),GST_TYPE_CUDA_CONVERTER,GstCudaConverterClass)) +#define GST_CUDA_CONVERTER_GET_CLASS(obj) (GST_CUDA_CONVERTER_CLASS(G_OBJECT_GET_CLASS(obj))) +#define GST_IS_CUDA_CONVERTER(obj) (G_TYPE_CHECK_INSTANCE_TYPE((obj),GST_TYPE_CUDA_CONVERTER)) +#define GST_IS_CUDA_CONVERTER_CLASS(klass) (G_TYPE_CHECK_CLASS_TYPE((klass),GST_TYPE_CUDA_CONVERTER)) +#define GST_CUDA_CONVERTER_CAST(obj) ((GstCudaConverter*)(obj)) + +typedef struct _GstCudaConverter GstCudaConverter; +typedef struct _GstCudaConverterClass GstCudaConverterClass; +typedef struct _GstCudaConverterPrivate GstCudaConverterPrivate; + +/** + * GST_CUDA_CONVERTER_OPT_DEST_X: + * + * #G_TYPE_INT, x position in the destination frame, default 0 + */ +#define GST_CUDA_CONVERTER_OPT_DEST_X "GstCudaConverter.dest-x" + +/** + * GST_CUDA_CONVERTER_OPT_DEST_Y: + * + * #G_TYPE_INT, y position in the destination frame, default 0 + */ +#define GST_CUDA_CONVERTER_OPT_DEST_Y "GstCudaConverter.dest-y" + +/** + * GST_CUDA_CONVERTER_OPT_DEST_WIDTH: + * + * #G_TYPE_INT, width in the destination frame, default destination width + */ +#define GST_CUDA_CONVERTER_OPT_DEST_WIDTH "GstCudaConverter.dest-width" + +/** + * GST_CUDA_CONVERTER_OPT_DEST_HEIGHT: + * + * #G_TYPE_INT, height in the destination frame, default destination height + */ +#define GST_CUDA_CONVERTER_OPT_DEST_HEIGHT "GstCudaConverter.dest-height" + +struct _GstCudaConverter +{ + GstObject parent; + + GstCudaContext *context; + + /*< private >*/ + GstCudaConverterPrivate *priv; + gpointer _gst_reserved[GST_PADDING]; +}; + +struct _GstCudaConverterClass +{ + GstObjectClass parent_class; + + /*< private >*/ + gpointer _gst_reserved[GST_PADDING]; +}; + +GType gst_cuda_converter_get_type (void); + +GstCudaConverter * gst_cuda_converter_new (const GstVideoInfo * in_info, + const GstVideoInfo * out_info, + GstCudaContext * context, + GstStructure * config); + +gboolean gst_cuda_converter_convert_frame (GstCudaConverter * converter, + GstVideoFrame * src_frame, + GstVideoFrame * dst_frame, + CUstream cuda_stream); + +G_END_DECLS + diff --git a/subprojects/gst-plugins-bad/sys/nvcodec/gstcudaconvertscale.c b/subprojects/gst-plugins-bad/sys/nvcodec/gstcudaconvertscale.c index 3fda6d7a6d..ad696ebd96 100644 --- a/subprojects/gst-plugins-bad/sys/nvcodec/gstcudaconvertscale.c +++ b/subprojects/gst-plugins-bad/sys/nvcodec/gstcudaconvertscale.c @@ -24,7 +24,7 @@ #include #include "gstcudaconvertscale.h" -#include "cuda-converter.h" +#include "gstcudaconverter.h" GST_DEBUG_CATEGORY_STATIC (gst_cuda_base_convert_debug); #define GST_CAT_DEFAULT gst_cuda_base_convert_debug @@ -133,10 +133,7 @@ gst_cuda_base_convert_dispose (GObject * object) { GstCudaBaseConvert *self = GST_CUDA_BASE_CONVERT (object); - if (self->converter) { - gst_cuda_converter_free (self->converter); - self->converter = NULL; - } + gst_clear_object (&self->converter); G_OBJECT_CLASS (parent_class)->dispose (object); } @@ -1229,7 +1226,7 @@ gst_cuda_base_convert_set_info (GstCudaBaseTransform * btrans, gint from_dar_n, from_dar_d, to_dar_n, to_dar_d; GstVideoInfo tmp_info; - g_clear_pointer (&self->converter, gst_cuda_converter_free); + gst_clear_object (&self->converter); if (!gst_util_fraction_multiply (in_info->width, in_info->height, in_info->par_n, in_info->par_d, &from_dar_n, @@ -1288,7 +1285,7 @@ gst_cuda_base_convert_set_info (GstCudaBaseTransform * btrans, gst_base_transform_set_passthrough (GST_BASE_TRANSFORM (self), FALSE); self->converter = gst_cuda_converter_new (in_info, - out_info, btrans->context); + out_info, btrans->context, NULL); if (!self->converter) { GST_ERROR_OBJECT (self, "Couldn't create converter"); return FALSE; diff --git a/subprojects/gst-plugins-bad/sys/nvcodec/meson.build b/subprojects/gst-plugins-bad/sys/nvcodec/meson.build index ec871216fb..9010b7c7a1 100644 --- a/subprojects/gst-plugins-bad/sys/nvcodec/meson.build +++ b/subprojects/gst-plugins-bad/sys/nvcodec/meson.build @@ -1,6 +1,6 @@ nvcodec_sources = [ - 'cuda-converter.c', 'gstcudabasetransform.c', + 'gstcudaconverter.c', 'gstcudaconvertscale.c', 'gstcudafilter.c', 'gstcudamemorycopy.c',