From 592a8d540067df6456d5d126e7a5c27f8d91e375 Mon Sep 17 00:00:00 2001 From: Seungha Yang Date: Wed, 16 Oct 2019 22:42:39 +0900 Subject: [PATCH] nvcodec: Add generic CUDA video convert object Introducing generic video convert object similar to video-converter but using CUDA. Part-of: --- sys/nvcodec/cuda-converter.c | 2085 ++++++++++++++++++++++++++++++++++ sys/nvcodec/cuda-converter.h | 62 + sys/nvcodec/gstcudacontext.c | 26 +- sys/nvcodec/gstcudacontext.h | 3 + sys/nvcodec/gstcudaloader.c | 90 ++ sys/nvcodec/gstcudaloader.h | 37 + sys/nvcodec/meson.build | 1 + sys/nvcodec/stub/cuda.h | 100 ++ 8 files changed, 2403 insertions(+), 1 deletion(-) create mode 100644 sys/nvcodec/cuda-converter.c create mode 100644 sys/nvcodec/cuda-converter.h diff --git a/sys/nvcodec/cuda-converter.c b/sys/nvcodec/cuda-converter.c new file mode 100644 index 0000000000..5b81acc29b --- /dev/null +++ b/sys/nvcodec/cuda-converter.c @@ -0,0 +1,2085 @@ +/* GStreamer + * Copyright (C) 2010 David Schleef + * Copyright (C) 2010 Sebastian Dröge + * Copyright (C) 2019 Seungha Yang + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Library General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Library General Public License for more details. + * + * You should have received a copy of the GNU Library General Public + * License along with this library; if not, write to the + * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, + * Boston, MA 02110-1301, USA. + */ + +/** + * SECTION:cudaconverter + * @title: GstCudaConverter + * @short_description: Generic video conversion using CUDA + * + * This object is used to convert video frames from one format to another. + * The object can perform conversion of: + * + * * video format + * * video colorspace + * * video size + */ + +/** + * TODO: + * * Add more interpolation method and make it selectable, + * currently default bi-linear interpolation only + * * Add fast-path for conversion like videoconvert + * * Full colorimetiry and chroma-siting support + * * cropping, and x, y position support + */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "cuda-converter.h" +#include "gstcudautils.h" +#include "gstcudaloader.h" +#include "gstcudanvrtc.h" +#include + +#define CUDA_BLOCK_X 16 +#define CUDA_BLOCK_Y 16 +#define DIV_UP(size,block) (((size) + ((block) - 1)) / (block)) + +static gboolean cuda_converter_lookup_path (GstCudaConverter * convert); + +#ifndef GST_DISABLE_GST_DEBUG +#define GST_CAT_DEFAULT ensure_debug_category() +static GstDebugCategory * +ensure_debug_category (void) +{ + static gsize cat_gonce = 0; + + if (g_once_init_enter (&cat_gonce)) { + gsize cat_done; + + cat_done = (gsize) _gst_debug_category_new ("cuda-converter", 0, + "cuda-converter object"); + + g_once_init_leave (&cat_gonce, cat_done); + } + + return (GstDebugCategory *) cat_gonce; +} +#else +#define ensure_debug_category() +#endif + +#define GST_CUDA_KERNEL_FUNC "gst_cuda_kernel_func" + +#define GST_CUDA_KERNEL_FUNC_TO_Y444 "gst_cuda_kernel_func_to_y444" + +#define GST_CUDA_KERNEL_FUNC_Y444_TO_YUV "gst_cuda_kernel_func_y444_to_yuv" + +#define GST_CUDA_KERNEL_FUNC_TO_ARGB "gst_cuda_kernel_func_to_argb" + +#define GST_CUDA_KERNEL_FUNC_SCALE_RGB "gst_cuda_kernel_func_scale_rgb" + +/* *INDENT-OFF* */ +/** + * read_chroma: + * @tex1: a CUDA texture object representing a semi-planar chroma plane + * @tex2: dummy object + * @x: the x coordinate to read data from @tex1 + * @y: the y coordinate to read data from @tex1 + * + * Returns: a #ushort2 vector representing both chroma pixel values + */ +static const gchar READ_CHROMA_FROM_SEMI_PLANAR[] = +"__device__ ushort2\n" +"read_chroma (cudaTextureObject_t tex1, cudaTextureObject_t tex2, \n" +" float x, float y)\n" +"{\n" +" return tex2D(tex1, x, y);\n" +"}"; + +/** + * read_chroma: + * @tex1: a CUDA texture object representing a chroma planar plane + * @tex2: a CUDA texture object representing the orher planar plane + * @x: the x coordinate to read data from @tex1 and @tex2 + * @y: the y coordinate to read data from @tex1 and @tex2 + * + * Returns: a #ushort2 vector representing both chroma pixel values + */ +static const gchar READ_CHROMA_FROM_PLANAR[] = +"__device__ ushort2\n" +"read_chroma (cudaTextureObject_t tex1, cudaTextureObject_t tex2, \n" +" float x, float y)\n" +"{\n" +" unsigned short u, v;\n" +" u = tex2D(tex1, x, y);\n" +" v = tex2D(tex2, x, y);\n" +" return make_ushort2(u, v);\n" +"}"; + +/** + * write_chroma: + * @dst1: a CUDA global memory pointing to a semi-planar chroma plane + * @dst2: dummy + * @u: a pixel value to write @dst1 + * @v: a pixel value to write @dst1 + * @x: the x coordinate to wrtie data into @tex1 + * @x: the y coordinate to wrtie data into @tex1 + * @pstride: the pixel stride of @dst1 + * @mask: bitmask to be applied to high bitdepth plane + * + * Write @u and @v pixel value to @dst1 semi-planar plane + */ +static const gchar WRITE_CHROMA_TO_SEMI_PLANAR[] = +"__device__ void\n" +"write_chroma (unsigned char *dst1, unsigned char *dst2, unsigned short u,\n" +" unsigned short v, int x, int y, int pstride, int stride, int mask)\n" +"{\n" +" if (OUT_DEPTH > 8) {\n" +" *(unsigned short *)&dst1[x * pstride + y * stride] = (u & mask);\n" +" *(unsigned short *)&dst1[x * pstride + 2 + y * stride] = (v & mask);\n" +" } else {\n" +" dst1[x * pstride + y * stride] = u;\n" +" dst1[x * pstride + 1 + y * stride] = v;\n" +" }\n" +"}"; + +/** + * write_chroma: + * @dst1: a CUDA global memory pointing to a planar chroma plane + * @dst2: a CUDA global memory pointing to a the other planar chroma plane + * @u: a pixel value to write @dst1 + * @v: a pixel value to write @dst1 + * @x: the x coordinate to wrtie data into @tex1 + * @x: the y coordinate to wrtie data into @tex1 + * @pstride: the pixel stride of @dst1 + * @mask: bitmask to be applied to high bitdepth plane + * + * Write @u and @v pixel value into @dst1 and @dst2 planar planes + */ +static const gchar WRITE_CHROMA_TO_PLANAR[] = +"__device__ void\n" +"write_chroma (unsigned char *dst1, unsigned char *dst2, unsigned short u,\n" +" unsigned short v, int x, int y, int pstride, int stride, int mask)\n" +"{\n" +" if (OUT_DEPTH > 8) {\n" +" *(unsigned short *)&dst1[x * pstride + y * stride] = (u & mask);\n" +" *(unsigned short *)&dst2[x * pstride + y * stride] = (v & mask);\n" +" } else {\n" +" dst1[x * pstride + y * stride] = u;\n" +" dst2[x * pstride + y * stride] = v;\n" +" }\n" +"}"; + +/* CUDA kernel source for from YUV to YUV conversion and scale */ +static const gchar templ_YUV_TO_YUV[] = +"extern \"C\"{\n" +"__constant__ float SCALE_H = %f;\n" +"__constant__ float SCALE_V = %f;\n" +"__constant__ float CHROMA_SCALE_H = %f;\n" +"__constant__ float CHROMA_SCALE_V = %f;\n" +"__constant__ int WIDTH = %d;\n" +"__constant__ int HEIGHT = %d;\n" +"__constant__ int CHROMA_WIDTH = %d;\n" +"__constant__ int CHROMA_HEIGHT = %d;\n" +"__constant__ int IN_DEPTH = %d;\n" +"__constant__ int OUT_DEPTH = %d;\n" +"__constant__ int PSTRIDE = %d;\n" +"__constant__ int CHROMA_PSTRIDE = %d;\n" +"__constant__ int IN_SHIFT = %d;\n" +"__constant__ int OUT_SHIFT = %d;\n" +"__constant__ int MASK = %d;\n" +"__constant__ int SWAP_UV = %d;\n" +"\n" +"__device__ unsigned short\n" +"do_scale_pixel (unsigned short val) \n" +"{\n" +" unsigned int diff;\n" +" if (OUT_DEPTH > IN_DEPTH) {\n" +" diff = OUT_DEPTH - IN_DEPTH;\n" +" return (val << diff) | (val >> (IN_DEPTH - diff));\n" +" } else if (IN_DEPTH > OUT_DEPTH) {\n" +" return val >> (IN_DEPTH - OUT_DEPTH);\n" +" }\n" +" return val;\n" +"}\n" +"\n" +/* __device__ ushort2 + * read_chroma (cudaTextureObject_t tex1, cudaTextureObject_t tex2, float x, float y); + */ +"%s\n" +"\n" +/* __device__ void + * write_chroma (unsigned char *dst1, unsigned char *dst2, unsigned short u, + * unsigned short v, int x, int y, int pstride, int stride, int mask); + */ +"%s\n" +"\n" +"__global__ void\n" +GST_CUDA_KERNEL_FUNC +"(cudaTextureObject_t tex0, cudaTextureObject_t tex1, cudaTextureObject_t tex2,\n" +" unsigned char *dst0, unsigned char *dst1, unsigned char *dst2,\n" +" int stride)\n" +"{\n" +" int x_pos = blockIdx.x * blockDim.x + threadIdx.x;\n" +" int y_pos = blockIdx.y * blockDim.y + threadIdx.y;\n" +" if (x_pos < WIDTH && y_pos < HEIGHT) {\n" +" float src_xpos = SCALE_H * x_pos;\n" +" float src_ypos = SCALE_V * y_pos;\n" +" unsigned short y = tex2D(tex0, src_xpos, src_ypos);\n" +" y = y >> IN_SHIFT;\n" +" y = do_scale_pixel (y);\n" +" y = y << OUT_SHIFT;\n" +" if (OUT_DEPTH > 8) {\n" +" *(unsigned short *)&dst0[x_pos * PSTRIDE + y_pos * stride] = (y & MASK);\n" +" } else {\n" +" dst0[x_pos * PSTRIDE + y_pos * stride] = y;\n" +" }\n" +" }\n" +" if (x_pos < CHROMA_WIDTH && y_pos < CHROMA_HEIGHT) {\n" +" float src_xpos = CHROMA_SCALE_H * x_pos;\n" +" float src_ypos = CHROMA_SCALE_V * y_pos;\n" +" unsigned short u, v;\n" +" ushort2 uv = read_chroma (tex1, tex2, src_xpos, src_ypos);\n" +" u = uv.x;\n" +" v = uv.y;\n" +" u = u >> IN_SHIFT;\n" +" v = v >> IN_SHIFT;\n" +" u = do_scale_pixel (u);\n" +" v = do_scale_pixel (v);\n" +" u = u << OUT_SHIFT;\n" +" v = v << OUT_SHIFT;\n" +" if (SWAP_UV) {\n" +" unsigned short tmp = u;\n" +" u = v;\n" +" v = tmp;\n" +" }\n" +" write_chroma (dst1,\n" +" dst2, u, v, x_pos, y_pos, CHROMA_PSTRIDE, stride, MASK);\n" +" }\n" +"}\n" +"\n" +"}"; + +/* CUDA kernel source for from YUV to RGB conversion and scale */ +static const gchar templ_YUV_TO_RGB[] = +"extern \"C\"{\n" +"__constant__ float offset[3] = {%f, %f, %f};\n" +"__constant__ float rcoeff[3] = {%f, %f, %f};\n" +"__constant__ float gcoeff[3] = {%f, %f, %f};\n" +"__constant__ float bcoeff[3] = {%f, %f, %f};\n" +"\n" +"__constant__ float SCALE_H = %f;\n" +"__constant__ float SCALE_V = %f;\n" +"__constant__ float CHROMA_SCALE_H = %f;\n" +"__constant__ float CHROMA_SCALE_V = %f;\n" +"__constant__ int WIDTH = %d;\n" +"__constant__ int HEIGHT = %d;\n" +"__constant__ int CHROMA_WIDTH = %d;\n" +"__constant__ int CHROMA_HEIGHT = %d;\n" +"__constant__ int IN_DEPTH = %d;\n" +"__constant__ int OUT_DEPTH = %d;\n" +"__constant__ int PSTRIDE = %d;\n" +"__constant__ int CHROMA_PSTRIDE = %d;\n" +"__constant__ int IN_SHIFT = %d;\n" +"__constant__ int OUT_SHIFT = %d;\n" +"__constant__ int MASK = %d;\n" +"__constant__ int SWAP_UV = %d;\n" +"__constant__ int MAX_IN_VAL = %d;\n" +"__constant__ int R_IDX = %d;\n" +"__constant__ int G_IDX = %d;\n" +"__constant__ int B_IDX = %d;\n" +"__constant__ int A_IDX = %d;\n" +"__constant__ int X_IDX = %d;\n" +"\n" +"__device__ unsigned short\n" +"do_scale_pixel (unsigned short val) \n" +"{\n" +" unsigned int diff;\n" +" if (OUT_DEPTH > IN_DEPTH) {\n" +" diff = OUT_DEPTH - IN_DEPTH;\n" +" return (val << diff) | (val >> (IN_DEPTH - diff));\n" +" } else if (IN_DEPTH > OUT_DEPTH) {\n" +" return val >> (IN_DEPTH - OUT_DEPTH);\n" +" }\n" +" return val;\n" +"}\n" +"\n" +"__device__ float\n" +"dot(float3 val, float *coeff)\n" +"{\n" +" return val.x * coeff[0] + val.y * coeff[1] + val.z * coeff[2];\n" +"}\n" +"\n" +"__device__ uint3\n" +"yuv_to_rgb (unsigned short y, unsigned short u, unsigned short v, unsigned int max_val)\n" +"{\n" +" float3 yuv = make_float3 (y, u, v);\n" +" uint3 rgb;\n" +" rgb.x = max ((unsigned int)(dot (yuv, rcoeff) + offset[0]), 0);\n" +" rgb.y = max ((unsigned int)(dot (yuv, gcoeff) + offset[1]), 0);\n" +" rgb.z = max ((unsigned int)(dot (yuv, bcoeff) + offset[2]), 0);\n" +" rgb.x = min (rgb.x, max_val);\n" +" rgb.y = min (rgb.y, max_val);\n" +" rgb.z = min (rgb.z, max_val);\n" +" return rgb;\n" +"}\n" +"\n" +/* __device__ ushort2 + * read_chroma (cudaTextureObject_t tex1, cudaTextureObject_t tex2, float x, float y); + */ +"%s\n" +"\n" +"__global__ void\n" +GST_CUDA_KERNEL_FUNC +"(cudaTextureObject_t tex0, cudaTextureObject_t tex1, cudaTextureObject_t tex2,\n" +" unsigned char *dstRGB, int stride)\n" +"{\n" +" int x_pos = blockIdx.x * blockDim.x + threadIdx.x;\n" +" int y_pos = blockIdx.y * blockDim.y + threadIdx.y;\n" +" if (x_pos < WIDTH && y_pos < HEIGHT) {\n" +" float src_xpos = SCALE_H * x_pos;\n" +" float src_ypos = SCALE_V * y_pos;\n" +" unsigned short y = tex2D(tex0, src_xpos, src_ypos);\n" +" ushort2 uv;\n" +" unsigned short u, v;\n" +" uint3 rgb;\n" +" unsigned int clip_max = MAX_IN_VAL;\n" +" src_xpos = CHROMA_SCALE_H * x_pos;\n" +" src_ypos = CHROMA_SCALE_V * y_pos;\n" +" uv = read_chroma (tex1, tex2, src_xpos, src_ypos);\n" +" u = uv.x;\n" +" v = uv.y;\n" +" y = y >> IN_SHIFT;\n" +" u = u >> IN_SHIFT;\n" +" v = v >> IN_SHIFT;\n" +" if (SWAP_UV) {\n" +" unsigned short tmp = u;\n" +" u = v;\n" +" v = tmp;\n" +" }\n" + /* conversion matrix is scaled to higher bitdepth between in/out formats */ +" if (OUT_DEPTH > IN_DEPTH) {\n" +" y = do_scale_pixel (y);\n" +" u = do_scale_pixel (u);\n" +" v = do_scale_pixel (v);\n" +" clip_max = MASK;\n" +" }" +" rgb = yuv_to_rgb (y, u, v, clip_max);\n" +" if (OUT_DEPTH < IN_DEPTH) {\n" +" rgb.x = do_scale_pixel (rgb.x);\n" +" rgb.y = do_scale_pixel (rgb.y);\n" +" rgb.z = do_scale_pixel (rgb.z);\n" +" }" +" if (OUT_DEPTH > 8) {\n" +" unsigned int packed_rgb = 0;\n" + /* A is always MSB, we support only little endian system */ +" packed_rgb = 0xc000 << 16;\n" +" packed_rgb |= (rgb.x << (30 - (R_IDX * 10)));\n" +" packed_rgb |= (rgb.y << (30 - (G_IDX * 10)));\n" +" packed_rgb |= (rgb.z << (30 - (B_IDX * 10)));\n" +" *(unsigned int *)&dstRGB[x_pos * PSTRIDE + y_pos * stride] = packed_rgb;\n" +" } else {\n" +" dstRGB[x_pos * PSTRIDE + R_IDX + y_pos * stride] = (unsigned char) rgb.x;\n" +" dstRGB[x_pos * PSTRIDE + G_IDX + y_pos * stride] = (unsigned char) rgb.y;\n" +" dstRGB[x_pos * PSTRIDE + B_IDX + y_pos * stride] = (unsigned char) rgb.z;\n" +" if (A_IDX >= 0 || X_IDX >= 0)\n" +" dstRGB[x_pos * PSTRIDE + A_IDX + y_pos * stride] = 0xff;\n" +" }\n" +" }\n" +"}\n" +"\n" +"}"; + +/** + * GST_CUDA_KERNEL_FUNC_TO_ARGB: + * @srcRGB: a CUDA global memory containing a RGB image + * @dstRGB: a CUDA global memory to store unpacked ARGB image + * @width: the width of @srcRGB and @dstRGB + * @height: the height of @srcRGB and @dstRGB + * @src_stride: the stride of @srcRGB + * @src_pstride: the pixel stride of @srcRGB + * @dst_stride: the stride of @dstRGB + * @r_idx: the index of red component of @srcRGB + * @g_idx: the index of green component of @srcRGB + * @b_idx: the index of blue component of @srcRGB + * @a_idx: the index of alpha component of @srcRGB + * + * Unpack a RGB image from @srcRGB and write the unpacked data into @dstRGB + */ +static const gchar unpack_to_ARGB[] = +"__global__ void\n" +GST_CUDA_KERNEL_FUNC_TO_ARGB +"(unsigned char *srcRGB, unsigned char *dstRGB, int width, int height,\n" +" int src_stride, int src_pstride, int dst_stride,\n" +" int r_idx, int g_idx, int b_idx, int a_idx)\n" +"{\n" +" int x_pos = blockIdx.x * blockDim.x + threadIdx.x;\n" +" int y_pos = blockIdx.y * blockDim.y + threadIdx.y;\n" +" if (x_pos < width && y_pos < height) {\n" +" if (a_idx >= 0) {\n" +" dstRGB[x_pos * 4 + y_pos * dst_stride] =\n" +" srcRGB[x_pos * src_pstride + a_idx + y_pos * src_stride];\n" +" } else {\n" +" dstRGB[x_pos * 4 + y_pos * dst_stride] = 0xff;\n" +" }\n" +" dstRGB[x_pos * 4 + 1 + y_pos * dst_stride] =\n" +" srcRGB[x_pos * src_pstride + r_idx + y_pos * src_stride];\n" +" dstRGB[x_pos * 4 + 2 + y_pos * dst_stride] =\n" +" srcRGB[x_pos * src_pstride + g_idx + y_pos * src_stride];\n" +" dstRGB[x_pos * 4 + 3 + y_pos * dst_stride] =\n" +" srcRGB[x_pos * src_pstride + b_idx + y_pos * src_stride];\n" +" }\n" +"}\n"; + +/** + * GST_CUDA_KERNEL_FUNC_TO_ARGB: + * @srcRGB: a CUDA global memory containing a RGB image + * @dstRGB: a CUDA global memory to store unpacked ARGB64 image + * @width: the width of @srcRGB and @dstRGB + * @height: the height of @srcRGB and @dstRGB + * @src_stride: the stride of @srcRGB + * @src_pstride: the pixel stride of @srcRGB + * @dst_stride: the stride of @dstRGB + * @r_idx: the index of red component of @srcRGB + * @g_idx: the index of green component of @srcRGB + * @b_idx: the index of blue component of @srcRGB + * @a_idx: the index of alpha component of @srcRGB + * + * Unpack a RGB image from @srcRGB and write the unpacked data into @dstRGB + */ +static const gchar unpack_to_ARGB64[] = +"__global__ void\n" +GST_CUDA_KERNEL_FUNC_TO_ARGB +"(unsigned char *srcRGB, unsigned char *dstRGB, int width, int height,\n" +" int src_stride, int src_pstride, int dst_stride,\n" +" int r_idx, int g_idx, int b_idx, int a_idx)\n" +"{\n" +" int x_pos = blockIdx.x * blockDim.x + threadIdx.x;\n" +" int y_pos = blockIdx.y * blockDim.y + threadIdx.y;\n" +" if (x_pos < width && y_pos < height) {\n" +" unsigned short a, r, g, b;\n" +" unsigned int read_val;\n" +" read_val = *(unsigned int *)&srcRGB[x_pos * src_pstride + y_pos * src_stride];\n" +" a = (read_val >> 30) & 0x03;\n" +" a = (a << 14) | (a << 12) | (a << 10) | (a << 8) | (a << 6) | (a << 4) | (a << 2) | (a << 0);\n" +" r = ((read_val >> (30 - (r_idx * 10))) & 0x3ff);\n" +" r = (r << 6) | (r >> 4);\n" +" g = ((read_val >> (30 - (g_idx * 10))) & 0x3ff);\n" +" g = (g << 6) | (g >> 4);\n" +" b = ((read_val >> (30 - (b_idx * 10))) & 0x3ff);\n" +" b = (b << 6) | (b >> 4);\n" +" *(unsigned short *)&dstRGB[x_pos * 8 + y_pos * dst_stride] = 0xffff;\n" +" *(unsigned short *)&dstRGB[x_pos * 8 + 2 + y_pos * dst_stride] = r;\n" +" *(unsigned short *)&dstRGB[x_pos * 8 + 4 + y_pos * dst_stride] = g;\n" +" *(unsigned short *)&dstRGB[x_pos * 8 + 6 + y_pos * dst_stride] = b;\n" +" }\n" +"}\n"; + +/* CUDA kernel source for from RGB to YUV conversion and scale */ +static const gchar templ_RGB_TO_YUV[] = +"extern \"C\"{\n" +"__constant__ float offset[3] = {%f, %f, %f};\n" +"__constant__ float ycoeff[3] = {%f, %f, %f};\n" +"__constant__ float ucoeff[3] = {%f, %f, %f};\n" +"__constant__ float vcoeff[3] = {%f, %f, %f};\n" +"\n" +"__constant__ float SCALE_H = %f;\n" +"__constant__ float SCALE_V = %f;\n" +"__constant__ float CHROMA_SCALE_H = %f;\n" +"__constant__ float CHROMA_SCALE_V = %f;\n" +"__constant__ int WIDTH = %d;\n" +"__constant__ int HEIGHT = %d;\n" +"__constant__ int CHROMA_WIDTH = %d;\n" +"__constant__ int CHROMA_HEIGHT = %d;\n" +"__constant__ int IN_DEPTH = %d;\n" +"__constant__ int OUT_DEPTH = %d;\n" +"__constant__ int PSTRIDE = %d;\n" +"__constant__ int CHROMA_PSTRIDE = %d;\n" +"__constant__ int IN_SHIFT = %d;\n" +"__constant__ int OUT_SHIFT = %d;\n" +"__constant__ int MASK = %d;\n" +"__constant__ int SWAP_UV = %d;\n" +"\n" +"__device__ unsigned short\n" +"do_scale_pixel (unsigned short val) \n" +"{\n" +" unsigned int diff;\n" +" if (OUT_DEPTH > IN_DEPTH) {\n" +" diff = OUT_DEPTH - IN_DEPTH;\n" +" return (val << diff) | (val >> (IN_DEPTH - diff));\n" +" } else if (IN_DEPTH > OUT_DEPTH) {\n" +" return val >> (IN_DEPTH - OUT_DEPTH);\n" +" }\n" +" return val;\n" +"}\n" +"\n" +"__device__ float\n" +"dot(float3 val, float *coeff)\n" +"{\n" +" return val.x * coeff[0] + val.y * coeff[1] + val.z * coeff[2];\n" +"}\n" +"\n" +"__device__ uint3\n" +"rgb_to_yuv (unsigned short r, unsigned short g, unsigned short b,\n" +" unsigned int max_val)\n" +"{\n" +" float3 rgb = make_float3 (r, g, b);\n" +" uint3 yuv;\n" +" yuv.x = max ((unsigned int)(dot (rgb, ycoeff) + offset[0]), 0);\n" +" yuv.y = max ((unsigned int)(dot (rgb, ucoeff) + offset[1]), 0);\n" +" yuv.z = max ((unsigned int)(dot (rgb, vcoeff) + offset[2]), 0);\n" +" yuv.x = min (yuv.x, max_val);\n" +" yuv.y = min (yuv.y, max_val);\n" +" yuv.z = min (yuv.z, max_val);\n" +" return yuv;\n" +"}\n" +"\n" +/* __global__ void + * GST_CUDA_KERNEL_FUNC_TO_ARGB + */ +"%s\n" +"\n" +/* __device__ ushort2 + * read_chroma (cudaTextureObject_t tex1, cudaTextureObject_t tex2, float x, float y); + */ +"%s\n" +"\n" +/* __device__ void + * write_chroma (unsigned char *dst1, unsigned char *dst2, unsigned short u, + * unsigned short v, int x, int y, int pstride, int stride, int mask); + */ +"%s\n" +"\n" +"__global__ void\n" +GST_CUDA_KERNEL_FUNC_TO_Y444 +"(cudaTextureObject_t srcRGB, unsigned char *dstY, int y_stride,\n" +" unsigned char *dstU, int u_stride, unsigned char *dstV, int v_stride,\n" +" int width, int height, int dst_pstride, int in_depth)\n" +"{\n" +" int x_pos = blockIdx.x * blockDim.x + threadIdx.x;\n" +" int y_pos = blockIdx.y * blockDim.y + threadIdx.y;\n" +" if (x_pos < width && y_pos < height) {\n" +" ushort4 argb = tex2D(srcRGB, x_pos, y_pos);\n" +" uint3 yuv;\n" +" yuv = rgb_to_yuv (argb.y, argb.z, argb.w, (1 << in_depth) - 1);\n" +" if (in_depth > 8) {\n" +" *(unsigned short *)&dstY[x_pos * dst_pstride + y_pos * y_stride] = yuv.x;\n" +" *(unsigned short *)&dstU[x_pos * dst_pstride + y_pos * u_stride] = yuv.y;\n" +" *(unsigned short *)&dstV[x_pos * dst_pstride + y_pos * v_stride] = yuv.z;\n" +" } else {\n" +" dstY[x_pos * dst_pstride + y_pos * y_stride] = yuv.x;\n" +" dstU[x_pos * dst_pstride + y_pos * u_stride] = yuv.y;\n" +" dstV[x_pos * dst_pstride + y_pos * v_stride] = yuv.z;\n" +" }\n" +" }\n" +"}\n" +"\n" +"__global__ void\n" +GST_CUDA_KERNEL_FUNC_Y444_TO_YUV +"(cudaTextureObject_t tex0, cudaTextureObject_t tex1, cudaTextureObject_t tex2,\n" +" unsigned char *dst0, unsigned char *dst1, unsigned char *dst2,\n" +" int stride)\n" +"{\n" +" int x_pos = blockIdx.x * blockDim.x + threadIdx.x;\n" +" int y_pos = blockIdx.y * blockDim.y + threadIdx.y;\n" +" if (x_pos < WIDTH && y_pos < HEIGHT) {\n" +" float src_xpos = SCALE_H * x_pos;\n" +" float src_ypos = SCALE_V * y_pos;\n" +" unsigned short y = tex2D(tex0, src_xpos, src_ypos);\n" +" y = y >> IN_SHIFT;\n" +" y = do_scale_pixel (y);\n" +" y = y << OUT_SHIFT;\n" +" if (OUT_DEPTH > 8) {\n" +" *(unsigned short *)&dst0[x_pos * PSTRIDE + y_pos * stride] = (y & MASK);\n" +" } else {\n" +" dst0[x_pos * PSTRIDE + y_pos * stride] = y;\n" +" }\n" +" }\n" +" if (x_pos < CHROMA_WIDTH && y_pos < CHROMA_HEIGHT) {\n" +" float src_xpos = CHROMA_SCALE_H * x_pos;\n" +" float src_ypos = CHROMA_SCALE_V * y_pos;\n" +" unsigned short u, v;\n" +" ushort2 uv;\n" +" uv = read_chroma (tex1, tex2, src_xpos, src_ypos);\n" +" u = uv.x;\n" +" v = uv.y;\n" +" u = u >> IN_SHIFT;\n" +" v = v >> IN_SHIFT;\n" +" u = do_scale_pixel (u);\n" +" v = do_scale_pixel (v);\n" +" u = u << OUT_SHIFT;\n" +" v = v << OUT_SHIFT;\n" +" if (SWAP_UV) {\n" +" unsigned short tmp = u;\n" +" u = v;\n" +" v = tmp;\n" +" }\n" +" write_chroma (dst1,\n" +" dst2, u, v, x_pos, y_pos, CHROMA_PSTRIDE, stride, MASK);\n" +" }\n" +"}\n" +"\n" +"}"; + +/* CUDA kernel source for from RGB to RGB conversion and scale */ +static const gchar templ_RGB_to_RGB[] = +"extern \"C\"{\n" +"__constant__ float SCALE_H = %f;\n" +"__constant__ float SCALE_V = %f;\n" +"__constant__ int WIDTH = %d;\n" +"__constant__ int HEIGHT = %d;\n" +"__constant__ int IN_DEPTH = %d;\n" +"__constant__ int OUT_DEPTH = %d;\n" +"__constant__ int PSTRIDE = %d;\n" +"__constant__ int R_IDX = %d;\n" +"__constant__ int G_IDX = %d;\n" +"__constant__ int B_IDX = %d;\n" +"__constant__ int A_IDX = %d;\n" +"__constant__ int X_IDX = %d;\n" +"\n" +"__device__ unsigned short\n" +"do_scale_pixel (unsigned short val) \n" +"{\n" +" unsigned int diff;\n" +" if (OUT_DEPTH > IN_DEPTH) {\n" +" diff = OUT_DEPTH - IN_DEPTH;\n" +" return (val << diff) | (val >> (IN_DEPTH - diff));\n" +" } else if (IN_DEPTH > OUT_DEPTH) {\n" +" return val >> (IN_DEPTH - OUT_DEPTH);\n" +" }\n" +" return val;\n" +"}\n" +"\n" +/* __global__ void + * GST_CUDA_KERNEL_FUNC_TO_ARGB + */ +"%s\n" +"\n" +/* convert ARGB or ARGB64 to other RGB formats with scale */ +"__global__ void\n" +GST_CUDA_KERNEL_FUNC_SCALE_RGB +"(cudaTextureObject_t srcRGB, unsigned char *dstRGB, int dst_stride)\n" +"{\n" +" int x_pos = blockIdx.x * blockDim.x + threadIdx.x;\n" +" int y_pos = blockIdx.y * blockDim.y + threadIdx.y;\n" +" if (x_pos < WIDTH && y_pos < HEIGHT) {\n" +" float src_xpos = SCALE_H * x_pos;\n" +" float src_ypos = SCALE_V * y_pos;\n" +" ushort4 argb = tex2D(srcRGB, src_xpos, src_ypos);\n" +" argb.x = do_scale_pixel(argb.x);\n" +" argb.y = do_scale_pixel(argb.y);\n" +" argb.z = do_scale_pixel(argb.z);\n" +" argb.w = do_scale_pixel(argb.w);\n" + /* FIXME: RGB10A2_LE or BGR10A2_LE only */ +" if (OUT_DEPTH > 8) {\n" +" unsigned int packed_rgb = 0;\n" +" unsigned int a, r, g, b;" +" a = (argb.x >> 8) & 0x3;\n" +" r = argb.y & 0x3ff;\n" +" g = argb.z & 0x3ff;\n" +" b = argb.w & 0x3ff;\n" + /* A is always MSB, we support only little endian system */ +" packed_rgb = a << 30;\n" +" packed_rgb |= (r << (30 - (R_IDX * 10)));\n" +" packed_rgb |= (g << (30 - (G_IDX * 10)));\n" +" packed_rgb |= (b << (30 - (B_IDX * 10)));\n" +" *(unsigned int *)&dstRGB[x_pos * 4 + y_pos * dst_stride] = packed_rgb;\n" +" } else {\n" +" if (A_IDX >= 0) {\n" +" argb.x = do_scale_pixel(argb.x);\n" +" dstRGB[x_pos * PSTRIDE + A_IDX + y_pos * dst_stride] = argb.x;\n" +" } else if (X_IDX >= 0) {\n" +" dstRGB[x_pos * PSTRIDE + X_IDX + y_pos * dst_stride] = 0xff;\n" +" }\n" +" dstRGB[x_pos * PSTRIDE + R_IDX + y_pos * dst_stride] = argb.y;\n" +" dstRGB[x_pos * PSTRIDE + G_IDX + y_pos * dst_stride] = argb.z;\n" +" dstRGB[x_pos * PSTRIDE + B_IDX + y_pos * dst_stride] = argb.w;\n" +" }\n" +" }\n" +"}\n" +"\n" +"}"; +/* *INDENT-ON* */ + +typedef struct +{ + gint R; + gint G; + gint B; + gint A; + gint X; +} GstCudaRGBOrder; + +typedef struct +{ + CUdeviceptr device_ptr; + gsize cuda_stride; +} GstCudaStageBuffer; + +#define CONVERTER_MAX_NUM_FUNC 4 + +struct _GstCudaConverter +{ + GstVideoInfo in_info; + GstVideoInfo out_info; + gboolean keep_size; + + gint texture_alignment; + + GstCudaContext *cuda_ctx; + CUmodule cuda_module; + CUfunction kernel_func[CONVERTER_MAX_NUM_FUNC]; + const gchar *func_names[CONVERTER_MAX_NUM_FUNC]; + gchar *kernel_source; + gchar *ptx; + GstCudaStageBuffer fallback_buffer[GST_VIDEO_MAX_PLANES]; + + gboolean (*convert) (GstCudaConverter * convert, const GstCudaMemory * src, + GstVideoInfo * in_info, GstCudaMemory * dst, GstVideoInfo * out_info, + CUstream cuda_stream); + + const CUdeviceptr src; + GstVideoInfo *cur_in_info; + + CUdeviceptr dest; + GstVideoInfo *cur_out_info; + + /* rgb to {rgb, yuv} only */ + GstCudaRGBOrder in_rgb_order; + GstCudaStageBuffer unpack_surface; + GstCudaStageBuffer y444_surface[GST_VIDEO_MAX_PLANES]; +}; + +#define LOAD_CUDA_FUNC(module,func,name) G_STMT_START { \ + if (!gst_cuda_result (CuModuleGetFunction (&(func), (module), name))) { \ + GST_ERROR ("failed to get %s function", (name)); \ + goto error; \ + } \ +} G_STMT_END + +/** + * gst_cuda_converter_new: + * @in_info: a #GstVideoInfo + * @out_info: a #GstVideoInfo + * @cuda_ctx: (transfer none): a #GstCudaContext + * + * Create a new converter object to convert between @in_info and @out_info + * with @config. + * + * Returns: a #GstCudaConverter or %NULL if conversion is not possible. + */ +GstCudaConverter * +gst_cuda_converter_new (GstVideoInfo * in_info, GstVideoInfo * out_info, + GstCudaContext * cuda_ctx) +{ + GstCudaConverter *convert; + gint i; + + g_return_val_if_fail (in_info != NULL, NULL); + g_return_val_if_fail (out_info != NULL, NULL); + g_return_val_if_fail (cuda_ctx != NULL, NULL); + /* we won't ever do framerate conversion */ + g_return_val_if_fail (in_info->fps_n == out_info->fps_n, NULL); + g_return_val_if_fail (in_info->fps_d == out_info->fps_d, NULL); + /* we won't ever do deinterlace */ + g_return_val_if_fail (in_info->interlace_mode == out_info->interlace_mode, + NULL); + + convert = g_new0 (GstCudaConverter, 1); + + convert->in_info = *in_info; + convert->out_info = *out_info; + + /* FIXME: should return kernel source */ + if (!gst_cuda_context_push (cuda_ctx)) { + GST_ERROR ("cannot push context"); + goto error; + } + + if (!cuda_converter_lookup_path (convert)) + goto error; + + convert->ptx = gst_cuda_nvrtc_compile (convert->kernel_source); + if (!convert->ptx) { + GST_ERROR ("no PTX data to load"); + goto error; + } + + GST_TRACE ("compiled convert ptx \n%s", convert->ptx); + + if (!gst_cuda_result (CuModuleLoadData (&convert->cuda_module, convert->ptx))) { + gst_cuda_context_pop (NULL); + GST_ERROR ("failed to load cuda module data"); + + goto error; + } + + for (i = 0; i < CONVERTER_MAX_NUM_FUNC; i++) { + if (!convert->func_names[i]) + break; + + LOAD_CUDA_FUNC (convert->cuda_module, convert->kernel_func[i], + convert->func_names[i]); + GST_DEBUG ("kernel function \"%s\" loaded", convert->func_names[i]); + } + + gst_cuda_context_pop (NULL); + convert->cuda_ctx = gst_object_ref (cuda_ctx); + convert->texture_alignment = + gst_cuda_context_get_texture_alignment (cuda_ctx); + + g_free (convert->kernel_source); + g_free (convert->ptx); + convert->kernel_source = NULL; + convert->ptx = NULL; + + return convert; + +error: + gst_cuda_context_pop (NULL); + gst_cuda_converter_free (convert); + + return NULL; +} + +/** + * gst_video_converter_free: + * @convert: a #GstCudaConverter + * + * Free @convert + */ +void +gst_cuda_converter_free (GstCudaConverter * convert) +{ + g_return_if_fail (convert != NULL); + + if (convert->cuda_ctx) { + if (gst_cuda_context_push (convert->cuda_ctx)) { + gint i; + + if (convert->cuda_module) { + gst_cuda_result (CuModuleUnload (convert->cuda_module)); + } + + for (i = 0; i < GST_VIDEO_MAX_PLANES; i++) { + if (convert->fallback_buffer[i].device_ptr) + gst_cuda_result (CuMemFree (convert->fallback_buffer[i].device_ptr)); + if (convert->y444_surface[i].device_ptr) + gst_cuda_result (CuMemFree (convert->y444_surface[i].device_ptr)); + } + + if (convert->unpack_surface.device_ptr) + gst_cuda_result (CuMemFree (convert->unpack_surface.device_ptr)); + + gst_cuda_context_pop (NULL); + } + + gst_object_unref (convert->cuda_ctx); + } + + g_free (convert->kernel_source); + g_free (convert->ptx); + g_free (convert); +} + +/** + * gst_cuda_converter_frame: + * @convert: a #GstCudaConverter + * @src: a #GstCudaMemory + * @in_info: a #GstVideoInfo representing @src + * @dst: a #GstCudaMemory + * @out_info: a #GstVideoInfo representing @dst + * @cuda_stream: a #CUstream + * + * Convert the pixels of @src into @dest using @convert. + * Called without gst_cuda_context_push() and gst_cuda_context_pop() by caller + */ +gboolean +gst_cuda_converter_frame (GstCudaConverter * convert, const GstCudaMemory * src, + GstVideoInfo * in_info, GstCudaMemory * dst, GstVideoInfo * out_info, + CUstream cuda_stream) +{ + gboolean ret; + + g_return_val_if_fail (convert, FALSE); + g_return_val_if_fail (src, FALSE); + g_return_val_if_fail (in_info, FALSE); + g_return_val_if_fail (dst, FALSE); + g_return_val_if_fail (out_info, FALSE); + + gst_cuda_context_push (convert->cuda_ctx); + + ret = gst_cuda_converter_frame_unlocked (convert, + src, in_info, dst, out_info, cuda_stream); + + gst_cuda_context_pop (NULL); + + return ret; +} + +/** + * gst_cuda_converter_frame_unlocked: + * @convert: a #GstCudaConverter + * @src: a #GstCudaMemory + * @in_info: a #GstVideoInfo representing @src + * @dst: a #GstCudaMemory + * @out_info: a #GstVideoInfo representing @dest + * @cuda_stream: a #CUstream + * + * Convert the pixels of @src into @dest using @convert. + * Caller should call this method after gst_cuda_context_push() + */ +gboolean +gst_cuda_converter_frame_unlocked (GstCudaConverter * convert, + const GstCudaMemory * src, GstVideoInfo * in_info, GstCudaMemory * dst, + GstVideoInfo * out_info, CUstream cuda_stream) +{ + g_return_val_if_fail (convert, FALSE); + g_return_val_if_fail (src, FALSE); + g_return_val_if_fail (in_info, FALSE); + g_return_val_if_fail (dst, FALSE); + g_return_val_if_fail (out_info, FALSE); + + return convert->convert (convert, src, in_info, dst, out_info, cuda_stream); +} + +/* allocate fallback memory for texture alignment requirement */ +static gboolean +convert_ensure_fallback_memory (GstCudaConverter * convert, + GstVideoInfo * info, guint plane) +{ + CUresult ret; + guint element_size = 8; + + if (convert->fallback_buffer[plane].device_ptr) + return TRUE; + + if (GST_VIDEO_INFO_COMP_DEPTH (info, 0) > 8) + element_size = 16; + + ret = CuMemAllocPitch (&convert->fallback_buffer[plane].device_ptr, + &convert->fallback_buffer[plane].cuda_stride, + GST_VIDEO_INFO_COMP_WIDTH (info, plane) * + GST_VIDEO_INFO_COMP_PSTRIDE (info, plane), + GST_VIDEO_INFO_COMP_HEIGHT (info, plane), element_size); + + if (!gst_cuda_result (ret)) { + GST_ERROR ("failed to allocated fallback memory"); + return FALSE; + } + + return TRUE; +} + +/* create a 2D CUDA texture without alignment check */ +static CUtexObject +convert_create_texture_unchecked (const CUdeviceptr src, gint width, + gint height, gint channels, gint stride, CUarray_format format, + CUfilter_mode mode, CUstream cuda_stream) +{ + CUDA_TEXTURE_DESC texture_desc; + CUDA_RESOURCE_DESC resource_desc; + CUtexObject texture = 0; + CUresult cuda_ret; + + memset (&texture_desc, 0, sizeof (CUDA_TEXTURE_DESC)); + memset (&resource_desc, 0, sizeof (CUDA_RESOURCE_DESC)); + + resource_desc.resType = CU_RESOURCE_TYPE_PITCH2D; + resource_desc.res.pitch2D.format = format; + resource_desc.res.pitch2D.numChannels = channels; + resource_desc.res.pitch2D.width = width; + resource_desc.res.pitch2D.height = height; + resource_desc.res.pitch2D.pitchInBytes = stride; + resource_desc.res.pitch2D.devPtr = src; + + texture_desc.filterMode = mode; + texture_desc.flags = CU_TRSF_READ_AS_INTEGER; + + gst_cuda_result (CuStreamSynchronize (cuda_stream)); + cuda_ret = CuTexObjectCreate (&texture, &resource_desc, &texture_desc, NULL); + + if (!gst_cuda_result (cuda_ret)) { + GST_ERROR ("couldn't create texture"); + + return 0; + } + + return texture; +} + +static CUtexObject +convert_create_texture (GstCudaConverter * convert, const GstCudaMemory * src, + GstVideoInfo * info, guint plane, CUstream cuda_stream) +{ + CUarray_format format = CU_AD_FORMAT_UNSIGNED_INT8; + guint channels = 1; + CUdeviceptr src_ptr; + gsize stride; + CUresult cuda_ret; + CUfilter_mode mode; + + if (GST_VIDEO_INFO_COMP_DEPTH (info, plane) > 8) + format = CU_AD_FORMAT_UNSIGNED_INT16; + + /* FIXME: more graceful method ? */ + if (plane != 0 && + GST_VIDEO_INFO_N_PLANES (info) != GST_VIDEO_INFO_N_COMPONENTS (info)) { + channels = 2; + } + + src_ptr = src->data + src->offset[plane]; + stride = src->stride; + + if (convert->texture_alignment && (src_ptr % convert->texture_alignment)) { + CUDA_MEMCPY2D copy_params = { 0, }; + + if (!convert_ensure_fallback_memory (convert, info, plane)) + return 0; + + GST_LOG ("device memory was not aligned, copy to fallback memory"); + + copy_params.srcMemoryType = CU_MEMORYTYPE_DEVICE; + copy_params.srcPitch = stride; + copy_params.srcDevice = (CUdeviceptr) src_ptr; + + copy_params.dstMemoryType = CU_MEMORYTYPE_DEVICE; + copy_params.dstPitch = convert->fallback_buffer[plane].cuda_stride; + copy_params.dstDevice = convert->fallback_buffer[plane].device_ptr; + copy_params.WidthInBytes = GST_VIDEO_INFO_COMP_WIDTH (info, plane) + * GST_VIDEO_INFO_COMP_PSTRIDE (info, plane); + copy_params.Height = GST_VIDEO_INFO_COMP_HEIGHT (info, plane); + + cuda_ret = CuMemcpy2DAsync (©_params, cuda_stream); + if (!gst_cuda_result (cuda_ret)) { + GST_ERROR ("failed to copy to fallback buffer"); + return 0; + } + + src_ptr = convert->fallback_buffer[plane].device_ptr; + stride = convert->fallback_buffer[plane].cuda_stride; + } + + /* Use h/w linear interpolation only when resize is required. + * Otherwise the image might be blurred */ + if (convert->keep_size) + mode = CU_TR_FILTER_MODE_POINT; + else + mode = CU_TR_FILTER_MODE_LINEAR; + + return convert_create_texture_unchecked (src_ptr, + GST_VIDEO_INFO_COMP_WIDTH (info, plane), + GST_VIDEO_INFO_COMP_HEIGHT (info, plane), channels, stride, format, mode, + cuda_stream); +} + +/* main conversion function for YUV to YUV conversion */ +static gboolean +convert_YUV_TO_YUV (GstCudaConverter * convert, + const GstCudaMemory * src, GstVideoInfo * in_info, GstCudaMemory * dst, + GstVideoInfo * out_info, CUstream cuda_stream) +{ + CUtexObject texture[GST_VIDEO_MAX_PLANES] = { 0, }; + CUresult cuda_ret; + gboolean ret = FALSE; + CUdeviceptr dst_ptr[GST_VIDEO_MAX_PLANES] = { 0, }; + gint dst_stride; + gint width, height; + gint i; + + gpointer kernel_args[] = { &texture[0], &texture[1], &texture[2], + &dst_ptr[0], &dst_ptr[1], &dst_ptr[2], &dst_stride + }; + + /* conversion step + * STEP 1: create CUtexObject per plane + * STEP 2: call YUV to YUV conversion kernel function. + * resize, uv reordering and bitdepth conversion will be performed in + * the CUDA kernel function + */ + + /* map CUDA device memory to CUDA texture object */ + for (i = 0; i < GST_VIDEO_INFO_N_PLANES (in_info); i++) { + texture[i] = convert_create_texture (convert, src, in_info, i, cuda_stream); + if (!texture[i]) { + GST_ERROR ("couldn't create texture for %d th plane", i); + goto done; + } + } + + for (i = 0; i < GST_VIDEO_INFO_N_PLANES (out_info); i++) + dst_ptr[i] = dst->data + dst->offset[i]; + + dst_stride = dst->stride; + + width = GST_VIDEO_INFO_WIDTH (out_info); + height = GST_VIDEO_INFO_HEIGHT (out_info); + + cuda_ret = + CuLaunchKernel (convert->kernel_func[0], DIV_UP (width, CUDA_BLOCK_X), + DIV_UP (height, CUDA_BLOCK_Y), 1, CUDA_BLOCK_X, CUDA_BLOCK_Y, 1, 0, + cuda_stream, kernel_args, NULL); + + if (!gst_cuda_result (cuda_ret)) { + GST_ERROR ("could not rescale plane"); + goto done; + } + + ret = TRUE; + gst_cuda_result (CuStreamSynchronize (cuda_stream)); + +done: + for (i = 0; i < GST_VIDEO_INFO_N_PLANES (in_info); i++) { + if (texture[i]) + gst_cuda_result (CuTexObjectDestroy (texture[i])); + } + + return ret; +} + +/* main conversion function for YUV to RGB conversion */ +static gboolean +convert_YUV_TO_RGB (GstCudaConverter * convert, + const GstCudaMemory * src, GstVideoInfo * in_info, GstCudaMemory * dst, + GstVideoInfo * out_info, CUstream cuda_stream) +{ + CUtexObject texture[GST_VIDEO_MAX_PLANES] = { 0, }; + CUresult cuda_ret; + gboolean ret = FALSE; + CUdeviceptr dstRGB = 0; + gint dst_stride; + gint width, height; + gint i; + + gpointer kernel_args[] = { &texture[0], &texture[1], &texture[2], + &dstRGB, &dst_stride + }; + + /* conversion step + * STEP 1: create CUtexObject per plane + * STEP 2: call YUV to RGB conversion kernel function. + * resizing, argb ordering and bitdepth conversion will be performed in + * the CUDA kernel function + */ + + /* map CUDA device memory to CUDA texture object */ + for (i = 0; i < GST_VIDEO_INFO_N_PLANES (in_info); i++) { + texture[i] = convert_create_texture (convert, src, in_info, i, cuda_stream); + if (!texture[i]) { + GST_ERROR ("couldn't create texture for %d th plane", i); + goto done; + } + } + + dstRGB = dst->data; + dst_stride = dst->stride; + + width = GST_VIDEO_INFO_WIDTH (out_info); + height = GST_VIDEO_INFO_HEIGHT (out_info); + + cuda_ret = + CuLaunchKernel (convert->kernel_func[0], DIV_UP (width, CUDA_BLOCK_X), + DIV_UP (height, CUDA_BLOCK_Y), 1, CUDA_BLOCK_X, CUDA_BLOCK_Y, 1, 0, + cuda_stream, kernel_args, NULL); + + if (!gst_cuda_result (cuda_ret)) { + GST_ERROR ("could not rescale plane"); + goto done; + } + + ret = TRUE; + gst_cuda_result (CuStreamSynchronize (cuda_stream)); + +done: + for (i = 0; i < GST_VIDEO_INFO_N_PLANES (in_info); i++) { + if (texture[i]) + gst_cuda_result (CuTexObjectDestroy (texture[i])); + } + + return ret; +} + +static gboolean +convert_UNPACK_RGB (GstCudaConverter * convert, CUfunction kernel_func, + CUstream cuda_stream, const GstCudaMemory * src, GstVideoInfo * in_info, + CUdeviceptr dst, gint dst_stride, GstCudaRGBOrder * rgb_order) +{ + CUdeviceptr srcRGB = 0; + gint width, height; + gint src_stride, src_pstride; + CUresult cuda_ret; + + gpointer unpack_kernel_args[] = { &srcRGB, &dst, + &width, &height, + &src_stride, &src_pstride, &dst_stride, + &convert->in_rgb_order.R, &convert->in_rgb_order.G, + &convert->in_rgb_order.B, &convert->in_rgb_order.A, + }; + + srcRGB = src->data; + src_stride = src->stride; + + width = GST_VIDEO_INFO_WIDTH (in_info); + height = GST_VIDEO_INFO_HEIGHT (in_info); + src_pstride = GST_VIDEO_INFO_COMP_PSTRIDE (in_info, 0); + + cuda_ret = + CuLaunchKernel (kernel_func, DIV_UP (width, CUDA_BLOCK_X), + DIV_UP (height, CUDA_BLOCK_Y), 1, CUDA_BLOCK_X, CUDA_BLOCK_Y, 1, 0, + cuda_stream, unpack_kernel_args, NULL); + + if (!gst_cuda_result (cuda_ret)) { + GST_ERROR ("could not unpack rgb"); + return FALSE; + } + + return TRUE; +} + +static gboolean +convert_TO_Y444 (GstCudaConverter * convert, CUfunction kernel_func, + CUstream cuda_stream, CUtexObject srcRGB, CUdeviceptr dstY, gint y_stride, + CUdeviceptr dstU, gint u_stride, CUdeviceptr dstV, gint v_stride, + gint width, gint height, gint pstride, gint bitdepth) +{ + CUresult cuda_ret; + + gpointer kernel_args[] = { &srcRGB, &dstY, &y_stride, &dstU, &u_stride, &dstV, + &v_stride, &width, &height, &pstride, &bitdepth, + }; + + cuda_ret = + CuLaunchKernel (kernel_func, DIV_UP (width, CUDA_BLOCK_X), + DIV_UP (height, CUDA_BLOCK_Y), 1, CUDA_BLOCK_X, CUDA_BLOCK_Y, 1, 0, + cuda_stream, kernel_args, NULL); + + if (!gst_cuda_result (cuda_ret)) { + GST_ERROR ("could not unpack rgb"); + return FALSE; + } + + return TRUE; +} + +/* main conversion function for RGB to YUV conversion */ +static gboolean +convert_RGB_TO_YUV (GstCudaConverter * convert, + const GstCudaMemory * src, GstVideoInfo * in_info, GstCudaMemory * dst, + GstVideoInfo * out_info, CUstream cuda_stream) +{ + CUtexObject texture = 0; + CUtexObject yuv_texture[3] = { 0, }; + CUdeviceptr dst_ptr[GST_VIDEO_MAX_PLANES] = { 0, }; + CUresult cuda_ret; + gboolean ret = FALSE; + gint in_width, in_height; + gint out_width, out_height; + gint dst_stride; + CUarray_format format = CU_AD_FORMAT_UNSIGNED_INT8; + CUfilter_mode mode = CU_TR_FILTER_MODE_POINT; + gint pstride = 1; + gint bitdepth = 8; + gint i; + + gpointer kernel_args[] = { &yuv_texture[0], &yuv_texture[1], &yuv_texture[2], + &dst_ptr[0], &dst_ptr[1], &dst_ptr[2], &dst_stride + }; + + /* conversion step + * STEP 1: unpack src RGB into ARGB or ARGB64 format + * STEP 2: convert unpacked ARGB (or ARGB64) to Y444 (or Y444_16LE) + * STEP 3: convert Y444 (or Y444_16LE) to final YUV format. + * resizing, bitdepth conversion, uv reordering will be performed in + * the CUDA kernel function + */ + if (!convert_UNPACK_RGB (convert, convert->kernel_func[0], cuda_stream, + src, in_info, convert->unpack_surface.device_ptr, + convert->unpack_surface.cuda_stride, &convert->in_rgb_order)) { + GST_ERROR ("could not unpack input rgb"); + + goto done; + } + + in_width = GST_VIDEO_INFO_WIDTH (in_info); + in_height = GST_VIDEO_INFO_HEIGHT (in_info); + + out_width = GST_VIDEO_INFO_WIDTH (out_info); + out_height = GST_VIDEO_INFO_HEIGHT (out_info); + dst_stride = dst->stride; + + if (GST_VIDEO_INFO_COMP_DEPTH (in_info, 0) > 8) { + pstride = 2; + bitdepth = 16; + format = CU_AD_FORMAT_UNSIGNED_INT16; + } + + texture = + convert_create_texture_unchecked (convert->unpack_surface.device_ptr, + in_width, in_height, 4, convert->unpack_surface.cuda_stride, format, + mode, cuda_stream); + + if (!texture) { + GST_ERROR ("could not create texture"); + goto done; + } + + if (!convert_TO_Y444 (convert, convert->kernel_func[1], cuda_stream, texture, + convert->y444_surface[0].device_ptr, + convert->y444_surface[0].cuda_stride, + convert->y444_surface[1].device_ptr, + convert->y444_surface[1].cuda_stride, + convert->y444_surface[2].device_ptr, + convert->y444_surface[2].cuda_stride, in_width, in_height, pstride, + bitdepth)) { + GST_ERROR ("could not convert to Y444 or Y444_16LE"); + goto done; + } + + /* Use h/w linear interpolation only when resize is required. + * Otherwise the image might be blurred */ + if (convert->keep_size) + mode = CU_TR_FILTER_MODE_POINT; + else + mode = CU_TR_FILTER_MODE_LINEAR; + + for (i = 0; i < 3; i++) { + yuv_texture[i] = + convert_create_texture_unchecked (convert->y444_surface[i].device_ptr, + in_width, in_height, 1, convert->y444_surface[i].cuda_stride, format, + mode, cuda_stream); + + if (!yuv_texture[i]) { + GST_ERROR ("could not create %dth yuv texture", i); + goto done; + } + } + + for (i = 0; i < GST_VIDEO_INFO_N_PLANES (out_info); i++) + dst_ptr[i] = dst->data + dst->offset[i]; + + cuda_ret = + CuLaunchKernel (convert->kernel_func[2], DIV_UP (out_width, CUDA_BLOCK_X), + DIV_UP (out_height, CUDA_BLOCK_Y), 1, CUDA_BLOCK_X, CUDA_BLOCK_Y, 1, 0, + cuda_stream, kernel_args, NULL); + + if (!gst_cuda_result (cuda_ret)) { + GST_ERROR ("could not rescale plane"); + goto done; + } + + ret = TRUE; + gst_cuda_result (CuStreamSynchronize (cuda_stream)); + +done: + if (texture) + gst_cuda_result (CuTexObjectDestroy (texture)); + for (i = 0; i < 3; i++) { + if (yuv_texture[i]) + gst_cuda_result (CuTexObjectDestroy (yuv_texture[i])); + } + + return ret; +} + +/* main conversion function for RGB to RGB conversion */ +static gboolean +convert_RGB_TO_RGB (GstCudaConverter * convert, + const GstCudaMemory * src, GstVideoInfo * in_info, GstCudaMemory * dst, + GstVideoInfo * out_info, CUstream cuda_stream) +{ + CUtexObject texture = 0; + CUresult cuda_ret; + gboolean ret = FALSE; + CUdeviceptr dstRGB = 0; + gint in_width, in_height; + gint out_width, out_height; + gint dst_stride; + CUfilter_mode mode; + CUarray_format format = CU_AD_FORMAT_UNSIGNED_INT8; + + gpointer rescale_kernel_args[] = { &texture, &dstRGB, &dst_stride }; + + /* conversion step + * STEP 1: unpack src RGB into ARGB or ARGB64 format + * STEP 2: convert ARGB (or ARGB64) to final RGB format. + * resizing, bitdepth conversion, argb reordering will be performed in + * the CUDA kernel function + */ + + if (!convert_UNPACK_RGB (convert, convert->kernel_func[0], cuda_stream, + src, in_info, convert->unpack_surface.device_ptr, + convert->unpack_surface.cuda_stride, &convert->in_rgb_order)) { + GST_ERROR ("could not unpack input rgb"); + + goto done; + } + + in_width = GST_VIDEO_INFO_WIDTH (in_info); + in_height = GST_VIDEO_INFO_HEIGHT (in_info); + + out_width = GST_VIDEO_INFO_WIDTH (out_info); + out_height = GST_VIDEO_INFO_HEIGHT (out_info); + + dstRGB = dst->data; + dst_stride = dst->stride; + + if (GST_VIDEO_INFO_COMP_DEPTH (in_info, 0) > 8) + format = CU_AD_FORMAT_UNSIGNED_INT16; + + /* Use h/w linear interpolation only when resize is required. + * Otherwise the image might be blurred */ + if (convert->keep_size) + mode = CU_TR_FILTER_MODE_POINT; + else + mode = CU_TR_FILTER_MODE_LINEAR; + + texture = + convert_create_texture_unchecked (convert->unpack_surface.device_ptr, + in_width, in_height, 4, convert->unpack_surface.cuda_stride, format, + mode, cuda_stream); + + if (!texture) { + GST_ERROR ("could not create texture"); + goto done; + } + + cuda_ret = + CuLaunchKernel (convert->kernel_func[1], DIV_UP (out_width, CUDA_BLOCK_X), + DIV_UP (out_height, CUDA_BLOCK_Y), 1, CUDA_BLOCK_X, CUDA_BLOCK_Y, 1, 0, + cuda_stream, rescale_kernel_args, NULL); + + if (!gst_cuda_result (cuda_ret)) { + GST_ERROR ("could not rescale plane"); + goto done; + } + + ret = TRUE; + gst_cuda_result (CuStreamSynchronize (cuda_stream)); + +done: + if (texture) + gst_cuda_result (CuTexObjectDestroy (texture)); + + return ret; +} + +/* from video-converter.c */ +typedef struct +{ + gdouble dm[4][4]; +} MatrixData; + +static void +color_matrix_set_identity (MatrixData * m) +{ + gint i, j; + + for (i = 0; i < 4; i++) { + for (j = 0; j < 4; j++) { + m->dm[i][j] = (i == j); + } + } +} + +static void +color_matrix_copy (MatrixData * d, const MatrixData * s) +{ + gint i, j; + + for (i = 0; i < 4; i++) + for (j = 0; j < 4; j++) + d->dm[i][j] = s->dm[i][j]; +} + +/* Perform 4x4 matrix multiplication: + * - @dst@ = @a@ * @b@ + * - @dst@ may be a pointer to @a@ andor @b@ + */ +static void +color_matrix_multiply (MatrixData * dst, MatrixData * a, MatrixData * b) +{ + MatrixData tmp; + gint i, j, k; + + for (i = 0; i < 4; i++) { + for (j = 0; j < 4; j++) { + gdouble x = 0; + for (k = 0; k < 4; k++) { + x += a->dm[i][k] * b->dm[k][j]; + } + tmp.dm[i][j] = x; + } + } + color_matrix_copy (dst, &tmp); +} + +static void +color_matrix_offset_components (MatrixData * m, gdouble a1, gdouble a2, + gdouble a3) +{ + MatrixData a; + + color_matrix_set_identity (&a); + a.dm[0][3] = a1; + a.dm[1][3] = a2; + a.dm[2][3] = a3; + color_matrix_multiply (m, &a, m); +} + +static void +color_matrix_scale_components (MatrixData * m, gdouble a1, gdouble a2, + gdouble a3) +{ + MatrixData a; + + color_matrix_set_identity (&a); + a.dm[0][0] = a1; + a.dm[1][1] = a2; + a.dm[2][2] = a3; + color_matrix_multiply (m, &a, m); +} + +static void +color_matrix_debug (const MatrixData * s) +{ + GST_DEBUG ("[%f %f %f %f]", s->dm[0][0], s->dm[0][1], s->dm[0][2], + s->dm[0][3]); + GST_DEBUG ("[%f %f %f %f]", s->dm[1][0], s->dm[1][1], s->dm[1][2], + s->dm[1][3]); + GST_DEBUG ("[%f %f %f %f]", s->dm[2][0], s->dm[2][1], s->dm[2][2], + s->dm[2][3]); + GST_DEBUG ("[%f %f %f %f]", s->dm[3][0], s->dm[3][1], s->dm[3][2], + s->dm[3][3]); +} + +static void +color_matrix_YCbCr_to_RGB (MatrixData * m, gdouble Kr, gdouble Kb) +{ + gdouble Kg = 1.0 - Kr - Kb; + MatrixData k = { + { + {1., 0., 2 * (1 - Kr), 0.}, + {1., -2 * Kb * (1 - Kb) / Kg, -2 * Kr * (1 - Kr) / Kg, 0.}, + {1., 2 * (1 - Kb), 0., 0.}, + {0., 0., 0., 1.}, + } + }; + + color_matrix_multiply (m, &k, m); +} + +static void +color_matrix_RGB_to_YCbCr (MatrixData * m, gdouble Kr, gdouble Kb) +{ + gdouble Kg = 1.0 - Kr - Kb; + MatrixData k; + gdouble x; + + k.dm[0][0] = Kr; + k.dm[0][1] = Kg; + k.dm[0][2] = Kb; + k.dm[0][3] = 0; + + x = 1 / (2 * (1 - Kb)); + k.dm[1][0] = -x * Kr; + k.dm[1][1] = -x * Kg; + k.dm[1][2] = x * (1 - Kb); + k.dm[1][3] = 0; + + x = 1 / (2 * (1 - Kr)); + k.dm[2][0] = x * (1 - Kr); + k.dm[2][1] = -x * Kg; + k.dm[2][2] = -x * Kb; + k.dm[2][3] = 0; + + k.dm[3][0] = 0; + k.dm[3][1] = 0; + k.dm[3][2] = 0; + k.dm[3][3] = 1; + + color_matrix_multiply (m, &k, m); +} + +static void +compute_matrix_to_RGB (GstCudaConverter * convert, MatrixData * data, + GstVideoInfo * info) +{ + gdouble Kr = 0, Kb = 0; + gint offset[4], scale[4]; + + /* bring color components to [0..1.0] range */ + gst_video_color_range_offsets (info->colorimetry.range, info->finfo, offset, + scale); + + color_matrix_offset_components (data, -offset[0], -offset[1], -offset[2]); + color_matrix_scale_components (data, 1 / ((float) scale[0]), + 1 / ((float) scale[1]), 1 / ((float) scale[2])); + + if (!GST_VIDEO_INFO_IS_RGB (info)) { + /* bring components to R'G'B' space */ + if (gst_video_color_matrix_get_Kr_Kb (info->colorimetry.matrix, &Kr, &Kb)) + color_matrix_YCbCr_to_RGB (data, Kr, Kb); + } + color_matrix_debug (data); +} + +static void +compute_matrix_to_YUV (GstCudaConverter * convert, MatrixData * data, + GstVideoInfo * info) +{ + gdouble Kr = 0, Kb = 0; + gint offset[4], scale[4]; + + if (!GST_VIDEO_INFO_IS_RGB (info)) { + /* bring components to YCbCr space */ + if (gst_video_color_matrix_get_Kr_Kb (info->colorimetry.matrix, &Kr, &Kb)) + color_matrix_RGB_to_YCbCr (data, Kr, Kb); + } + + /* bring color components to nominal range */ + gst_video_color_range_offsets (info->colorimetry.range, info->finfo, offset, + scale); + + color_matrix_scale_components (data, (float) scale[0], (float) scale[1], + (float) scale[2]); + color_matrix_offset_components (data, offset[0], offset[1], offset[2]); + + color_matrix_debug (data); +} + +static gboolean +cuda_converter_get_matrix (GstCudaConverter * convert, MatrixData * matrix, + GstVideoInfo * in_info, GstVideoInfo * out_info) +{ + gboolean same_matrix, same_bits; + guint in_bits, out_bits; + + in_bits = GST_VIDEO_INFO_COMP_DEPTH (in_info, 0); + out_bits = GST_VIDEO_INFO_COMP_DEPTH (out_info, 0); + + same_bits = in_bits == out_bits; + same_matrix = in_info->colorimetry.matrix == out_info->colorimetry.matrix; + + GST_DEBUG ("matrix %d -> %d (%d)", in_info->colorimetry.matrix, + out_info->colorimetry.matrix, same_matrix); + GST_DEBUG ("bits %d -> %d (%d)", in_bits, out_bits, same_bits); + + color_matrix_set_identity (matrix); + + if (same_bits && same_matrix) { + GST_DEBUG ("conversion matrix is not required"); + + return FALSE; + } + + if (in_bits < out_bits) { + gint scale = 1 << (out_bits - in_bits); + color_matrix_scale_components (matrix, + 1 / (float) scale, 1 / (float) scale, 1 / (float) scale); + } + + GST_DEBUG ("to RGB matrix"); + compute_matrix_to_RGB (convert, matrix, in_info); + GST_DEBUG ("current matrix"); + color_matrix_debug (matrix); + + GST_DEBUG ("to YUV matrix"); + compute_matrix_to_YUV (convert, matrix, out_info); + GST_DEBUG ("current matrix"); + color_matrix_debug (matrix); + + if (in_bits > out_bits) { + gint scale = 1 << (in_bits - out_bits); + color_matrix_scale_components (matrix, + (float) scale, (float) scale, (float) scale); + } + + GST_DEBUG ("final matrix"); + color_matrix_debug (matrix); + + return TRUE; +} + +static gboolean +is_uv_swapped (GstVideoFormat format) +{ + static GstVideoFormat swapped_formats[] = { + GST_VIDEO_FORMAT_YV12, + GST_VIDEO_FORMAT_NV21, + }; + gint i; + + for (i = 0; i < G_N_ELEMENTS (swapped_formats); i++) { + if (format == swapped_formats[i]) + return TRUE; + } + + return FALSE; +} + +typedef struct +{ + const gchar *read_chroma; + const gchar *write_chroma; + const gchar *unpack_function; + gfloat scale_h, scale_v; + gfloat chroma_scale_h, chroma_scale_v; + gint width, height; + gint chroma_width, chroma_height; + gint in_depth; + gint out_depth; + gint pstride, chroma_pstride; + gint in_shift, out_shift; + gint mask; + gint swap_uv; + /* RGBA specific variables */ + gint max_in_val; + GstCudaRGBOrder rgb_order; +} GstCudaKernelTempl; + +static gchar * +cuda_converter_generate_yuv_to_yuv_kernel_code (GstCudaConverter * convert, + GstCudaKernelTempl * templ) +{ + return g_strdup_printf (templ_YUV_TO_YUV, + templ->scale_h, templ->scale_v, templ->chroma_scale_h, + templ->chroma_scale_v, templ->width, templ->height, templ->chroma_width, + templ->chroma_height, templ->in_depth, templ->out_depth, templ->pstride, + templ->chroma_pstride, templ->in_shift, templ->out_shift, templ->mask, + templ->swap_uv, templ->read_chroma, templ->write_chroma); +} + +static gchar * +cuda_converter_generate_yuv_to_rgb_kernel_code (GstCudaConverter * convert, + GstCudaKernelTempl * templ, MatrixData * matrix) +{ + return g_strdup_printf (templ_YUV_TO_RGB, + matrix->dm[0][3], matrix->dm[1][3], matrix->dm[2][3], + matrix->dm[0][0], matrix->dm[0][1], matrix->dm[0][2], + matrix->dm[1][0], matrix->dm[1][1], matrix->dm[1][2], + matrix->dm[2][0], matrix->dm[2][1], matrix->dm[2][2], + templ->scale_h, templ->scale_v, templ->chroma_scale_h, + templ->chroma_scale_v, templ->width, templ->height, templ->chroma_width, + templ->chroma_height, templ->in_depth, templ->out_depth, templ->pstride, + templ->chroma_pstride, templ->in_shift, templ->out_shift, templ->mask, + templ->swap_uv, templ->max_in_val, templ->rgb_order.R, + templ->rgb_order.G, templ->rgb_order.B, templ->rgb_order.A, + templ->rgb_order.X, templ->read_chroma); +} + +static gchar * +cuda_converter_generate_rgb_to_yuv_kernel_code (GstCudaConverter * convert, + GstCudaKernelTempl * templ, MatrixData * matrix) +{ + return g_strdup_printf (templ_RGB_TO_YUV, + matrix->dm[0][3], matrix->dm[1][3], matrix->dm[2][3], + matrix->dm[0][0], matrix->dm[0][1], matrix->dm[0][2], + matrix->dm[1][0], matrix->dm[1][1], matrix->dm[1][2], + matrix->dm[2][0], matrix->dm[2][1], matrix->dm[2][2], + templ->scale_h, templ->scale_v, templ->chroma_scale_h, + templ->chroma_scale_v, templ->width, templ->height, templ->chroma_width, + templ->chroma_height, templ->in_depth, templ->out_depth, templ->pstride, + templ->chroma_pstride, templ->in_shift, templ->out_shift, templ->mask, + templ->swap_uv, templ->unpack_function, templ->read_chroma, + templ->write_chroma); +} + +static gchar * +cuda_converter_generate_rgb_to_rgb_kernel_code (GstCudaConverter * convert, + GstCudaKernelTempl * templ) +{ + return g_strdup_printf (templ_RGB_to_RGB, + templ->scale_h, templ->scale_v, + templ->width, templ->height, + templ->in_depth, templ->out_depth, templ->pstride, + templ->rgb_order.R, templ->rgb_order.G, + templ->rgb_order.B, templ->rgb_order.A, templ->rgb_order.X, + templ->unpack_function); +} + +#define SET_ORDER(o,r,g,b,a,x) G_STMT_START { \ + (o)->R = (r); \ + (o)->G = (g); \ + (o)->B = (b); \ + (o)->A = (a); \ + (o)->X = (x); \ +} G_STMT_END + +static void +cuda_converter_get_rgb_order (GstVideoFormat format, GstCudaRGBOrder * order) +{ + switch (format) { + case GST_VIDEO_FORMAT_RGBA: + SET_ORDER (order, 0, 1, 2, 3, -1); + break; + case GST_VIDEO_FORMAT_RGBx: + SET_ORDER (order, 0, 1, 2, -1, 3); + break; + case GST_VIDEO_FORMAT_BGRA: + SET_ORDER (order, 2, 1, 0, 3, -1); + break; + case GST_VIDEO_FORMAT_BGRx: + SET_ORDER (order, 2, 1, 0, -1, 3); + break; + case GST_VIDEO_FORMAT_ARGB: + SET_ORDER (order, 1, 2, 3, 0, -1); + break; + case GST_VIDEO_FORMAT_ABGR: + SET_ORDER (order, 3, 2, 1, 0, -1); + break; + case GST_VIDEO_FORMAT_RGB: + SET_ORDER (order, 0, 1, 2, -1, -1); + break; + case GST_VIDEO_FORMAT_BGR: + SET_ORDER (order, 2, 1, 0, -1, -1); + break; + case GST_VIDEO_FORMAT_BGR10A2_LE: + SET_ORDER (order, 1, 2, 3, 0, -1); + break; + case GST_VIDEO_FORMAT_RGB10A2_LE: + SET_ORDER (order, 3, 2, 1, 0, -1); + break; + default: + g_assert_not_reached (); + break; + } +} + +static gboolean +cuda_converter_lookup_path (GstCudaConverter * convert) +{ + GstVideoFormat in_format, out_format; + gboolean src_yuv, dst_yuv; + gboolean src_planar, dst_planar; + GstCudaKernelTempl templ = { 0, }; + GstVideoInfo *in_info, *out_info; + gboolean ret = FALSE; + CUresult cuda_ret; + + in_info = &convert->in_info; + out_info = &convert->out_info; + + in_format = GST_VIDEO_INFO_FORMAT (in_info); + out_format = GST_VIDEO_INFO_FORMAT (out_info); + + src_yuv = GST_VIDEO_INFO_IS_YUV (in_info); + dst_yuv = GST_VIDEO_INFO_IS_YUV (out_info); + + src_planar = GST_VIDEO_INFO_N_PLANES (in_info) == + GST_VIDEO_INFO_N_COMPONENTS (in_info); + dst_planar = GST_VIDEO_INFO_N_PLANES (out_info) == + GST_VIDEO_INFO_N_COMPONENTS (out_info); + + convert->keep_size = (GST_VIDEO_INFO_WIDTH (&convert->in_info) == + GST_VIDEO_INFO_WIDTH (&convert->out_info) && + GST_VIDEO_INFO_HEIGHT (&convert->in_info) == + GST_VIDEO_INFO_HEIGHT (&convert->out_info)); + + templ.scale_h = (gfloat) GST_VIDEO_INFO_COMP_WIDTH (in_info, 0) / + (gfloat) GST_VIDEO_INFO_COMP_WIDTH (out_info, 0); + templ.scale_v = (gfloat) GST_VIDEO_INFO_COMP_HEIGHT (in_info, 0) / + (gfloat) GST_VIDEO_INFO_COMP_HEIGHT (out_info, 0); + templ.chroma_scale_h = (gfloat) GST_VIDEO_INFO_COMP_WIDTH (in_info, 1) / + (gfloat) GST_VIDEO_INFO_COMP_WIDTH (out_info, 1); + templ.chroma_scale_v = (gfloat) GST_VIDEO_INFO_COMP_HEIGHT (in_info, 1) / + (gfloat) GST_VIDEO_INFO_COMP_HEIGHT (out_info, 1); + templ.width = GST_VIDEO_INFO_COMP_WIDTH (out_info, 0); + templ.height = GST_VIDEO_INFO_COMP_HEIGHT (out_info, 0); + templ.chroma_width = GST_VIDEO_INFO_COMP_WIDTH (out_info, 1); + templ.chroma_height = GST_VIDEO_INFO_COMP_HEIGHT (out_info, 1); + + templ.in_depth = GST_VIDEO_INFO_COMP_DEPTH (in_info, 0); + templ.out_depth = GST_VIDEO_INFO_COMP_DEPTH (out_info, 0); + templ.pstride = GST_VIDEO_INFO_COMP_PSTRIDE (out_info, 0); + templ.chroma_pstride = GST_VIDEO_INFO_COMP_PSTRIDE (out_info, 1); + templ.in_shift = in_info->finfo->shift[0]; + templ.out_shift = out_info->finfo->shift[0]; + templ.mask = ((1 << templ.out_depth) - 1) << templ.out_shift; + templ.swap_uv = (is_uv_swapped (in_format) != is_uv_swapped (out_format)); + + if (src_yuv && dst_yuv) { + convert->convert = convert_YUV_TO_YUV; + + if (src_planar && dst_planar) { + templ.read_chroma = READ_CHROMA_FROM_PLANAR; + templ.write_chroma = WRITE_CHROMA_TO_PLANAR; + } else if (!src_planar && dst_planar) { + templ.read_chroma = READ_CHROMA_FROM_SEMI_PLANAR; + templ.write_chroma = WRITE_CHROMA_TO_PLANAR; + } else if (src_planar && !dst_planar) { + templ.read_chroma = READ_CHROMA_FROM_PLANAR; + templ.write_chroma = WRITE_CHROMA_TO_SEMI_PLANAR; + } else { + templ.read_chroma = READ_CHROMA_FROM_SEMI_PLANAR; + templ.write_chroma = WRITE_CHROMA_TO_SEMI_PLANAR; + } + + convert->kernel_source = + cuda_converter_generate_yuv_to_yuv_kernel_code (convert, &templ); + convert->func_names[0] = GST_CUDA_KERNEL_FUNC; + + ret = TRUE; + } else if (src_yuv && !dst_yuv) { + MatrixData matrix; + + if (src_planar) { + templ.read_chroma = READ_CHROMA_FROM_PLANAR; + } else { + templ.read_chroma = READ_CHROMA_FROM_SEMI_PLANAR; + } + + templ.max_in_val = (1 << templ.in_depth) - 1; + cuda_converter_get_rgb_order (out_format, &templ.rgb_order); + + cuda_converter_get_matrix (convert, &matrix, in_info, out_info); + convert->kernel_source = + cuda_converter_generate_yuv_to_rgb_kernel_code (convert, + &templ, &matrix); + convert->func_names[0] = GST_CUDA_KERNEL_FUNC; + + convert->convert = convert_YUV_TO_RGB; + + ret = TRUE; + } else if (!src_yuv && dst_yuv) { + MatrixData matrix; + gsize element_size = 8; + GstVideoFormat unpack_format; + GstVideoFormat y444_format; + GstVideoInfo unpack_info; + GstVideoInfo y444_info; + gint i; + + if (dst_planar) { + templ.write_chroma = WRITE_CHROMA_TO_PLANAR; + } else { + templ.write_chroma = WRITE_CHROMA_TO_SEMI_PLANAR; + } + templ.read_chroma = READ_CHROMA_FROM_PLANAR; + + cuda_converter_get_rgb_order (in_format, &convert->in_rgb_order); + + if (templ.in_depth > 8) { + /* FIXME: RGB10A2_LE and BGR10A2_LE only */ + element_size = 16; + unpack_format = GST_VIDEO_FORMAT_ARGB64; + y444_format = GST_VIDEO_FORMAT_Y444_16LE; + templ.unpack_function = unpack_to_ARGB64; + } else { + unpack_format = GST_VIDEO_FORMAT_ARGB; + y444_format = GST_VIDEO_FORMAT_Y444; + templ.unpack_function = unpack_to_ARGB; + } + + gst_video_info_set_format (&unpack_info, + unpack_format, GST_VIDEO_INFO_WIDTH (in_info), + GST_VIDEO_INFO_HEIGHT (in_info)); + gst_video_info_set_format (&y444_info, + y444_format, GST_VIDEO_INFO_WIDTH (in_info), + GST_VIDEO_INFO_HEIGHT (in_info)); + + templ.in_depth = GST_VIDEO_INFO_COMP_DEPTH (&unpack_info, 0); + + cuda_ret = CuMemAllocPitch (&convert->unpack_surface.device_ptr, + &convert->unpack_surface.cuda_stride, + GST_VIDEO_INFO_COMP_WIDTH (&unpack_info, 0) * + GST_VIDEO_INFO_COMP_PSTRIDE (&unpack_info, 0), + GST_VIDEO_INFO_HEIGHT (&unpack_info), element_size); + + if (!gst_cuda_result (cuda_ret)) { + GST_ERROR ("couldn't alloc unpack surface"); + return FALSE; + } + + for (i = 0; i < 3; i++) { + cuda_ret = CuMemAllocPitch (&convert->y444_surface[i].device_ptr, + &convert->y444_surface[i].cuda_stride, + GST_VIDEO_INFO_COMP_WIDTH (&y444_info, i) * + GST_VIDEO_INFO_COMP_PSTRIDE (&y444_info, i), + GST_VIDEO_INFO_COMP_HEIGHT (&y444_info, i), element_size); + + if (!gst_cuda_result (cuda_ret)) { + GST_ERROR ("couldn't alloc %dth y444 surface", i); + return FALSE; + } + } + + cuda_converter_get_matrix (convert, &matrix, &unpack_info, &y444_info); + + convert->kernel_source = + cuda_converter_generate_rgb_to_yuv_kernel_code (convert, + &templ, &matrix); + + convert->func_names[0] = GST_CUDA_KERNEL_FUNC_TO_ARGB; + convert->func_names[1] = GST_CUDA_KERNEL_FUNC_TO_Y444; + convert->func_names[2] = GST_CUDA_KERNEL_FUNC_Y444_TO_YUV; + + convert->convert = convert_RGB_TO_YUV; + + ret = TRUE; + } else { + gsize element_size = 8; + GstVideoFormat unpack_format; + GstVideoInfo unpack_info; + + cuda_converter_get_rgb_order (in_format, &convert->in_rgb_order); + cuda_converter_get_rgb_order (out_format, &templ.rgb_order); + + if (templ.in_depth > 8) { + /* FIXME: RGB10A2_LE and BGR10A2_LE only */ + element_size = 16; + unpack_format = GST_VIDEO_FORMAT_ARGB64; + templ.unpack_function = unpack_to_ARGB64; + } else { + unpack_format = GST_VIDEO_FORMAT_ARGB; + templ.unpack_function = unpack_to_ARGB; + } + + gst_video_info_set_format (&unpack_info, + unpack_format, GST_VIDEO_INFO_WIDTH (in_info), + GST_VIDEO_INFO_HEIGHT (in_info)); + + templ.in_depth = GST_VIDEO_INFO_COMP_DEPTH (&unpack_info, 0); + + cuda_ret = CuMemAllocPitch (&convert->unpack_surface.device_ptr, + &convert->unpack_surface.cuda_stride, + GST_VIDEO_INFO_COMP_WIDTH (&unpack_info, 0) * + GST_VIDEO_INFO_COMP_PSTRIDE (&unpack_info, 0), + GST_VIDEO_INFO_HEIGHT (&unpack_info), element_size); + + if (!gst_cuda_result (cuda_ret)) { + GST_ERROR ("couldn't alloc unpack surface"); + return FALSE; + } + + convert->kernel_source = + cuda_converter_generate_rgb_to_rgb_kernel_code (convert, &templ); + + convert->func_names[0] = GST_CUDA_KERNEL_FUNC_TO_ARGB; + convert->func_names[1] = GST_CUDA_KERNEL_FUNC_SCALE_RGB; + + convert->convert = convert_RGB_TO_RGB; + + ret = TRUE; + } + + if (!ret) { + GST_DEBUG ("no path found"); + + return FALSE; + } + + GST_TRACE ("configured CUDA kernel source\n%s", convert->kernel_source); + + return TRUE; +} diff --git a/sys/nvcodec/cuda-converter.h b/sys/nvcodec/cuda-converter.h new file mode 100644 index 0000000000..ee67f50b56 --- /dev/null +++ b/sys/nvcodec/cuda-converter.h @@ -0,0 +1,62 @@ +/* GStreamer + * Copyright (C) 2019 Seungha Yang + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Library General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Library General Public License for more details. + * + * You should have received a copy of the GNU Library General Public + * License along with this library; if not, write to the + * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, + * Boston, MA 02110-1301, USA. + */ + +#ifndef __GST_CUDA_CONVERTER_H__ +#define __GST_CUDA_CONVERTER_H__ + +#include +#include "gstcudacontext.h" +#include "gstcudamemory.h" + +G_BEGIN_DECLS + +typedef struct _GstCudaConverter GstCudaConverter; + +#define GST_CUDA_CONVERTER_FORMATS \ + "{ I420, YV12, NV12, NV21, P010_10LE, P016_LE, I420_10LE, Y444, Y444_16LE, " \ + "BGRA, RGBA, RGBx, BGRx, ARGB, ABGR, RGB, BGR, BGR10A2_LE, RGB10A2_LE }" + +G_GNUC_INTERNAL +GstCudaConverter * gst_cuda_converter_new (GstVideoInfo * in_info, + GstVideoInfo * out_info, + GstCudaContext * cuda_ctx); + +G_GNUC_INTERNAL +void gst_cuda_converter_free (GstCudaConverter * convert); + +G_GNUC_INTERNAL +gboolean gst_cuda_converter_frame (GstCudaConverter * convert, + const GstCudaMemory * src, + GstVideoInfo * in_info, + GstCudaMemory * dst, + GstVideoInfo * out_info, + CUstream cuda_stream); + +G_GNUC_INTERNAL +gboolean gst_cuda_converter_frame_unlocked (GstCudaConverter * convert, + const GstCudaMemory * src, + GstVideoInfo * in_info, + GstCudaMemory * dst, + GstVideoInfo * out_info, + CUstream cuda_stream); + + +G_END_DECLS + +#endif /* __GST_CUDA_CONVERTER_H__ */ diff --git a/sys/nvcodec/gstcudacontext.c b/sys/nvcodec/gstcudacontext.c index 8962ff3281..ff1b350c98 100644 --- a/sys/nvcodec/gstcudacontext.c +++ b/sys/nvcodec/gstcudacontext.c @@ -46,6 +46,8 @@ struct _GstCudaContextPrivate CUdevice device; gint device_id; + gint tex_align; + GHashTable *accessible_peer; }; @@ -142,6 +144,7 @@ gst_cuda_context_constructed (GObject * object) gchar name[256]; gint min = 0, maj = 0; gint i; + gint tex_align = 0; GList *iter; if (g_once_init_enter (&once)) { @@ -166,11 +169,15 @@ gst_cuda_context_constructed (GObject * object) gst_cuda_result (CuDeviceGetAttribute (&maj, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cdev)) && gst_cuda_result (CuDeviceGetAttribute (&min, - CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cdev))) { + CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cdev)) && + gst_cuda_result (CuDeviceGetAttribute (&tex_align, + CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT, cdev))) { GST_INFO ("GPU #%d supports NVENC: %s (%s) (Compute SM %d.%d)", i, (((maj << 4) + min) >= 0x30) ? "yes" : "no", name, maj, min); if (priv->device_id == -1 || priv->device_id == cdev) { priv->device_id = cuda_dev = cdev; + priv->tex_align = tex_align; + break; } } } @@ -382,6 +389,23 @@ gst_cuda_context_get_handle (GstCudaContext * ctx) return ctx->priv->context; } +/** + * gst_cuda_context_get_texture_alignment: + * @ctx: a #GstCudaContext + * + * Get required texture alignment by device + * + * Returns: the #CUcontext of @ctx + */ +gint +gst_cuda_context_get_texture_alignment (GstCudaContext * ctx) +{ + g_return_val_if_fail (ctx, 0); + g_return_val_if_fail (GST_IS_CUDA_CONTEXT (ctx), 0); + + return ctx->priv->tex_align; +} + /** * gst_cuda_context_can_access_peer: * @ctx: a #GstCudaContext diff --git a/sys/nvcodec/gstcudacontext.h b/sys/nvcodec/gstcudacontext.h index 5b5d640a76..d91bee9ad3 100644 --- a/sys/nvcodec/gstcudacontext.h +++ b/sys/nvcodec/gstcudacontext.h @@ -72,6 +72,9 @@ gboolean gst_cuda_context_pop (CUcontext * cuda_ctx); G_GNUC_INTERNAL gpointer gst_cuda_context_get_handle (GstCudaContext * ctx); +G_GNUC_INTERNAL +gint gst_cuda_context_get_texture_alignment (GstCudaContext * ctx); + G_GNUC_INTERNAL gboolean gst_cuda_context_can_access_peer (GstCudaContext * ctx, GstCudaContext * peer); diff --git a/sys/nvcodec/gstcudaloader.c b/sys/nvcodec/gstcudaloader.c index 116e68117d..4223ba1fbd 100644 --- a/sys/nvcodec/gstcudaloader.c +++ b/sys/nvcodec/gstcudaloader.c @@ -92,6 +92,22 @@ typedef struct _GstNvCodecCudaVTable CUdevice_attribute attrib, CUdevice dev); CUresult (CUDAAPI * CuDeviceCanAccessPeer) (int *canAccessPeer, CUdevice dev, CUdevice peerDev); + CUresult (CUDAAPI * CuDriverGetVersion) (int *driverVersion); + + CUresult (CUDAAPI * CuModuleLoadData) (CUmodule * module, + const void *image); + CUresult (CUDAAPI * CuModuleUnload) (CUmodule module); + CUresult (CUDAAPI * CuModuleGetFunction) (CUfunction * hfunc, + CUmodule hmod, const char *name); + CUresult (CUDAAPI * CuTexObjectCreate) (CUtexObject * pTexObject, + const CUDA_RESOURCE_DESC * pResDesc, const CUDA_TEXTURE_DESC * pTexDesc, + const CUDA_RESOURCE_VIEW_DESC * pResViewDesc); + CUresult (CUDAAPI * CuTexObjectDestroy) (CUtexObject texObject); + CUresult (CUDAAPI * CuLaunchKernel) (CUfunction f, unsigned int gridDimX, + unsigned int gridDimY, unsigned int gridDimZ, + unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, + unsigned int sharedMemBytes, CUstream hStream, void **kernelParams, + void **extra); CUresult (CUDAAPI * CuGraphicsGLRegisterImage) (CUgraphicsResource * pCudaResource, unsigned int image, unsigned int target, @@ -164,6 +180,15 @@ gst_cuda_load_library (void) LOAD_SYMBOL (cuDeviceGetAttribute, CuDeviceGetAttribute); LOAD_SYMBOL (cuDeviceCanAccessPeer, CuDeviceCanAccessPeer); + LOAD_SYMBOL (cuDriverGetVersion, CuDriverGetVersion); + + LOAD_SYMBOL (cuModuleLoadData, CuModuleLoadData); + LOAD_SYMBOL (cuModuleUnload, CuModuleUnload); + LOAD_SYMBOL (cuModuleGetFunction, CuModuleGetFunction); + LOAD_SYMBOL (cuTexObjectCreate, CuTexObjectCreate); + LOAD_SYMBOL (cuTexObjectDestroy, CuTexObjectDestroy); + LOAD_SYMBOL (cuLaunchKernel, CuLaunchKernel); + /* cudaGL.h */ LOAD_SYMBOL (cuGraphicsGLRegisterImage, CuGraphicsGLRegisterImage); LOAD_SYMBOL (cuGraphicsGLRegisterBuffer, CuGraphicsGLRegisterBuffer); @@ -420,6 +445,71 @@ CuDeviceCanAccessPeer (int *canAccessPeer, CUdevice dev, CUdevice peerDev) return gst_cuda_vtable.CuDeviceCanAccessPeer (canAccessPeer, dev, peerDev); } +CUresult CUDAAPI +CuDriverGetVersion (int *driverVersion) +{ + g_assert (gst_cuda_vtable.CuDriverGetVersion != NULL); + + return gst_cuda_vtable.CuDriverGetVersion (driverVersion); +} + +CUresult CUDAAPI +CuModuleLoadData (CUmodule * module, const void *image) +{ + g_assert (gst_cuda_vtable.CuModuleLoadData != NULL); + + return gst_cuda_vtable.CuModuleLoadData (module, image); +} + +CUresult CUDAAPI +CuModuleUnload (CUmodule module) +{ + g_assert (gst_cuda_vtable.CuModuleUnload != NULL); + + return gst_cuda_vtable.CuModuleUnload (module); +} + +CUresult CUDAAPI +CuModuleGetFunction (CUfunction * hfunc, CUmodule hmod, const char *name) +{ + g_assert (gst_cuda_vtable.CuModuleGetFunction != NULL); + + return gst_cuda_vtable.CuModuleGetFunction (hfunc, hmod, name); +} + +CUresult CUDAAPI +CuTexObjectCreate (CUtexObject * pTexObject, + const CUDA_RESOURCE_DESC * pResDesc, const CUDA_TEXTURE_DESC * pTexDesc, + const CUDA_RESOURCE_VIEW_DESC * pResViewDesc) +{ + g_assert (gst_cuda_vtable.CuTexObjectCreate != NULL); + + return gst_cuda_vtable.CuTexObjectCreate (pTexObject, pResDesc, pTexDesc, + pResViewDesc); +} + +CUresult CUDAAPI +CuTexObjectDestroy (CUtexObject texObject) +{ + g_assert (gst_cuda_vtable.CuTexObjectDestroy != NULL); + + return gst_cuda_vtable.CuTexObjectDestroy (texObject); +} + +CUresult CUDAAPI +CuLaunchKernel (CUfunction f, unsigned int gridDimX, + unsigned int gridDimY, unsigned int gridDimZ, + unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, + unsigned int sharedMemBytes, CUstream hStream, void **kernelParams, + void **extra) +{ + g_assert (gst_cuda_vtable.CuLaunchKernel != NULL); + + return gst_cuda_vtable.CuLaunchKernel (f, gridDimX, gridDimY, gridDimZ, + blockDimX, blockDimY, blockDimZ, sharedMemBytes, hStream, kernelParams, + extra); +} + /* cudaGL.h */ CUresult CUDAAPI CuGraphicsGLRegisterImage (CUgraphicsResource * pCudaResource, diff --git a/sys/nvcodec/gstcudaloader.h b/sys/nvcodec/gstcudaloader.h index 64e8831e72..510eb03350 100644 --- a/sys/nvcodec/gstcudaloader.h +++ b/sys/nvcodec/gstcudaloader.h @@ -145,6 +145,43 @@ CUresult CUDAAPI CuDeviceCanAccessPeer (int *canAccessPeer, CUdevice dev, CUdevice peerDev); +G_GNUC_INTERNAL +CUresult CUDAAPI CuDriverGetVersion (int * driverVersion); + +G_GNUC_INTERNAL +CUresult CUDAAPI CuModuleLoadData (CUmodule* module, + const void *image); + +G_GNUC_INTERNAL +CUresult CUDAAPI CuModuleUnload (CUmodule module); + +G_GNUC_INTERNAL +CUresult CUDAAPI CuModuleGetFunction (CUfunction* hfunc, + CUmodule hmod, + const char* name); + +G_GNUC_INTERNAL +CUresult CUDAAPI CuTexObjectCreate (CUtexObject *pTexObject, + const CUDA_RESOURCE_DESC *pResDesc, + const CUDA_TEXTURE_DESC *pTexDesc, + const CUDA_RESOURCE_VIEW_DESC *pResViewDesc); + +G_GNUC_INTERNAL +CUresult CUDAAPI CuTexObjectDestroy (CUtexObject texObject); + +G_GNUC_INTERNAL +CUresult CUDAAPI CuLaunchKernel (CUfunction f, + unsigned int gridDimX, + unsigned int gridDimY, + unsigned int gridDimZ, + unsigned int blockDimX, + unsigned int blockDimY, + unsigned int blockDimZ, + unsigned int sharedMemBytes, + CUstream hStream, + void **kernelParams, + void **extra); + /* cudaGL.h */ G_GNUC_INTERNAL CUresult CUDAAPI CuGraphicsGLRegisterImage (CUgraphicsResource * pCudaResource, diff --git a/sys/nvcodec/meson.build b/sys/nvcodec/meson.build index 1a5170056e..f35c0c0d76 100644 --- a/sys/nvcodec/meson.build +++ b/sys/nvcodec/meson.build @@ -19,6 +19,7 @@ nvcodec_sources = [ 'gstcudaupload.c', 'gstcudanvrtc.c', 'gstnvrtcloader.c', + 'cuda-converter.c', ] if get_option('nvcodec').disabled() diff --git a/sys/nvcodec/stub/cuda.h b/sys/nvcodec/stub/cuda.h index b844cbc09e..a63d0ae129 100644 --- a/sys/nvcodec/stub/cuda.h +++ b/sys/nvcodec/stub/cuda.h @@ -27,7 +27,11 @@ typedef gpointer CUcontext; typedef gpointer CUgraphicsResource; typedef gpointer CUstream; typedef gpointer CUarray; +typedef gpointer CUmodule; +typedef gpointer CUfunction; +typedef gpointer CUmipmappedArray; +typedef guint64 CUtexObject; typedef guintptr CUdeviceptr; typedef gint CUdevice; @@ -46,6 +50,7 @@ typedef enum typedef enum { + CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT = 14, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR = 75, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR = 76, } CUdevice_attribute; @@ -70,6 +75,39 @@ typedef enum CU_STREAM_NON_BLOCKING = 0x1 } CUstream_flags; +typedef enum +{ + CU_TR_FILTER_MODE_POINT = 0, + CU_TR_FILTER_MODE_LINEAR = 1 +} CUfilter_mode; + +typedef enum +{ + CU_TR_ADDRESS_MODE_WRAP = 0, + CU_TR_ADDRESS_MODE_CLAMP = 1, + CU_TR_ADDRESS_MODE_MIRROR = 2, + CU_TR_ADDRESS_MODE_BORDER = 3 +} CUaddress_mode; + +typedef enum +{ + CU_RESOURCE_TYPE_ARRAY = 0, + CU_RESOURCE_TYPE_MIPMAPPED_ARRAY = 1, + CU_RESOURCE_TYPE_LINEAR = 2, + CU_RESOURCE_TYPE_PITCH2D = 3 +} CUresourcetype; + +typedef enum +{ + CU_AD_FORMAT_UNSIGNED_INT8 = 1, + CU_AD_FORMAT_UNSIGNED_INT16 = 2, +} CUarray_format; + +typedef enum +{ + CU_RES_VIEW_FORMAT_NONE = 0, +} CUresourceViewFormat; + typedef struct { gsize srcXInBytes; @@ -97,6 +135,66 @@ typedef enum CU_GL_DEVICE_LIST_ALL = 0x01, } CUGLDeviceList; +typedef struct +{ + CUaddress_mode addressMode[3]; + CUfilter_mode filterMode; + guint flags; + guint maxAnisotropy; + CUfilter_mode mipmapFilterMode; + gfloat mipmapLevelBias; + gfloat minMipmapLevelClamp; + gfloat maxMipmapLevelClamp; + gfloat borderColor[4]; + gint reserved[12]; +} CUDA_TEXTURE_DESC; + +typedef struct +{ + CUresourcetype resType; + + union { + struct { + CUarray hArray; + } array; + struct { + CUmipmappedArray hMipmappedArray; + } mipmap; + struct { + CUdeviceptr devPtr; + CUarray_format format; + guint numChannels; + gsize sizeInBytes; + } linear; + struct { + CUdeviceptr devPtr; + CUarray_format format; + guint numChannels; + gsize width; + gsize height; + gsize pitchInBytes; + } pitch2D; + struct { + gint reserved[32]; + } reserved; + } res; + + guint flags; +} CUDA_RESOURCE_DESC; + +typedef struct +{ + CUresourceViewFormat format; + gsize width; + gsize height; + gsize depth; + guint firstMipmapLevel; + guint lastMipmapLevel; + guint firstLayer; + guint lastLayer; + guint reserved[16]; +} CUDA_RESOURCE_VIEW_DESC; + #define CUDA_VERSION 10000 #ifdef _WIN32 @@ -120,6 +218,8 @@ typedef enum #define cuMemFree cuMemFree_v2 #define cuGLGetDevices cuGLGetDevices_v2 +#define CU_TRSF_READ_AS_INTEGER 1 + G_END_DECLS #endif /* __GST_CUDA_STUB_H__ */