gstreamer/subprojects/gst-plugins-bad/sys/nvcodec/kernel/gstcudaconverter-unpack.cu
Seungha Yang 9a8f3a65a3 nvcodec: Add support for CUDA kernel precompile
Enable build time CUDA kernel compile if nvcc is detected.
Precompile is disabled by default and controlled by
"nvcodec-cuda-precompile" build option.

Part-of: <https://gitlab.freedesktop.org/gstreamer/gstreamer/-/merge_requests/8536>
2025-02-22 07:09:03 +00:00

168 lines
No EOL
6 KiB
Text

/* GStreamer
* Copyright (C) 2025 Seungha Yang <seungha@centricular.com>
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Library General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Library General Public License for more details.
*
* You should have received a copy of the GNU Library General Public
* License along with this library; if not, write to the
* Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
* Boston, MA 02110-1301, USA.
*/
#ifdef __NVCC__
extern "C" {
__global__ void
GstCudaConverterUnpack_RGB_RGBx
(unsigned char *src, unsigned char *dst, int width, int height,
int src_stride, int dst_stride)
{
int x_pos = blockIdx.x * blockDim.x + threadIdx.x;
int y_pos = blockIdx.y * blockDim.y + threadIdx.y;
if (x_pos < width && y_pos < height) {
int dst_pos = x_pos * 4 + y_pos * dst_stride;
int src_pos = x_pos * 3 + y_pos * src_stride;
dst[dst_pos] = src[src_pos];
dst[dst_pos + 1] = src[src_pos + 1];
dst[dst_pos + 2] = src[src_pos + 2];
dst[dst_pos + 3] = 0xff;
}
}
__global__ void
GstCudaConverterUnpack_RGB10A2_ARGB64
(unsigned char *src, unsigned char *dst, int width, int height,
int src_stride, int dst_stride)
{
int x_pos = blockIdx.x * blockDim.x + threadIdx.x;
int y_pos = blockIdx.y * blockDim.y + threadIdx.y;
if (x_pos < width && y_pos < height) {
unsigned short a, r, g, b;
unsigned int val;
int dst_pos = x_pos * 8 + y_pos * dst_stride;
val = *(unsigned int *)&src[x_pos * 4 + y_pos * src_stride];
a = (val >> 30) & 0x03;
a = (a << 14) | (a << 12) | (a << 10) | (a << 8) | (a << 6) | (a << 4) | (a << 2) | (a << 0);
r = (val & 0x3ff);
r = (r << 6) | (r >> 4);
g = ((val >> 10) & 0x3ff);
g = (g << 6) | (g >> 4);
b = ((val >> 20) & 0x3ff);
b = (b << 6) | (b >> 4);
*(unsigned short *) &dst[dst_pos] = a;
*(unsigned short *) &dst[dst_pos + 2] = r;
*(unsigned short *) &dst[dst_pos + 4] = g;
*(unsigned short *) &dst[dst_pos + 6] = b;
}
}
__global__ void
GstCudaConverterUnpack_BGR10A2_ARGB64
(unsigned char *src, unsigned char *dst, int width, int height,
int src_stride, int dst_stride)
{
int x_pos = blockIdx.x * blockDim.x + threadIdx.x;
int y_pos = blockIdx.y * blockDim.y + threadIdx.y;
if (x_pos < width && y_pos < height) {
unsigned short a, r, g, b;
unsigned int val;
int dst_pos = x_pos * 8 + y_pos * dst_stride;
val = *(unsigned int *)&src[x_pos * 4 + y_pos * src_stride];
a = (val >> 30) & 0x03;
a = (a << 14) | (a << 12) | (a << 10) | (a << 8) | (a << 6) | (a << 4) | (a << 2) | (a << 0);
b = (val & 0x3ff);
b = (b << 6) | (b >> 4);
g = ((val >> 10) & 0x3ff);
g = (g << 6) | (g >> 4);
r = ((val >> 20) & 0x3ff);
r = (r << 6) | (r >> 4);
*(unsigned short *) &dst[dst_pos] = a;
*(unsigned short *) &dst[dst_pos + 2] = r;
*(unsigned short *) &dst[dst_pos + 4] = g;
*(unsigned short *) &dst[dst_pos + 6] = b;
}
}
}
#else
static const char GstCudaConverterUnpack_str[] =
"extern \"C\" {\n"
"__global__ void\n"
"GstCudaConverterUnpack_RGB_RGBx\n"
"(unsigned char *src, unsigned char *dst, int width, int height,\n"
" int src_stride, int dst_stride)\n"
"{\n"
" int x_pos = blockIdx.x * blockDim.x + threadIdx.x;\n"
" int y_pos = blockIdx.y * blockDim.y + threadIdx.y;\n"
" if (x_pos < width && y_pos < height) {\n"
" int dst_pos = x_pos * 4 + y_pos * dst_stride;\n"
" int src_pos = x_pos * 3 + y_pos * src_stride;\n"
" dst[dst_pos] = src[src_pos];\n"
" dst[dst_pos + 1] = src[src_pos + 1];\n"
" dst[dst_pos + 2] = src[src_pos + 2];\n"
" dst[dst_pos + 3] = 0xff;\n"
" }\n"
"}\n"
"\n"
"__global__ void\n"
"GstCudaConverterUnpack_RGB10A2_ARGB64\n"
"(unsigned char *src, unsigned char *dst, int width, int height,\n"
" int src_stride, int dst_stride)\n"
"{\n"
" int x_pos = blockIdx.x * blockDim.x + threadIdx.x;\n"
" int y_pos = blockIdx.y * blockDim.y + threadIdx.y;\n"
" if (x_pos < width && y_pos < height) {\n"
" unsigned short a, r, g, b;\n"
" unsigned int val;\n"
" int dst_pos = x_pos * 8 + y_pos * dst_stride;\n"
" val = *(unsigned int *)&src[x_pos * 4 + y_pos * src_stride];\n"
" a = (val >> 30) & 0x03;\n"
" a = (a << 14) | (a << 12) | (a << 10) | (a << 8) | (a << 6) | (a << 4) | (a << 2) | (a << 0);\n"
" r = (val & 0x3ff);\n"
" r = (r << 6) | (r >> 4);\n"
" g = ((val >> 10) & 0x3ff);\n"
" g = (g << 6) | (g >> 4);\n"
" b = ((val >> 20) & 0x3ff);\n"
" b = (b << 6) | (b >> 4);\n"
" *(unsigned short *) &dst[dst_pos] = a;\n"
" *(unsigned short *) &dst[dst_pos + 2] = r;\n"
" *(unsigned short *) &dst[dst_pos + 4] = g;\n"
" *(unsigned short *) &dst[dst_pos + 6] = b;\n"
" }\n"
"}\n"
"\n"
"__global__ void\n"
"GstCudaConverterUnpack_BGR10A2_ARGB64\n"
"(unsigned char *src, unsigned char *dst, int width, int height,\n"
" int src_stride, int dst_stride)\n"
"{\n"
" int x_pos = blockIdx.x * blockDim.x + threadIdx.x;\n"
" int y_pos = blockIdx.y * blockDim.y + threadIdx.y;\n"
" if (x_pos < width && y_pos < height) {\n"
" unsigned short a, r, g, b;\n"
" unsigned int val;\n"
" int dst_pos = x_pos * 8 + y_pos * dst_stride;\n"
" val = *(unsigned int *)&src[x_pos * 4 + y_pos * src_stride];\n"
" a = (val >> 30) & 0x03;\n"
" a = (a << 14) | (a << 12) | (a << 10) | (a << 8) | (a << 6) | (a << 4) | (a << 2) | (a << 0);\n"
" b = (val & 0x3ff);\n"
" b = (b << 6) | (b >> 4);\n"
" g = ((val >> 10) & 0x3ff);\n"
" g = (g << 6) | (g >> 4);\n"
" r = ((val >> 20) & 0x3ff);\n"
" r = (r << 6) | (r >> 4);\n"
" *(unsigned short *) &dst[dst_pos] = a;\n"
" *(unsigned short *) &dst[dst_pos + 2] = r;\n"
" *(unsigned short *) &dst[dst_pos + 4] = g;\n"
" *(unsigned short *) &dst[dst_pos + 6] = b;\n"
" }\n"
"}\n"
"}\n"
"\n";
#endif