examples: cuda: Add CUDA memory synchronization example

Add an example code for external CUDA context sharing and
gst_cuda_memory_sync()
This commit is contained in:
Seungha Yang 2024-05-15 02:29:12 +09:00
parent 0a2e0a4f64
commit b02cf3fdcc
3 changed files with 598 additions and 0 deletions

View file

@ -0,0 +1,529 @@
/*
* GStreamer
* Copyright (C) 2024 Seungha Yang <seungha@centricular.com>
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Library General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Library General Public License for more details.
*
* You should have received a copy of the GNU Library General Public
* License along with this library; if not, write to the
* Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
* Boston, MA 02110-1301, USA.
*/
/* This example demonstrates how to share application's CUDA context with
* GStreamer, and CUDA synchronization.
*
* In case that application wants to read CUDA device memory produced by
* GStreamer directly, buffer/memory map with GST_MAP_CUDA flag will return
* CUDA device memory instead of staging system memory. Also, GStreamer will not
* wait for pending CUDA operation associated with the device memory when
* GST_MAP_CUDA is specified. Thus, synchronization is user's responsibility.
* For the synchronization, app needs to use GStreamer's CUDA stream, or
* waits for possibly pending GPU operations queued by GStreamer.
* 1) Executes operations with GStreamer's CUDA stream:
* GstCudaMemory will hold associated CUDA stream. User can access the
* CUDA stream via gst_cuda_memory_get_stream() which returns GstCudaStream
* object. The GstCudaStream is a wrapper of CUstream, so that the native
* handle can be used as a refcounted manner. To get native CUstream handle,
* use gst_cuda_stream_get_handle(). Since GPU commands are serialized in
* the CUDA stream already, user-side CUDA operation using the shared
* CUDA stream will be automatically serialized.
* 2) Executes CUDA operation without GStreamer's CUDA stream:
* Since queued GPU commands may or may not be finished at the moment
* when application executes any CUDA operation using application's own
* CUDA stream, application should wait for GStreamer side CUDA operation.
* gst_cuda_memory_sync() will execute synchronization operation if needed
* and will block the calling CPU thread.
*
* This example consists of following steps
* - Prepares CUDA resources (context, memory, etc)
* - Launches GStreamer pipeline with shared CUDA context.
* The pipeline will produce GstCudaMemory rendered by cudaconvert element.
* - Exectues scale CUDA kernel function and downloads scaled frame to host memory
* - Encodes downloaded host memory to JPEG, write to a file.
*
* NOTE: In this example code, GStreamer's dlopen-ed CUDA functions
* (decleared in cuda-gst.h) will be used instead of ones in decleared
* in cuda.h.
*/
#ifdef HAVE_CONFIG_H
#include "config.h"
#endif
#include <gst/gst.h>
#ifdef G_OS_WIN32
#include <windows.h>
#endif
#include <gst/video/video.h>
#include <gst/cuda/gstcuda.h>
#include <cuda.h>
#include <string.h>
#include <stdio.h>
#define RENDER_TARGET_WIDTH 640
#define RENDER_TARGET_HEIGHT 480
typedef struct
{
GMutex lock;
GCond cond;
GstCudaContext *cuda_ctx;
GstBuffer *buffer;
} AppData;
static void
on_handoff_cb (GstElement * sink, GstBuffer * buf, GstPad * pad, AppData * data)
{
g_mutex_lock (&data->lock);
data->buffer = gst_buffer_ref (buf);
g_cond_signal (&data->cond);
g_mutex_unlock (&data->lock);
}
static GstBusSyncReply
bus_sync_handler (GstBus * bus, GstMessage * msg, AppData * data)
{
switch (GST_MESSAGE_TYPE (msg)) {
case GST_MESSAGE_NEED_CONTEXT:
{
const gchar *ctx_type;
gst_message_parse_context_type (msg, &ctx_type);
gst_println ("Got need-context %s", ctx_type);
if (g_strcmp0 (ctx_type, GST_CUDA_CONTEXT_TYPE) == 0) {
GstContext *gst_ctx = gst_context_new_cuda_context (data->cuda_ctx);
GstElement *src = GST_ELEMENT (msg->src);
gst_element_set_context (src, gst_ctx);
gst_context_unref (gst_ctx);
}
break;
}
default:
break;
}
return GST_BUS_PASS;
}
/* *INDENT-OFF* */
static const gchar kernel_func_str[] =
"extern \"C\" {\n"
"__device__ inline unsigned char\n"
"scale_to_uchar (float val)\n"
"{\n"
" return (unsigned char) __float2int_rz (val * 255.0);\n"
"}\n"
"__global__ void\n"
"scale_func (cudaTextureObject_t tex, unsigned char * dst, size_t stride)\n"
"{\n"
" int x_pos = blockIdx.x * blockDim.x + threadIdx.x;\n"
" int y_pos = blockIdx.y * blockDim.y + threadIdx.y;\n"
" if (x_pos >= 640 || y_pos >= 480)"
" return;\n"
" float x = (float) x_pos / 640.0f;\n"
" float y = (float) y_pos / 480.0f;\n"
" float4 sample = tex2D<float4>(tex, x, y);\n"
" int dst_pos = (x_pos * 4) + (y_pos * stride);\n"
" dst[dst_pos] = scale_to_uchar (sample.x);\n"
" dst[dst_pos + 1] = scale_to_uchar (sample.y);\n"
" dst[dst_pos + 2] = scale_to_uchar (sample.z);\n"
" dst[dst_pos + 3] = scale_to_uchar (sample.w);\n"
"}\n"
"}\n";
/* *INDENT-ON* */
gint
main (gint argc, gchar ** argv)
{
gchar *location = NULL;
gboolean shared_stream = FALSE;
GOptionEntry options[] = {
{"location", 'l', 0, G_OPTION_ARG_STRING, &location,
"Output jpeg file location", NULL},
{"shared-stream", 's', 0, G_OPTION_ARG_NONE, &shared_stream,
"Use GStreamer's CUDA stream", NULL},
{NULL}
};
GOptionContext *option_ctx;
gboolean ret;
GError *err = NULL;
CUresult cuda_ret;
CUcontext cuda_ctx;
CUdevice cuda_dev;
int dev_cnt = 0;
GstElement *pipeline;
CUdeviceptr render_target;
void *host_mem;
gsize mem_size;
size_t pitch;
GstElement *sink;
GstBus *bus;
AppData app_data;
gchar *cubin;
CUmodule module;
CUfunction kernel_func;
GstBuffer *converted_buf;
GstVideoInfo info;
GstCaps *caps;
GstSample *sample;
gsize offset[GST_VIDEO_MAX_PLANES] = { 0, };
gint stride[GST_VIDEO_MAX_PLANES] = { 0, };
GstSample *jpeg_sample;
GstCaps *jpeg_caps;
CUstream app_stream = NULL;
option_ctx = g_option_context_new ("CUDA memory sync example");
g_option_context_add_main_entries (option_ctx, options, NULL);
g_option_context_add_group (option_ctx, gst_init_get_option_group ());
ret = g_option_context_parse (option_ctx, &argc, &argv, &err);
g_option_context_free (option_ctx);
if (!ret) {
gst_printerrln ("option parsing failed: %s", err->message);
g_clear_error (&err);
return 1;
}
if (!location) {
gst_printerrln ("File location must be specified");
return 1;
}
if (!gst_cuda_load_library ()) {
gst_printerrln ("Unable to initialize GstCUDA library");
return 1;
}
if (!gst_cuda_nvrtc_load_library ()) {
gst_printerrln ("Unable to load CUDA runtime compiler library");
return 1;
}
/* Initialize CUDA and create device */
cuda_ret = CuInit (0);
if (cuda_ret != CUDA_SUCCESS) {
gst_printerrln ("cuInit failed");
return 1;
}
cuda_ret = CuDeviceGetCount (&dev_cnt);
if (cuda_ret != CUDA_SUCCESS || dev_cnt == 0) {
gst_printerrln ("No availiable CUDA device");
return 1;
}
cuda_ret = CuDeviceGet (&cuda_dev, 0);
if (cuda_ret != CUDA_SUCCESS) {
gst_printerrln ("Couldn't get CUDA device");
return 1;
}
cuda_ret = CuCtxCreate (&cuda_ctx, 0, cuda_dev);
if (cuda_ret != CUDA_SUCCESS) {
gst_printerrln ("Couldn't create CUDA context");
return 1;
}
if (!shared_stream) {
cuda_ret = CuStreamCreate (&app_stream, CU_STREAM_DEFAULT);
if (cuda_ret != CUDA_SUCCESS) {
gst_printerrln ("Couldn't create CUDA stream");
return 1;
}
}
/* Allocate render target device memory */
cuda_ret = CuMemAllocPitch (&render_target,
&pitch, RENDER_TARGET_WIDTH * 4, RENDER_TARGET_HEIGHT, 16);
if (cuda_ret != CUDA_SUCCESS) {
gst_printerrln ("cuMemAllocPitch failed");
return 1;
}
mem_size = pitch * RENDER_TARGET_HEIGHT;
cuda_ret = CuMemAllocHost (&host_mem, mem_size);
if (cuda_ret != CUDA_SUCCESS) {
gst_printerrln ("cuMemAllocHost failed");
return 1;
}
/* We will download converted CUDA device memory to this system memory */
converted_buf = gst_buffer_new_wrapped_full (0,
host_mem, mem_size, 0, mem_size, NULL, NULL);
gst_video_info_set_format (&info, GST_VIDEO_FORMAT_RGBA,
RENDER_TARGET_WIDTH, RENDER_TARGET_HEIGHT);
stride[0] = pitch;
/* Since we allocated system memory with the same size of CUDA device
* memory, need to attach video meta to signal memory layout. The pitch
* can be different from default stride */
gst_buffer_add_video_meta_full (converted_buf, GST_VIDEO_FRAME_FLAG_NONE,
GST_VIDEO_FORMAT_RGBA, RENDER_TARGET_WIDTH, RENDER_TARGET_HEIGHT,
1, offset, stride);
cubin = gst_cuda_nvrtc_compile_cubin (kernel_func_str, (gint) cuda_dev);
if (!cubin) {
gst_printerrln ("Couldn't compile cubin");
return 1;
}
cuda_ret = CuModuleLoadData (&module, cubin);
g_free (cubin);
if (cuda_ret != CUDA_SUCCESS) {
gst_printerrln ("cuModuleLoadData failed");
return 1;
}
cuda_ret = CuModuleGetFunction (&kernel_func, module, "scale_func");
if (cuda_ret != CUDA_SUCCESS) {
gst_printerrln ("cuModuleGetFunction failed");
return 1;
}
cuda_ret = CuCtxPopCurrent (NULL);
if (cuda_ret != CUDA_SUCCESS) {
gst_printerrln ("cuCtxPopCurrent failed");
return 1;
}
/* Create GstCudaContext wrapping our context */
app_data.cuda_ctx = gst_cuda_context_new_wrapped (cuda_ctx, cuda_dev);
if (!app_data.cuda_ctx) {
gst_printerrln ("Couldn't create wrapped context");
return 1;
}
pipeline = gst_parse_launch ("videotestsrc num-buffers=1 ! "
"video/x-raw,format=NV12 ! cudaupload ! cudaconvert ! "
"video/x-raw(memory:CUDAMemory),format=RGBA ! "
"fakesink signal-handoffs=true name=sink", NULL);
if (!pipeline) {
gst_printerrln ("Couldn't create pipeline");
return 1;
}
g_mutex_init (&app_data.lock);
g_cond_init (&app_data.cond);
app_data.buffer = NULL;
sink = gst_bin_get_by_name (GST_BIN (pipeline), "sink");
g_assert (sink);
/* Install handoff signal to get GstCudaMemory processed by cudaconvert */
g_signal_connect (sink, "handoff", G_CALLBACK (on_handoff_cb), &app_data);
gst_object_unref (sink);
/* Setup **SYNC** bus handler. In case that an application wants to
* shader its own CUDA context with GStreamer pipeline, GstContext
* should be configured using sync bus handler */
bus = gst_element_get_bus (pipeline);
gst_bus_set_sync_handler (bus, (GstBusSyncHandler) bus_sync_handler,
&app_data, NULL);
gst_object_unref (bus);
if (gst_element_set_state (pipeline, GST_STATE_PLAYING) ==
GST_STATE_CHANGE_FAILURE) {
gst_printerrln ("State change failed");
return 1;
}
/* Wait for processed buffer */
g_mutex_lock (&app_data.lock);
while (!app_data.buffer)
g_cond_wait (&app_data.cond, &app_data.lock);
g_mutex_unlock (&app_data.lock);
gst_element_set_state (pipeline, GST_STATE_NULL);
/* Launch image scale kernel func and download to host memory */
{
CUtexObject texture;
GstMemory *mem;
GstCudaMemory *cmem;
GstCudaStream *gst_stream;
CUstream stream;
CUDA_MEMCPY2D copy_params = { 0, };
CUDA_RESOURCE_DESC resource_desc;
CUDA_TEXTURE_DESC texture_desc;
GstMapInfo src_map;
void *kernel_args[] = { &texture, &render_target, &pitch };
mem = gst_buffer_peek_memory (app_data.buffer, 0);
g_assert (gst_is_cuda_memory (mem));
if (!gst_memory_map (mem, &src_map, GST_MAP_READ | GST_MAP_CUDA)) {
gst_printerrln ("gst_memory_map failed");
return 1;
}
cmem = GST_CUDA_MEMORY_CAST (mem);
/* In case of GST_MAP_CUDA, GStreamer will not wait for CUDA sync.
* Application can use CUDA stream attached in GstCudaMemory
* or need to call gst_cuda_memory_sync() to ensure synchronization */
if (shared_stream) {
gst_stream = gst_cuda_memory_get_stream (cmem);
stream = gst_cuda_stream_get_handle (gst_stream);
} else {
gst_cuda_memory_sync (cmem);
stream = app_stream;
}
/* Prepare texture resource */
memset (&resource_desc, 0, sizeof (CUDA_RESOURCE_DESC));
memset (&texture_desc, 0, sizeof (CUDA_TEXTURE_DESC));
resource_desc.resType = CU_RESOURCE_TYPE_PITCH2D;
resource_desc.res.pitch2D.format = CU_AD_FORMAT_UNSIGNED_INT8;
resource_desc.res.pitch2D.numChannels = 4;
resource_desc.res.pitch2D.width = cmem->info.width;
resource_desc.res.pitch2D.height = cmem->info.height;
resource_desc.res.pitch2D.pitchInBytes = cmem->info.stride[0];
resource_desc.res.pitch2D.devPtr = (CUdeviceptr) src_map.data;
texture_desc.filterMode = CU_TR_FILTER_MODE_LINEAR;
texture_desc.flags = CU_TRSF_NORMALIZED_COORDINATES;
texture_desc.addressMode[0] = CU_TR_ADDRESS_MODE_CLAMP;
texture_desc.addressMode[1] = CU_TR_ADDRESS_MODE_CLAMP;
texture_desc.addressMode[2] = CU_TR_ADDRESS_MODE_CLAMP;
cuda_ret = CuCtxPushCurrent (cuda_ctx);
if (cuda_ret != CUDA_SUCCESS) {
gst_printerrln ("cuCtxPopCurrent failed");
return 1;
}
/* Create texture for sampling */
cuda_ret = CuTexObjectCreate (&texture,
&resource_desc, &texture_desc, NULL);
if (cuda_ret != CUDA_SUCCESS) {
gst_printerrln ("cuTexObjectCreate failed");
return 1;
}
cuda_ret = CuLaunchKernel (kernel_func,
GST_ROUND_UP_16 (RENDER_TARGET_WIDTH) / 16,
GST_ROUND_UP_16 (RENDER_TARGET_HEIGHT) / 16, 1, 16, 16, 1, 0,
stream, kernel_args, NULL);
if (cuda_ret != CUDA_SUCCESS) {
gst_printerrln ("cuLaunchKernel failed");
return 1;
}
/* Download to system memory */
copy_params.srcMemoryType = CU_MEMORYTYPE_DEVICE;
copy_params.srcDevice = render_target;
copy_params.srcPitch = pitch;
copy_params.dstMemoryType = CU_MEMORYTYPE_HOST;
copy_params.dstHost = host_mem;
copy_params.dstPitch = pitch;
copy_params.WidthInBytes = RENDER_TARGET_WIDTH * 4;
copy_params.Height = RENDER_TARGET_HEIGHT;
cuda_ret = CuMemcpy2DAsync (&copy_params, stream);
if (cuda_ret != CUDA_SUCCESS) {
gst_printerrln ("cuMemcpy2DAsync failed");
return 1;
}
/* Wait for conversion and memory download */
cuda_ret = CuStreamSynchronize (stream);
if (cuda_ret != CUDA_SUCCESS) {
gst_printerrln ("cuStreamSynchronize failed");
return 1;
}
cuda_ret = CuTexObjectDestroy (texture);
if (cuda_ret != CUDA_SUCCESS) {
gst_printerrln ("cuTexObjectDestroy failed");
return 1;
}
cuda_ret = CuCtxPopCurrent (NULL);
if (cuda_ret != CUDA_SUCCESS) {
gst_printerrln ("cuCtxPopCurrent failed");
return 1;
}
gst_memory_unmap (mem, &src_map);
}
/* Create sample and convert it to jpeg image */
caps = gst_video_info_to_caps (&info);
sample = gst_sample_new (converted_buf, caps, NULL, NULL);
jpeg_caps = gst_caps_new_empty_simple ("image/jpeg");
jpeg_sample = gst_video_convert_sample (sample,
jpeg_caps, GST_CLOCK_TIME_NONE, NULL);
if (!jpeg_sample) {
gst_printerrln ("gst_video_convert_sample failed");
return 1;
}
{
GstBuffer *jpeg_buf = gst_sample_get_buffer (jpeg_sample);
GstMapInfo map;
if (!gst_buffer_map (jpeg_buf, &map, GST_MAP_READ)) {
gst_printerrln ("gst_buffer_map failed");
return 1;
}
FILE *fp;
fp = fopen (location, "wb");
if (!fp) {
gst_printerrln ("fopen failed");
return 1;
}
if (map.size != fwrite (map.data, 1, map.size, fp)) {
gst_printerrln ("fwrite failed");
return 1;
}
fclose (fp);
gst_buffer_unmap (jpeg_buf, &map);
}
gst_println ("JPEG file is written to \"%s\"", location);
/* Cleanup */
g_free (location);
gst_buffer_unref (app_data.buffer);
gst_object_unref (app_data.cuda_ctx);
g_mutex_clear (&app_data.lock);
g_cond_clear (&app_data.cond);
gst_object_unref (pipeline);
gst_buffer_unref (converted_buf);
gst_sample_unref (jpeg_sample);
gst_sample_unref (sample);
gst_caps_unref (caps);
gst_caps_unref (jpeg_caps);
/* Release CUDA resources */
CuCtxPushCurrent (cuda_ctx);
CuModuleUnload (module);
CuMemFree (render_target);
CuMemFreeHost (host_mem);
if (app_stream)
CuStreamDestroy (app_stream);
CuCtxPopCurrent (NULL);
CuCtxDestroy (cuda_ctx);
gst_deinit ();
return 0;
}

View file

@ -0,0 +1,68 @@
if not gstcuda_dep.found()
subdir_done()
endif
cuda_deps = []
cuda_inc_dir = []
if host_system == 'windows'
cuda_path = run_command(python3, '-c', 'import os; print(os.environ.get("CUDA_PATH"))', check: false).stdout().strip()
if cuda_path == ''
subdir_done()
endif
cuda_inc_dir = include_directories(join_paths(cuda_path, 'include'))
if not cc.has_header('cuda.h', include_directories: cuda_inc_dir)
subdir_done()
endif
else
# NOTE: meson dependency('cuda', version: '>=10'...) will return CUDA runtime
# library but we want to use CUDA driver API. Iterates
cuda_versions = [
'12.4',
'12.3',
'12.2',
'12.1',
'12.0',
'11.8',
'11.8',
'11.7',
'11.6',
'11.5',
'11.4',
'11.2',
'11.1',
'11.0',
'10.2',
'10.1',
'10.0',
]
cuda_dep = dependency('', required: false)
foreach ver : cuda_versions
cuda_dep = dependency('cuda-@0@'.format(ver), required: false)
if cuda_dep.found()
break
endif
endforeach
if not cuda_dep.found()
subdir_done()
endif
cuda_deps = [cuda_dep.partial_dependency(includes : true, compile_args: true)]
endif
gl_header_dep = dependency('', fallback : ['gl-headers', 'gl_headers_dummy_dep'],
required : false)
if gl_header_dep.type_name() == 'internal'
# this will only contain the includes of headers that are not found
compat_includes = subproject('gl-headers').get_variable('compatibility_includes')
else
compat_includes = []
endif
executable('cudamemory-sync', 'cudamemory-sync.c',
include_directories : [configinc] + compat_includes + cuda_inc_dir,
dependencies: [gst_dep, gstvideo_dep, gstcuda_dep, gl_header_dep] + cuda_deps,
c_args : gst_plugins_bad_args + ['-DGST_USE_UNSTABLE_API'],
install: false)

View file

@ -3,6 +3,7 @@ subdir('avsamplesink')
subdir('camerabin2')
subdir('codecparsers')
subdir('codecs')
subdir('cuda')
subdir('d3d11')
subdir('d3d12')
subdir('directfb')