From 6fb0c7b92833355cda4851be6e26fffbc4d0504d Mon Sep 17 00:00:00 2001 From: Seungha Yang Date: Wed, 15 May 2024 02:29:12 +0900 Subject: [PATCH] examples: cuda: Add CUDA memory synchronization example Add an example code for external CUDA context sharing and gst_cuda_memory_sync() Part-of: --- .../tests/examples/cuda/cudamemory-sync.c | 529 ++++++++++++++++++ .../tests/examples/cuda/meson.build | 68 +++ .../tests/examples/meson.build | 1 + 3 files changed, 598 insertions(+) create mode 100644 subprojects/gst-plugins-bad/tests/examples/cuda/cudamemory-sync.c create mode 100644 subprojects/gst-plugins-bad/tests/examples/cuda/meson.build diff --git a/subprojects/gst-plugins-bad/tests/examples/cuda/cudamemory-sync.c b/subprojects/gst-plugins-bad/tests/examples/cuda/cudamemory-sync.c new file mode 100644 index 0000000000..82abd26608 --- /dev/null +++ b/subprojects/gst-plugins-bad/tests/examples/cuda/cudamemory-sync.c @@ -0,0 +1,529 @@ +/* + * GStreamer + * Copyright (C) 2024 Seungha Yang + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Library General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Library General Public License for more details. + * + * You should have received a copy of the GNU Library General Public + * License along with this library; if not, write to the + * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, + * Boston, MA 02110-1301, USA. + */ + +/* This example demonstrates how to share application's CUDA context with + * GStreamer, and CUDA synchronization. + * + * In case that application wants to read CUDA device memory produced by + * GStreamer directly, buffer/memory map with GST_MAP_CUDA flag will return + * CUDA device memory instead of staging system memory. Also, GStreamer will not + * wait for pending CUDA operation associated with the device memory when + * GST_MAP_CUDA is specified. Thus, synchronization is user's responsibility. + * For the synchronization, app needs to use GStreamer's CUDA stream, or + * waits for possibly pending GPU operations queued by GStreamer. + * 1) Executes operations with GStreamer's CUDA stream: + * GstCudaMemory will hold associated CUDA stream. User can access the + * CUDA stream via gst_cuda_memory_get_stream() which returns GstCudaStream + * object. The GstCudaStream is a wrapper of CUstream, so that the native + * handle can be used as a refcounted manner. To get native CUstream handle, + * use gst_cuda_stream_get_handle(). Since GPU commands are serialized in + * the CUDA stream already, user-side CUDA operation using the shared + * CUDA stream will be automatically serialized. + * 2) Executes CUDA operation without GStreamer's CUDA stream: + * Since queued GPU commands may or may not be finished at the moment + * when application executes any CUDA operation using application's own + * CUDA stream, application should wait for GStreamer side CUDA operation. + * gst_cuda_memory_sync() will execute synchronization operation if needed + * and will block the calling CPU thread. + * + * This example consists of following steps + * - Prepares CUDA resources (context, memory, etc) + * - Launches GStreamer pipeline with shared CUDA context. + * The pipeline will produce GstCudaMemory rendered by cudaconvert element. + * - Exectues scale CUDA kernel function and downloads scaled frame to host memory + * - Encodes downloaded host memory to JPEG, write to a file. + * + * NOTE: In this example code, GStreamer's dlopen-ed CUDA functions + * (decleared in cuda-gst.h) will be used instead of ones in decleared + * in cuda.h. + */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include + +#ifdef G_OS_WIN32 +#include +#endif + +#include +#include +#include +#include +#include + +#define RENDER_TARGET_WIDTH 640 +#define RENDER_TARGET_HEIGHT 480 + +typedef struct +{ + GMutex lock; + GCond cond; + GstCudaContext *cuda_ctx; + GstBuffer *buffer; +} AppData; + +static void +on_handoff_cb (GstElement * sink, GstBuffer * buf, GstPad * pad, AppData * data) +{ + g_mutex_lock (&data->lock); + data->buffer = gst_buffer_ref (buf); + g_cond_signal (&data->cond); + g_mutex_unlock (&data->lock); +} + +static GstBusSyncReply +bus_sync_handler (GstBus * bus, GstMessage * msg, AppData * data) +{ + switch (GST_MESSAGE_TYPE (msg)) { + case GST_MESSAGE_NEED_CONTEXT: + { + const gchar *ctx_type; + gst_message_parse_context_type (msg, &ctx_type); + gst_println ("Got need-context %s", ctx_type); + if (g_strcmp0 (ctx_type, GST_CUDA_CONTEXT_TYPE) == 0) { + GstContext *gst_ctx = gst_context_new_cuda_context (data->cuda_ctx); + GstElement *src = GST_ELEMENT (msg->src); + gst_element_set_context (src, gst_ctx); + gst_context_unref (gst_ctx); + } + break; + } + default: + break; + } + + return GST_BUS_PASS; +} + +/* *INDENT-OFF* */ +static const gchar kernel_func_str[] = +"extern \"C\" {\n" +"__device__ inline unsigned char\n" +"scale_to_uchar (float val)\n" +"{\n" +" return (unsigned char) __float2int_rz (val * 255.0);\n" +"}\n" +"__global__ void\n" +"scale_func (cudaTextureObject_t tex, unsigned char * dst, size_t stride)\n" +"{\n" +" int x_pos = blockIdx.x * blockDim.x + threadIdx.x;\n" +" int y_pos = blockIdx.y * blockDim.y + threadIdx.y;\n" +" if (x_pos >= 640 || y_pos >= 480)" +" return;\n" +" float x = (float) x_pos / 640.0f;\n" +" float y = (float) y_pos / 480.0f;\n" +" float4 sample = tex2D(tex, x, y);\n" +" int dst_pos = (x_pos * 4) + (y_pos * stride);\n" +" dst[dst_pos] = scale_to_uchar (sample.x);\n" +" dst[dst_pos + 1] = scale_to_uchar (sample.y);\n" +" dst[dst_pos + 2] = scale_to_uchar (sample.z);\n" +" dst[dst_pos + 3] = scale_to_uchar (sample.w);\n" +"}\n" +"}\n"; +/* *INDENT-ON* */ + +gint +main (gint argc, gchar ** argv) +{ + gchar *location = NULL; + gboolean shared_stream = FALSE; + GOptionEntry options[] = { + {"location", 'l', 0, G_OPTION_ARG_STRING, &location, + "Output jpeg file location", NULL}, + {"shared-stream", 's', 0, G_OPTION_ARG_NONE, &shared_stream, + "Use GStreamer's CUDA stream", NULL}, + {NULL} + }; + GOptionContext *option_ctx; + gboolean ret; + GError *err = NULL; + CUresult cuda_ret; + CUcontext cuda_ctx; + CUdevice cuda_dev; + int dev_cnt = 0; + GstElement *pipeline; + CUdeviceptr render_target; + void *host_mem; + gsize mem_size; + size_t pitch; + GstElement *sink; + GstBus *bus; + AppData app_data; + gchar *cubin; + CUmodule module; + CUfunction kernel_func; + GstBuffer *converted_buf; + GstVideoInfo info; + GstCaps *caps; + GstSample *sample; + gsize offset[GST_VIDEO_MAX_PLANES] = { 0, }; + gint stride[GST_VIDEO_MAX_PLANES] = { 0, }; + GstSample *jpeg_sample; + GstCaps *jpeg_caps; + CUstream app_stream = NULL; + + option_ctx = g_option_context_new ("CUDA memory sync example"); + g_option_context_add_main_entries (option_ctx, options, NULL); + g_option_context_add_group (option_ctx, gst_init_get_option_group ()); + ret = g_option_context_parse (option_ctx, &argc, &argv, &err); + g_option_context_free (option_ctx); + + if (!ret) { + gst_printerrln ("option parsing failed: %s", err->message); + g_clear_error (&err); + return 1; + } + + if (!location) { + gst_printerrln ("File location must be specified"); + return 1; + } + + if (!gst_cuda_load_library ()) { + gst_printerrln ("Unable to initialize GstCUDA library"); + return 1; + } + + if (!gst_cuda_nvrtc_load_library ()) { + gst_printerrln ("Unable to load CUDA runtime compiler library"); + return 1; + } + + /* Initialize CUDA and create device */ + cuda_ret = CuInit (0); + if (cuda_ret != CUDA_SUCCESS) { + gst_printerrln ("cuInit failed"); + return 1; + } + + cuda_ret = CuDeviceGetCount (&dev_cnt); + if (cuda_ret != CUDA_SUCCESS || dev_cnt == 0) { + gst_printerrln ("No availiable CUDA device"); + return 1; + } + + cuda_ret = CuDeviceGet (&cuda_dev, 0); + if (cuda_ret != CUDA_SUCCESS) { + gst_printerrln ("Couldn't get CUDA device"); + return 1; + } + + cuda_ret = CuCtxCreate (&cuda_ctx, 0, cuda_dev); + if (cuda_ret != CUDA_SUCCESS) { + gst_printerrln ("Couldn't create CUDA context"); + return 1; + } + + if (!shared_stream) { + cuda_ret = CuStreamCreate (&app_stream, CU_STREAM_DEFAULT); + if (cuda_ret != CUDA_SUCCESS) { + gst_printerrln ("Couldn't create CUDA stream"); + return 1; + } + } + + /* Allocate render target device memory */ + cuda_ret = CuMemAllocPitch (&render_target, + &pitch, RENDER_TARGET_WIDTH * 4, RENDER_TARGET_HEIGHT, 16); + if (cuda_ret != CUDA_SUCCESS) { + gst_printerrln ("cuMemAllocPitch failed"); + return 1; + } + + mem_size = pitch * RENDER_TARGET_HEIGHT; + cuda_ret = CuMemAllocHost (&host_mem, mem_size); + if (cuda_ret != CUDA_SUCCESS) { + gst_printerrln ("cuMemAllocHost failed"); + return 1; + } + + /* We will download converted CUDA device memory to this system memory */ + converted_buf = gst_buffer_new_wrapped_full (0, + host_mem, mem_size, 0, mem_size, NULL, NULL); + + gst_video_info_set_format (&info, GST_VIDEO_FORMAT_RGBA, + RENDER_TARGET_WIDTH, RENDER_TARGET_HEIGHT); + stride[0] = pitch; + + /* Since we allocated system memory with the same size of CUDA device + * memory, need to attach video meta to signal memory layout. The pitch + * can be different from default stride */ + gst_buffer_add_video_meta_full (converted_buf, GST_VIDEO_FRAME_FLAG_NONE, + GST_VIDEO_FORMAT_RGBA, RENDER_TARGET_WIDTH, RENDER_TARGET_HEIGHT, + 1, offset, stride); + + cubin = gst_cuda_nvrtc_compile_cubin (kernel_func_str, (gint) cuda_dev); + if (!cubin) { + gst_printerrln ("Couldn't compile cubin"); + return 1; + } + + cuda_ret = CuModuleLoadData (&module, cubin); + g_free (cubin); + if (cuda_ret != CUDA_SUCCESS) { + gst_printerrln ("cuModuleLoadData failed"); + return 1; + } + + cuda_ret = CuModuleGetFunction (&kernel_func, module, "scale_func"); + if (cuda_ret != CUDA_SUCCESS) { + gst_printerrln ("cuModuleGetFunction failed"); + return 1; + } + + cuda_ret = CuCtxPopCurrent (NULL); + if (cuda_ret != CUDA_SUCCESS) { + gst_printerrln ("cuCtxPopCurrent failed"); + return 1; + } + + /* Create GstCudaContext wrapping our context */ + app_data.cuda_ctx = gst_cuda_context_new_wrapped (cuda_ctx, cuda_dev); + if (!app_data.cuda_ctx) { + gst_printerrln ("Couldn't create wrapped context"); + return 1; + } + + pipeline = gst_parse_launch ("videotestsrc num-buffers=1 ! " + "video/x-raw,format=NV12 ! cudaupload ! cudaconvert ! " + "video/x-raw(memory:CUDAMemory),format=RGBA ! " + "fakesink signal-handoffs=true name=sink", NULL); + if (!pipeline) { + gst_printerrln ("Couldn't create pipeline"); + return 1; + } + + g_mutex_init (&app_data.lock); + g_cond_init (&app_data.cond); + app_data.buffer = NULL; + + sink = gst_bin_get_by_name (GST_BIN (pipeline), "sink"); + g_assert (sink); + + /* Install handoff signal to get GstCudaMemory processed by cudaconvert */ + g_signal_connect (sink, "handoff", G_CALLBACK (on_handoff_cb), &app_data); + gst_object_unref (sink); + + /* Setup **SYNC** bus handler. In case that an application wants to + * shader its own CUDA context with GStreamer pipeline, GstContext + * should be configured using sync bus handler */ + bus = gst_element_get_bus (pipeline); + gst_bus_set_sync_handler (bus, (GstBusSyncHandler) bus_sync_handler, + &app_data, NULL); + gst_object_unref (bus); + + if (gst_element_set_state (pipeline, GST_STATE_PLAYING) == + GST_STATE_CHANGE_FAILURE) { + gst_printerrln ("State change failed"); + return 1; + } + + /* Wait for processed buffer */ + g_mutex_lock (&app_data.lock); + while (!app_data.buffer) + g_cond_wait (&app_data.cond, &app_data.lock); + g_mutex_unlock (&app_data.lock); + gst_element_set_state (pipeline, GST_STATE_NULL); + + /* Launch image scale kernel func and download to host memory */ + { + CUtexObject texture; + GstMemory *mem; + GstCudaMemory *cmem; + GstCudaStream *gst_stream; + CUstream stream; + CUDA_MEMCPY2D copy_params = { 0, }; + CUDA_RESOURCE_DESC resource_desc; + CUDA_TEXTURE_DESC texture_desc; + GstMapInfo src_map; + void *kernel_args[] = { &texture, &render_target, &pitch }; + + mem = gst_buffer_peek_memory (app_data.buffer, 0); + g_assert (gst_is_cuda_memory (mem)); + + if (!gst_memory_map (mem, &src_map, GST_MAP_READ | GST_MAP_CUDA)) { + gst_printerrln ("gst_memory_map failed"); + return 1; + } + + cmem = GST_CUDA_MEMORY_CAST (mem); + + /* In case of GST_MAP_CUDA, GStreamer will not wait for CUDA sync. + * Application can use CUDA stream attached in GstCudaMemory + * or need to call gst_cuda_memory_sync() to ensure synchronization */ + if (shared_stream) { + gst_stream = gst_cuda_memory_get_stream (cmem); + stream = gst_cuda_stream_get_handle (gst_stream); + } else { + gst_cuda_memory_sync (cmem); + stream = app_stream; + } + + /* Prepare texture resource */ + memset (&resource_desc, 0, sizeof (CUDA_RESOURCE_DESC)); + memset (&texture_desc, 0, sizeof (CUDA_TEXTURE_DESC)); + resource_desc.resType = CU_RESOURCE_TYPE_PITCH2D; + resource_desc.res.pitch2D.format = CU_AD_FORMAT_UNSIGNED_INT8; + resource_desc.res.pitch2D.numChannels = 4; + resource_desc.res.pitch2D.width = cmem->info.width; + resource_desc.res.pitch2D.height = cmem->info.height; + resource_desc.res.pitch2D.pitchInBytes = cmem->info.stride[0]; + resource_desc.res.pitch2D.devPtr = (CUdeviceptr) src_map.data; + + texture_desc.filterMode = CU_TR_FILTER_MODE_LINEAR; + texture_desc.flags = CU_TRSF_NORMALIZED_COORDINATES; + texture_desc.addressMode[0] = CU_TR_ADDRESS_MODE_CLAMP; + texture_desc.addressMode[1] = CU_TR_ADDRESS_MODE_CLAMP; + texture_desc.addressMode[2] = CU_TR_ADDRESS_MODE_CLAMP; + + cuda_ret = CuCtxPushCurrent (cuda_ctx); + if (cuda_ret != CUDA_SUCCESS) { + gst_printerrln ("cuCtxPopCurrent failed"); + return 1; + } + + /* Create texture for sampling */ + cuda_ret = CuTexObjectCreate (&texture, + &resource_desc, &texture_desc, NULL); + if (cuda_ret != CUDA_SUCCESS) { + gst_printerrln ("cuTexObjectCreate failed"); + return 1; + } + + cuda_ret = CuLaunchKernel (kernel_func, + GST_ROUND_UP_16 (RENDER_TARGET_WIDTH) / 16, + GST_ROUND_UP_16 (RENDER_TARGET_HEIGHT) / 16, 1, 16, 16, 1, 0, + stream, kernel_args, NULL); + if (cuda_ret != CUDA_SUCCESS) { + gst_printerrln ("cuLaunchKernel failed"); + return 1; + } + + /* Download to system memory */ + copy_params.srcMemoryType = CU_MEMORYTYPE_DEVICE; + copy_params.srcDevice = render_target; + copy_params.srcPitch = pitch; + + copy_params.dstMemoryType = CU_MEMORYTYPE_HOST; + copy_params.dstHost = host_mem; + copy_params.dstPitch = pitch; + + copy_params.WidthInBytes = RENDER_TARGET_WIDTH * 4; + copy_params.Height = RENDER_TARGET_HEIGHT; + + cuda_ret = CuMemcpy2DAsync (©_params, stream); + if (cuda_ret != CUDA_SUCCESS) { + gst_printerrln ("cuMemcpy2DAsync failed"); + return 1; + } + + /* Wait for conversion and memory download */ + cuda_ret = CuStreamSynchronize (stream); + if (cuda_ret != CUDA_SUCCESS) { + gst_printerrln ("cuStreamSynchronize failed"); + return 1; + } + + cuda_ret = CuTexObjectDestroy (texture); + if (cuda_ret != CUDA_SUCCESS) { + gst_printerrln ("cuTexObjectDestroy failed"); + return 1; + } + + cuda_ret = CuCtxPopCurrent (NULL); + if (cuda_ret != CUDA_SUCCESS) { + gst_printerrln ("cuCtxPopCurrent failed"); + return 1; + } + + gst_memory_unmap (mem, &src_map); + } + + /* Create sample and convert it to jpeg image */ + caps = gst_video_info_to_caps (&info); + sample = gst_sample_new (converted_buf, caps, NULL, NULL); + + jpeg_caps = gst_caps_new_empty_simple ("image/jpeg"); + + jpeg_sample = gst_video_convert_sample (sample, + jpeg_caps, GST_CLOCK_TIME_NONE, NULL); + if (!jpeg_sample) { + gst_printerrln ("gst_video_convert_sample failed"); + return 1; + } + + { + GstBuffer *jpeg_buf = gst_sample_get_buffer (jpeg_sample); + GstMapInfo map; + + if (!gst_buffer_map (jpeg_buf, &map, GST_MAP_READ)) { + gst_printerrln ("gst_buffer_map failed"); + return 1; + } + + FILE *fp; + fp = fopen (location, "wb"); + if (!fp) { + gst_printerrln ("fopen failed"); + return 1; + } + + if (map.size != fwrite (map.data, 1, map.size, fp)) { + gst_printerrln ("fwrite failed"); + return 1; + } + + fclose (fp); + + gst_buffer_unmap (jpeg_buf, &map); + } + + gst_println ("JPEG file is written to \"%s\"", location); + + /* Cleanup */ + g_free (location); + gst_buffer_unref (app_data.buffer); + gst_object_unref (app_data.cuda_ctx); + g_mutex_clear (&app_data.lock); + g_cond_clear (&app_data.cond); + gst_object_unref (pipeline); + gst_buffer_unref (converted_buf); + gst_sample_unref (jpeg_sample); + gst_sample_unref (sample); + gst_caps_unref (caps); + gst_caps_unref (jpeg_caps); + + /* Release CUDA resources */ + CuCtxPushCurrent (cuda_ctx); + CuModuleUnload (module); + CuMemFree (render_target); + CuMemFreeHost (host_mem); + if (app_stream) + CuStreamDestroy (app_stream); + CuCtxPopCurrent (NULL); + CuCtxDestroy (cuda_ctx); + + gst_deinit (); + + return 0; +} diff --git a/subprojects/gst-plugins-bad/tests/examples/cuda/meson.build b/subprojects/gst-plugins-bad/tests/examples/cuda/meson.build new file mode 100644 index 0000000000..a40e753579 --- /dev/null +++ b/subprojects/gst-plugins-bad/tests/examples/cuda/meson.build @@ -0,0 +1,68 @@ +if not gstcuda_dep.found() + subdir_done() +endif + +cuda_deps = [] +cuda_inc_dir = [] +if host_system == 'windows' + cuda_path = run_command(python3, '-c', 'import os; print(os.environ.get("CUDA_PATH"))', check: false).stdout().strip() + if cuda_path in ['', 'None'] + subdir_done() + endif + + cuda_inc_dir = include_directories(join_paths(cuda_path, 'include')) + if not cc.has_header('cuda.h', include_directories: cuda_inc_dir) + subdir_done() + endif +else + # NOTE: meson dependency('cuda', version: '>=10'...) will return CUDA runtime + # library but we want to use CUDA driver API. Iterates + cuda_versions = [ + '12.4', + '12.3', + '12.2', + '12.1', + '12.0', + '11.8', + '11.8', + '11.7', + '11.6', + '11.5', + '11.4', + '11.2', + '11.1', + '11.0', + '10.2', + '10.1', + '10.0', + ] + + cuda_dep = dependency('', required: false) + foreach ver : cuda_versions + cuda_dep = dependency('cuda-@0@'.format(ver), required: false) + if cuda_dep.found() + break + endif + endforeach + + if not cuda_dep.found() + subdir_done() + endif + + cuda_deps = [cuda_dep.partial_dependency(includes : true, compile_args: true)] +endif + +gl_header_dep = dependency('', fallback : ['gl-headers', 'gl_headers_dummy_dep'], + required : false) +if gl_header_dep.type_name() == 'internal' + # this will only contain the includes of headers that are not found + compat_includes = subproject('gl-headers').get_variable('compatibility_includes') +else + compat_includes = [] +endif + +executable('cudamemory-sync', 'cudamemory-sync.c', + include_directories : [configinc] + compat_includes + cuda_inc_dir, + dependencies: [gst_dep, gstvideo_dep, gstcuda_dep, gl_header_dep] + cuda_deps, + c_args : gst_plugins_bad_args + ['-DGST_USE_UNSTABLE_API'], + install: false) diff --git a/subprojects/gst-plugins-bad/tests/examples/meson.build b/subprojects/gst-plugins-bad/tests/examples/meson.build index aec685bf65..c9053339e7 100644 --- a/subprojects/gst-plugins-bad/tests/examples/meson.build +++ b/subprojects/gst-plugins-bad/tests/examples/meson.build @@ -3,6 +3,7 @@ subdir('avsamplesink') subdir('camerabin2') subdir('codecparsers') subdir('codecs') +subdir('cuda') subdir('d3d11') subdir('d3d12') subdir('directfb')