mirror of
https://gitlab.freedesktop.org/gstreamer/gstreamer.git
synced 2025-01-25 08:38:21 +00:00
6fb0c7b928
Add an example code for external CUDA context sharing and gst_cuda_memory_sync() Part-of: <https://gitlab.freedesktop.org/gstreamer/gstreamer/-/merge_requests/6864>
529 lines
16 KiB
C
529 lines
16 KiB
C
/*
|
|
* GStreamer
|
|
* Copyright (C) 2024 Seungha Yang <seungha@centricular.com>
|
|
*
|
|
* This library is free software; you can redistribute it and/or
|
|
* modify it under the terms of the GNU Library General Public
|
|
* License as published by the Free Software Foundation; either
|
|
* version 2 of the License, or (at your option) any later version.
|
|
*
|
|
* This library is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
* Library General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU Library General Public
|
|
* License along with this library; if not, write to the
|
|
* Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
|
|
* Boston, MA 02110-1301, USA.
|
|
*/
|
|
|
|
/* This example demonstrates how to share application's CUDA context with
|
|
* GStreamer, and CUDA synchronization.
|
|
*
|
|
* In case that application wants to read CUDA device memory produced by
|
|
* GStreamer directly, buffer/memory map with GST_MAP_CUDA flag will return
|
|
* CUDA device memory instead of staging system memory. Also, GStreamer will not
|
|
* wait for pending CUDA operation associated with the device memory when
|
|
* GST_MAP_CUDA is specified. Thus, synchronization is user's responsibility.
|
|
* For the synchronization, app needs to use GStreamer's CUDA stream, or
|
|
* waits for possibly pending GPU operations queued by GStreamer.
|
|
* 1) Executes operations with GStreamer's CUDA stream:
|
|
* GstCudaMemory will hold associated CUDA stream. User can access the
|
|
* CUDA stream via gst_cuda_memory_get_stream() which returns GstCudaStream
|
|
* object. The GstCudaStream is a wrapper of CUstream, so that the native
|
|
* handle can be used as a refcounted manner. To get native CUstream handle,
|
|
* use gst_cuda_stream_get_handle(). Since GPU commands are serialized in
|
|
* the CUDA stream already, user-side CUDA operation using the shared
|
|
* CUDA stream will be automatically serialized.
|
|
* 2) Executes CUDA operation without GStreamer's CUDA stream:
|
|
* Since queued GPU commands may or may not be finished at the moment
|
|
* when application executes any CUDA operation using application's own
|
|
* CUDA stream, application should wait for GStreamer side CUDA operation.
|
|
* gst_cuda_memory_sync() will execute synchronization operation if needed
|
|
* and will block the calling CPU thread.
|
|
*
|
|
* This example consists of following steps
|
|
* - Prepares CUDA resources (context, memory, etc)
|
|
* - Launches GStreamer pipeline with shared CUDA context.
|
|
* The pipeline will produce GstCudaMemory rendered by cudaconvert element.
|
|
* - Exectues scale CUDA kernel function and downloads scaled frame to host memory
|
|
* - Encodes downloaded host memory to JPEG, write to a file.
|
|
*
|
|
* NOTE: In this example code, GStreamer's dlopen-ed CUDA functions
|
|
* (decleared in cuda-gst.h) will be used instead of ones in decleared
|
|
* in cuda.h.
|
|
*/
|
|
|
|
#ifdef HAVE_CONFIG_H
|
|
#include "config.h"
|
|
#endif
|
|
|
|
#include <gst/gst.h>
|
|
|
|
#ifdef G_OS_WIN32
|
|
#include <windows.h>
|
|
#endif
|
|
|
|
#include <gst/video/video.h>
|
|
#include <gst/cuda/gstcuda.h>
|
|
#include <cuda.h>
|
|
#include <string.h>
|
|
#include <stdio.h>
|
|
|
|
#define RENDER_TARGET_WIDTH 640
|
|
#define RENDER_TARGET_HEIGHT 480
|
|
|
|
typedef struct
|
|
{
|
|
GMutex lock;
|
|
GCond cond;
|
|
GstCudaContext *cuda_ctx;
|
|
GstBuffer *buffer;
|
|
} AppData;
|
|
|
|
static void
|
|
on_handoff_cb (GstElement * sink, GstBuffer * buf, GstPad * pad, AppData * data)
|
|
{
|
|
g_mutex_lock (&data->lock);
|
|
data->buffer = gst_buffer_ref (buf);
|
|
g_cond_signal (&data->cond);
|
|
g_mutex_unlock (&data->lock);
|
|
}
|
|
|
|
static GstBusSyncReply
|
|
bus_sync_handler (GstBus * bus, GstMessage * msg, AppData * data)
|
|
{
|
|
switch (GST_MESSAGE_TYPE (msg)) {
|
|
case GST_MESSAGE_NEED_CONTEXT:
|
|
{
|
|
const gchar *ctx_type;
|
|
gst_message_parse_context_type (msg, &ctx_type);
|
|
gst_println ("Got need-context %s", ctx_type);
|
|
if (g_strcmp0 (ctx_type, GST_CUDA_CONTEXT_TYPE) == 0) {
|
|
GstContext *gst_ctx = gst_context_new_cuda_context (data->cuda_ctx);
|
|
GstElement *src = GST_ELEMENT (msg->src);
|
|
gst_element_set_context (src, gst_ctx);
|
|
gst_context_unref (gst_ctx);
|
|
}
|
|
break;
|
|
}
|
|
default:
|
|
break;
|
|
}
|
|
|
|
return GST_BUS_PASS;
|
|
}
|
|
|
|
/* *INDENT-OFF* */
|
|
static const gchar kernel_func_str[] =
|
|
"extern \"C\" {\n"
|
|
"__device__ inline unsigned char\n"
|
|
"scale_to_uchar (float val)\n"
|
|
"{\n"
|
|
" return (unsigned char) __float2int_rz (val * 255.0);\n"
|
|
"}\n"
|
|
"__global__ void\n"
|
|
"scale_func (cudaTextureObject_t tex, unsigned char * dst, size_t stride)\n"
|
|
"{\n"
|
|
" int x_pos = blockIdx.x * blockDim.x + threadIdx.x;\n"
|
|
" int y_pos = blockIdx.y * blockDim.y + threadIdx.y;\n"
|
|
" if (x_pos >= 640 || y_pos >= 480)"
|
|
" return;\n"
|
|
" float x = (float) x_pos / 640.0f;\n"
|
|
" float y = (float) y_pos / 480.0f;\n"
|
|
" float4 sample = tex2D<float4>(tex, x, y);\n"
|
|
" int dst_pos = (x_pos * 4) + (y_pos * stride);\n"
|
|
" dst[dst_pos] = scale_to_uchar (sample.x);\n"
|
|
" dst[dst_pos + 1] = scale_to_uchar (sample.y);\n"
|
|
" dst[dst_pos + 2] = scale_to_uchar (sample.z);\n"
|
|
" dst[dst_pos + 3] = scale_to_uchar (sample.w);\n"
|
|
"}\n"
|
|
"}\n";
|
|
/* *INDENT-ON* */
|
|
|
|
gint
|
|
main (gint argc, gchar ** argv)
|
|
{
|
|
gchar *location = NULL;
|
|
gboolean shared_stream = FALSE;
|
|
GOptionEntry options[] = {
|
|
{"location", 'l', 0, G_OPTION_ARG_STRING, &location,
|
|
"Output jpeg file location", NULL},
|
|
{"shared-stream", 's', 0, G_OPTION_ARG_NONE, &shared_stream,
|
|
"Use GStreamer's CUDA stream", NULL},
|
|
{NULL}
|
|
};
|
|
GOptionContext *option_ctx;
|
|
gboolean ret;
|
|
GError *err = NULL;
|
|
CUresult cuda_ret;
|
|
CUcontext cuda_ctx;
|
|
CUdevice cuda_dev;
|
|
int dev_cnt = 0;
|
|
GstElement *pipeline;
|
|
CUdeviceptr render_target;
|
|
void *host_mem;
|
|
gsize mem_size;
|
|
size_t pitch;
|
|
GstElement *sink;
|
|
GstBus *bus;
|
|
AppData app_data;
|
|
gchar *cubin;
|
|
CUmodule module;
|
|
CUfunction kernel_func;
|
|
GstBuffer *converted_buf;
|
|
GstVideoInfo info;
|
|
GstCaps *caps;
|
|
GstSample *sample;
|
|
gsize offset[GST_VIDEO_MAX_PLANES] = { 0, };
|
|
gint stride[GST_VIDEO_MAX_PLANES] = { 0, };
|
|
GstSample *jpeg_sample;
|
|
GstCaps *jpeg_caps;
|
|
CUstream app_stream = NULL;
|
|
|
|
option_ctx = g_option_context_new ("CUDA memory sync example");
|
|
g_option_context_add_main_entries (option_ctx, options, NULL);
|
|
g_option_context_add_group (option_ctx, gst_init_get_option_group ());
|
|
ret = g_option_context_parse (option_ctx, &argc, &argv, &err);
|
|
g_option_context_free (option_ctx);
|
|
|
|
if (!ret) {
|
|
gst_printerrln ("option parsing failed: %s", err->message);
|
|
g_clear_error (&err);
|
|
return 1;
|
|
}
|
|
|
|
if (!location) {
|
|
gst_printerrln ("File location must be specified");
|
|
return 1;
|
|
}
|
|
|
|
if (!gst_cuda_load_library ()) {
|
|
gst_printerrln ("Unable to initialize GstCUDA library");
|
|
return 1;
|
|
}
|
|
|
|
if (!gst_cuda_nvrtc_load_library ()) {
|
|
gst_printerrln ("Unable to load CUDA runtime compiler library");
|
|
return 1;
|
|
}
|
|
|
|
/* Initialize CUDA and create device */
|
|
cuda_ret = CuInit (0);
|
|
if (cuda_ret != CUDA_SUCCESS) {
|
|
gst_printerrln ("cuInit failed");
|
|
return 1;
|
|
}
|
|
|
|
cuda_ret = CuDeviceGetCount (&dev_cnt);
|
|
if (cuda_ret != CUDA_SUCCESS || dev_cnt == 0) {
|
|
gst_printerrln ("No availiable CUDA device");
|
|
return 1;
|
|
}
|
|
|
|
cuda_ret = CuDeviceGet (&cuda_dev, 0);
|
|
if (cuda_ret != CUDA_SUCCESS) {
|
|
gst_printerrln ("Couldn't get CUDA device");
|
|
return 1;
|
|
}
|
|
|
|
cuda_ret = CuCtxCreate (&cuda_ctx, 0, cuda_dev);
|
|
if (cuda_ret != CUDA_SUCCESS) {
|
|
gst_printerrln ("Couldn't create CUDA context");
|
|
return 1;
|
|
}
|
|
|
|
if (!shared_stream) {
|
|
cuda_ret = CuStreamCreate (&app_stream, CU_STREAM_DEFAULT);
|
|
if (cuda_ret != CUDA_SUCCESS) {
|
|
gst_printerrln ("Couldn't create CUDA stream");
|
|
return 1;
|
|
}
|
|
}
|
|
|
|
/* Allocate render target device memory */
|
|
cuda_ret = CuMemAllocPitch (&render_target,
|
|
&pitch, RENDER_TARGET_WIDTH * 4, RENDER_TARGET_HEIGHT, 16);
|
|
if (cuda_ret != CUDA_SUCCESS) {
|
|
gst_printerrln ("cuMemAllocPitch failed");
|
|
return 1;
|
|
}
|
|
|
|
mem_size = pitch * RENDER_TARGET_HEIGHT;
|
|
cuda_ret = CuMemAllocHost (&host_mem, mem_size);
|
|
if (cuda_ret != CUDA_SUCCESS) {
|
|
gst_printerrln ("cuMemAllocHost failed");
|
|
return 1;
|
|
}
|
|
|
|
/* We will download converted CUDA device memory to this system memory */
|
|
converted_buf = gst_buffer_new_wrapped_full (0,
|
|
host_mem, mem_size, 0, mem_size, NULL, NULL);
|
|
|
|
gst_video_info_set_format (&info, GST_VIDEO_FORMAT_RGBA,
|
|
RENDER_TARGET_WIDTH, RENDER_TARGET_HEIGHT);
|
|
stride[0] = pitch;
|
|
|
|
/* Since we allocated system memory with the same size of CUDA device
|
|
* memory, need to attach video meta to signal memory layout. The pitch
|
|
* can be different from default stride */
|
|
gst_buffer_add_video_meta_full (converted_buf, GST_VIDEO_FRAME_FLAG_NONE,
|
|
GST_VIDEO_FORMAT_RGBA, RENDER_TARGET_WIDTH, RENDER_TARGET_HEIGHT,
|
|
1, offset, stride);
|
|
|
|
cubin = gst_cuda_nvrtc_compile_cubin (kernel_func_str, (gint) cuda_dev);
|
|
if (!cubin) {
|
|
gst_printerrln ("Couldn't compile cubin");
|
|
return 1;
|
|
}
|
|
|
|
cuda_ret = CuModuleLoadData (&module, cubin);
|
|
g_free (cubin);
|
|
if (cuda_ret != CUDA_SUCCESS) {
|
|
gst_printerrln ("cuModuleLoadData failed");
|
|
return 1;
|
|
}
|
|
|
|
cuda_ret = CuModuleGetFunction (&kernel_func, module, "scale_func");
|
|
if (cuda_ret != CUDA_SUCCESS) {
|
|
gst_printerrln ("cuModuleGetFunction failed");
|
|
return 1;
|
|
}
|
|
|
|
cuda_ret = CuCtxPopCurrent (NULL);
|
|
if (cuda_ret != CUDA_SUCCESS) {
|
|
gst_printerrln ("cuCtxPopCurrent failed");
|
|
return 1;
|
|
}
|
|
|
|
/* Create GstCudaContext wrapping our context */
|
|
app_data.cuda_ctx = gst_cuda_context_new_wrapped (cuda_ctx, cuda_dev);
|
|
if (!app_data.cuda_ctx) {
|
|
gst_printerrln ("Couldn't create wrapped context");
|
|
return 1;
|
|
}
|
|
|
|
pipeline = gst_parse_launch ("videotestsrc num-buffers=1 ! "
|
|
"video/x-raw,format=NV12 ! cudaupload ! cudaconvert ! "
|
|
"video/x-raw(memory:CUDAMemory),format=RGBA ! "
|
|
"fakesink signal-handoffs=true name=sink", NULL);
|
|
if (!pipeline) {
|
|
gst_printerrln ("Couldn't create pipeline");
|
|
return 1;
|
|
}
|
|
|
|
g_mutex_init (&app_data.lock);
|
|
g_cond_init (&app_data.cond);
|
|
app_data.buffer = NULL;
|
|
|
|
sink = gst_bin_get_by_name (GST_BIN (pipeline), "sink");
|
|
g_assert (sink);
|
|
|
|
/* Install handoff signal to get GstCudaMemory processed by cudaconvert */
|
|
g_signal_connect (sink, "handoff", G_CALLBACK (on_handoff_cb), &app_data);
|
|
gst_object_unref (sink);
|
|
|
|
/* Setup **SYNC** bus handler. In case that an application wants to
|
|
* shader its own CUDA context with GStreamer pipeline, GstContext
|
|
* should be configured using sync bus handler */
|
|
bus = gst_element_get_bus (pipeline);
|
|
gst_bus_set_sync_handler (bus, (GstBusSyncHandler) bus_sync_handler,
|
|
&app_data, NULL);
|
|
gst_object_unref (bus);
|
|
|
|
if (gst_element_set_state (pipeline, GST_STATE_PLAYING) ==
|
|
GST_STATE_CHANGE_FAILURE) {
|
|
gst_printerrln ("State change failed");
|
|
return 1;
|
|
}
|
|
|
|
/* Wait for processed buffer */
|
|
g_mutex_lock (&app_data.lock);
|
|
while (!app_data.buffer)
|
|
g_cond_wait (&app_data.cond, &app_data.lock);
|
|
g_mutex_unlock (&app_data.lock);
|
|
gst_element_set_state (pipeline, GST_STATE_NULL);
|
|
|
|
/* Launch image scale kernel func and download to host memory */
|
|
{
|
|
CUtexObject texture;
|
|
GstMemory *mem;
|
|
GstCudaMemory *cmem;
|
|
GstCudaStream *gst_stream;
|
|
CUstream stream;
|
|
CUDA_MEMCPY2D copy_params = { 0, };
|
|
CUDA_RESOURCE_DESC resource_desc;
|
|
CUDA_TEXTURE_DESC texture_desc;
|
|
GstMapInfo src_map;
|
|
void *kernel_args[] = { &texture, &render_target, &pitch };
|
|
|
|
mem = gst_buffer_peek_memory (app_data.buffer, 0);
|
|
g_assert (gst_is_cuda_memory (mem));
|
|
|
|
if (!gst_memory_map (mem, &src_map, GST_MAP_READ | GST_MAP_CUDA)) {
|
|
gst_printerrln ("gst_memory_map failed");
|
|
return 1;
|
|
}
|
|
|
|
cmem = GST_CUDA_MEMORY_CAST (mem);
|
|
|
|
/* In case of GST_MAP_CUDA, GStreamer will not wait for CUDA sync.
|
|
* Application can use CUDA stream attached in GstCudaMemory
|
|
* or need to call gst_cuda_memory_sync() to ensure synchronization */
|
|
if (shared_stream) {
|
|
gst_stream = gst_cuda_memory_get_stream (cmem);
|
|
stream = gst_cuda_stream_get_handle (gst_stream);
|
|
} else {
|
|
gst_cuda_memory_sync (cmem);
|
|
stream = app_stream;
|
|
}
|
|
|
|
/* Prepare texture resource */
|
|
memset (&resource_desc, 0, sizeof (CUDA_RESOURCE_DESC));
|
|
memset (&texture_desc, 0, sizeof (CUDA_TEXTURE_DESC));
|
|
resource_desc.resType = CU_RESOURCE_TYPE_PITCH2D;
|
|
resource_desc.res.pitch2D.format = CU_AD_FORMAT_UNSIGNED_INT8;
|
|
resource_desc.res.pitch2D.numChannels = 4;
|
|
resource_desc.res.pitch2D.width = cmem->info.width;
|
|
resource_desc.res.pitch2D.height = cmem->info.height;
|
|
resource_desc.res.pitch2D.pitchInBytes = cmem->info.stride[0];
|
|
resource_desc.res.pitch2D.devPtr = (CUdeviceptr) src_map.data;
|
|
|
|
texture_desc.filterMode = CU_TR_FILTER_MODE_LINEAR;
|
|
texture_desc.flags = CU_TRSF_NORMALIZED_COORDINATES;
|
|
texture_desc.addressMode[0] = CU_TR_ADDRESS_MODE_CLAMP;
|
|
texture_desc.addressMode[1] = CU_TR_ADDRESS_MODE_CLAMP;
|
|
texture_desc.addressMode[2] = CU_TR_ADDRESS_MODE_CLAMP;
|
|
|
|
cuda_ret = CuCtxPushCurrent (cuda_ctx);
|
|
if (cuda_ret != CUDA_SUCCESS) {
|
|
gst_printerrln ("cuCtxPopCurrent failed");
|
|
return 1;
|
|
}
|
|
|
|
/* Create texture for sampling */
|
|
cuda_ret = CuTexObjectCreate (&texture,
|
|
&resource_desc, &texture_desc, NULL);
|
|
if (cuda_ret != CUDA_SUCCESS) {
|
|
gst_printerrln ("cuTexObjectCreate failed");
|
|
return 1;
|
|
}
|
|
|
|
cuda_ret = CuLaunchKernel (kernel_func,
|
|
GST_ROUND_UP_16 (RENDER_TARGET_WIDTH) / 16,
|
|
GST_ROUND_UP_16 (RENDER_TARGET_HEIGHT) / 16, 1, 16, 16, 1, 0,
|
|
stream, kernel_args, NULL);
|
|
if (cuda_ret != CUDA_SUCCESS) {
|
|
gst_printerrln ("cuLaunchKernel failed");
|
|
return 1;
|
|
}
|
|
|
|
/* Download to system memory */
|
|
copy_params.srcMemoryType = CU_MEMORYTYPE_DEVICE;
|
|
copy_params.srcDevice = render_target;
|
|
copy_params.srcPitch = pitch;
|
|
|
|
copy_params.dstMemoryType = CU_MEMORYTYPE_HOST;
|
|
copy_params.dstHost = host_mem;
|
|
copy_params.dstPitch = pitch;
|
|
|
|
copy_params.WidthInBytes = RENDER_TARGET_WIDTH * 4;
|
|
copy_params.Height = RENDER_TARGET_HEIGHT;
|
|
|
|
cuda_ret = CuMemcpy2DAsync (©_params, stream);
|
|
if (cuda_ret != CUDA_SUCCESS) {
|
|
gst_printerrln ("cuMemcpy2DAsync failed");
|
|
return 1;
|
|
}
|
|
|
|
/* Wait for conversion and memory download */
|
|
cuda_ret = CuStreamSynchronize (stream);
|
|
if (cuda_ret != CUDA_SUCCESS) {
|
|
gst_printerrln ("cuStreamSynchronize failed");
|
|
return 1;
|
|
}
|
|
|
|
cuda_ret = CuTexObjectDestroy (texture);
|
|
if (cuda_ret != CUDA_SUCCESS) {
|
|
gst_printerrln ("cuTexObjectDestroy failed");
|
|
return 1;
|
|
}
|
|
|
|
cuda_ret = CuCtxPopCurrent (NULL);
|
|
if (cuda_ret != CUDA_SUCCESS) {
|
|
gst_printerrln ("cuCtxPopCurrent failed");
|
|
return 1;
|
|
}
|
|
|
|
gst_memory_unmap (mem, &src_map);
|
|
}
|
|
|
|
/* Create sample and convert it to jpeg image */
|
|
caps = gst_video_info_to_caps (&info);
|
|
sample = gst_sample_new (converted_buf, caps, NULL, NULL);
|
|
|
|
jpeg_caps = gst_caps_new_empty_simple ("image/jpeg");
|
|
|
|
jpeg_sample = gst_video_convert_sample (sample,
|
|
jpeg_caps, GST_CLOCK_TIME_NONE, NULL);
|
|
if (!jpeg_sample) {
|
|
gst_printerrln ("gst_video_convert_sample failed");
|
|
return 1;
|
|
}
|
|
|
|
{
|
|
GstBuffer *jpeg_buf = gst_sample_get_buffer (jpeg_sample);
|
|
GstMapInfo map;
|
|
|
|
if (!gst_buffer_map (jpeg_buf, &map, GST_MAP_READ)) {
|
|
gst_printerrln ("gst_buffer_map failed");
|
|
return 1;
|
|
}
|
|
|
|
FILE *fp;
|
|
fp = fopen (location, "wb");
|
|
if (!fp) {
|
|
gst_printerrln ("fopen failed");
|
|
return 1;
|
|
}
|
|
|
|
if (map.size != fwrite (map.data, 1, map.size, fp)) {
|
|
gst_printerrln ("fwrite failed");
|
|
return 1;
|
|
}
|
|
|
|
fclose (fp);
|
|
|
|
gst_buffer_unmap (jpeg_buf, &map);
|
|
}
|
|
|
|
gst_println ("JPEG file is written to \"%s\"", location);
|
|
|
|
/* Cleanup */
|
|
g_free (location);
|
|
gst_buffer_unref (app_data.buffer);
|
|
gst_object_unref (app_data.cuda_ctx);
|
|
g_mutex_clear (&app_data.lock);
|
|
g_cond_clear (&app_data.cond);
|
|
gst_object_unref (pipeline);
|
|
gst_buffer_unref (converted_buf);
|
|
gst_sample_unref (jpeg_sample);
|
|
gst_sample_unref (sample);
|
|
gst_caps_unref (caps);
|
|
gst_caps_unref (jpeg_caps);
|
|
|
|
/* Release CUDA resources */
|
|
CuCtxPushCurrent (cuda_ctx);
|
|
CuModuleUnload (module);
|
|
CuMemFree (render_target);
|
|
CuMemFreeHost (host_mem);
|
|
if (app_stream)
|
|
CuStreamDestroy (app_stream);
|
|
CuCtxPopCurrent (NULL);
|
|
CuCtxDestroy (cuda_ctx);
|
|
|
|
gst_deinit ();
|
|
|
|
return 0;
|
|
}
|