nvencoder: Add extern-cuda-bufferpool property

Add new property to support application allocated GstCudaMemory.

CUDA memory alloc/free is a global device synchronization point
as if launching CUDA kernel on default CUDA stream. To avoid the global
synchronization, we added stream-ordered allocation support
which allocates CUDA memory asynchronously.
However, NVENC does not allow registering the stream-ordered
allocated memory. Thus encoder was allocating normal CUDA
memory in case that input CUDA memory is stream-ordered type.

In this commit, newly introduced property will allow application
to provide encoder with GstCudaBufferPool. Application can
preallocate sufficient amount of CUDA memory in advance
to avoid global device synchronization while pipeline is running.

For now, this pool is used only if input CUDA memory is allocated
via stream-ordered-allocation

Part-of: <https://gitlab.freedesktop.org/gstreamer/gstreamer/-/merge_requests/8516>
This commit is contained in:
Seungha Yang 2025-02-19 17:46:34 +09:00 committed by GStreamer Marge Bot
parent 700e31d146
commit d17e8707c9
4 changed files with 181 additions and 29 deletions

View file

@ -234329,6 +234329,17 @@
"readable": true,
"type": "GstNvEncoderSeiInsertMode",
"writable": true
},
"extern-cuda-bufferpool": {
"blurb": "GstCudaBufferPool prepared by application",
"conditionally-available": false,
"construct": false,
"construct-only": false,
"controllable": false,
"mutable": "ready",
"readable": true,
"type": "GstObject",
"writable": true
}
}
},

View file

@ -722,8 +722,8 @@ GstNvEncObject::DeviceUnlock ()
}
NVENCSTATUS
GstNvEncObject::acquireResourceCuda (GstMemory * mem,
GstNvEncResource ** resource)
GstNvEncObject::acquireResourceCuda (GstMemory * mem, guint width, guint height,
guint stride, GstNvEncResource ** resource)
{
GstNvEncResource *res;
GstCudaMemory *cmem;
@ -732,11 +732,6 @@ GstNvEncObject::acquireResourceCuda (GstMemory * mem,
NVENCSTATUS status;
GstMapInfo info;
if (!gst_is_cuda_memory (mem)) {
GST_ERROR_ID (id_.c_str (), "Not a CUDA memory");
return NV_ENC_ERR_INVALID_CALL;
}
cmem = GST_CUDA_MEMORY_CAST (mem);
res = (GstNvEncResource *) gst_cuda_memory_get_token_data (cmem,
@ -761,9 +756,9 @@ GstNvEncObject::acquireResourceCuda (GstMemory * mem,
new_resource.version = gst_nvenc_get_register_resource_version ();
new_resource.resourceType = NV_ENC_INPUT_RESOURCE_TYPE_CUDADEVICEPTR;
new_resource.width = cmem->info.width;
new_resource.height = cmem->info.height;
new_resource.pitch = cmem->info.stride[0];
new_resource.width = width;
new_resource.height = height;
new_resource.pitch = stride;
new_resource.resourceToRegister = info.data;
new_resource.bufferFormat = buffer_format_;
@ -895,7 +890,17 @@ GstNvEncObject::AcquireResource (GstMemory * mem, GstNvEncResource ** resource)
} else
#endif
{
status = acquireResourceCuda (mem, resource);
if (!gst_is_cuda_memory (mem)) {
GST_ERROR_ID (id_.c_str (), "Not a CUDA memory");
return NV_ENC_ERR_INVALID_CALL;
}
auto cmem = GST_CUDA_MEMORY_CAST (mem);
auto width = cmem->info.width;
auto height = cmem->info.height;
auto stride = cmem->info.stride[0];
status = acquireResourceCuda (mem, width, height, stride, resource);
}
if (status == NV_ENC_SUCCESS) {
@ -908,6 +913,30 @@ GstNvEncObject::AcquireResource (GstMemory * mem, GstNvEncResource ** resource)
return status;
}
NVENCSTATUS
GstNvEncObject::AcquireResourceWithSize (GstMemory * mem,
guint width, guint height, guint stride, GstNvEncResource ** resource)
{
NVENCSTATUS status;
std::lock_guard <std::recursive_mutex> lk (resource_lock_);
if (!gst_is_cuda_memory (mem)) {
GST_ERROR_ID (id_.c_str (), "Not a CUDA memory");
return NV_ENC_ERR_INVALID_CALL;
}
status = acquireResourceCuda (mem, width, height, stride, resource);
if (status == NV_ENC_SUCCESS) {
GST_TRACE_ID (id_.c_str (), "Returning resource %u, "
"resource queue size %u (active %u)",
(*resource)->seq_num, (guint) resource_queue_.size (),
(guint) active_resource_queue_.size ());
}
return status;
}
GstFlowReturn
GstNvEncObject::AcquireTask (GstNvEncTask ** task, bool force)
{

View file

@ -187,6 +187,12 @@ public:
NVENCSTATUS AcquireResource (GstMemory * mem,
GstNvEncResource ** resource);
NVENCSTATUS AcquireResourceWithSize (GstMemory * mem,
guint width,
guint height,
guint stride,
GstNvEncResource ** resource);
GstFlowReturn AcquireTask (GstNvEncTask ** task,
bool force);
@ -208,6 +214,9 @@ private:
void releaseTaskUnlocked (GstNvEncTask * task);
NVENCSTATUS acquireResourceCuda (GstMemory * mem,
guint width,
guint height,
guint stride,
GstNvEncResource ** resource);
#ifdef G_OS_WIN32

View file

@ -63,6 +63,7 @@ enum
{
PROP_0,
PROP_CC_INSERT,
PROP_EXTERN_POOL,
};
#define DEFAULT_CC_INSERT GST_NV_ENCODER_SEI_INSERT
@ -75,6 +76,11 @@ struct _GstNvEncoderPrivate
memset (&config, 0, sizeof (NV_ENC_CONFIG));
}
~_GstNvEncoderPrivate ()
{
gst_clear_object (&extern_pool);
}
GstCudaContext *context = nullptr;
GstCudaStream *stream = nullptr;
@ -121,8 +127,11 @@ struct _GstNvEncoderPrivate
std::atomic < GstFlowReturn > last_flow;
GstVideoInfo extern_pool_info;
/* properties */
GstNvEncoderSeiInsertMode cc_insert = DEFAULT_CC_INSERT;
GstBufferPool *extern_pool = nullptr;
};
/**
@ -184,6 +193,25 @@ gst_nv_encoder_class_init (GstNvEncoderClass * klass)
GST_TYPE_NV_ENCODER_SEI_INSERT_MODE, DEFAULT_CC_INSERT,
(GParamFlags) (G_PARAM_READWRITE | G_PARAM_STATIC_STRINGS)));
/**
* GstNvEncoder:extern-cuda-bufferpool:
*
* GstCudaBufferPool prepared by application. Application can pass
* a buffer pool instance prepared in advance, to avoid
* global device synchronization caused by CUDA memory allocation.
*
* The buffer pool should be configured with stream-ordered-allocation disabled
*
* Since: 1.26
*/
g_object_class_install_property (object_class, PROP_EXTERN_POOL,
g_param_spec_object ("extern-cuda-bufferpool", "Extern CUDA Buffer Pool",
"GstCudaBufferPool prepared by application",
GST_TYPE_OBJECT,
(GParamFlags) (G_PARAM_READWRITE | G_PARAM_STATIC_STRINGS |
GST_PARAM_MUTABLE_READY)));
element_class->set_context = GST_DEBUG_FUNCPTR (gst_nv_encoder_set_context);
videoenc_class->open = GST_DEBUG_FUNCPTR (gst_nv_encoder_open);
@ -247,6 +275,31 @@ gst_nv_encoder_set_property (GObject * object, guint prop_id,
case PROP_CC_INSERT:
priv->cc_insert = (GstNvEncoderSeiInsertMode) g_value_get_enum (value);
break;
case PROP_EXTERN_POOL:
gst_clear_object (&priv->extern_pool);
priv->extern_pool = (GstBufferPool *) g_value_dup_object (value);
if (priv->extern_pool) {
if (!GST_IS_CUDA_BUFFER_POOL (priv->extern_pool)) {
GST_ERROR_OBJECT (self, "Not a CUDA buffer pool");
gst_clear_object (&priv->extern_pool);
} else if (!gst_buffer_pool_set_active (priv->extern_pool, TRUE)) {
GST_ERROR_OBJECT (self, "Set active failed");
gst_clear_object (&priv->extern_pool);
} else {
auto config = gst_buffer_pool_get_config (priv->extern_pool);
GstCaps *caps;
gst_buffer_pool_config_get_params (config,
&caps, nullptr, nullptr, nullptr);
auto is_valid = gst_video_info_from_caps (&priv->extern_pool_info,
caps);
gst_structure_free (config);
if (!is_valid) {
GST_ERROR_OBJECT (self, "Invalid buffer pool");
gst_clear_object (&priv->extern_pool);
}
}
}
break;
default:
G_OBJECT_WARN_INVALID_PROPERTY_ID (object, prop_id, pspec);
break;
@ -264,6 +317,9 @@ gst_nv_encoder_get_property (GObject * object, guint prop_id, GValue * value,
case PROP_CC_INSERT:
g_value_set_enum (value, priv->cc_insert);
break;
case PROP_EXTERN_POOL:
g_value_set_object (value, priv->extern_pool);
break;
default:
G_OBJECT_WARN_INVALID_PROPERTY_ID (object, prop_id, pspec);
break;
@ -782,7 +838,8 @@ gst_nv_encoder_propose_allocation (GstVideoEncoder * encoder, GstQuery * query)
gst_buffer_pool_config_set_cuda_stream (config, priv->stream);
/* Encoder does not seem to support stream ordered allocation */
gst_buffer_pool_config_set_cuda_stream_ordered_alloc (config, FALSE);
if (!priv->extern_pool)
gst_buffer_pool_config_set_cuda_stream_ordered_alloc (config, FALSE);
}
if (!gst_buffer_pool_set_config (pool, config)) {
@ -1555,6 +1612,9 @@ gst_nv_encoder_prepare_task_input_cuda (GstNvEncoder * self,
GstCudaStream *stream;
GstNvEncResource *resource = nullptr;
const GstVideoInfo *info = &priv->input_state->info;
gboolean sync_done = FALSE;
guint out_stride = 0;
gboolean is_extern_mem = FALSE;
mem = gst_buffer_peek_memory (buffer, 0);
@ -1648,21 +1708,20 @@ gst_nv_encoder_prepare_task_input_cuda (GstNvEncoder * self,
return gst_nv_encoder_copy_system (self, info, buffer, task);
}
out_stride = cmem->info.stride[0];
if (gst_cuda_memory_is_stream_ordered (mem)) {
GstBuffer *copy = nullptr;
GstVideoFrame in_frame, out_frame;
GstVideoFrame in_frame;
CUDA_MEMCPY2D copy_params = { };
GstMemory *out_mem;
GstMapInfo out_map;
guint8 *out_data;
stream = gst_cuda_memory_get_stream (cmem);
GST_LOG_OBJECT (self, "Stream ordered allocation needs memory copy");
gst_buffer_pool_acquire_buffer (priv->internal_pool, &copy, nullptr);
if (!copy) {
GST_ERROR_OBJECT (self, "Couldn't allocate internal buffer");
return GST_FLOW_ERROR;
}
if (!gst_video_frame_map (&in_frame, info, buffer,
(GstMapFlags) (GST_MAP_READ | GST_MAP_CUDA))) {
GST_ERROR_OBJECT (self, "Couldn't map input buffer");
@ -1670,14 +1729,50 @@ gst_nv_encoder_prepare_task_input_cuda (GstNvEncoder * self,
return GST_FLOW_ERROR;
}
if (!gst_video_frame_map (&out_frame, info, copy,
(GstMapFlags) (GST_MAP_WRITE | GST_MAP_CUDA))) {
if (priv->extern_pool) {
auto cuda_pool = GST_CUDA_BUFFER_POOL (priv->extern_pool);
if (cuda_pool->context == priv->context) {
gst_buffer_pool_acquire_buffer (priv->extern_pool, &copy, nullptr);
if (copy) {
auto copy_mem = gst_buffer_peek_memory (copy, 0);
if (gst_cuda_memory_is_stream_ordered (copy_mem)) {
GST_LOG_OBJECT (self, "External pool uses stream ordered alloc");
gst_clear_buffer (&copy);
} else if (gst_memory_get_sizes (mem, nullptr, nullptr) >
gst_memory_get_sizes (copy_mem, nullptr, nullptr)) {
GST_LOG_OBJECT (self, "Too small extern pool buffer");
gst_clear_buffer (&copy);
} else {
is_extern_mem = TRUE;
}
}
}
}
if (!copy)
gst_buffer_pool_acquire_buffer (priv->internal_pool, &copy, nullptr);
if (!copy) {
GST_ERROR_OBJECT (self, "Couldn't allocate internal buffer");
return GST_FLOW_ERROR;
}
out_mem = gst_buffer_peek_memory (copy, 0);
if (!gst_memory_map (out_mem,
&out_map, (GstMapFlags) (GST_MAP_WRITE | GST_MAP_CUDA))) {
GST_ERROR_OBJECT (self, "Couldn't map output buffer");
gst_video_frame_unmap (&in_frame);
gst_buffer_unref (copy);
return GST_FLOW_ERROR;
}
out_data = (guint8 *) out_map.data;
if (is_extern_mem)
out_stride = in_frame.info.stride[0];
else
out_stride = GST_CUDA_MEMORY_CAST (out_mem)->info.stride[0];
for (guint i = 0; i < GST_VIDEO_FRAME_N_PLANES (&in_frame); i++) {
copy_params.srcMemoryType = CU_MEMORYTYPE_DEVICE;
copy_params.srcDevice = (CUdeviceptr)
@ -1685,9 +1780,8 @@ gst_nv_encoder_prepare_task_input_cuda (GstNvEncoder * self,
copy_params.srcPitch = GST_VIDEO_FRAME_PLANE_STRIDE (&in_frame, i);
copy_params.dstMemoryType = CU_MEMORYTYPE_DEVICE;
copy_params.dstDevice = (CUdeviceptr)
GST_VIDEO_FRAME_PLANE_DATA (&out_frame, i);
copy_params.dstPitch = GST_VIDEO_FRAME_PLANE_STRIDE (&out_frame, i);
copy_params.dstDevice = (CUdeviceptr) out_data;
copy_params.dstPitch = out_stride;
copy_params.WidthInBytes = GST_VIDEO_INFO_COMP_WIDTH (info, i) *
GST_VIDEO_INFO_COMP_PSTRIDE (info, i);
@ -1698,18 +1792,22 @@ gst_nv_encoder_prepare_task_input_cuda (GstNvEncoder * self,
if (!gst_cuda_result (cuda_ret)) {
GST_ERROR_OBJECT (self, "Copy failed");
gst_video_frame_unmap (&in_frame);
gst_video_frame_unmap (&out_frame);
gst_memory_unmap (out_mem, &out_map);
gst_buffer_unref (copy);
return GST_FLOW_ERROR;
}
out_data += GST_VIDEO_INFO_COMP_HEIGHT (info, i) * out_stride;
}
gst_video_frame_unmap (&in_frame);
gst_video_frame_unmap (&out_frame);
gst_memory_unmap (out_mem, &out_map);
if (stream && stream != priv->stream)
if (stream && stream != priv->stream) {
CuStreamSynchronize (gst_cuda_stream_get_handle (stream));
sync_done = TRUE;
}
buffer = copy;
mem = gst_buffer_peek_memory (copy, 0);
@ -1718,7 +1816,12 @@ gst_nv_encoder_prepare_task_input_cuda (GstNvEncoder * self,
buffer = gst_buffer_ref (buffer);
}
status = object->AcquireResource (mem, &resource);
if (is_extern_mem) {
status = object->AcquireResourceWithSize (mem, info->width, info->height,
out_stride, &resource);
} else {
status = object->AcquireResource (mem, &resource);
}
if (status != NV_ENC_SUCCESS) {
GST_ERROR_OBJECT (self, "Failed to get resource, status %"
@ -1729,7 +1832,7 @@ gst_nv_encoder_prepare_task_input_cuda (GstNvEncoder * self,
}
stream = gst_cuda_memory_get_stream (cmem);
if (stream != priv->stream) {
if (stream != priv->stream && !sync_done) {
/* different stream, needs sync */
gst_cuda_memory_sync (cmem);
}