From d17e8707c9fb438878e3b1261a268fe34e49e491 Mon Sep 17 00:00:00 2001 From: Seungha Yang Date: Wed, 19 Feb 2025 17:46:34 +0900 Subject: [PATCH] nvencoder: Add extern-cuda-bufferpool property Add new property to support application allocated GstCudaMemory. CUDA memory alloc/free is a global device synchronization point as if launching CUDA kernel on default CUDA stream. To avoid the global synchronization, we added stream-ordered allocation support which allocates CUDA memory asynchronously. However, NVENC does not allow registering the stream-ordered allocated memory. Thus encoder was allocating normal CUDA memory in case that input CUDA memory is stream-ordered type. In this commit, newly introduced property will allow application to provide encoder with GstCudaBufferPool. Application can preallocate sufficient amount of CUDA memory in advance to avoid global device synchronization while pipeline is running. For now, this pool is used only if input CUDA memory is allocated via stream-ordered-allocation Part-of: --- .../docs/plugins/gst_plugins_cache.json | 11 ++ .../sys/nvcodec/gstnvencobject.cpp | 51 +++++-- .../sys/nvcodec/gstnvencobject.h | 9 ++ .../sys/nvcodec/gstnvencoder.cpp | 139 +++++++++++++++--- 4 files changed, 181 insertions(+), 29 deletions(-) diff --git a/subprojects/gst-plugins-bad/docs/plugins/gst_plugins_cache.json b/subprojects/gst-plugins-bad/docs/plugins/gst_plugins_cache.json index f3bc8f59a1..81b646a63e 100644 --- a/subprojects/gst-plugins-bad/docs/plugins/gst_plugins_cache.json +++ b/subprojects/gst-plugins-bad/docs/plugins/gst_plugins_cache.json @@ -234329,6 +234329,17 @@ "readable": true, "type": "GstNvEncoderSeiInsertMode", "writable": true + }, + "extern-cuda-bufferpool": { + "blurb": "GstCudaBufferPool prepared by application", + "conditionally-available": false, + "construct": false, + "construct-only": false, + "controllable": false, + "mutable": "ready", + "readable": true, + "type": "GstObject", + "writable": true } } }, diff --git a/subprojects/gst-plugins-bad/sys/nvcodec/gstnvencobject.cpp b/subprojects/gst-plugins-bad/sys/nvcodec/gstnvencobject.cpp index d98c0856a8..d2b2762239 100644 --- a/subprojects/gst-plugins-bad/sys/nvcodec/gstnvencobject.cpp +++ b/subprojects/gst-plugins-bad/sys/nvcodec/gstnvencobject.cpp @@ -722,8 +722,8 @@ GstNvEncObject::DeviceUnlock () } NVENCSTATUS -GstNvEncObject::acquireResourceCuda (GstMemory * mem, - GstNvEncResource ** resource) +GstNvEncObject::acquireResourceCuda (GstMemory * mem, guint width, guint height, + guint stride, GstNvEncResource ** resource) { GstNvEncResource *res; GstCudaMemory *cmem; @@ -732,11 +732,6 @@ GstNvEncObject::acquireResourceCuda (GstMemory * mem, NVENCSTATUS status; GstMapInfo info; - if (!gst_is_cuda_memory (mem)) { - GST_ERROR_ID (id_.c_str (), "Not a CUDA memory"); - return NV_ENC_ERR_INVALID_CALL; - } - cmem = GST_CUDA_MEMORY_CAST (mem); res = (GstNvEncResource *) gst_cuda_memory_get_token_data (cmem, @@ -761,9 +756,9 @@ GstNvEncObject::acquireResourceCuda (GstMemory * mem, new_resource.version = gst_nvenc_get_register_resource_version (); new_resource.resourceType = NV_ENC_INPUT_RESOURCE_TYPE_CUDADEVICEPTR; - new_resource.width = cmem->info.width; - new_resource.height = cmem->info.height; - new_resource.pitch = cmem->info.stride[0]; + new_resource.width = width; + new_resource.height = height; + new_resource.pitch = stride; new_resource.resourceToRegister = info.data; new_resource.bufferFormat = buffer_format_; @@ -895,7 +890,17 @@ GstNvEncObject::AcquireResource (GstMemory * mem, GstNvEncResource ** resource) } else #endif { - status = acquireResourceCuda (mem, resource); + if (!gst_is_cuda_memory (mem)) { + GST_ERROR_ID (id_.c_str (), "Not a CUDA memory"); + return NV_ENC_ERR_INVALID_CALL; + } + + auto cmem = GST_CUDA_MEMORY_CAST (mem); + auto width = cmem->info.width; + auto height = cmem->info.height; + auto stride = cmem->info.stride[0]; + + status = acquireResourceCuda (mem, width, height, stride, resource); } if (status == NV_ENC_SUCCESS) { @@ -908,6 +913,30 @@ GstNvEncObject::AcquireResource (GstMemory * mem, GstNvEncResource ** resource) return status; } +NVENCSTATUS +GstNvEncObject::AcquireResourceWithSize (GstMemory * mem, + guint width, guint height, guint stride, GstNvEncResource ** resource) +{ + NVENCSTATUS status; + std::lock_guard lk (resource_lock_); + + if (!gst_is_cuda_memory (mem)) { + GST_ERROR_ID (id_.c_str (), "Not a CUDA memory"); + return NV_ENC_ERR_INVALID_CALL; + } + + status = acquireResourceCuda (mem, width, height, stride, resource); + + if (status == NV_ENC_SUCCESS) { + GST_TRACE_ID (id_.c_str (), "Returning resource %u, " + "resource queue size %u (active %u)", + (*resource)->seq_num, (guint) resource_queue_.size (), + (guint) active_resource_queue_.size ()); + } + + return status; +} + GstFlowReturn GstNvEncObject::AcquireTask (GstNvEncTask ** task, bool force) { diff --git a/subprojects/gst-plugins-bad/sys/nvcodec/gstnvencobject.h b/subprojects/gst-plugins-bad/sys/nvcodec/gstnvencobject.h index 248c689869..7291766a32 100644 --- a/subprojects/gst-plugins-bad/sys/nvcodec/gstnvencobject.h +++ b/subprojects/gst-plugins-bad/sys/nvcodec/gstnvencobject.h @@ -187,6 +187,12 @@ public: NVENCSTATUS AcquireResource (GstMemory * mem, GstNvEncResource ** resource); + NVENCSTATUS AcquireResourceWithSize (GstMemory * mem, + guint width, + guint height, + guint stride, + GstNvEncResource ** resource); + GstFlowReturn AcquireTask (GstNvEncTask ** task, bool force); @@ -208,6 +214,9 @@ private: void releaseTaskUnlocked (GstNvEncTask * task); NVENCSTATUS acquireResourceCuda (GstMemory * mem, + guint width, + guint height, + guint stride, GstNvEncResource ** resource); #ifdef G_OS_WIN32 diff --git a/subprojects/gst-plugins-bad/sys/nvcodec/gstnvencoder.cpp b/subprojects/gst-plugins-bad/sys/nvcodec/gstnvencoder.cpp index 75a895bb16..67f1b9526d 100644 --- a/subprojects/gst-plugins-bad/sys/nvcodec/gstnvencoder.cpp +++ b/subprojects/gst-plugins-bad/sys/nvcodec/gstnvencoder.cpp @@ -63,6 +63,7 @@ enum { PROP_0, PROP_CC_INSERT, + PROP_EXTERN_POOL, }; #define DEFAULT_CC_INSERT GST_NV_ENCODER_SEI_INSERT @@ -75,6 +76,11 @@ struct _GstNvEncoderPrivate memset (&config, 0, sizeof (NV_ENC_CONFIG)); } + ~_GstNvEncoderPrivate () + { + gst_clear_object (&extern_pool); + } + GstCudaContext *context = nullptr; GstCudaStream *stream = nullptr; @@ -121,8 +127,11 @@ struct _GstNvEncoderPrivate std::atomic < GstFlowReturn > last_flow; + GstVideoInfo extern_pool_info; + /* properties */ GstNvEncoderSeiInsertMode cc_insert = DEFAULT_CC_INSERT; + GstBufferPool *extern_pool = nullptr; }; /** @@ -184,6 +193,25 @@ gst_nv_encoder_class_init (GstNvEncoderClass * klass) GST_TYPE_NV_ENCODER_SEI_INSERT_MODE, DEFAULT_CC_INSERT, (GParamFlags) (G_PARAM_READWRITE | G_PARAM_STATIC_STRINGS))); + /** + * GstNvEncoder:extern-cuda-bufferpool: + * + * GstCudaBufferPool prepared by application. Application can pass + * a buffer pool instance prepared in advance, to avoid + * global device synchronization caused by CUDA memory allocation. + * + * The buffer pool should be configured with stream-ordered-allocation disabled + * + * Since: 1.26 + */ + g_object_class_install_property (object_class, PROP_EXTERN_POOL, + g_param_spec_object ("extern-cuda-bufferpool", "Extern CUDA Buffer Pool", + "GstCudaBufferPool prepared by application", + GST_TYPE_OBJECT, + (GParamFlags) (G_PARAM_READWRITE | G_PARAM_STATIC_STRINGS | + GST_PARAM_MUTABLE_READY))); + + element_class->set_context = GST_DEBUG_FUNCPTR (gst_nv_encoder_set_context); videoenc_class->open = GST_DEBUG_FUNCPTR (gst_nv_encoder_open); @@ -247,6 +275,31 @@ gst_nv_encoder_set_property (GObject * object, guint prop_id, case PROP_CC_INSERT: priv->cc_insert = (GstNvEncoderSeiInsertMode) g_value_get_enum (value); break; + case PROP_EXTERN_POOL: + gst_clear_object (&priv->extern_pool); + priv->extern_pool = (GstBufferPool *) g_value_dup_object (value); + if (priv->extern_pool) { + if (!GST_IS_CUDA_BUFFER_POOL (priv->extern_pool)) { + GST_ERROR_OBJECT (self, "Not a CUDA buffer pool"); + gst_clear_object (&priv->extern_pool); + } else if (!gst_buffer_pool_set_active (priv->extern_pool, TRUE)) { + GST_ERROR_OBJECT (self, "Set active failed"); + gst_clear_object (&priv->extern_pool); + } else { + auto config = gst_buffer_pool_get_config (priv->extern_pool); + GstCaps *caps; + gst_buffer_pool_config_get_params (config, + &caps, nullptr, nullptr, nullptr); + auto is_valid = gst_video_info_from_caps (&priv->extern_pool_info, + caps); + gst_structure_free (config); + if (!is_valid) { + GST_ERROR_OBJECT (self, "Invalid buffer pool"); + gst_clear_object (&priv->extern_pool); + } + } + } + break; default: G_OBJECT_WARN_INVALID_PROPERTY_ID (object, prop_id, pspec); break; @@ -264,6 +317,9 @@ gst_nv_encoder_get_property (GObject * object, guint prop_id, GValue * value, case PROP_CC_INSERT: g_value_set_enum (value, priv->cc_insert); break; + case PROP_EXTERN_POOL: + g_value_set_object (value, priv->extern_pool); + break; default: G_OBJECT_WARN_INVALID_PROPERTY_ID (object, prop_id, pspec); break; @@ -782,7 +838,8 @@ gst_nv_encoder_propose_allocation (GstVideoEncoder * encoder, GstQuery * query) gst_buffer_pool_config_set_cuda_stream (config, priv->stream); /* Encoder does not seem to support stream ordered allocation */ - gst_buffer_pool_config_set_cuda_stream_ordered_alloc (config, FALSE); + if (!priv->extern_pool) + gst_buffer_pool_config_set_cuda_stream_ordered_alloc (config, FALSE); } if (!gst_buffer_pool_set_config (pool, config)) { @@ -1555,6 +1612,9 @@ gst_nv_encoder_prepare_task_input_cuda (GstNvEncoder * self, GstCudaStream *stream; GstNvEncResource *resource = nullptr; const GstVideoInfo *info = &priv->input_state->info; + gboolean sync_done = FALSE; + guint out_stride = 0; + gboolean is_extern_mem = FALSE; mem = gst_buffer_peek_memory (buffer, 0); @@ -1648,21 +1708,20 @@ gst_nv_encoder_prepare_task_input_cuda (GstNvEncoder * self, return gst_nv_encoder_copy_system (self, info, buffer, task); } + out_stride = cmem->info.stride[0]; + if (gst_cuda_memory_is_stream_ordered (mem)) { GstBuffer *copy = nullptr; - GstVideoFrame in_frame, out_frame; + GstVideoFrame in_frame; CUDA_MEMCPY2D copy_params = { }; + GstMemory *out_mem; + GstMapInfo out_map; + guint8 *out_data; stream = gst_cuda_memory_get_stream (cmem); GST_LOG_OBJECT (self, "Stream ordered allocation needs memory copy"); - gst_buffer_pool_acquire_buffer (priv->internal_pool, ©, nullptr); - if (!copy) { - GST_ERROR_OBJECT (self, "Couldn't allocate internal buffer"); - return GST_FLOW_ERROR; - } - if (!gst_video_frame_map (&in_frame, info, buffer, (GstMapFlags) (GST_MAP_READ | GST_MAP_CUDA))) { GST_ERROR_OBJECT (self, "Couldn't map input buffer"); @@ -1670,14 +1729,50 @@ gst_nv_encoder_prepare_task_input_cuda (GstNvEncoder * self, return GST_FLOW_ERROR; } - if (!gst_video_frame_map (&out_frame, info, copy, - (GstMapFlags) (GST_MAP_WRITE | GST_MAP_CUDA))) { + if (priv->extern_pool) { + auto cuda_pool = GST_CUDA_BUFFER_POOL (priv->extern_pool); + if (cuda_pool->context == priv->context) { + gst_buffer_pool_acquire_buffer (priv->extern_pool, ©, nullptr); + if (copy) { + auto copy_mem = gst_buffer_peek_memory (copy, 0); + if (gst_cuda_memory_is_stream_ordered (copy_mem)) { + GST_LOG_OBJECT (self, "External pool uses stream ordered alloc"); + gst_clear_buffer (©); + } else if (gst_memory_get_sizes (mem, nullptr, nullptr) > + gst_memory_get_sizes (copy_mem, nullptr, nullptr)) { + GST_LOG_OBJECT (self, "Too small extern pool buffer"); + gst_clear_buffer (©); + } else { + is_extern_mem = TRUE; + } + } + } + } + + if (!copy) + gst_buffer_pool_acquire_buffer (priv->internal_pool, ©, nullptr); + + if (!copy) { + GST_ERROR_OBJECT (self, "Couldn't allocate internal buffer"); + return GST_FLOW_ERROR; + } + + out_mem = gst_buffer_peek_memory (copy, 0); + + if (!gst_memory_map (out_mem, + &out_map, (GstMapFlags) (GST_MAP_WRITE | GST_MAP_CUDA))) { GST_ERROR_OBJECT (self, "Couldn't map output buffer"); gst_video_frame_unmap (&in_frame); gst_buffer_unref (copy); return GST_FLOW_ERROR; } + out_data = (guint8 *) out_map.data; + if (is_extern_mem) + out_stride = in_frame.info.stride[0]; + else + out_stride = GST_CUDA_MEMORY_CAST (out_mem)->info.stride[0]; + for (guint i = 0; i < GST_VIDEO_FRAME_N_PLANES (&in_frame); i++) { copy_params.srcMemoryType = CU_MEMORYTYPE_DEVICE; copy_params.srcDevice = (CUdeviceptr) @@ -1685,9 +1780,8 @@ gst_nv_encoder_prepare_task_input_cuda (GstNvEncoder * self, copy_params.srcPitch = GST_VIDEO_FRAME_PLANE_STRIDE (&in_frame, i); copy_params.dstMemoryType = CU_MEMORYTYPE_DEVICE; - copy_params.dstDevice = (CUdeviceptr) - GST_VIDEO_FRAME_PLANE_DATA (&out_frame, i); - copy_params.dstPitch = GST_VIDEO_FRAME_PLANE_STRIDE (&out_frame, i); + copy_params.dstDevice = (CUdeviceptr) out_data; + copy_params.dstPitch = out_stride; copy_params.WidthInBytes = GST_VIDEO_INFO_COMP_WIDTH (info, i) * GST_VIDEO_INFO_COMP_PSTRIDE (info, i); @@ -1698,18 +1792,22 @@ gst_nv_encoder_prepare_task_input_cuda (GstNvEncoder * self, if (!gst_cuda_result (cuda_ret)) { GST_ERROR_OBJECT (self, "Copy failed"); gst_video_frame_unmap (&in_frame); - gst_video_frame_unmap (&out_frame); + gst_memory_unmap (out_mem, &out_map); gst_buffer_unref (copy); return GST_FLOW_ERROR; } + + out_data += GST_VIDEO_INFO_COMP_HEIGHT (info, i) * out_stride; } gst_video_frame_unmap (&in_frame); - gst_video_frame_unmap (&out_frame); + gst_memory_unmap (out_mem, &out_map); - if (stream && stream != priv->stream) + if (stream && stream != priv->stream) { CuStreamSynchronize (gst_cuda_stream_get_handle (stream)); + sync_done = TRUE; + } buffer = copy; mem = gst_buffer_peek_memory (copy, 0); @@ -1718,7 +1816,12 @@ gst_nv_encoder_prepare_task_input_cuda (GstNvEncoder * self, buffer = gst_buffer_ref (buffer); } - status = object->AcquireResource (mem, &resource); + if (is_extern_mem) { + status = object->AcquireResourceWithSize (mem, info->width, info->height, + out_stride, &resource); + } else { + status = object->AcquireResource (mem, &resource); + } if (status != NV_ENC_SUCCESS) { GST_ERROR_OBJECT (self, "Failed to get resource, status %" @@ -1729,7 +1832,7 @@ gst_nv_encoder_prepare_task_input_cuda (GstNvEncoder * self, } stream = gst_cuda_memory_get_stream (cmem); - if (stream != priv->stream) { + if (stream != priv->stream && !sync_done) { /* different stream, needs sync */ gst_cuda_memory_sync (cmem); }