nvencoder: Add extern-cuda-bufferpool property

Add new property to support application allocated GstCudaMemory. CUDA memory alloc/free is a global device synchronization point as if launching CUDA kernel on default CUDA stream. To avoid the global synchronization, we added stream-ordered allocation support which allocates CUDA memory asynchronously. However, NVENC does not allow registering the stream-ordered allocated memory. Thus encoder was allocating normal CUDA memory in case that input CUDA memory is stream-ordered type. In this commit, newly introduced property will allow application to provide encoder with GstCudaBufferPool. Application can preallocate sufficient amount of CUDA memory in advance to avoid global device synchronization while pipeline is running. For now, this pool is used only if input CUDA memory is allocated via stream-ordered-allocation Part-of: <https://gitlab.freedesktop.org/gstreamer/gstreamer/-/merge_requests/8516>
2025-02-23 14:36:24 +00:00 · 2025-02-19 17:46:34 +09:00 · 2025-02-19 17:46:34 +09:00 · d17e8707c9
commit d17e8707c9
parent 700e31d146
4 changed files with 181 additions and 29 deletions
--- a/subprojects/gst-plugins-bad/docs/plugins/gst_plugins_cache.json
+++ b/subprojects/gst-plugins-bad/docs/plugins/gst_plugins_cache.json
@ -234329,6 +234329,17 @@
                        "readable": true,
                        "type": "GstNvEncoderSeiInsertMode",
                        "writable": true
+                    },
+                    "extern-cuda-bufferpool": {
+                        "blurb": "GstCudaBufferPool prepared by application",
+                        "conditionally-available": false,
+                        "construct": false,
+                        "construct-only": false,
+                        "controllable": false,
+                        "mutable": "ready",
+                        "readable": true,
+                        "type": "GstObject",
+                        "writable": true
                    }
                }
            },
--- a/subprojects/gst-plugins-bad/sys/nvcodec/gstnvencobject.cpp
+++ b/subprojects/gst-plugins-bad/sys/nvcodec/gstnvencobject.cpp
@ -722,8 +722,8 @@ GstNvEncObject::DeviceUnlock ()
 }

 NVENCSTATUS
-GstNvEncObject::acquireResourceCuda (GstMemory * mem,
-    GstNvEncResource ** resource)
+GstNvEncObject::acquireResourceCuda (GstMemory * mem, guint width, guint height,
+      guint stride, GstNvEncResource ** resource)
 {
  GstNvEncResource *res;
  GstCudaMemory *cmem;
@ -732,11 +732,6 @@ GstNvEncObject::acquireResourceCuda (GstMemory * mem,
  NVENCSTATUS status;
  GstMapInfo info;

-  if (!gst_is_cuda_memory (mem)) {
-    GST_ERROR_ID (id_.c_str (), "Not a CUDA memory");
-    return NV_ENC_ERR_INVALID_CALL;
-  }
-
  cmem = GST_CUDA_MEMORY_CAST (mem);

  res = (GstNvEncResource *) gst_cuda_memory_get_token_data (cmem,
@ -761,9 +756,9 @@ GstNvEncObject::acquireResourceCuda (GstMemory * mem,

  new_resource.version = gst_nvenc_get_register_resource_version ();
  new_resource.resourceType = NV_ENC_INPUT_RESOURCE_TYPE_CUDADEVICEPTR;
-  new_resource.width = cmem->info.width;
-  new_resource.height = cmem->info.height;
-  new_resource.pitch = cmem->info.stride[0];
+  new_resource.width = width;
+  new_resource.height = height;
+  new_resource.pitch = stride;
  new_resource.resourceToRegister = info.data;
  new_resource.bufferFormat = buffer_format_;

@ -895,7 +890,17 @@ GstNvEncObject::AcquireResource (GstMemory * mem, GstNvEncResource ** resource)
  } else
 #endif
  {
-    status = acquireResourceCuda (mem, resource);
+    if (!gst_is_cuda_memory (mem)) {
+      GST_ERROR_ID (id_.c_str (), "Not a CUDA memory");
+      return NV_ENC_ERR_INVALID_CALL;
+    }
+
+    auto cmem = GST_CUDA_MEMORY_CAST (mem);
+    auto width = cmem->info.width;
+    auto height = cmem->info.height;
+    auto stride = cmem->info.stride[0];
+
+    status = acquireResourceCuda (mem, width, height, stride, resource);
  }

  if (status == NV_ENC_SUCCESS) {
@ -908,6 +913,30 @@ GstNvEncObject::AcquireResource (GstMemory * mem, GstNvEncResource ** resource)
  return status;
 }

+NVENCSTATUS
+GstNvEncObject::AcquireResourceWithSize (GstMemory * mem,
+  guint width, guint height, guint stride, GstNvEncResource ** resource)
+{
+  NVENCSTATUS status;
+  std::lock_guard <std::recursive_mutex> lk (resource_lock_);
+
+  if (!gst_is_cuda_memory (mem)) {
+    GST_ERROR_ID (id_.c_str (), "Not a CUDA memory");
+    return NV_ENC_ERR_INVALID_CALL;
+  }
+
+  status = acquireResourceCuda (mem, width, height, stride, resource);
+
+  if (status == NV_ENC_SUCCESS) {
+    GST_TRACE_ID (id_.c_str (), "Returning resource %u, "
+        "resource queue size %u (active %u)",
+        (*resource)->seq_num, (guint) resource_queue_.size (),
+        (guint) active_resource_queue_.size ());
+  }
+
+  return status;
+}
+
 GstFlowReturn
 GstNvEncObject::AcquireTask (GstNvEncTask ** task, bool force)
 {
--- a/subprojects/gst-plugins-bad/sys/nvcodec/gstnvencobject.h
+++ b/subprojects/gst-plugins-bad/sys/nvcodec/gstnvencobject.h
@ -187,6 +187,12 @@ public:
  NVENCSTATUS   AcquireResource (GstMemory * mem,
                                 GstNvEncResource ** resource);

+  NVENCSTATUS   AcquireResourceWithSize (GstMemory * mem,
+                                         guint width,
+                                         guint height,
+                                         guint stride,
+                                         GstNvEncResource ** resource);
+
  GstFlowReturn AcquireTask (GstNvEncTask ** task,
                             bool force);

@ -208,6 +214,9 @@ private:
  void releaseTaskUnlocked (GstNvEncTask * task);

  NVENCSTATUS acquireResourceCuda (GstMemory * mem,
+                                   guint width,
+                                   guint height,
+                                   guint stride,
                                   GstNvEncResource ** resource);

 #ifdef G_OS_WIN32
--- a/subprojects/gst-plugins-bad/sys/nvcodec/gstnvencoder.cpp
+++ b/subprojects/gst-plugins-bad/sys/nvcodec/gstnvencoder.cpp
@ -63,6 +63,7 @@ enum
 {
  PROP_0,
  PROP_CC_INSERT,
+  PROP_EXTERN_POOL,
 };

 #define DEFAULT_CC_INSERT GST_NV_ENCODER_SEI_INSERT
@ -75,6 +76,11 @@ struct _GstNvEncoderPrivate
    memset (&config, 0, sizeof (NV_ENC_CONFIG));
  }

+   ~_GstNvEncoderPrivate ()
+  {
+    gst_clear_object (&extern_pool);
+  }
+
  GstCudaContext *context = nullptr;
  GstCudaStream *stream = nullptr;

@ -121,8 +127,11 @@ struct _GstNvEncoderPrivate

  std::atomic < GstFlowReturn > last_flow;

+  GstVideoInfo extern_pool_info;
+
  /* properties */
  GstNvEncoderSeiInsertMode cc_insert = DEFAULT_CC_INSERT;
+  GstBufferPool *extern_pool = nullptr;
 };

 /**
@ -184,6 +193,25 @@ gst_nv_encoder_class_init (GstNvEncoderClass * klass)
          GST_TYPE_NV_ENCODER_SEI_INSERT_MODE, DEFAULT_CC_INSERT,
          (GParamFlags) (G_PARAM_READWRITE | G_PARAM_STATIC_STRINGS)));

+  /**
+   * GstNvEncoder:extern-cuda-bufferpool:
+   *
+   * GstCudaBufferPool prepared by application. Application can pass
+   * a buffer pool instance prepared in advance, to avoid
+   * global device synchronization caused by CUDA memory allocation.
+   *
+   * The buffer pool should be configured with stream-ordered-allocation disabled
+   *
+   * Since: 1.26
+   */
+  g_object_class_install_property (object_class, PROP_EXTERN_POOL,
+      g_param_spec_object ("extern-cuda-bufferpool", "Extern CUDA Buffer Pool",
+          "GstCudaBufferPool prepared by application",
+          GST_TYPE_OBJECT,
+          (GParamFlags) (G_PARAM_READWRITE | G_PARAM_STATIC_STRINGS |
+              GST_PARAM_MUTABLE_READY)));
+
+
  element_class->set_context = GST_DEBUG_FUNCPTR (gst_nv_encoder_set_context);

  videoenc_class->open = GST_DEBUG_FUNCPTR (gst_nv_encoder_open);
@ -247,6 +275,31 @@ gst_nv_encoder_set_property (GObject * object, guint prop_id,
    case PROP_CC_INSERT:
      priv->cc_insert = (GstNvEncoderSeiInsertMode) g_value_get_enum (value);
      break;
+    case PROP_EXTERN_POOL:
+      gst_clear_object (&priv->extern_pool);
+      priv->extern_pool = (GstBufferPool *) g_value_dup_object (value);
+      if (priv->extern_pool) {
+        if (!GST_IS_CUDA_BUFFER_POOL (priv->extern_pool)) {
+          GST_ERROR_OBJECT (self, "Not a CUDA buffer pool");
+          gst_clear_object (&priv->extern_pool);
+        } else if (!gst_buffer_pool_set_active (priv->extern_pool, TRUE)) {
+          GST_ERROR_OBJECT (self, "Set active failed");
+          gst_clear_object (&priv->extern_pool);
+        } else {
+          auto config = gst_buffer_pool_get_config (priv->extern_pool);
+          GstCaps *caps;
+          gst_buffer_pool_config_get_params (config,
+              &caps, nullptr, nullptr, nullptr);
+          auto is_valid = gst_video_info_from_caps (&priv->extern_pool_info,
+              caps);
+          gst_structure_free (config);
+          if (!is_valid) {
+            GST_ERROR_OBJECT (self, "Invalid buffer pool");
+            gst_clear_object (&priv->extern_pool);
+          }
+        }
+      }
+      break;
    default:
      G_OBJECT_WARN_INVALID_PROPERTY_ID (object, prop_id, pspec);
      break;
@ -264,6 +317,9 @@ gst_nv_encoder_get_property (GObject * object, guint prop_id, GValue * value,
    case PROP_CC_INSERT:
      g_value_set_enum (value, priv->cc_insert);
      break;
+    case PROP_EXTERN_POOL:
+      g_value_set_object (value, priv->extern_pool);
+      break;
    default:
      G_OBJECT_WARN_INVALID_PROPERTY_ID (object, prop_id, pspec);
      break;
@ -782,7 +838,8 @@ gst_nv_encoder_propose_allocation (GstVideoEncoder * encoder, GstQuery * query)
    gst_buffer_pool_config_set_cuda_stream (config, priv->stream);

    /* Encoder does not seem to support stream ordered allocation */
-    gst_buffer_pool_config_set_cuda_stream_ordered_alloc (config, FALSE);
+    if (!priv->extern_pool)
+      gst_buffer_pool_config_set_cuda_stream_ordered_alloc (config, FALSE);
  }

  if (!gst_buffer_pool_set_config (pool, config)) {
@ -1555,6 +1612,9 @@ gst_nv_encoder_prepare_task_input_cuda (GstNvEncoder * self,
  GstCudaStream *stream;
  GstNvEncResource *resource = nullptr;
  const GstVideoInfo *info = &priv->input_state->info;
+  gboolean sync_done = FALSE;
+  guint out_stride = 0;
+  gboolean is_extern_mem = FALSE;

  mem = gst_buffer_peek_memory (buffer, 0);

@ -1648,21 +1708,20 @@ gst_nv_encoder_prepare_task_input_cuda (GstNvEncoder * self,
    return gst_nv_encoder_copy_system (self, info, buffer, task);
  }

+  out_stride = cmem->info.stride[0];
+
  if (gst_cuda_memory_is_stream_ordered (mem)) {
    GstBuffer *copy = nullptr;
-    GstVideoFrame in_frame, out_frame;
+    GstVideoFrame in_frame;
    CUDA_MEMCPY2D copy_params = { };
+    GstMemory *out_mem;
+    GstMapInfo out_map;
+    guint8 *out_data;

    stream = gst_cuda_memory_get_stream (cmem);

    GST_LOG_OBJECT (self, "Stream ordered allocation needs memory copy");

-    gst_buffer_pool_acquire_buffer (priv->internal_pool, &copy, nullptr);
-    if (!copy) {
-      GST_ERROR_OBJECT (self, "Couldn't allocate internal buffer");
-      return GST_FLOW_ERROR;
-    }
-
    if (!gst_video_frame_map (&in_frame, info, buffer,
            (GstMapFlags) (GST_MAP_READ | GST_MAP_CUDA))) {
      GST_ERROR_OBJECT (self, "Couldn't map input buffer");
@ -1670,14 +1729,50 @@ gst_nv_encoder_prepare_task_input_cuda (GstNvEncoder * self,
      return GST_FLOW_ERROR;
    }

-    if (!gst_video_frame_map (&out_frame, info, copy,
-            (GstMapFlags) (GST_MAP_WRITE | GST_MAP_CUDA))) {
+    if (priv->extern_pool) {
+      auto cuda_pool = GST_CUDA_BUFFER_POOL (priv->extern_pool);
+      if (cuda_pool->context == priv->context) {
+        gst_buffer_pool_acquire_buffer (priv->extern_pool, &copy, nullptr);
+        if (copy) {
+          auto copy_mem = gst_buffer_peek_memory (copy, 0);
+          if (gst_cuda_memory_is_stream_ordered (copy_mem)) {
+            GST_LOG_OBJECT (self, "External pool uses stream ordered alloc");
+            gst_clear_buffer (&copy);
+          } else if (gst_memory_get_sizes (mem, nullptr, nullptr) >
+              gst_memory_get_sizes (copy_mem, nullptr, nullptr)) {
+            GST_LOG_OBJECT (self, "Too small extern pool buffer");
+            gst_clear_buffer (&copy);
+          } else {
+            is_extern_mem = TRUE;
+          }
+        }
+      }
+    }
+
+    if (!copy)
+      gst_buffer_pool_acquire_buffer (priv->internal_pool, &copy, nullptr);
+
+    if (!copy) {
+      GST_ERROR_OBJECT (self, "Couldn't allocate internal buffer");
+      return GST_FLOW_ERROR;
+    }
+
+    out_mem = gst_buffer_peek_memory (copy, 0);
+
+    if (!gst_memory_map (out_mem,
+            &out_map, (GstMapFlags) (GST_MAP_WRITE | GST_MAP_CUDA))) {
      GST_ERROR_OBJECT (self, "Couldn't map output buffer");
      gst_video_frame_unmap (&in_frame);
      gst_buffer_unref (copy);
      return GST_FLOW_ERROR;
    }

+    out_data = (guint8 *) out_map.data;
+    if (is_extern_mem)
+      out_stride = in_frame.info.stride[0];
+    else
+      out_stride = GST_CUDA_MEMORY_CAST (out_mem)->info.stride[0];
+
    for (guint i = 0; i < GST_VIDEO_FRAME_N_PLANES (&in_frame); i++) {
      copy_params.srcMemoryType = CU_MEMORYTYPE_DEVICE;
      copy_params.srcDevice = (CUdeviceptr)
@ -1685,9 +1780,8 @@ gst_nv_encoder_prepare_task_input_cuda (GstNvEncoder * self,
      copy_params.srcPitch = GST_VIDEO_FRAME_PLANE_STRIDE (&in_frame, i);

      copy_params.dstMemoryType = CU_MEMORYTYPE_DEVICE;
-      copy_params.dstDevice = (CUdeviceptr)
-          GST_VIDEO_FRAME_PLANE_DATA (&out_frame, i);
-      copy_params.dstPitch = GST_VIDEO_FRAME_PLANE_STRIDE (&out_frame, i);
+      copy_params.dstDevice = (CUdeviceptr) out_data;
+      copy_params.dstPitch = out_stride;

      copy_params.WidthInBytes = GST_VIDEO_INFO_COMP_WIDTH (info, i) *
          GST_VIDEO_INFO_COMP_PSTRIDE (info, i);
@ -1698,18 +1792,22 @@ gst_nv_encoder_prepare_task_input_cuda (GstNvEncoder * self,
      if (!gst_cuda_result (cuda_ret)) {
        GST_ERROR_OBJECT (self, "Copy failed");
        gst_video_frame_unmap (&in_frame);
-        gst_video_frame_unmap (&out_frame);
+        gst_memory_unmap (out_mem, &out_map);

        gst_buffer_unref (copy);
        return GST_FLOW_ERROR;
      }
+
+      out_data += GST_VIDEO_INFO_COMP_HEIGHT (info, i) * out_stride;
    }

    gst_video_frame_unmap (&in_frame);
-    gst_video_frame_unmap (&out_frame);
+    gst_memory_unmap (out_mem, &out_map);

-    if (stream && stream != priv->stream)
+    if (stream && stream != priv->stream) {
      CuStreamSynchronize (gst_cuda_stream_get_handle (stream));
+      sync_done = TRUE;
+    }

    buffer = copy;
    mem = gst_buffer_peek_memory (copy, 0);
@ -1718,7 +1816,12 @@ gst_nv_encoder_prepare_task_input_cuda (GstNvEncoder * self,
    buffer = gst_buffer_ref (buffer);
  }

-  status = object->AcquireResource (mem, &resource);
+  if (is_extern_mem) {
+    status = object->AcquireResourceWithSize (mem, info->width, info->height,
+        out_stride, &resource);
+  } else {
+    status = object->AcquireResource (mem, &resource);
+  }

  if (status != NV_ENC_SUCCESS) {
    GST_ERROR_OBJECT (self, "Failed to get resource, status %"
@ -1729,7 +1832,7 @@ gst_nv_encoder_prepare_task_input_cuda (GstNvEncoder * self,
  }

  stream = gst_cuda_memory_get_stream (cmem);
-  if (stream != priv->stream) {
+  if (stream != priv->stream && !sync_done) {
    /* different stream, needs sync */
    gst_cuda_memory_sync (cmem);
  }