cudaconvertscale: Add support for shared CUDA stream

If CUDA stream is shared by upstream/downstream, don't sync at convert element Part-of: <https://gitlab.freedesktop.org/gstreamer/gstreamer/-/merge_requests/3629>
2024-11-19 16:21:17 +00:00 · 2022-12-20 23:04:12 +09:00 · 2022-12-20 23:04:12 +09:00 · ddc5f1d425
commit ddc5f1d425
parent 1cb47d549b
1 changed files with 62 additions and 1 deletions
--- a/subprojects/gst-plugins-bad/sys/nvcodec/gstcudaconvertscale.c
+++ b/subprojects/gst-plugins-bad/sys/nvcodec/gstcudaconvertscale.c
@ -55,6 +55,7 @@ struct _GstCudaBaseConvert
  GstCudaBaseTransform parent;

  GstCudaConverter *converter;
+  GstCudaStream *other_stream;

  gint borders_h;
  gint borders_w;
@ -136,6 +137,7 @@ gst_cuda_base_convert_dispose (GObject * object)
 {
  GstCudaBaseConvert *self = GST_CUDA_BASE_CONVERT (object);

+  gst_clear_cuda_stream (&self->other_stream);
  gst_clear_object (&self->converter);

  G_OBJECT_CLASS (parent_class)->dispose (object);
@ -1099,6 +1101,7 @@ static gboolean
 gst_cuda_base_convert_propose_allocation (GstBaseTransform * trans,
    GstQuery * decide_query, GstQuery * query)
 {
+  GstCudaBaseConvert *self = GST_CUDA_BASE_CONVERT (trans);
  GstCudaBaseTransform *ctrans = GST_CUDA_BASE_TRANSFORM (trans);
  GstVideoInfo info;
  GstBufferPool *pool;
@ -1127,6 +1130,14 @@ gst_cuda_base_convert_propose_allocation (GstBaseTransform * trans,
    pool = gst_cuda_buffer_pool_new (ctrans->context);

    config = gst_buffer_pool_get_config (pool);
+    /* Forward downstream CUDA stream to upstream */
+    if (self->other_stream) {
+      GST_DEBUG_OBJECT (self, "Have downstream CUDA stream, forwarding");
+      gst_buffer_pool_config_set_cuda_stream (config, self->other_stream);
+    } else if (ctrans->stream) {
+      GST_DEBUG_OBJECT (self, "Set our stream to proposing buffer pool");
+      gst_buffer_pool_config_set_cuda_stream (config, ctrans->stream);
+    }

    gst_buffer_pool_config_add_option (config,
        GST_BUFFER_POOL_OPTION_VIDEO_META);
@ -1159,6 +1170,7 @@ static gboolean
 gst_cuda_base_convert_decide_allocation (GstBaseTransform * trans,
    GstQuery * query)
 {
+  GstCudaBaseConvert *self = GST_CUDA_BASE_CONVERT (trans);
  GstCudaBaseTransform *ctrans = GST_CUDA_BASE_TRANSFORM (trans);
  GstCaps *outcaps = NULL;
  GstBufferPool *pool = NULL;
@ -1202,6 +1214,15 @@ gst_cuda_base_convert_decide_allocation (GstBaseTransform * trans,
  config = gst_buffer_pool_get_config (pool);
  gst_buffer_pool_config_add_option (config, GST_BUFFER_POOL_OPTION_VIDEO_META);
  gst_buffer_pool_config_set_params (config, outcaps, size, min, max);
+  gst_clear_cuda_stream (&self->other_stream);
+  self->other_stream = gst_buffer_pool_config_get_cuda_stream (config);
+  if (self->other_stream) {
+    GST_DEBUG_OBJECT (self, "Downstream provided CUDA stream");
+  } else if (ctrans->stream) {
+    GST_DEBUG_OBJECT (self, "Set our stream to decided buffer pool");
+    gst_buffer_pool_config_set_cuda_stream (config, ctrans->stream);
+  }
+
  gst_buffer_pool_set_config (pool, config);

  /* Get updated size by cuda buffer pool */
@ -1343,6 +1364,10 @@ gst_cuda_base_convert_transform (GstBaseTransform * trans,
  GstVideoFrame in_frame, out_frame;
  GstFlowReturn ret = GST_FLOW_OK;
  GstMemory *mem;
+  GstCudaMemory *in_cmem, *out_cmem;
+  GstCudaStream *in_stream, *out_stream;
+  GstCudaStream *selected_stream = NULL;
+  gboolean sync_done = FALSE;

  if (gst_buffer_n_memory (inbuf) != 1) {
    GST_ERROR_OBJECT (self, "Invalid input buffer");
@ -1354,6 +1379,8 @@ gst_cuda_base_convert_transform (GstBaseTransform * trans,
    GST_ERROR_OBJECT (self, "Input buffer is not CUDA");
    return GST_FLOW_ERROR;
  }
+  in_cmem = GST_CUDA_MEMORY_CAST (mem);
+  in_stream = gst_cuda_memory_get_stream (in_cmem);

  if (gst_buffer_n_memory (outbuf) != 1) {
    GST_ERROR_OBJECT (self, "Invalid output buffer");
@ -1365,6 +1392,8 @@ gst_cuda_base_convert_transform (GstBaseTransform * trans,
    GST_ERROR_OBJECT (self, "Input buffer is not CUDA");
    return GST_FLOW_ERROR;
  }
+  out_cmem = GST_CUDA_MEMORY_CAST (mem);
+  out_stream = gst_cuda_memory_get_stream (out_cmem);

  if (!gst_video_frame_map (&in_frame, &btrans->in_info, inbuf,
          GST_MAP_READ | GST_MAP_CUDA)) {
@ -1379,12 +1408,44 @@ gst_cuda_base_convert_transform (GstBaseTransform * trans,
    return GST_FLOW_ERROR;
  }

+  /* If downstream does not aware of CUDA stream (i.e., using default stream) */
+  if (!out_stream) {
+    if (in_stream) {
+      GST_TRACE_OBJECT (self, "Use upstram CUDA stream");
+      selected_stream = in_stream;
+    } else if (btrans->stream) {
+      GST_TRACE_OBJECT (self, "Use our CUDA stream");
+      selected_stream = btrans->stream;
+    }
+  } else {
+    selected_stream = out_stream;
+    if (in_stream) {
+      if (in_stream == out_stream) {
+        GST_TRACE_OBJECT (self, "Same stream");
+      } else {
+        GST_TRACE_OBJECT (self, "Different CUDA stream");
+        gst_cuda_memory_sync (in_cmem);
+      }
+    }
+  }
+
  if (!gst_cuda_converter_convert_frame (self->converter, &in_frame, &out_frame,
-          gst_cuda_stream_get_handle (btrans->stream), NULL)) {
+          gst_cuda_stream_get_handle (selected_stream), &sync_done)) {
    GST_ERROR_OBJECT (self, "Failed to convert frame");
    ret = GST_FLOW_ERROR;
  }

+  if (sync_done) {
+    GST_TRACE_OBJECT (self, "Sync done by converter");
+    GST_MEMORY_FLAG_UNSET (out_cmem, GST_CUDA_MEMORY_TRANSFER_NEED_SYNC);
+  } else if (selected_stream != out_stream) {
+    GST_MEMORY_FLAG_UNSET (out_cmem, GST_CUDA_MEMORY_TRANSFER_NEED_SYNC);
+    GST_TRACE_OBJECT (self, "Waiting for convert sync");
+    gst_cuda_context_push (btrans->context);
+    CuStreamSynchronize (gst_cuda_stream_get_handle (selected_stream));
+    gst_cuda_context_pop (NULL);
+  }
+
  gst_video_frame_unmap (&out_frame);
  gst_video_frame_unmap (&in_frame);