From 7a8bb85523592588a8c698fed9abcedf1ccbbdb2 Mon Sep 17 00:00:00 2001
From: Seungha Yang <seungha@centricular.com>
Date: Thu, 22 Dec 2022 02:27:36 +0900
Subject: [PATCH] cudaupload, cudadownload: Update for shared CUDA stream

Use CUDA stream of memory if exists

Part-of: <https://gitlab.freedesktop.org/gstreamer/gstreamer/-/merge_requests/3629>
---
 .../gst-libs/gst/cuda/gstcuda-private.h       |  3 +-
 .../gst-libs/gst/cuda/gstcudautils.c          | 90 ++++++++++++++-----
 .../sys/nvcodec/gstcudamemorycopy.c           | 12 +--
 3 files changed, 77 insertions(+), 28 deletions(-)

diff --git a/subprojects/gst-plugins-bad/gst-libs/gst/cuda/gstcuda-private.h b/subprojects/gst-plugins-bad/gst-libs/gst/cuda/gstcuda-private.h
index 5c2cc1a9a8..be91a1e964 100644
--- a/subprojects/gst-plugins-bad/gst-libs/gst/cuda/gstcuda-private.h
+++ b/subprojects/gst-plugins-bad/gst-libs/gst/cuda/gstcuda-private.h
@@ -22,6 +22,7 @@
 #include <gst/cuda/cuda-prelude.h>
 #include <gst/cuda/cuda-gst.h>
 #include <gst/cuda/gstcudacontext.h>
+#include <gst/cuda/gstcudastream.h>
 
 #include <gst/video/video.h>
 
@@ -47,7 +48,7 @@ gboolean      gst_cuda_buffer_copy (GstBuffer * dst,
                                     GstCudaBufferCopyType src_type,
                                     const GstVideoInfo * src_info,
                                     GstCudaContext * context,
-                                    CUstream stream);
+                                    GstCudaStream * stream);
 
 G_END_DECLS
 
diff --git a/subprojects/gst-plugins-bad/gst-libs/gst/cuda/gstcudautils.c b/subprojects/gst-plugins-bad/gst-libs/gst/cuda/gstcudautils.c
index 647e23e049..a2b592d813 100644
--- a/subprojects/gst-plugins-bad/gst-libs/gst/cuda/gstcudautils.c
+++ b/subprojects/gst-plugins-bad/gst-libs/gst/cuda/gstcudautils.c
@@ -1490,14 +1490,17 @@ gboolean
 gst_cuda_buffer_copy (GstBuffer * dst, GstCudaBufferCopyType dst_type,
     const GstVideoInfo * dst_info, GstBuffer * src,
     GstCudaBufferCopyType src_type, const GstVideoInfo * src_info,
-    GstCudaContext * context, CUstream stream)
+    GstCudaContext * context, GstCudaStream * stream)
 {
   gboolean use_copy_2d = FALSE;
   GstMemory *dst_mem, *src_mem;
 #ifdef GST_CUDA_HAS_D3D
   D3D11_TEXTURE2D_DESC desc;
 #endif
-  GstCudaContext *cuda_context;
+  GstCudaContext *cuda_context = context;
+  GstCudaMemory *cmem = NULL;
+  GstCudaStream *mem_stream = NULL;
+  gboolean ret;
 
   g_return_val_if_fail (GST_IS_BUFFER (dst), FALSE);
   g_return_val_if_fail (dst_info != NULL, FALSE);
@@ -1538,31 +1541,46 @@ gst_cuda_buffer_copy (GstBuffer * dst, GstCudaBufferCopyType dst_type,
   if (src_type == GST_CUDA_BUFFER_COPY_GL && gst_is_gl_memory_pbo (src_mem)) {
     GstGLMemory *gl_mem = (GstGLMemory *) src_mem;
     GstGLContext *gl_context = gl_mem->mem.context;
-    GstCudaContext *cuda_context = context;
 
-    if (dst_type == GST_CUDA_BUFFER_COPY_CUDA && gst_is_cuda_memory (dst_mem))
-      cuda_context = GST_CUDA_MEMORY_CAST (dst_mem)->context;
+    if (dst_type == GST_CUDA_BUFFER_COPY_CUDA && gst_is_cuda_memory (dst_mem)) {
+      cmem = GST_CUDA_MEMORY_CAST (dst_mem);
+      cuda_context = cmem->context;
+      mem_stream = gst_cuda_memory_get_stream (cmem);
+      if (mem_stream)
+        stream = mem_stream;
+    }
 
     GST_TRACE_OBJECT (context, "GL -> %s",
         gst_cuda_buffer_copy_type_to_string (dst_type));
 
-    return cuda_copy_gl_interop (dst, dst_info, src, src_info, gl_context,
-        cuda_context, stream, TRUE, dst_type);
+    ret = cuda_copy_gl_interop (dst, dst_info, src, src_info, gl_context,
+        cuda_context, gst_cuda_stream_get_handle (stream), TRUE, dst_type);
+
+    if (cmem)
+      GST_MEMORY_FLAG_UNSET (cmem, GST_CUDA_MEMORY_TRANSFER_NEED_SYNC);
+
+    return ret;
   }
 
   if (dst_type == GST_CUDA_BUFFER_COPY_GL && gst_is_gl_memory_pbo (dst_mem)) {
     GstGLMemory *gl_mem = (GstGLMemory *) dst_mem;
     GstGLContext *gl_context = gl_mem->mem.context;
-    GstCudaContext *cuda_context = context;
 
-    if (src_type == GST_CUDA_BUFFER_COPY_CUDA && gst_is_cuda_memory (src_mem))
-      cuda_context = GST_CUDA_MEMORY_CAST (src_mem)->context;
+    if (src_type == GST_CUDA_BUFFER_COPY_CUDA && gst_is_cuda_memory (src_mem)) {
+      cmem = GST_CUDA_MEMORY_CAST (src_mem);
+      cuda_context = cmem->context;
+
+      /* Use memory's stream object if available */
+      mem_stream = gst_cuda_memory_get_stream (cmem);
+      if (mem_stream)
+        stream = mem_stream;
+    }
 
     GST_TRACE_OBJECT (context, "%s -> GL",
         gst_cuda_buffer_copy_type_to_string (src_type));
 
     return cuda_copy_gl_interop (dst, dst_info, src, src_info, gl_context,
-        cuda_context, stream, FALSE, src_type);
+        cuda_context, gst_cuda_stream_get_handle (stream), FALSE, src_type);
   }
 #endif
 
@@ -1572,16 +1590,24 @@ gst_cuda_buffer_copy (GstBuffer * dst, GstCudaBufferCopyType dst_type,
       && desc.Usage == D3D11_USAGE_DEFAULT && gst_is_cuda_memory (dst_mem)) {
     GstD3D11Memory *dmem = GST_D3D11_MEMORY_CAST (src_mem);
     GstD3D11Device *device = dmem->device;
-    GstCudaContext *cuda_context = GST_CUDA_MEMORY_CAST (dst_mem)->context;
-    gboolean ret;
+
+    cmem = GST_CUDA_MEMORY_CAST (dst_mem);
+    cuda_context = cmem->context;
+
+    /* Use memory's stream object if available */
+    mem_stream = gst_cuda_memory_get_stream (cmem);
+    if (mem_stream)
+      stream = mem_stream;
 
     GST_TRACE_OBJECT (context, "D3D11 -> CUDA");
 
     gst_d3d11_device_lock (device);
     ret = cuda_copy_d3d11_interop (dst, dst_info, src, src_info, device,
-        cuda_context, stream, TRUE);
+        cuda_context, gst_cuda_stream_get_handle (stream), TRUE);
     gst_d3d11_device_unlock (device);
 
+    GST_MEMORY_FLAG_UNSET (cmem, GST_CUDA_MEMORY_TRANSFER_NEED_SYNC);
+
     return ret;
   }
 
@@ -1590,14 +1616,20 @@ gst_cuda_buffer_copy (GstBuffer * dst, GstCudaBufferCopyType dst_type,
       && desc.Usage == D3D11_USAGE_DEFAULT && gst_is_cuda_memory (src_mem)) {
     GstD3D11Memory *dmem = GST_D3D11_MEMORY_CAST (dst_mem);
     GstD3D11Device *device = dmem->device;
-    GstCudaContext *cuda_context = GST_CUDA_MEMORY_CAST (src_mem)->context;
-    gboolean ret;
+
+    cmem = GST_CUDA_MEMORY_CAST (src_mem);
+    cuda_context = cmem->context;
+
+    /* Use memory's stream object if available */
+    mem_stream = gst_cuda_memory_get_stream (cmem);
+    if (mem_stream)
+      stream = mem_stream;
 
     GST_TRACE_OBJECT (context, "CUDA -> D3D11");
 
     gst_d3d11_device_lock (device);
     ret = cuda_copy_d3d11_interop (dst, dst_info, src, src_info, device,
-        cuda_context, stream, FALSE);
+        cuda_context, gst_cuda_stream_get_handle (stream), FALSE);
     gst_d3d11_device_unlock (device);
 
     return ret;
@@ -1605,17 +1637,31 @@ gst_cuda_buffer_copy (GstBuffer * dst, GstCudaBufferCopyType dst_type,
 #endif
 
   if (gst_is_cuda_memory (dst_mem)) {
-    cuda_context = GST_CUDA_MEMORY_CAST (dst_mem)->context;
+    cmem = GST_CUDA_MEMORY_CAST (dst_mem);
   } else if (gst_is_cuda_memory (src_mem)) {
-    cuda_context = GST_CUDA_MEMORY_CAST (src_mem)->context;
+    cmem = GST_CUDA_MEMORY_CAST (src_mem);
   } else {
-    cuda_context = context;
+    cmem = NULL;
+  }
+
+  if (cmem) {
+    context = cmem->context;
+    mem_stream = gst_cuda_memory_get_stream (cmem);
+    if (mem_stream)
+      stream = mem_stream;
   }
 
   GST_TRACE_OBJECT (context, "%s -> %s",
       gst_cuda_buffer_copy_type_to_string (src_type),
       gst_cuda_buffer_copy_type_to_string (dst_type));
 
-  return gst_cuda_buffer_copy_internal (dst, dst_type, dst_info,
-      src, src_type, src_info, cuda_context, stream);
+  ret = gst_cuda_buffer_copy_internal (dst, dst_type, dst_info,
+      src, src_type, src_info, cuda_context,
+      gst_cuda_stream_get_handle (stream));
+
+  /* Already synchronized */
+  if (gst_is_cuda_memory (src_mem))
+    GST_MEMORY_FLAG_UNSET (src_mem, GST_CUDA_MEMORY_TRANSFER_NEED_SYNC);
+
+  return ret;
 }
diff --git a/subprojects/gst-plugins-bad/sys/nvcodec/gstcudamemorycopy.c b/subprojects/gst-plugins-bad/sys/nvcodec/gstcudamemorycopy.c
index 53604d30a2..2c008cce0a 100644
--- a/subprojects/gst-plugins-bad/sys/nvcodec/gstcudamemorycopy.c
+++ b/subprojects/gst-plugins-bad/sys/nvcodec/gstcudamemorycopy.c
@@ -485,6 +485,7 @@ gst_cuda_memory_copy_propose_allocation (GstBaseTransform * trans,
   GstBufferPool *pool = NULL;
   GstCaps *caps;
   guint size;
+  gboolean is_cuda = FALSE;
 
   if (!GST_BASE_TRANSFORM_CLASS (parent_class)->propose_allocation (trans,
           decide_query, query))
@@ -574,6 +575,8 @@ gst_cuda_memory_copy_propose_allocation (GstBaseTransform * trans,
 
     size = GST_VIDEO_INFO_SIZE (&info);
     gst_buffer_pool_config_set_params (config, caps, size, 0, 0);
+    if (is_cuda && ctrans->stream)
+      gst_buffer_pool_config_set_cuda_stream (config, ctrans->stream);
 
     if (!gst_buffer_pool_set_config (pool, config)) {
       GST_ERROR_OBJECT (ctrans, "failed to set config");
@@ -951,7 +954,7 @@ gst_cuda_memory_copy_transform (GstBaseTransform * trans, GstBuffer * inbuf,
     GST_TRACE_OBJECT (self, "Both in/out buffers are not CUDA");
     if (!gst_cuda_buffer_copy (outbuf, GST_CUDA_BUFFER_COPY_SYSTEM, out_info,
             inbuf, GST_CUDA_BUFFER_COPY_SYSTEM, in_info, ctrans->context,
-            gst_cuda_stream_get_handle (ctrans->stream))) {
+            ctrans->stream)) {
       return GST_FLOW_ERROR;
     }
 
@@ -959,7 +962,7 @@ gst_cuda_memory_copy_transform (GstBaseTransform * trans, GstBuffer * inbuf,
   }
 
   ret = gst_cuda_buffer_copy (outbuf, out_type, out_info, inbuf, in_type,
-      in_info, ctrans->context, gst_cuda_stream_get_handle (ctrans->stream));
+      in_info, ctrans->context, ctrans->stream);
 
   /* system memory <-> CUDA copy fallback if possible */
   if (!ret) {
@@ -1002,8 +1005,7 @@ gst_cuda_memory_copy_transform (GstBaseTransform * trans, GstBuffer * inbuf,
         gst_cuda_buffer_copy_type_to_string (fallback_out_type));
 
     ret = gst_cuda_buffer_copy (outbuf, fallback_out_type, out_info, inbuf,
-        fallback_in_type, in_info, ctrans->context,
-        gst_cuda_stream_get_handle (ctrans->stream));
+        fallback_in_type, in_info, ctrans->context, ctrans->stream);
   }
 
   if (ret)
@@ -1018,7 +1020,7 @@ gst_cuda_memory_copy_transform (GstBaseTransform * trans, GstBuffer * inbuf,
   /* final fallback using system memory */
   ret = gst_cuda_buffer_copy (outbuf, GST_CUDA_BUFFER_COPY_SYSTEM, out_info,
       inbuf, GST_CUDA_BUFFER_COPY_SYSTEM, in_info, ctrans->context,
-      gst_cuda_stream_get_handle (ctrans->stream));
+      ctrans->stream);
 
   if (ret)
     return GST_FLOW_OK;