nvdec: Don't use default CUDA stream

Async CUDA operation with default stream (NULL CUstream) is not much beneficial than blocking operation since all CUDA operations which belong to the CUDA context will be synchronized with the default stream's operation. Note that CUDA stream will share all resources of the corresponding CUDA context but which can help parallel operation similar to the relation between thread and process
2025-04-10 18:14:15 +00:00 · 2019-08-18 22:07:38 +09:00 · 2019-08-18 22:07:38 +09:00 · 5615e9258f
commit 5615e9258f
parent 20d8f54e63
5 changed files with 60 additions and 7 deletions
--- a/sys/nvcodec/gstcudaloader.c
+++ b/sys/nvcodec/gstcudaloader.c
@ -68,6 +68,8 @@ typedef struct _GstNvCodecCudaVTable
    CUresult (*CuMemcpy2D) (const CUDA_MEMCPY2D * pCopy);
    CUresult (*CuMemcpy2DAsync) (const CUDA_MEMCPY2D * pCopy, CUstream hStream);
    CUresult (*CuMemFree) (CUdeviceptr dptr);
+    CUresult (*CuStreamCreate) (CUstream * phStream, unsigned int Flags);
+    CUresult (*CuStreamDestroy) (CUstream hStream);
    CUresult (*CuStreamSynchronize) (CUstream hStream);

    CUresult (*CuDeviceGet) (CUdevice * device, int ordinal);
@ -125,6 +127,8 @@ gst_cuda_load_library (void)
  LOAD_SYMBOL (cuMemcpy2DAsync, CuMemcpy2DAsync);
  LOAD_SYMBOL (cuMemFree, CuMemFree);

+  LOAD_SYMBOL (cuStreamCreate, CuStreamCreate);
+  LOAD_SYMBOL (cuStreamDestroy, CuStreamDestroy);
  LOAD_SYMBOL (cuStreamSynchronize, CuStreamSynchronize);

  LOAD_SYMBOL (cuDeviceGet, CuDeviceGet);
@ -290,6 +294,22 @@ CuMemFree (CUdeviceptr dptr)
  return gst_cuda_vtable.CuMemFree (dptr);
 }

+CUresult
+CuStreamCreate (CUstream * phStream, unsigned int Flags)
+{
+  g_assert (gst_cuda_vtable.CuStreamCreate != NULL);
+
+  return gst_cuda_vtable.CuStreamCreate (phStream, Flags);
+}
+
+CUresult
+CuStreamDestroy (CUstream hStream)
+{
+  g_assert (gst_cuda_vtable.CuStreamDestroy != NULL);
+
+  return gst_cuda_vtable.CuStreamDestroy (hStream);
+}
+
 CUresult
 CuStreamSynchronize (CUstream hStream)
 {
--- a/sys/nvcodec/gstcudaloader.h
+++ b/sys/nvcodec/gstcudaloader.h
@ -98,6 +98,13 @@ CUresult CuMemcpy2DAsync    (const CUDA_MEMCPY2D *pCopy, CUstream hStream);
 G_GNUC_INTERNAL
 CUresult CuMemFree          (CUdeviceptr dptr);

+G_GNUC_INTERNAL
+CUresult CuStreamCreate     (CUstream *phStream,
+                             unsigned int Flags);
+
+G_GNUC_INTERNAL
+CUresult CuStreamDestroy    (CUstream hStream);
+
 G_GNUC_INTERNAL
 CUresult CuStreamSynchronize (CUstream hStream);

--- a/sys/nvcodec/gstnvdec.c
+++ b/sys/nvcodec/gstnvdec.c
@ -616,6 +616,7 @@ gst_nvdec_open (GstVideoDecoder * decoder)
 {
  GstNvDec *nvdec = GST_NVDEC (decoder);
  GstNvDecClass *klass = GST_NVDEC_GET_CLASS (nvdec);
+  CUresult cuda_ret;

  GST_DEBUG_OBJECT (nvdec, "creating CUDA context");

@ -624,6 +625,16 @@ gst_nvdec_open (GstVideoDecoder * decoder)
    GST_ERROR_OBJECT (nvdec, "failed to create CUDA context");
    return FALSE;
  }
+
+  if (gst_cuda_context_push (nvdec->cuda_ctx)) {
+    cuda_ret = CuStreamCreate (&nvdec->cuda_stream, CU_STREAM_NON_BLOCKING);
+    if (!gst_cuda_result (cuda_ret)) {
+      GST_WARNING_OBJECT (nvdec,
+          "Could not create cuda stream, will use default stream");
+      nvdec->cuda_stream = NULL;
+    }
+    gst_cuda_context_pop (NULL);
+  }
 #if HAVE_NVCODEC_GST_GL
  gst_gl_ensure_element_data (GST_ELEMENT (nvdec),
      &nvdec->gl_display, &nvdec->other_gl_context);
@ -723,7 +734,15 @@ gst_nvdec_close (GstVideoDecoder * decoder)
 {
  GstNvDec *nvdec = GST_NVDEC (decoder);

+  if (nvdec->cuda_ctx && nvdec->cuda_stream) {
+    if (gst_cuda_context_push (nvdec->cuda_ctx)) {
+      gst_cuda_result (CuStreamDestroy (nvdec->cuda_stream));
+      gst_cuda_context_pop (NULL);
+    }
+  }
+
  gst_clear_object (&nvdec->cuda_ctx);
+  nvdec->cuda_stream = NULL;

  return TRUE;
 }
@ -819,7 +838,7 @@ copy_video_frame_to_gl_textures (GstGLContext * context,
  }

  if (!gst_cuda_result (CuGraphicsMapResources (num_resources, resources,
-              NULL))) {
+              nvdec->cuda_stream))) {
    GST_WARNING_OBJECT (nvdec, "failed to map CUDA resources");
    data->ret = FALSE;
    goto unmap_video_frame;
@ -844,18 +863,18 @@ copy_video_frame_to_gl_textures (GstGLContext * context,
    mcpy2d.dstArray = array;
    mcpy2d.Height = GST_VIDEO_INFO_COMP_HEIGHT (info, i);

-    if (!gst_cuda_result (CuMemcpy2DAsync (&mcpy2d, 0))) {
+    if (!gst_cuda_result (CuMemcpy2DAsync (&mcpy2d, nvdec->cuda_stream))) {
      GST_WARNING_OBJECT (nvdec, "memcpy to mapped array failed");
      data->ret = FALSE;
    }
  }

-  gst_cuda_result (CuStreamSynchronize (0));
-
  if (!gst_cuda_result (CuGraphicsUnmapResources (num_resources, resources,
-              NULL)))
+              nvdec->cuda_stream)))
    GST_WARNING_OBJECT (nvdec, "failed to unmap CUDA resources");

+  gst_cuda_result (CuStreamSynchronize (nvdec->cuda_stream));
+
 unmap_video_frame:
  if (!gst_cuda_result (CuvidUnmapVideoFrame (nvdec->decoder, dptr)))
    GST_WARNING_OBJECT (nvdec, "failed to unmap CUDA video frame");
@ -943,7 +962,7 @@ gst_nvdec_copy_device_to_system (GstNvDec * nvdec,
    copy_params.dstPitch = GST_VIDEO_FRAME_PLANE_STRIDE (&video_frame, i);
    copy_params.Height = GST_VIDEO_FRAME_COMP_HEIGHT (&video_frame, i);

-    if (!gst_cuda_result (CuMemcpy2DAsync (&copy_params, 0))) {
+    if (!gst_cuda_result (CuMemcpy2DAsync (&copy_params, nvdec->cuda_stream))) {
      GST_ERROR_OBJECT (nvdec, "failed to copy %dth plane", i);
      CuvidUnmapVideoFrame (nvdec->decoder, dptr);
      gst_video_frame_unmap (&video_frame);
@ -952,7 +971,7 @@ gst_nvdec_copy_device_to_system (GstNvDec * nvdec,
    }
  }

-  gst_cuda_result (CuStreamSynchronize (0));
+  gst_cuda_result (CuStreamSynchronize (nvdec->cuda_stream));

  gst_video_frame_unmap (&video_frame);

--- a/sys/nvcodec/gstnvdec.h
+++ b/sys/nvcodec/gstnvdec.h
@ -77,6 +77,7 @@ struct _GstNvDec
  CUvideoparser parser;
  CUvideodecoder decoder;
  GstCudaContext *cuda_ctx;
+  CUstream cuda_stream;

  guint width;
  guint height;
--- a/sys/nvcodec/stub/cuda.h
+++ b/sys/nvcodec/stub/cuda.h
@ -56,6 +56,12 @@ typedef enum
  CU_GRAPHICS_REGISTER_FLAGS_WRITE_DISCARD = 2
 } CUgraphicsRegisterFlags;

+typedef enum
+{
+  CU_STREAM_DEFAULT = 0x0,
+  CU_STREAM_NON_BLOCKING = 0x1
+} CUstream_flags;
+
 typedef struct
 {
  gsize srcXInBytes;