From 5615e9258f2193a4e9d0acfae7f9eb89c7178564 Mon Sep 17 00:00:00 2001 From: Seungha Yang Date: Sun, 18 Aug 2019 22:07:38 +0900 Subject: [PATCH] nvdec: Don't use default CUDA stream Async CUDA operation with default stream (NULL CUstream) is not much beneficial than blocking operation since all CUDA operations which belong to the CUDA context will be synchronized with the default stream's operation. Note that CUDA stream will share all resources of the corresponding CUDA context but which can help parallel operation similar to the relation between thread and process --- sys/nvcodec/gstcudaloader.c | 20 ++++++++++++++++++++ sys/nvcodec/gstcudaloader.h | 7 +++++++ sys/nvcodec/gstnvdec.c | 33 ++++++++++++++++++++++++++------- sys/nvcodec/gstnvdec.h | 1 + sys/nvcodec/stub/cuda.h | 6 ++++++ 5 files changed, 60 insertions(+), 7 deletions(-) diff --git a/sys/nvcodec/gstcudaloader.c b/sys/nvcodec/gstcudaloader.c index 8d2305bc6f..de9547f646 100644 --- a/sys/nvcodec/gstcudaloader.c +++ b/sys/nvcodec/gstcudaloader.c @@ -68,6 +68,8 @@ typedef struct _GstNvCodecCudaVTable CUresult (*CuMemcpy2D) (const CUDA_MEMCPY2D * pCopy); CUresult (*CuMemcpy2DAsync) (const CUDA_MEMCPY2D * pCopy, CUstream hStream); CUresult (*CuMemFree) (CUdeviceptr dptr); + CUresult (*CuStreamCreate) (CUstream * phStream, unsigned int Flags); + CUresult (*CuStreamDestroy) (CUstream hStream); CUresult (*CuStreamSynchronize) (CUstream hStream); CUresult (*CuDeviceGet) (CUdevice * device, int ordinal); @@ -125,6 +127,8 @@ gst_cuda_load_library (void) LOAD_SYMBOL (cuMemcpy2DAsync, CuMemcpy2DAsync); LOAD_SYMBOL (cuMemFree, CuMemFree); + LOAD_SYMBOL (cuStreamCreate, CuStreamCreate); + LOAD_SYMBOL (cuStreamDestroy, CuStreamDestroy); LOAD_SYMBOL (cuStreamSynchronize, CuStreamSynchronize); LOAD_SYMBOL (cuDeviceGet, CuDeviceGet); @@ -290,6 +294,22 @@ CuMemFree (CUdeviceptr dptr) return gst_cuda_vtable.CuMemFree (dptr); } +CUresult +CuStreamCreate (CUstream * phStream, unsigned int Flags) +{ + g_assert (gst_cuda_vtable.CuStreamCreate != NULL); + + return gst_cuda_vtable.CuStreamCreate (phStream, Flags); +} + +CUresult +CuStreamDestroy (CUstream hStream) +{ + g_assert (gst_cuda_vtable.CuStreamDestroy != NULL); + + return gst_cuda_vtable.CuStreamDestroy (hStream); +} + CUresult CuStreamSynchronize (CUstream hStream) { diff --git a/sys/nvcodec/gstcudaloader.h b/sys/nvcodec/gstcudaloader.h index b3e56dccdf..62794228ec 100644 --- a/sys/nvcodec/gstcudaloader.h +++ b/sys/nvcodec/gstcudaloader.h @@ -98,6 +98,13 @@ CUresult CuMemcpy2DAsync (const CUDA_MEMCPY2D *pCopy, CUstream hStream); G_GNUC_INTERNAL CUresult CuMemFree (CUdeviceptr dptr); +G_GNUC_INTERNAL +CUresult CuStreamCreate (CUstream *phStream, + unsigned int Flags); + +G_GNUC_INTERNAL +CUresult CuStreamDestroy (CUstream hStream); + G_GNUC_INTERNAL CUresult CuStreamSynchronize (CUstream hStream); diff --git a/sys/nvcodec/gstnvdec.c b/sys/nvcodec/gstnvdec.c index 0b28e46c9f..cec0573f75 100644 --- a/sys/nvcodec/gstnvdec.c +++ b/sys/nvcodec/gstnvdec.c @@ -616,6 +616,7 @@ gst_nvdec_open (GstVideoDecoder * decoder) { GstNvDec *nvdec = GST_NVDEC (decoder); GstNvDecClass *klass = GST_NVDEC_GET_CLASS (nvdec); + CUresult cuda_ret; GST_DEBUG_OBJECT (nvdec, "creating CUDA context"); @@ -624,6 +625,16 @@ gst_nvdec_open (GstVideoDecoder * decoder) GST_ERROR_OBJECT (nvdec, "failed to create CUDA context"); return FALSE; } + + if (gst_cuda_context_push (nvdec->cuda_ctx)) { + cuda_ret = CuStreamCreate (&nvdec->cuda_stream, CU_STREAM_NON_BLOCKING); + if (!gst_cuda_result (cuda_ret)) { + GST_WARNING_OBJECT (nvdec, + "Could not create cuda stream, will use default stream"); + nvdec->cuda_stream = NULL; + } + gst_cuda_context_pop (NULL); + } #if HAVE_NVCODEC_GST_GL gst_gl_ensure_element_data (GST_ELEMENT (nvdec), &nvdec->gl_display, &nvdec->other_gl_context); @@ -723,7 +734,15 @@ gst_nvdec_close (GstVideoDecoder * decoder) { GstNvDec *nvdec = GST_NVDEC (decoder); + if (nvdec->cuda_ctx && nvdec->cuda_stream) { + if (gst_cuda_context_push (nvdec->cuda_ctx)) { + gst_cuda_result (CuStreamDestroy (nvdec->cuda_stream)); + gst_cuda_context_pop (NULL); + } + } + gst_clear_object (&nvdec->cuda_ctx); + nvdec->cuda_stream = NULL; return TRUE; } @@ -819,7 +838,7 @@ copy_video_frame_to_gl_textures (GstGLContext * context, } if (!gst_cuda_result (CuGraphicsMapResources (num_resources, resources, - NULL))) { + nvdec->cuda_stream))) { GST_WARNING_OBJECT (nvdec, "failed to map CUDA resources"); data->ret = FALSE; goto unmap_video_frame; @@ -844,18 +863,18 @@ copy_video_frame_to_gl_textures (GstGLContext * context, mcpy2d.dstArray = array; mcpy2d.Height = GST_VIDEO_INFO_COMP_HEIGHT (info, i); - if (!gst_cuda_result (CuMemcpy2DAsync (&mcpy2d, 0))) { + if (!gst_cuda_result (CuMemcpy2DAsync (&mcpy2d, nvdec->cuda_stream))) { GST_WARNING_OBJECT (nvdec, "memcpy to mapped array failed"); data->ret = FALSE; } } - gst_cuda_result (CuStreamSynchronize (0)); - if (!gst_cuda_result (CuGraphicsUnmapResources (num_resources, resources, - NULL))) + nvdec->cuda_stream))) GST_WARNING_OBJECT (nvdec, "failed to unmap CUDA resources"); + gst_cuda_result (CuStreamSynchronize (nvdec->cuda_stream)); + unmap_video_frame: if (!gst_cuda_result (CuvidUnmapVideoFrame (nvdec->decoder, dptr))) GST_WARNING_OBJECT (nvdec, "failed to unmap CUDA video frame"); @@ -943,7 +962,7 @@ gst_nvdec_copy_device_to_system (GstNvDec * nvdec, copy_params.dstPitch = GST_VIDEO_FRAME_PLANE_STRIDE (&video_frame, i); copy_params.Height = GST_VIDEO_FRAME_COMP_HEIGHT (&video_frame, i); - if (!gst_cuda_result (CuMemcpy2DAsync (©_params, 0))) { + if (!gst_cuda_result (CuMemcpy2DAsync (©_params, nvdec->cuda_stream))) { GST_ERROR_OBJECT (nvdec, "failed to copy %dth plane", i); CuvidUnmapVideoFrame (nvdec->decoder, dptr); gst_video_frame_unmap (&video_frame); @@ -952,7 +971,7 @@ gst_nvdec_copy_device_to_system (GstNvDec * nvdec, } } - gst_cuda_result (CuStreamSynchronize (0)); + gst_cuda_result (CuStreamSynchronize (nvdec->cuda_stream)); gst_video_frame_unmap (&video_frame); diff --git a/sys/nvcodec/gstnvdec.h b/sys/nvcodec/gstnvdec.h index fe9ddf8846..86f8872aed 100644 --- a/sys/nvcodec/gstnvdec.h +++ b/sys/nvcodec/gstnvdec.h @@ -77,6 +77,7 @@ struct _GstNvDec CUvideoparser parser; CUvideodecoder decoder; GstCudaContext *cuda_ctx; + CUstream cuda_stream; guint width; guint height; diff --git a/sys/nvcodec/stub/cuda.h b/sys/nvcodec/stub/cuda.h index cc0aecaf4a..1dff9bb777 100644 --- a/sys/nvcodec/stub/cuda.h +++ b/sys/nvcodec/stub/cuda.h @@ -56,6 +56,12 @@ typedef enum CU_GRAPHICS_REGISTER_FLAGS_WRITE_DISCARD = 2 } CUgraphicsRegisterFlags; +typedef enum +{ + CU_STREAM_DEFAULT = 0x0, + CU_STREAM_NON_BLOCKING = 0x1 +} CUstream_flags; + typedef struct { gsize srcXInBytes;