diff --git a/sys/nvcodec/gstcudaloader.c b/sys/nvcodec/gstcudaloader.c index 8d2305bc6f..de9547f646 100644 --- a/sys/nvcodec/gstcudaloader.c +++ b/sys/nvcodec/gstcudaloader.c @@ -68,6 +68,8 @@ typedef struct _GstNvCodecCudaVTable CUresult (*CuMemcpy2D) (const CUDA_MEMCPY2D * pCopy); CUresult (*CuMemcpy2DAsync) (const CUDA_MEMCPY2D * pCopy, CUstream hStream); CUresult (*CuMemFree) (CUdeviceptr dptr); + CUresult (*CuStreamCreate) (CUstream * phStream, unsigned int Flags); + CUresult (*CuStreamDestroy) (CUstream hStream); CUresult (*CuStreamSynchronize) (CUstream hStream); CUresult (*CuDeviceGet) (CUdevice * device, int ordinal); @@ -125,6 +127,8 @@ gst_cuda_load_library (void) LOAD_SYMBOL (cuMemcpy2DAsync, CuMemcpy2DAsync); LOAD_SYMBOL (cuMemFree, CuMemFree); + LOAD_SYMBOL (cuStreamCreate, CuStreamCreate); + LOAD_SYMBOL (cuStreamDestroy, CuStreamDestroy); LOAD_SYMBOL (cuStreamSynchronize, CuStreamSynchronize); LOAD_SYMBOL (cuDeviceGet, CuDeviceGet); @@ -290,6 +294,22 @@ CuMemFree (CUdeviceptr dptr) return gst_cuda_vtable.CuMemFree (dptr); } +CUresult +CuStreamCreate (CUstream * phStream, unsigned int Flags) +{ + g_assert (gst_cuda_vtable.CuStreamCreate != NULL); + + return gst_cuda_vtable.CuStreamCreate (phStream, Flags); +} + +CUresult +CuStreamDestroy (CUstream hStream) +{ + g_assert (gst_cuda_vtable.CuStreamDestroy != NULL); + + return gst_cuda_vtable.CuStreamDestroy (hStream); +} + CUresult CuStreamSynchronize (CUstream hStream) { diff --git a/sys/nvcodec/gstcudaloader.h b/sys/nvcodec/gstcudaloader.h index b3e56dccdf..62794228ec 100644 --- a/sys/nvcodec/gstcudaloader.h +++ b/sys/nvcodec/gstcudaloader.h @@ -98,6 +98,13 @@ CUresult CuMemcpy2DAsync (const CUDA_MEMCPY2D *pCopy, CUstream hStream); G_GNUC_INTERNAL CUresult CuMemFree (CUdeviceptr dptr); +G_GNUC_INTERNAL +CUresult CuStreamCreate (CUstream *phStream, + unsigned int Flags); + +G_GNUC_INTERNAL +CUresult CuStreamDestroy (CUstream hStream); + G_GNUC_INTERNAL CUresult CuStreamSynchronize (CUstream hStream); diff --git a/sys/nvcodec/gstnvdec.c b/sys/nvcodec/gstnvdec.c index 0b28e46c9f..cec0573f75 100644 --- a/sys/nvcodec/gstnvdec.c +++ b/sys/nvcodec/gstnvdec.c @@ -616,6 +616,7 @@ gst_nvdec_open (GstVideoDecoder * decoder) { GstNvDec *nvdec = GST_NVDEC (decoder); GstNvDecClass *klass = GST_NVDEC_GET_CLASS (nvdec); + CUresult cuda_ret; GST_DEBUG_OBJECT (nvdec, "creating CUDA context"); @@ -624,6 +625,16 @@ gst_nvdec_open (GstVideoDecoder * decoder) GST_ERROR_OBJECT (nvdec, "failed to create CUDA context"); return FALSE; } + + if (gst_cuda_context_push (nvdec->cuda_ctx)) { + cuda_ret = CuStreamCreate (&nvdec->cuda_stream, CU_STREAM_NON_BLOCKING); + if (!gst_cuda_result (cuda_ret)) { + GST_WARNING_OBJECT (nvdec, + "Could not create cuda stream, will use default stream"); + nvdec->cuda_stream = NULL; + } + gst_cuda_context_pop (NULL); + } #if HAVE_NVCODEC_GST_GL gst_gl_ensure_element_data (GST_ELEMENT (nvdec), &nvdec->gl_display, &nvdec->other_gl_context); @@ -723,7 +734,15 @@ gst_nvdec_close (GstVideoDecoder * decoder) { GstNvDec *nvdec = GST_NVDEC (decoder); + if (nvdec->cuda_ctx && nvdec->cuda_stream) { + if (gst_cuda_context_push (nvdec->cuda_ctx)) { + gst_cuda_result (CuStreamDestroy (nvdec->cuda_stream)); + gst_cuda_context_pop (NULL); + } + } + gst_clear_object (&nvdec->cuda_ctx); + nvdec->cuda_stream = NULL; return TRUE; } @@ -819,7 +838,7 @@ copy_video_frame_to_gl_textures (GstGLContext * context, } if (!gst_cuda_result (CuGraphicsMapResources (num_resources, resources, - NULL))) { + nvdec->cuda_stream))) { GST_WARNING_OBJECT (nvdec, "failed to map CUDA resources"); data->ret = FALSE; goto unmap_video_frame; @@ -844,18 +863,18 @@ copy_video_frame_to_gl_textures (GstGLContext * context, mcpy2d.dstArray = array; mcpy2d.Height = GST_VIDEO_INFO_COMP_HEIGHT (info, i); - if (!gst_cuda_result (CuMemcpy2DAsync (&mcpy2d, 0))) { + if (!gst_cuda_result (CuMemcpy2DAsync (&mcpy2d, nvdec->cuda_stream))) { GST_WARNING_OBJECT (nvdec, "memcpy to mapped array failed"); data->ret = FALSE; } } - gst_cuda_result (CuStreamSynchronize (0)); - if (!gst_cuda_result (CuGraphicsUnmapResources (num_resources, resources, - NULL))) + nvdec->cuda_stream))) GST_WARNING_OBJECT (nvdec, "failed to unmap CUDA resources"); + gst_cuda_result (CuStreamSynchronize (nvdec->cuda_stream)); + unmap_video_frame: if (!gst_cuda_result (CuvidUnmapVideoFrame (nvdec->decoder, dptr))) GST_WARNING_OBJECT (nvdec, "failed to unmap CUDA video frame"); @@ -943,7 +962,7 @@ gst_nvdec_copy_device_to_system (GstNvDec * nvdec, copy_params.dstPitch = GST_VIDEO_FRAME_PLANE_STRIDE (&video_frame, i); copy_params.Height = GST_VIDEO_FRAME_COMP_HEIGHT (&video_frame, i); - if (!gst_cuda_result (CuMemcpy2DAsync (©_params, 0))) { + if (!gst_cuda_result (CuMemcpy2DAsync (©_params, nvdec->cuda_stream))) { GST_ERROR_OBJECT (nvdec, "failed to copy %dth plane", i); CuvidUnmapVideoFrame (nvdec->decoder, dptr); gst_video_frame_unmap (&video_frame); @@ -952,7 +971,7 @@ gst_nvdec_copy_device_to_system (GstNvDec * nvdec, } } - gst_cuda_result (CuStreamSynchronize (0)); + gst_cuda_result (CuStreamSynchronize (nvdec->cuda_stream)); gst_video_frame_unmap (&video_frame); diff --git a/sys/nvcodec/gstnvdec.h b/sys/nvcodec/gstnvdec.h index fe9ddf8846..86f8872aed 100644 --- a/sys/nvcodec/gstnvdec.h +++ b/sys/nvcodec/gstnvdec.h @@ -77,6 +77,7 @@ struct _GstNvDec CUvideoparser parser; CUvideodecoder decoder; GstCudaContext *cuda_ctx; + CUstream cuda_stream; guint width; guint height; diff --git a/sys/nvcodec/stub/cuda.h b/sys/nvcodec/stub/cuda.h index cc0aecaf4a..1dff9bb777 100644 --- a/sys/nvcodec/stub/cuda.h +++ b/sys/nvcodec/stub/cuda.h @@ -56,6 +56,12 @@ typedef enum CU_GRAPHICS_REGISTER_FLAGS_WRITE_DISCARD = 2 } CUgraphicsRegisterFlags; +typedef enum +{ + CU_STREAM_DEFAULT = 0x0, + CU_STREAM_NON_BLOCKING = 0x1 +} CUstream_flags; + typedef struct { gsize srcXInBytes;