nvdec: Don't use default CUDA stream

Async CUDA operation with default stream (NULL CUstream) is not much
beneficial than blocking operation since all CUDA operations which belong
to the CUDA context will be synchronized with the default stream's operation.
Note that CUDA stream will share all resources of the corresponding CUDA context
but which can help parallel operation similar to the relation between thread and process
This commit is contained in:
Seungha Yang 2019-08-18 22:07:38 +09:00 committed by Matthew Waters
parent 20d8f54e63
commit 5615e9258f
5 changed files with 60 additions and 7 deletions

View file

@ -68,6 +68,8 @@ typedef struct _GstNvCodecCudaVTable
CUresult (*CuMemcpy2D) (const CUDA_MEMCPY2D * pCopy);
CUresult (*CuMemcpy2DAsync) (const CUDA_MEMCPY2D * pCopy, CUstream hStream);
CUresult (*CuMemFree) (CUdeviceptr dptr);
CUresult (*CuStreamCreate) (CUstream * phStream, unsigned int Flags);
CUresult (*CuStreamDestroy) (CUstream hStream);
CUresult (*CuStreamSynchronize) (CUstream hStream);
CUresult (*CuDeviceGet) (CUdevice * device, int ordinal);
@ -125,6 +127,8 @@ gst_cuda_load_library (void)
LOAD_SYMBOL (cuMemcpy2DAsync, CuMemcpy2DAsync);
LOAD_SYMBOL (cuMemFree, CuMemFree);
LOAD_SYMBOL (cuStreamCreate, CuStreamCreate);
LOAD_SYMBOL (cuStreamDestroy, CuStreamDestroy);
LOAD_SYMBOL (cuStreamSynchronize, CuStreamSynchronize);
LOAD_SYMBOL (cuDeviceGet, CuDeviceGet);
@ -290,6 +294,22 @@ CuMemFree (CUdeviceptr dptr)
return gst_cuda_vtable.CuMemFree (dptr);
}
CUresult
CuStreamCreate (CUstream * phStream, unsigned int Flags)
{
g_assert (gst_cuda_vtable.CuStreamCreate != NULL);
return gst_cuda_vtable.CuStreamCreate (phStream, Flags);
}
CUresult
CuStreamDestroy (CUstream hStream)
{
g_assert (gst_cuda_vtable.CuStreamDestroy != NULL);
return gst_cuda_vtable.CuStreamDestroy (hStream);
}
CUresult
CuStreamSynchronize (CUstream hStream)
{

View file

@ -98,6 +98,13 @@ CUresult CuMemcpy2DAsync (const CUDA_MEMCPY2D *pCopy, CUstream hStream);
G_GNUC_INTERNAL
CUresult CuMemFree (CUdeviceptr dptr);
G_GNUC_INTERNAL
CUresult CuStreamCreate (CUstream *phStream,
unsigned int Flags);
G_GNUC_INTERNAL
CUresult CuStreamDestroy (CUstream hStream);
G_GNUC_INTERNAL
CUresult CuStreamSynchronize (CUstream hStream);

View file

@ -616,6 +616,7 @@ gst_nvdec_open (GstVideoDecoder * decoder)
{
GstNvDec *nvdec = GST_NVDEC (decoder);
GstNvDecClass *klass = GST_NVDEC_GET_CLASS (nvdec);
CUresult cuda_ret;
GST_DEBUG_OBJECT (nvdec, "creating CUDA context");
@ -624,6 +625,16 @@ gst_nvdec_open (GstVideoDecoder * decoder)
GST_ERROR_OBJECT (nvdec, "failed to create CUDA context");
return FALSE;
}
if (gst_cuda_context_push (nvdec->cuda_ctx)) {
cuda_ret = CuStreamCreate (&nvdec->cuda_stream, CU_STREAM_NON_BLOCKING);
if (!gst_cuda_result (cuda_ret)) {
GST_WARNING_OBJECT (nvdec,
"Could not create cuda stream, will use default stream");
nvdec->cuda_stream = NULL;
}
gst_cuda_context_pop (NULL);
}
#if HAVE_NVCODEC_GST_GL
gst_gl_ensure_element_data (GST_ELEMENT (nvdec),
&nvdec->gl_display, &nvdec->other_gl_context);
@ -723,7 +734,15 @@ gst_nvdec_close (GstVideoDecoder * decoder)
{
GstNvDec *nvdec = GST_NVDEC (decoder);
if (nvdec->cuda_ctx && nvdec->cuda_stream) {
if (gst_cuda_context_push (nvdec->cuda_ctx)) {
gst_cuda_result (CuStreamDestroy (nvdec->cuda_stream));
gst_cuda_context_pop (NULL);
}
}
gst_clear_object (&nvdec->cuda_ctx);
nvdec->cuda_stream = NULL;
return TRUE;
}
@ -819,7 +838,7 @@ copy_video_frame_to_gl_textures (GstGLContext * context,
}
if (!gst_cuda_result (CuGraphicsMapResources (num_resources, resources,
NULL))) {
nvdec->cuda_stream))) {
GST_WARNING_OBJECT (nvdec, "failed to map CUDA resources");
data->ret = FALSE;
goto unmap_video_frame;
@ -844,18 +863,18 @@ copy_video_frame_to_gl_textures (GstGLContext * context,
mcpy2d.dstArray = array;
mcpy2d.Height = GST_VIDEO_INFO_COMP_HEIGHT (info, i);
if (!gst_cuda_result (CuMemcpy2DAsync (&mcpy2d, 0))) {
if (!gst_cuda_result (CuMemcpy2DAsync (&mcpy2d, nvdec->cuda_stream))) {
GST_WARNING_OBJECT (nvdec, "memcpy to mapped array failed");
data->ret = FALSE;
}
}
gst_cuda_result (CuStreamSynchronize (0));
if (!gst_cuda_result (CuGraphicsUnmapResources (num_resources, resources,
NULL)))
nvdec->cuda_stream)))
GST_WARNING_OBJECT (nvdec, "failed to unmap CUDA resources");
gst_cuda_result (CuStreamSynchronize (nvdec->cuda_stream));
unmap_video_frame:
if (!gst_cuda_result (CuvidUnmapVideoFrame (nvdec->decoder, dptr)))
GST_WARNING_OBJECT (nvdec, "failed to unmap CUDA video frame");
@ -943,7 +962,7 @@ gst_nvdec_copy_device_to_system (GstNvDec * nvdec,
copy_params.dstPitch = GST_VIDEO_FRAME_PLANE_STRIDE (&video_frame, i);
copy_params.Height = GST_VIDEO_FRAME_COMP_HEIGHT (&video_frame, i);
if (!gst_cuda_result (CuMemcpy2DAsync (&copy_params, 0))) {
if (!gst_cuda_result (CuMemcpy2DAsync (&copy_params, nvdec->cuda_stream))) {
GST_ERROR_OBJECT (nvdec, "failed to copy %dth plane", i);
CuvidUnmapVideoFrame (nvdec->decoder, dptr);
gst_video_frame_unmap (&video_frame);
@ -952,7 +971,7 @@ gst_nvdec_copy_device_to_system (GstNvDec * nvdec,
}
}
gst_cuda_result (CuStreamSynchronize (0));
gst_cuda_result (CuStreamSynchronize (nvdec->cuda_stream));
gst_video_frame_unmap (&video_frame);

View file

@ -77,6 +77,7 @@ struct _GstNvDec
CUvideoparser parser;
CUvideodecoder decoder;
GstCudaContext *cuda_ctx;
CUstream cuda_stream;
guint width;
guint height;

View file

@ -56,6 +56,12 @@ typedef enum
CU_GRAPHICS_REGISTER_FLAGS_WRITE_DISCARD = 2
} CUgraphicsRegisterFlags;
typedef enum
{
CU_STREAM_DEFAULT = 0x0,
CU_STREAM_NON_BLOCKING = 0x1
} CUstream_flags;
typedef struct
{
gsize srcXInBytes;