From 84ec16c67a1f05c0b07fda205fb2e65dc3a8b33b Mon Sep 17 00:00:00 2001 From: Seungha Yang Date: Sat, 18 Feb 2023 02:50:31 +0900 Subject: [PATCH] nvencoder: Add support for GL memory preparation to deprecate old NVENC elements Part-of: --- .../sys/nvcodec/gstnvencoder.cpp | 355 +++++++++++++++++- .../sys/nvcodec/gstnvencoder.h | 4 + .../sys/nvcodec/gstnvh264encoder.cpp | 13 + .../sys/nvcodec/gstnvh265encoder.cpp | 13 + 4 files changed, 369 insertions(+), 16 deletions(-) diff --git a/subprojects/gst-plugins-bad/sys/nvcodec/gstnvencoder.cpp b/subprojects/gst-plugins-bad/sys/nvcodec/gstnvencoder.cpp index 6e35838687..134d6a145b 100644 --- a/subprojects/gst-plugins-bad/sys/nvcodec/gstnvencoder.cpp +++ b/subprojects/gst-plugins-bad/sys/nvcodec/gstnvencoder.cpp @@ -35,10 +35,6 @@ #include #include "gstnvencobject.h" -#ifdef G_OS_WIN32 -#include -#endif - #ifdef G_OS_WIN32 #include @@ -47,6 +43,10 @@ using namespace Microsoft::WRL; /* *INDENT-ON* */ #endif +#ifdef HAVE_CUDA_GST_GL +#define SUPPORTED_GL_APIS GST_GL_API_OPENGL3 +#endif + GST_DEBUG_CATEGORY (gst_nv_encoder_debug); #define GST_CAT_DEFAULT gst_nv_encoder_debug @@ -69,6 +69,14 @@ struct _GstNvEncoderPrivate GstD3D11Fence *fence = nullptr; #endif +#ifdef HAVE_CUDA_GST_GL + GstGLDisplay *gl_display = nullptr; + GstGLContext *gl_context = nullptr; + GstGLContext *other_gl_context = nullptr; + + gboolean gl_interop = FALSE; +#endif + std::shared_ptr < GstNvEncObject > object; GstNvEncoderDeviceMode subclass_device_mode; @@ -195,6 +203,13 @@ gst_nv_encoder_set_context (GstElement * element, GstContext * context) case GST_NV_ENCODER_DEVICE_CUDA: gst_cuda_handle_set_context (element, context, priv->cuda_device_id, &priv->context); +#ifdef HAVE_CUDA_GST_GL + if (gst_gl_handle_set_context (element, context, &priv->gl_display, + &priv->other_gl_context)) { + if (priv->gl_display) + gst_gl_display_filter_gl_api (priv->gl_display, SUPPORTED_GL_APIS); + } +#endif break; default: break; @@ -376,6 +391,11 @@ gst_nv_encoder_close (GstVideoEncoder * encoder) gst_clear_d3d11_fence (&priv->fence); gst_clear_object (&priv->device); #endif +#ifdef HAVE_CUDA_GST_GL + gst_clear_object (&priv->gl_display); + gst_clear_object (&priv->gl_context); + gst_clear_object (&priv->other_gl_context); +#endif return TRUE; } @@ -422,6 +442,56 @@ gst_nv_encoder_sink_event (GstVideoEncoder * encoder, GstEvent * event) return GST_VIDEO_ENCODER_CLASS (parent_class)->sink_event (encoder, event); } +#ifdef HAVE_CUDA_GST_GL +static void +gst_nv_encoder_check_cuda_device_from_gl_context (GstGLContext * context, + gboolean * ret) +{ + guint device_count = 0; + CUdevice device_list[1] = { 0, }; + CUresult cuda_ret; + + *ret = FALSE; + + cuda_ret = CuGLGetDevices (&device_count, + device_list, 1, CU_GL_DEVICE_LIST_ALL); + + if (!gst_cuda_result (cuda_ret) || device_count == 0) + return; + + *ret = TRUE; +} + +static gboolean +gst_nv_encoder_ensure_gl_context (GstNvEncoder * self) +{ + GstNvEncoderPrivate *priv = self->priv; + gboolean ret = FALSE; + + std::unique_lock < std::recursive_mutex > lk (priv->context_lock); + + if (!gst_gl_ensure_element_data (GST_ELEMENT (self), &priv->gl_display, + &priv->other_gl_context)) { + GST_DEBUG_OBJECT (self, "Couldn't get GL display"); + return FALSE; + } + + gst_gl_display_filter_gl_api (priv->gl_display, SUPPORTED_GL_APIS); + + if (!gst_gl_display_ensure_context (priv->gl_display, priv->other_gl_context, + &priv->gl_context, nullptr)) { + GST_DEBUG_OBJECT (self, "Couldn't get GL context"); + return FALSE; + } + + gst_gl_context_thread_add (priv->gl_context, + (GstGLContextThreadFunc) gst_nv_encoder_check_cuda_device_from_gl_context, + &ret); + + return ret; +} +#endif + static gboolean gst_nv_encoder_handle_context_query (GstNvEncoder * self, GstQuery * query) { @@ -438,6 +508,32 @@ gst_nv_encoder_handle_context_query (GstNvEncoder * self, GstQuery * query) break; #endif case GST_NV_ENCODER_DEVICE_CUDA: +#ifdef HAVE_CUDA_GST_GL + { + GstGLDisplay *display = nullptr; + GstGLContext *other = nullptr; + GstGLContext *local = nullptr; + + if (priv->gl_display) + display = (GstGLDisplay *) gst_object_ref (priv->gl_display); + if (priv->gl_context) + local = (GstGLContext *) gst_object_ref (priv->gl_context); + if (priv->other_gl_context) + other = (GstGLContext *) gst_object_ref (priv->other_gl_context); + + lk.unlock (); + ret = gst_gl_handle_context_query (GST_ELEMENT (self), query, + display, local, other); + lk.lock (); + + gst_clear_object (&display); + gst_clear_object (&other); + gst_clear_object (&local); + + if (ret) + return ret; + } +#endif ret = gst_cuda_handle_context_query (GST_ELEMENT (self), query, priv->context); break; @@ -445,7 +541,6 @@ gst_nv_encoder_handle_context_query (GstNvEncoder * self, GstQuery * query) break; } - return ret; } @@ -544,6 +639,25 @@ gst_nv_encoder_propose_allocation (GstVideoEncoder * encoder, GstQuery * query) break; #endif case GST_NV_ENCODER_DEVICE_CUDA: +#ifdef HAVE_CUDA_GST_GL + if (features && gst_caps_features_contains (features, + GST_CAPS_FEATURE_MEMORY_GL_MEMORY)) { + GST_DEBUG_OBJECT (self, "upstream support GL memory"); + if (!gst_nv_encoder_ensure_gl_context (self)) { + GST_WARNING_OBJECT (self, "Couldn't get GL context"); + priv->gl_interop = FALSE; + gst_query_add_allocation_meta (query, + GST_VIDEO_META_API_TYPE, nullptr); + gst_query_add_allocation_pool (query, + nullptr, info.size, min_buffers, 0); + return TRUE; + } + + pool = gst_gl_buffer_pool_new (priv->gl_context); + break; + } +#endif + if (features && gst_caps_features_contains (features, GST_CAPS_FEATURE_MEMORY_CUDA_MEMORY)) { GST_DEBUG_OBJECT (self, "upstream support CUDA memory"); @@ -562,7 +676,6 @@ gst_nv_encoder_propose_allocation (GstVideoEncoder * encoder, GstQuery * query) config = gst_buffer_pool_get_config (pool); gst_buffer_pool_config_add_option (config, GST_BUFFER_POOL_OPTION_VIDEO_META); - size = GST_VIDEO_INFO_SIZE (&info); gst_buffer_pool_config_set_params (config, caps, size, min_buffers, 0); if (use_cuda_pool && priv->stream) { @@ -1076,6 +1189,18 @@ gst_nv_encoder_set_format (GstVideoEncoder * encoder, priv->input_state = gst_video_codec_state_ref (state); priv->last_flow = GST_FLOW_OK; +#ifdef HAVE_CUDA_GST_GL + { + GstCapsFeatures *features = gst_caps_get_features (state->caps, 0); + if (gst_caps_features_contains (features, + GST_CAPS_FEATURE_MEMORY_GL_MEMORY)) { + priv->gl_interop = TRUE; + } else { + priv->gl_interop = FALSE; + } + } +#endif + /* select device again on next buffer */ if (priv->subclass_device_mode == GST_NV_ENCODER_DEVICE_AUTO_SELECT) priv->selected_device_mode = GST_NV_ENCODER_DEVICE_AUTO_SELECT; @@ -1134,9 +1259,174 @@ gst_nv_encoder_copy_system (GstNvEncoder * self, const GstVideoInfo * info, return GST_FLOW_OK; } +#ifdef HAVE_CUDA_GST_GL +static GstCudaGraphicsResource * +gst_nv_encoder_ensure_gl_cuda_resource (GstNvEncoder * self, GstMemory * mem) +{ + GQuark quark; + GstNvEncoderPrivate *priv = self->priv; + GstCudaGraphicsResource *resource; + + if (!gst_is_gl_memory_pbo (mem)) { + GST_WARNING_OBJECT (self, "memory is not GL PBO memory, %s", + mem->allocator->mem_type); + return nullptr; + } + + quark = gst_cuda_quark_from_id (GST_CUDA_QUARK_GRAPHICS_RESOURCE); + resource = (GstCudaGraphicsResource *) + gst_mini_object_get_qdata (GST_MINI_OBJECT (mem), quark); + + if (!resource) { + GstMapInfo map_info; + GstGLMemoryPBO *pbo = (GstGLMemoryPBO *) mem; + GstGLBuffer *gl_buf = pbo->pbo; + gboolean ret; + + if (!gst_memory_map (mem, &map_info, + (GstMapFlags) (GST_MAP_READ | GST_MAP_GL))) { + GST_ERROR_OBJECT (self, "Couldn't map gl memory"); + return nullptr; + } + + resource = gst_cuda_graphics_resource_new (priv->context, + GST_OBJECT (GST_GL_BASE_MEMORY_CAST (mem)->context), + GST_CUDA_GRAPHICS_RESOURCE_GL_BUFFER); + + GST_LOG_OBJECT (self, "registering gl buffer %d to CUDA", gl_buf->id); + ret = gst_cuda_graphics_resource_register_gl_buffer (resource, gl_buf->id, + CU_GRAPHICS_REGISTER_FLAGS_NONE); + gst_memory_unmap (mem, &map_info); + + if (!ret) { + GST_ERROR_OBJECT (self, "Couldn't register gl buffer %d", gl_buf->id); + gst_cuda_graphics_resource_free (resource); + return nullptr; + } + + gst_mini_object_set_qdata (GST_MINI_OBJECT (mem), quark, resource, + (GDestroyNotify) gst_cuda_graphics_resource_free); + } + + return resource; +} + +struct GstNvEncGLInteropData +{ + GstNvEncoder *self = nullptr; + GstBuffer *in_buf = nullptr; + GstBuffer *out_buf = nullptr; +}; + +static void +gst_nv_encoder_upload_gl (GstGLContext * context, GstNvEncGLInteropData * data) +{ + GstNvEncoder *self = data->self; + GstNvEncoderPrivate *priv = self->priv; + CUDA_MEMCPY2D copy_param; + GstCudaGraphicsResource *gst_res[GST_VIDEO_MAX_PLANES] = { nullptr, }; + CUgraphicsResource cuda_res[GST_VIDEO_MAX_PLANES] = { nullptr, }; + CUdeviceptr src_devptr[GST_VIDEO_MAX_PLANES] = { 0, }; + const GstVideoInfo *info = &priv->input_state->info; + CUstream stream = gst_cuda_stream_get_handle (priv->stream); + GstCudaMemory *cmem; + GstMapInfo map_info; + CUresult cuda_ret; + gboolean ret = FALSE; + + gst_cuda_context_push (priv->context); + + for (guint i = 0; i < GST_VIDEO_INFO_N_PLANES (info); i++) { + GstMemory *mem = gst_buffer_peek_memory (data->in_buf, i); + GstGLMemoryPBO *pbo = (GstGLMemoryPBO *) mem; + gsize src_size; + + if (!gst_is_gl_memory_pbo (mem)) { + GST_ERROR_OBJECT (self, "Not a GL PBO memory"); + goto out; + } + + gst_res[i] = gst_nv_encoder_ensure_gl_cuda_resource (self, mem); + if (!gst_res[i]) { + GST_ERROR_OBJECT (self, "Couldn't get resource %d", i); + goto out; + } + + gst_gl_memory_pbo_upload_transfer (pbo); + gst_gl_memory_pbo_download_transfer (pbo); + + cuda_res[i] = gst_cuda_graphics_resource_map (gst_res[i], stream, + CU_GRAPHICS_MAP_RESOURCE_FLAGS_READ_ONLY); + if (!cuda_res[i]) { + GST_ERROR_OBJECT (self, "Couldn't map resource"); + goto out; + } + + cuda_ret = CuGraphicsResourceGetMappedPointer (&src_devptr[i], + &src_size, cuda_res[i]); + if (!gst_cuda_result (cuda_ret)) { + GST_ERROR_OBJECT (self, "Couldn't get mapped device pointer"); + goto out; + } + } + + if (gst_buffer_pool_acquire_buffer (priv->internal_pool, + &data->out_buf, nullptr) != GST_FLOW_OK) { + GST_ERROR_OBJECT (self, "Couldn't acquire fallback buffer"); + goto out; + } + + cmem = (GstCudaMemory *) gst_buffer_peek_memory (data->out_buf, 0); + if (!gst_memory_map (GST_MEMORY_CAST (cmem), &map_info, + (GstMapFlags) (GST_MAP_WRITE | GST_MAP_CUDA))) { + GST_ERROR_OBJECT (self, "Couldn't map fallback memory"); + goto out; + } + + memset (©_param, 0, sizeof (CUDA_MEMCPY2D)); + + for (guint i = 0; i < GST_VIDEO_INFO_N_PLANES (info); i++) { + copy_param.srcMemoryType = CU_MEMORYTYPE_DEVICE; + copy_param.srcDevice = src_devptr[i]; + copy_param.srcPitch = GST_VIDEO_INFO_PLANE_STRIDE (info, i); + + copy_param.dstMemoryType = CU_MEMORYTYPE_DEVICE; + copy_param.dstDevice = ((CUdeviceptr) map_info.data) + cmem->info.offset[i]; + copy_param.dstPitch = cmem->info.stride[0]; + + copy_param.WidthInBytes = GST_VIDEO_INFO_COMP_WIDTH (info, i) * + GST_VIDEO_INFO_COMP_PSTRIDE (info, i); + copy_param.Height = GST_VIDEO_INFO_COMP_HEIGHT (info, i); + + if (!gst_cuda_result (CuMemcpy2DAsync (©_param, stream))) { + gst_memory_unmap (GST_MEMORY_CAST (cmem), &map_info); + GST_ERROR_OBJECT (self, "Couldn't copy plane %d", i); + goto out; + } + } + + gst_memory_unmap (GST_MEMORY_CAST (cmem), &map_info); + + ret = TRUE; + +out: + for (guint i = 0; i < gst_buffer_n_memory (data->in_buf); i++) { + if (!gst_res[i]) + break; + + gst_cuda_graphics_resource_unmap (gst_res[i], stream); + } + + CuStreamSynchronize (stream); + gst_cuda_context_pop (nullptr); + if (!ret) + gst_clear_buffer (&data->out_buf); +} +#endif /* HAVE_CUDA_GST_GL */ + static GstFlowReturn gst_nv_encoder_prepare_task_input_cuda (GstNvEncoder * self, - const GstVideoInfo * info, GstBuffer * buffer, GstNvEncTask * task) + GstBuffer * buffer, GstNvEncTask * task) { GstNvEncoderPrivate *priv = self->priv; std::shared_ptr < GstNvEncObject > object = priv->object; @@ -1145,8 +1435,43 @@ gst_nv_encoder_prepare_task_input_cuda (GstNvEncoder * self, NVENCSTATUS status; GstCudaStream *stream; GstNvEncResource *resource = nullptr; + const GstVideoInfo *info = &priv->input_state->info; mem = gst_buffer_peek_memory (buffer, 0); + +#ifdef HAVE_CUDA_GST_GL + if (priv->gl_interop) { + if (gst_is_gl_memory (mem) && gst_buffer_n_memory (buffer) == + GST_VIDEO_INFO_N_PLANES (info)) { + GstNvEncGLInteropData gl_data; + GstGLMemory *gl_mem = (GstGLMemory *) mem; + gl_data.self = self; + gl_data.in_buf = buffer; + gl_data.out_buf = nullptr; + gst_gl_context_thread_add (gl_mem->mem.context, + (GstGLContextThreadFunc) gst_nv_encoder_upload_gl, &gl_data); + if (gl_data.out_buf) { + mem = gst_buffer_peek_memory (gl_data.out_buf, 0); + status = object->AcquireResource (mem, &resource); + + if (status != NV_ENC_SUCCESS) { + gst_buffer_unref (gl_data.out_buf); + GST_ERROR_OBJECT (self, "Failed to get resource, status %" + GST_NVENC_STATUS_FORMAT, GST_NVENC_STATUS_ARGS (status)); + return GST_FLOW_ERROR; + } + + gst_nv_enc_task_set_resource (task, gl_data.out_buf, resource); + + return GST_FLOW_OK; + } else { + GST_WARNING_OBJECT (self, "GL interop failed"); + priv->gl_interop = FALSE; + } + } + } +#endif + if (!gst_is_cuda_memory (mem)) { GST_LOG_OBJECT (self, "Not a CUDA buffer, system copy"); return gst_nv_encoder_copy_system (self, info, buffer, task); @@ -1338,8 +1663,7 @@ gst_nv_encoder_upload_d3d11_frame (GstNvEncoder * self, static GstFlowReturn gst_nv_encoder_prepare_task_input_d3d11 (GstNvEncoder * self, - const GstVideoInfo * info, GstBuffer * buffer, - GstBufferPool * pool, GstNvEncTask * task) + GstBuffer * buffer, GstNvEncTask * task) { GstNvEncoderPrivate *priv = self->priv; std::shared_ptr < GstNvEncObject > object = priv->object; @@ -1348,6 +1672,8 @@ gst_nv_encoder_prepare_task_input_d3d11 (GstNvEncoder * self, NVENCSTATUS status; GstBuffer *upload_buffer = nullptr; GstNvEncResource *resource = nullptr; + const GstVideoInfo *info = &priv->input_state->info; + GstBufferPool *pool = priv->internal_pool; if (gst_buffer_n_memory (buffer) > 1) { GST_LOG_OBJECT (self, "Not a native DXGI format, system copy"); @@ -1401,8 +1727,7 @@ gst_nv_encoder_prepare_task_input_d3d11 (GstNvEncoder * self, static GstFlowReturn gst_nv_encoder_prepare_task_input (GstNvEncoder * self, - const GstVideoInfo * info, GstBuffer * buffer, - GstBufferPool * pool, GstNvEncTask * task) + GstBuffer * buffer, GstNvEncTask * task) { GstNvEncoderPrivate *priv = self->priv; GstFlowReturn ret = GST_FLOW_ERROR; @@ -1410,12 +1735,11 @@ gst_nv_encoder_prepare_task_input (GstNvEncoder * self, switch (priv->selected_device_mode) { #ifdef G_OS_WIN32 case GST_NV_ENCODER_DEVICE_D3D11: - ret = gst_nv_encoder_prepare_task_input_d3d11 (self, info, buffer, - pool, task); + ret = gst_nv_encoder_prepare_task_input_d3d11 (self, buffer, task); break; #endif case GST_NV_ENCODER_DEVICE_CUDA: - ret = gst_nv_encoder_prepare_task_input_cuda (self, info, buffer, task); + ret = gst_nv_encoder_prepare_task_input_cuda (self, buffer, task); break; default: g_assert_not_reached (); @@ -1496,8 +1820,7 @@ gst_nv_encoder_handle_frame (GstVideoEncoder * encoder, } gst_nv_encoder_device_lock (self); - ret = gst_nv_encoder_prepare_task_input (self, &priv->input_state->info, - in_buf, priv->internal_pool, task); + ret = gst_nv_encoder_prepare_task_input (self, in_buf, task); gst_nv_encoder_device_unlock (self); if (ret != GST_FLOW_OK) { diff --git a/subprojects/gst-plugins-bad/sys/nvcodec/gstnvencoder.h b/subprojects/gst-plugins-bad/sys/nvcodec/gstnvencoder.h index 43f89ad0be..bf96044fcb 100644 --- a/subprojects/gst-plugins-bad/sys/nvcodec/gstnvencoder.h +++ b/subprojects/gst-plugins-bad/sys/nvcodec/gstnvencoder.h @@ -26,6 +26,10 @@ #include #endif +#ifdef HAVE_CUDA_GST_GL +#include +#endif + #include #include diff --git a/subprojects/gst-plugins-bad/sys/nvcodec/gstnvh264encoder.cpp b/subprojects/gst-plugins-bad/sys/nvcodec/gstnvh264encoder.cpp index 7858fb8085..f8475e2fc4 100644 --- a/subprojects/gst-plugins-bad/sys/nvcodec/gstnvh264encoder.cpp +++ b/subprojects/gst-plugins-bad/sys/nvcodec/gstnvh264encoder.cpp @@ -1901,6 +1901,12 @@ gst_nv_h264_encoder_create_class_data (GstObject * device, gpointer session, if (device_mode == GST_NV_ENCODER_DEVICE_CUDA) { gst_caps_set_features (sink_caps, 0, gst_caps_features_new (GST_CAPS_FEATURE_MEMORY_CUDA_MEMORY, nullptr)); +#ifdef HAVE_CUDA_GST_GL + GstCaps *gl_caps = gst_caps_copy (system_caps); + gst_caps_set_features (gl_caps, 0, + gst_caps_features_new (GST_CAPS_FEATURE_MEMORY_GL_MEMORY, nullptr)); + gst_caps_append (sink_caps, gl_caps); +#endif } gst_caps_append (sink_caps, system_caps); @@ -2224,6 +2230,13 @@ gst_nv_h264_encoder_register_auto_select (GstPlugin * plugin, } #endif +#ifdef HAVE_CUDA_GST_GL + GstCaps *gl_caps = gst_caps_copy (system_caps); + gst_caps_set_features (gl_caps, 0, + gst_caps_features_new (GST_CAPS_FEATURE_MEMORY_GL_MEMORY, nullptr)); + gst_caps_append (sink_caps, gl_caps); +#endif + gst_caps_append (sink_caps, system_caps); cdata = gst_nv_encoder_class_data_new (); diff --git a/subprojects/gst-plugins-bad/sys/nvcodec/gstnvh265encoder.cpp b/subprojects/gst-plugins-bad/sys/nvcodec/gstnvh265encoder.cpp index f680ec3862..ebbafd9e8a 100644 --- a/subprojects/gst-plugins-bad/sys/nvcodec/gstnvh265encoder.cpp +++ b/subprojects/gst-plugins-bad/sys/nvcodec/gstnvh265encoder.cpp @@ -1912,6 +1912,12 @@ gst_nv_h265_encoder_create_class_data (GstObject * device, gpointer session, if (device_mode == GST_NV_ENCODER_DEVICE_CUDA) { gst_caps_set_features (sink_caps, 0, gst_caps_features_new (GST_CAPS_FEATURE_MEMORY_CUDA_MEMORY, nullptr)); +#ifdef HAVE_CUDA_GST_GL + GstCaps *gl_caps = gst_caps_copy (system_caps); + gst_caps_set_features (gl_caps, 0, + gst_caps_features_new (GST_CAPS_FEATURE_MEMORY_GL_MEMORY, nullptr)); + gst_caps_append (sink_caps, gl_caps); +#endif } gst_caps_append (sink_caps, system_caps); @@ -2230,6 +2236,13 @@ gst_nv_h265_encoder_register_auto_select (GstPlugin * plugin, } #endif +#ifdef HAVE_CUDA_GST_GL + GstCaps *gl_caps = gst_caps_copy (system_caps); + gst_caps_set_features (gl_caps, 0, + gst_caps_features_new (GST_CAPS_FEATURE_MEMORY_GL_MEMORY, nullptr)); + gst_caps_append (sink_caps, gl_caps); +#endif + gst_caps_append (sink_caps, system_caps); cdata = gst_nv_encoder_class_data_new ();