From abe1f5044d1782571055eb342d4932cb575771bf Mon Sep 17 00:00:00 2001 From: Seungha Yang Date: Sat, 11 Nov 2023 00:37:02 +0900 Subject: [PATCH] cuda: Prefer CUBIN over PTX System installed NVRTC library might be newer version than driver, then generate PTX can be incompatible with the driver. Instead of the intermediate code PTX, use actual assembly code directly. Fixes: https://gitlab.freedesktop.org/gstreamer/gstreamer/-/issues/3108 Part-of: --- .../gst-libs/gst/cuda/gstcudanvrtc.cpp | 113 ++++++++++++++++++ .../gst-libs/gst/cuda/gstcudanvrtc.h | 4 + .../sys/nvcodec/gstcudaconverter.c | 19 ++- 3 files changed, 131 insertions(+), 5 deletions(-) diff --git a/subprojects/gst-plugins-bad/gst-libs/gst/cuda/gstcudanvrtc.cpp b/subprojects/gst-plugins-bad/gst-libs/gst/cuda/gstcudanvrtc.cpp index 8287a2e0c9..c3851414d3 100644 --- a/subprojects/gst-plugins-bad/gst-libs/gst/cuda/gstcudanvrtc.cpp +++ b/subprojects/gst-plugins-bad/gst-libs/gst/cuda/gstcudanvrtc.cpp @@ -27,6 +27,7 @@ #include #include #include "gstcuda-private.h" +#include GST_DEBUG_CATEGORY_STATIC (gst_cuda_nvrtc_debug); #define GST_CAT_DEFAULT gst_cuda_nvrtc_debug @@ -60,6 +61,8 @@ typedef struct _GstCudaNvrtcVTable nvrtcResult (*NvrtcGetProgramLog) (nvrtcProgram prog, char *log); nvrtcResult (*NvrtcGetProgramLogSize) (nvrtcProgram prog, size_t * logSizeRet); + nvrtcResult (*NvrtcGetCUBINSize) (nvrtcProgram prog, size_t *cubinSizeRet); + nvrtcResult (*NvrtcGetCUBIN) (nvrtcProgram prog, char *cubin); } GstCudaNvrtcVTable; /* *INDENT-ON* */ @@ -159,6 +162,8 @@ gst_cuda_nvrtc_load_library_once (void) LOAD_SYMBOL (nvrtcGetPTXSize, NvrtcGetPTXSize); LOAD_SYMBOL (nvrtcGetProgramLog, NvrtcGetProgramLog); LOAD_SYMBOL (nvrtcGetProgramLogSize, NvrtcGetProgramLogSize); + LOAD_SYMBOL (nvrtcGetCUBINSize, NvrtcGetCUBINSize); + LOAD_SYMBOL (nvrtcGetCUBIN, NvrtcGetCUBIN); vtable->loaded = TRUE; @@ -251,6 +256,22 @@ NvrtcGetProgramLogSize (nvrtcProgram prog, size_t *logSizeRet) return gst_cuda_nvrtc_vtable.NvrtcGetProgramLogSize (prog, logSizeRet); } + +static nvrtcResult +NvrtcGetCUBINSize (nvrtcProgram prog, size_t *cubinSizeRet) +{ + g_assert (gst_cuda_nvrtc_vtable.NvrtcGetCUBINSize != nullptr); + + return gst_cuda_nvrtc_vtable.NvrtcGetCUBINSize (prog, cubinSizeRet); +} + +static nvrtcResult +NvrtcGetCUBIN (nvrtcProgram prog, char *cubin) +{ + g_assert (gst_cuda_nvrtc_vtable.NvrtcGetCUBIN != nullptr); + + return gst_cuda_nvrtc_vtable.NvrtcGetCUBIN (prog, cubin); +} /* *INDENT-ON* */ /** @@ -340,3 +361,95 @@ error: return nullptr; } + +/** + * gst_cuda_nvrtc_compile_cubin: + * @source: Source code to compile + * @device: CUDA device + * + * Returns: (transfer full): Compiled CUDA assembly code if successful, + * otherwise %NULL + * + * Since: 1.24 + */ +gchar * +gst_cuda_nvrtc_compile_cubin (const gchar * source, gint device) +{ + nvrtcProgram prog; + nvrtcResult ret; + CUresult curet; + gsize cubin_size; + gchar *cubin = nullptr; + gint major, minor; + + g_return_val_if_fail (source != nullptr, nullptr); + + if (!gst_cuda_nvrtc_load_library ()) + return nullptr; + + GST_TRACE ("CUDA kernel source \n%s", source); + + curet = CuDeviceGetAttribute (&major, + CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device); + if (curet != CUDA_SUCCESS) { + GST_ERROR ("Unknown major compute caps"); + return nullptr; + } + + curet = CuDeviceGetAttribute (&minor, + CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, device); + if (curet != CUDA_SUCCESS) { + GST_ERROR ("Unknown minor compute caps"); + return nullptr; + } + + std::string opt_str = "--gpu-architecture=sm_" + + std::to_string (major) + std::to_string (minor); + + ret = NvrtcCreateProgram (&prog, source, nullptr, 0, nullptr, nullptr); + if (ret != NVRTC_SUCCESS) { + GST_ERROR ("couldn't create nvrtc program, ret %d", ret); + return nullptr; + } + + const char *opts[1] = { opt_str.c_str () }; + + ret = NvrtcCompileProgram (prog, 1, opts); + if (ret != NVRTC_SUCCESS) { + gsize log_size; + + GST_ERROR ("couldn't compile nvrtc program, ret %d", ret); + if (NvrtcGetProgramLogSize (prog, &log_size) == NVRTC_SUCCESS && + log_size > 0) { + gchar *compile_log = (gchar *) g_alloca (log_size); + if (NvrtcGetProgramLog (prog, compile_log) == NVRTC_SUCCESS) { + GST_ERROR ("nvrtc compile log %s", compile_log); + } + } + + goto error; + } + + ret = NvrtcGetCUBINSize (prog, &cubin_size); + if (ret != NVRTC_SUCCESS) { + GST_ERROR ("unknown ptx size, ret %d", ret); + goto error; + } + + cubin = (gchar *) g_malloc0 (cubin_size); + ret = NvrtcGetCUBIN (prog, cubin); + if (ret != NVRTC_SUCCESS) { + GST_ERROR ("couldn't get ptx, ret %d", ret); + g_free (cubin); + goto error; + } + + NvrtcDestroyProgram (&prog); + + return cubin; + +error: + NvrtcDestroyProgram (&prog); + + return nullptr; +} diff --git a/subprojects/gst-plugins-bad/gst-libs/gst/cuda/gstcudanvrtc.h b/subprojects/gst-plugins-bad/gst-libs/gst/cuda/gstcudanvrtc.h index 5fc4bf7bae..5af9dfea11 100644 --- a/subprojects/gst-plugins-bad/gst-libs/gst/cuda/gstcudanvrtc.h +++ b/subprojects/gst-plugins-bad/gst-libs/gst/cuda/gstcudanvrtc.h @@ -30,5 +30,9 @@ gboolean gst_cuda_nvrtc_load_library (void); GST_CUDA_API gchar * gst_cuda_nvrtc_compile (const gchar * source); +GST_CUDA_API +gchar * gst_cuda_nvrtc_compile_cubin (const gchar * source, + gint device); + G_END_DECLS diff --git a/subprojects/gst-plugins-bad/sys/nvcodec/gstcudaconverter.c b/subprojects/gst-plugins-bad/sys/nvcodec/gstcudaconverter.c index a6661fd692..ca336c0af6 100644 --- a/subprojects/gst-plugins-bad/sys/nvcodec/gstcudaconverter.c +++ b/subprojects/gst-plugins-bad/sys/nvcodec/gstcudaconverter.c @@ -1708,7 +1708,7 @@ gst_cuda_converter_setup (GstCudaConverter * self) const GstVideoColorimetry *in_color; const GstVideoColorimetry *out_color; gchar *str; - gchar *ptx; + gchar *program = NULL; CUresult ret; in_info = &priv->in_info; @@ -2071,10 +2071,16 @@ gst_cuda_converter_setup (GstCudaConverter * self) write_func); GST_LOG_OBJECT (self, "kernel code:\n%s\n", str); - ptx = gst_cuda_nvrtc_compile (str); + gint cuda_device; + g_object_get (self->context, "cuda-device-id", &cuda_device, NULL); + program = gst_cuda_nvrtc_compile_cubin (str, cuda_device); + if (!program) { + GST_WARNING_OBJECT (self, "Couldn't compile to cubin, trying ptx"); + program = gst_cuda_nvrtc_compile (str); + } g_free (str); - if (!ptx) { + if (!program) { GST_ERROR_OBJECT (self, "Could not compile code"); return FALSE; } @@ -2093,6 +2099,7 @@ gst_cuda_converter_setup (GstCudaConverter * self) if (!gst_cuda_context_push (self->context)) { GST_ERROR_OBJECT (self, "Couldn't push context"); + g_free (program); return FALSE; } @@ -2138,8 +2145,8 @@ gst_cuda_converter_setup (GstCudaConverter * self) priv->unpack_buffer.texture = texture; } - ret = CuModuleLoadData (&priv->module, ptx); - g_free (ptx); + ret = CuModuleLoadData (&priv->module, program); + g_clear_pointer (&program, g_free); if (!gst_cuda_result (ret)) { GST_ERROR_OBJECT (self, "Could not load module"); priv->module = NULL; @@ -2168,6 +2175,8 @@ gst_cuda_converter_setup (GstCudaConverter * self) error: gst_cuda_context_pop (NULL); + g_free (program); + return FALSE; }