cuda: Introduce GST_CUDA_CRITICAL_ERRORS env to abort on critical error

Adding GST_CUDA_CRITICAL_ERRORS env variable so that program can be terminated on unrecoverable error. Example) GST_CUDA_CRITICAL_ERRORS=2,700 gst-launch-1.0 ... In this example, CUDA_ERROR_OUT_OF_MEMORY(2) and CUDA_ERROR_ILLEGAL_ADDRESS(700) are registered as critical error and program will be aborted on those errors Part-of: <https://gitlab.freedesktop.org/gstreamer/gstreamer/-/merge_requests/4729>
2024-11-25 11:11:08 +00:00 · 2023-05-29 21:53:52 +09:00 · 2023-05-29 21:53:52 +09:00 · de749fa356
commit de749fa356
parent 58b166453d
3 changed files with 62 additions and 1 deletions
--- a/girs/GstCuda-1.0.gir
+++ b/girs/GstCuda-1.0.gir
@ -1226,7 +1226,7 @@ Retrieves the #GstCudaContext in @context and places the result in @cuda_ctx.</d
        </parameter>
      </parameters>
    </function>
-    <function-macro name="cuda_result" c:identifier="gst_cuda_result" introspectable="0">
+    <function-macro name="cuda_result" c:identifier="gst_cuda_result" version="1.22" introspectable="0">
      <source-position filename="../subprojects/gst-plugins-bad/gst-libs/gst/cuda/gstcudautils.h"/>
      <parameters>
        <parameter name="result">
--- a/subprojects/gst-plugins-bad/docs/libs/cuda/index.md
+++ b/subprojects/gst-plugins-bad/docs/libs/cuda/index.md
@ -5,3 +5,21 @@ gstreamer-cuda-{{ gst_api_version.md }}.pc

 > NOTE: This library API is considered *unstable*

+## Environment variables
+
+The GStreamer CUDA library inspects following environment variables
+
+**`GST_CUDA_CRITICAL_ERRORS`. (Since: 1.24)**
+
+This environment variable can be set to a comma-separated list of CUresult
+values (see CUDA driver API documentation). GStreamer CUDA library will
+abort when the user registered error is detected. This environment can be useful
+when unrecoverable CUDA error happens. Thus in-process error recovery
+(e.g., relaunching new pipeline) is not expected to work, and therefore
+the process should be relaunched.
+
+Example: `GST_CUDA_CRITICAL_ERRORS=2,700`
+
+As a result of the above example, if `CUDA_ERROR_OUT_OF_MEMORY(2)` or
+`CUDA_ERROR_ILLEGAL_ADDRESS(700)` error is detected in GStreamer CUDA library,
+the process will be aborted.
--- a/subprojects/gst-plugins-bad/gst-libs/gst/cuda/gstcudautils.cpp
+++ b/subprojects/gst-plugins-bad/gst-libs/gst/cuda/gstcudautils.cpp
@ -25,6 +25,8 @@
 #include "gstcudacontext.h"
 #include "gstcuda-private.h"
 #include <atomic>
+#include <set>
+#include <string>

 #ifdef HAVE_CUDA_GST_GL
 #include <gst/gl/gl.h>
@ -1672,6 +1674,43 @@ gst_cuda_create_user_token (void)
  return user_token.fetch_add (1);
 }

+static gboolean
+_abort_on_error (CUresult result)
+{
+  static std::set < CUresult > abort_list;
+  GST_CUDA_CALL_ONCE_BEGIN {
+    const gchar *env = g_getenv ("GST_CUDA_CRITICAL_ERRORS");
+    if (!env)
+      return;
+
+    gchar **split = g_strsplit (env, ",", 0);
+    gchar **iter;
+    for (iter = split; *iter; iter++) {
+      int error_code = 0;
+      try {
+        error_code = std::stoi (*iter);
+      } catch ( ...) {
+        GST_WARNING ("Invalid argument \"%s\"", *iter);
+        continue;
+      };
+
+      if (error_code > 0)
+        abort_list.insert ((CUresult) error_code);
+    }
+
+    g_strfreev (split);
+  }
+  GST_CUDA_CALL_ONCE_END;
+
+  if (abort_list.empty ())
+    return FALSE;
+
+  if (abort_list.find (result) != abort_list.end ())
+    return TRUE;
+
+  return FALSE;
+}
+
 /**
 * _gst_cuda_debug:
 * @result: CUDA result code
@ -1696,6 +1735,10 @@ _gst_cuda_debug (CUresult result, GstDebugCategory * cat,
    gst_debug_log (cat, GST_LEVEL_WARNING, file, function, line,
        NULL, "CUDA call failed: %s, %s", _error_name, _error_text);
 #endif
+    if (_abort_on_error (result)) {
+      GST_ERROR ("Critical error %d, abort", (gint) result);
+      g_abort ();
+    }

    return FALSE;
  }