From dbf4915abd962cd8bb753492da08629d4436c2bd Mon Sep 17 00:00:00 2001
From: Matthew Waters <matthew@centricular.com>
Date: Mon, 16 Dec 2024 17:32:20 +1100
Subject: [PATCH] cuda/context: add gpu stack size property

Allows reducing the initial stack size of GPU threads.  Cuda should
automatically increase this value if a kernel requires a larger stack.

Can save roughly 40MB of GPU memory for a single nvh264enc instance.

Part-of: <https://gitlab.freedesktop.org/gstreamer/gstreamer/-/merge_requests/8158>
---
 girs/GstCuda-1.0.gir                          |  4 ++
 .../gst-libs/gst/cuda/gstcudacontext.cpp      | 50 +++++++++++++++++++
 2 files changed, 54 insertions(+)
diff --git a/girs/GstCuda-1.0.gir b/girs/GstCuda-1.0.gir
index eda536c5a3..52489448de 100644
--- a/girs/GstCuda-1.0.gir
+++ b/girs/GstCuda-1.0.gir
@@ -509,6 +509,10 @@ so all CUDA functions that operate on the current context are affected.</doc>
       <property name="cuda-device-id" writable="1" construct-only="1" transfer-ownership="none" default-value="0">
         <type name="guint" c:type="guint"/>
       </property>
+      <property name="default-gpu-stack-size" version="1.26" writable="1" transfer-ownership="none" default-value="1024">
+        <doc xml:space="preserve" filename="../subprojects/gst-plugins-bad/gst-libs/gst/cuda/gstcudacontext.cpp">The default stack size for each GPU thread.</doc>
+        <type name="guint" c:type="guint"/>
+      </property>
       <property name="external-resource-interop" version="1.26" transfer-ownership="none" default-value="FALSE">
         <doc xml:space="preserve" filename="../subprojects/gst-plugins-bad/gst-libs/gst/cuda/gstcudacontext.cpp">External resource interop API support</doc>
         <type name="gboolean" c:type="gboolean"/>
diff --git a/subprojects/gst-plugins-bad/gst-libs/gst/cuda/gstcudacontext.cpp b/subprojects/gst-plugins-bad/gst-libs/gst/cuda/gstcudacontext.cpp
index b4bca0b524..776ab1c7b2 100644
--- a/subprojects/gst-plugins-bad/gst-libs/gst/cuda/gstcudacontext.cpp
+++ b/subprojects/gst-plugins-bad/gst-libs/gst/cuda/gstcudacontext.cpp
@@ -57,6 +57,7 @@ enum
   PROP_STREAM_ORDERED_ALLOC,
   PROP_PREFER_STREAM_ORDERED_ALLLOC,
   PROP_EXT_INTEROP,
+  PROP_DEFAULT_GPU_STACK_SIZE,
 };
 
 struct _GstCudaContextPrivate
@@ -70,6 +71,7 @@ struct _GstCudaContextPrivate
   gboolean stream_ordered_alloc_supported;
   gboolean prefer_stream_ordered_alloc;
   gboolean ext_interop_supported;
+  guint default_gpu_stack_size;
 
   gint tex_align;
 
@@ -182,6 +184,19 @@ gst_cuda_context_class_init (GstCudaContextClass * klass)
           "External resource interop API support", FALSE,
           (GParamFlags) (G_PARAM_READABLE | G_PARAM_STATIC_STRINGS)));
 
+  /**
+   * GstCudaContext:default-gpu-stack-size:
+   *
+   * The default stack size for each GPU thread.
+   *
+   * Since: 1.26
+   */
+  g_object_class_install_property (gobject_class, PROP_DEFAULT_GPU_STACK_SIZE,
+      g_param_spec_uint ("default-gpu-stack-size",
+          "Default GPU stack size",
+          "The initial stack size for GPU threads", 0, G_MAXUINT, 1024,
+          (GParamFlags) (G_PARAM_READWRITE | G_PARAM_STATIC_STRINGS)));
+
   gst_cuda_memory_init_once ();
 }
 
@@ -214,6 +229,27 @@ gst_cuda_context_set_property (GObject * object, guint prop_id,
       priv->prefer_stream_ordered_alloc = g_value_get_boolean (value);
       g_mutex_unlock (&priv->lock);
       break;
+    case PROP_DEFAULT_GPU_STACK_SIZE:{
+      guint new_stack_limit = g_value_get_uint (value);
+
+      g_mutex_lock (&priv->lock);
+      if (new_stack_limit != priv->default_gpu_stack_size) {
+        size_t set_value = 0;
+        gst_cuda_context_push (context);
+        if (CuCtxSetLimit (CU_LIMIT_STACK_SIZE,
+                (size_t) new_stack_limit) == CUDA_SUCCESS) {
+          if (CuCtxGetLimit (&set_value, CU_LIMIT_STACK_SIZE) == CUDA_SUCCESS) {
+            priv->default_gpu_stack_size = (guint) set_value;
+            GST_INFO_OBJECT (context,
+                "set default stack size to %" G_GUINT64_FORMAT,
+                (guint64) set_value);
+          }
+        }
+        gst_cuda_context_pop (nullptr);
+      }
+      g_mutex_unlock (&priv->lock);
+      break;
+    }
     default:
       G_OBJECT_WARN_INVALID_PROPERTY_ID (object, prop_id, pspec);
       break;
@@ -251,6 +287,9 @@ gst_cuda_context_get_property (GObject * object, guint prop_id,
     case PROP_EXT_INTEROP:
       g_value_set_boolean (value, priv->ext_interop_supported);
       break;
+    case PROP_DEFAULT_GPU_STACK_SIZE:
+      g_value_set_uint (value, priv->default_gpu_stack_size);
+      break;
     default:
       G_OBJECT_WARN_INVALID_PROPERTY_ID (object, prop_id, pspec);
       break;
@@ -635,6 +674,7 @@ gst_cuda_context_new_wrapped (CUcontext handler, CUdevice device)
   GList *iter;
   gint tex_align = 0;
   GstCudaContext *self;
+  size_t default_gpu_stack_size;
 
   g_return_val_if_fail (handler, nullptr);
   g_return_val_if_fail (device >= 0, nullptr);
@@ -654,6 +694,16 @@ gst_cuda_context_new_wrapped (CUcontext handler, CUdevice device)
   self->priv->context = handler;
   self->priv->device = device;
   self->priv->tex_align = tex_align;
+
+  gst_cuda_context_push (self);
+  if (CuCtxGetLimit (&default_gpu_stack_size,
+          CU_LIMIT_STACK_SIZE) == CUDA_SUCCESS) {
+    self->priv->default_gpu_stack_size = (guint) default_gpu_stack_size;
+    GST_DEBUG ("cuda default stack size %" G_GUINT64_FORMAT,
+        (guint64) default_gpu_stack_size);
+  }
+  gst_cuda_context_pop (nullptr);
+
   gst_object_ref_sink (self);
 
 #ifdef G_OS_WIN32