cuda/context: add gpu stack size property

Allows reducing the initial stack size of GPU threads.  Cuda should
automatically increase this value if a kernel requires a larger stack.

Can save roughly 40MB of GPU memory for a single nvh264enc instance.

Part-of: <https://gitlab.freedesktop.org/gstreamer/gstreamer/-/merge_requests/8158>
This commit is contained in:
Matthew Waters 2024-12-16 17:32:20 +11:00 committed by GStreamer Marge Bot
parent d6563016ca
commit dbf4915abd
2 changed files with 54 additions and 0 deletions

View file

@ -509,6 +509,10 @@ so all CUDA functions that operate on the current context are affected.</doc>
<property name="cuda-device-id" writable="1" construct-only="1" transfer-ownership="none" default-value="0">
<type name="guint" c:type="guint"/>
</property>
<property name="default-gpu-stack-size" version="1.26" writable="1" transfer-ownership="none" default-value="1024">
<doc xml:space="preserve" filename="../subprojects/gst-plugins-bad/gst-libs/gst/cuda/gstcudacontext.cpp">The default stack size for each GPU thread.</doc>
<type name="guint" c:type="guint"/>
</property>
<property name="external-resource-interop" version="1.26" transfer-ownership="none" default-value="FALSE">
<doc xml:space="preserve" filename="../subprojects/gst-plugins-bad/gst-libs/gst/cuda/gstcudacontext.cpp">External resource interop API support</doc>
<type name="gboolean" c:type="gboolean"/>

View file

@ -57,6 +57,7 @@ enum
PROP_STREAM_ORDERED_ALLOC,
PROP_PREFER_STREAM_ORDERED_ALLLOC,
PROP_EXT_INTEROP,
PROP_DEFAULT_GPU_STACK_SIZE,
};
struct _GstCudaContextPrivate
@ -70,6 +71,7 @@ struct _GstCudaContextPrivate
gboolean stream_ordered_alloc_supported;
gboolean prefer_stream_ordered_alloc;
gboolean ext_interop_supported;
guint default_gpu_stack_size;
gint tex_align;
@ -182,6 +184,19 @@ gst_cuda_context_class_init (GstCudaContextClass * klass)
"External resource interop API support", FALSE,
(GParamFlags) (G_PARAM_READABLE | G_PARAM_STATIC_STRINGS)));
/**
* GstCudaContext:default-gpu-stack-size:
*
* The default stack size for each GPU thread.
*
* Since: 1.26
*/
g_object_class_install_property (gobject_class, PROP_DEFAULT_GPU_STACK_SIZE,
g_param_spec_uint ("default-gpu-stack-size",
"Default GPU stack size",
"The initial stack size for GPU threads", 0, G_MAXUINT, 1024,
(GParamFlags) (G_PARAM_READWRITE | G_PARAM_STATIC_STRINGS)));
gst_cuda_memory_init_once ();
}
@ -214,6 +229,27 @@ gst_cuda_context_set_property (GObject * object, guint prop_id,
priv->prefer_stream_ordered_alloc = g_value_get_boolean (value);
g_mutex_unlock (&priv->lock);
break;
case PROP_DEFAULT_GPU_STACK_SIZE:{
guint new_stack_limit = g_value_get_uint (value);
g_mutex_lock (&priv->lock);
if (new_stack_limit != priv->default_gpu_stack_size) {
size_t set_value = 0;
gst_cuda_context_push (context);
if (CuCtxSetLimit (CU_LIMIT_STACK_SIZE,
(size_t) new_stack_limit) == CUDA_SUCCESS) {
if (CuCtxGetLimit (&set_value, CU_LIMIT_STACK_SIZE) == CUDA_SUCCESS) {
priv->default_gpu_stack_size = (guint) set_value;
GST_INFO_OBJECT (context,
"set default stack size to %" G_GUINT64_FORMAT,
(guint64) set_value);
}
}
gst_cuda_context_pop (nullptr);
}
g_mutex_unlock (&priv->lock);
break;
}
default:
G_OBJECT_WARN_INVALID_PROPERTY_ID (object, prop_id, pspec);
break;
@ -251,6 +287,9 @@ gst_cuda_context_get_property (GObject * object, guint prop_id,
case PROP_EXT_INTEROP:
g_value_set_boolean (value, priv->ext_interop_supported);
break;
case PROP_DEFAULT_GPU_STACK_SIZE:
g_value_set_uint (value, priv->default_gpu_stack_size);
break;
default:
G_OBJECT_WARN_INVALID_PROPERTY_ID (object, prop_id, pspec);
break;
@ -635,6 +674,7 @@ gst_cuda_context_new_wrapped (CUcontext handler, CUdevice device)
GList *iter;
gint tex_align = 0;
GstCudaContext *self;
size_t default_gpu_stack_size;
g_return_val_if_fail (handler, nullptr);
g_return_val_if_fail (device >= 0, nullptr);
@ -654,6 +694,16 @@ gst_cuda_context_new_wrapped (CUcontext handler, CUdevice device)
self->priv->context = handler;
self->priv->device = device;
self->priv->tex_align = tex_align;
gst_cuda_context_push (self);
if (CuCtxGetLimit (&default_gpu_stack_size,
CU_LIMIT_STACK_SIZE) == CUDA_SUCCESS) {
self->priv->default_gpu_stack_size = (guint) default_gpu_stack_size;
GST_DEBUG ("cuda default stack size %" G_GUINT64_FORMAT,
(guint64) default_gpu_stack_size);
}
gst_cuda_context_pop (nullptr);
gst_object_ref_sink (self);
#ifdef G_OS_WIN32