From 174c9bfaa58a3cfba985b6af29ed1f63fb0de65c Mon Sep 17 00:00:00 2001
From: Seungha Yang <seungha@centricular.com>
Date: Thu, 29 Aug 2024 18:23:37 +0900
Subject: [PATCH] cuda: Load stream ordered allocation related symbols

Required to support async memory allocation

Part-of: <https://gitlab.freedesktop.org/gstreamer/gstreamer/-/merge_requests/7427>
---
 .../gst-libs/gst/cuda/cuda-gst.h              |  32 ++++++
 .../gst-libs/gst/cuda/gstcuda-private.h       |   2 -
 .../gst-libs/gst/cuda/gstcudacontext.cpp      |  27 ++++-
 .../gst-libs/gst/cuda/gstcudaloader-private.h |  30 +++++
 .../gst-libs/gst/cuda/gstcudaloader.cpp       | 107 ++++++++++++++++++
 .../gst-libs/gst/cuda/stub/cuda.h             |  24 ++++
 6 files changed, 219 insertions(+), 3 deletions(-)
 create mode 100644 subprojects/gst-plugins-bad/gst-libs/gst/cuda/gstcudaloader-private.h

diff --git a/subprojects/gst-plugins-bad/gst-libs/gst/cuda/cuda-gst.h b/subprojects/gst-plugins-bad/gst-libs/gst/cuda/cuda-gst.h
index c2a4ec5592..fd83158e82 100644
--- a/subprojects/gst-plugins-bad/gst-libs/gst/cuda/cuda-gst.h
+++ b/subprojects/gst-plugins-bad/gst-libs/gst/cuda/cuda-gst.h
@@ -307,6 +307,38 @@ GST_CUDA_API
 CUresult CUDAAPI CuMemRetainAllocationHandle (CUmemGenericAllocationHandle *handle,
                                               void *addr);
 
+GST_CUDA_API
+CUresult CUDAAPI CuMemAllocAsync (CUdeviceptr *dptr,
+                                  size_t bytesize,
+                                  CUstream hStream);
+
+GST_CUDA_API
+CUresult CUDAAPI CuMemAllocFromPoolAsync (CUdeviceptr *dptr,
+                                          size_t bytesize,
+                                          CUmemoryPool pool,
+                                          CUstream hStream);
+
+GST_CUDA_API
+CUresult CUDAAPI CuMemFreeAsync (CUdeviceptr dptr,
+                                 CUstream hStream);
+
+GST_CUDA_API
+CUresult CUDAAPI CuMemPoolCreate (CUmemoryPool *pool,
+                                  const CUmemPoolProps *poolProps);
+
+GST_CUDA_API
+CUresult CUDAAPI CuMemPoolDestroy (CUmemoryPool pool);
+
+GST_CUDA_API
+CUresult CUDAAPI CuMemPoolSetAttribute (CUmemoryPool pool,
+                                        CUmemPool_attribute attr,
+                                        void *value);
+
+GST_CUDA_API
+CUresult CUDAAPI CuMemPoolGetAttribute (CUmemoryPool pool,
+                                        CUmemPool_attribute attr,
+                                        void *value);
+
 /* cudaGL.h */
 GST_CUDA_API
 CUresult CUDAAPI CuGraphicsGLRegisterImage  (CUgraphicsResource * pCudaResource,
diff --git a/subprojects/gst-plugins-bad/gst-libs/gst/cuda/gstcuda-private.h b/subprojects/gst-plugins-bad/gst-libs/gst/cuda/gstcuda-private.h
index be3b8e00db..f2d8761518 100644
--- a/subprojects/gst-plugins-bad/gst-libs/gst/cuda/gstcuda-private.h
+++ b/subprojects/gst-plugins-bad/gst-libs/gst/cuda/gstcuda-private.h
@@ -56,8 +56,6 @@ void          gst_cuda_memory_set_from_fixed_pool (GstMemory * mem);
 GST_CUDA_API
 gboolean      gst_cuda_memory_is_from_fixed_pool (GstMemory * mem);
 
-gboolean      gst_cuda_virtual_memory_symbol_loaded (void);
-
 gpointer      gst_cuda_get_win32_handle_metadata (void);
 
 G_END_DECLS
diff --git a/subprojects/gst-plugins-bad/gst-libs/gst/cuda/gstcudacontext.cpp b/subprojects/gst-plugins-bad/gst-libs/gst/cuda/gstcudacontext.cpp
index efc7d50157..ab6e0afa09 100644
--- a/subprojects/gst-plugins-bad/gst-libs/gst/cuda/gstcudacontext.cpp
+++ b/subprojects/gst-plugins-bad/gst-libs/gst/cuda/gstcudacontext.cpp
@@ -26,6 +26,7 @@
 #include "gstcudautils.h"
 #include "gstcudamemory.h"
 #include "gstcuda-private.h"
+#include "gstcudaloader-private.h"
 
 #ifdef G_OS_WIN32
 #include <gst/d3d11/gstd3d11.h>
@@ -53,6 +54,7 @@ enum
   PROP_DXGI_ADAPTER_LUID,
   PROP_VIRTUAL_MEMORY,
   PROP_OS_HANDLE,
+  PROP_STREAM_ORDERED_ALLOC,
 };
 
 struct _GstCudaContextPrivate
@@ -63,6 +65,7 @@ struct _GstCudaContextPrivate
   gint64 dxgi_adapter_luid;
   gboolean virtual_memory_supported;
   gboolean os_handle_supported;
+  gboolean stream_ordered_alloc_supported;
 
   gint tex_align;
 
@@ -139,6 +142,16 @@ gst_cuda_context_class_init (GstCudaContextClass * klass)
           "Whether OS specific handle is supported via virtual memory", FALSE,
           (GParamFlags) (G_PARAM_READABLE | G_PARAM_STATIC_STRINGS)));
 
+  /**
+   * GstCudaContext:stream-ordered-alloc:
+   *
+   * Since: 1.26
+   */
+  g_object_class_install_property (gobject_class, PROP_STREAM_ORDERED_ALLOC,
+      g_param_spec_boolean ("stream-ordered-alloc", "Stream Ordered Alloc",
+          "Device supports stream ordered allocation", FALSE,
+          (GParamFlags) (G_PARAM_READABLE | G_PARAM_STATIC_STRINGS)));
+
   gst_cuda_memory_init_once ();
 }
 
@@ -190,6 +203,9 @@ gst_cuda_context_get_property (GObject * object, guint prop_id,
     case PROP_OS_HANDLE:
       g_value_set_boolean (value, priv->os_handle_supported);
       break;
+    case PROP_STREAM_ORDERED_ALLOC:
+      g_value_set_boolean (value, priv->stream_ordered_alloc_supported);
+      break;
     default:
       G_OBJECT_WARN_INVALID_PROPERTY_ID (object, prop_id, pspec);
       break;
@@ -571,7 +587,6 @@ gst_cuda_context_new_wrapped (CUcontext handler, CUdevice device)
 {
   GList *iter;
   gint tex_align = 0;
-
   GstCudaContext *self;
 
   g_return_val_if_fail (handler, nullptr);
@@ -619,6 +634,16 @@ gst_cuda_context_new_wrapped (CUcontext handler, CUdevice device)
       self->priv->os_handle_supported = TRUE;
   }
 
+  if (gst_cuda_stream_ordered_symbol_loaded ()) {
+    CUresult ret;
+    int supported = 0;
+
+    ret = CuDeviceGetAttribute (&supported,
+        CU_DEVICE_ATTRIBUTE_MEMORY_POOLS_SUPPORTED, device);
+    if (ret == CUDA_SUCCESS && supported)
+      self->priv->stream_ordered_alloc_supported = TRUE;
+  }
+
   std::lock_guard < std::mutex > lk (list_lock);
   g_object_weak_ref (G_OBJECT (self),
       (GWeakNotify) gst_cuda_context_weak_ref_notify, nullptr);
diff --git a/subprojects/gst-plugins-bad/gst-libs/gst/cuda/gstcudaloader-private.h b/subprojects/gst-plugins-bad/gst-libs/gst/cuda/gstcudaloader-private.h
new file mode 100644
index 0000000000..6e33874b3d
--- /dev/null
+++ b/subprojects/gst-plugins-bad/gst-libs/gst/cuda/gstcudaloader-private.h
@@ -0,0 +1,30 @@
+/* GStreamer
+ * Copyright (C) 2024 Seungha Yang <seungha@centricular.com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Library General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Library General Public
+ * License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
+ * Boston, MA 02110-1301, USA.
+ */
+
+#pragma once
+
+#include <gst/gst.h>
+
+G_BEGIN_DECLS
+
+gboolean      gst_cuda_virtual_memory_symbol_loaded (void);
+
+gboolean      gst_cuda_stream_ordered_symbol_loaded (void);
+
+G_END_DECLS
diff --git a/subprojects/gst-plugins-bad/gst-libs/gst/cuda/gstcudaloader.cpp b/subprojects/gst-plugins-bad/gst-libs/gst/cuda/gstcudaloader.cpp
index 75b1a11021..7fcb6162d0 100644
--- a/subprojects/gst-plugins-bad/gst-libs/gst/cuda/gstcudaloader.cpp
+++ b/subprojects/gst-plugins-bad/gst-libs/gst/cuda/gstcudaloader.cpp
@@ -25,6 +25,7 @@
 #include "gstcudaloader.h"
 #include <gmodule.h>
 #include "gstcuda-private.h"
+#include "gstcudaloader-private.h"
 
 #ifdef HAVE_CUDA_GST_GL
 #include <gst/gl/gstglconfig.h>
@@ -62,6 +63,7 @@ typedef struct _GstNvCodecCudaVTable
 {
   gboolean loaded;
   gboolean have_virtual_alloc;
+  gboolean have_stream_ordered_alloc;
 
   CUresult (CUDAAPI * CuInit) (unsigned int Flags);
   CUresult (CUDAAPI * CuGetErrorName) (CUresult error, const char **pStr);
@@ -211,6 +213,19 @@ typedef struct _GstNvCodecCudaVTable
   CUresult (CUDAAPI * CuMemUnmap) (CUdeviceptr ptr, size_t size);
   CUresult (CUDAAPI * CuMemRetainAllocationHandle)
       (CUmemGenericAllocationHandle *handle, void *addr);
+
+  CUresult (CUDAAPI * CuMemAllocAsync) (CUdeviceptr *dptr, size_t bytesize,
+      CUstream hStream);
+  CUresult (CUDAAPI * CuMemAllocFromPoolAsync) (CUdeviceptr *dptr,
+      size_t bytesize, CUmemoryPool pool, CUstream hStream);
+  CUresult (CUDAAPI * CuMemFreeAsync) (CUdeviceptr dptr, CUstream hStream);
+  CUresult (CUDAAPI * CuMemPoolCreate) (CUmemoryPool *pool,
+      const CUmemPoolProps *poolProps);
+  CUresult (CUDAAPI * CuMemPoolDestroy) (CUmemoryPool pool);
+  CUresult (CUDAAPI * CuMemPoolSetAttribute) (CUmemoryPool pool,
+      CUmemPool_attribute attr, void *value);
+  CUresult (CUDAAPI * CuMemPoolGetAttribute) (CUmemoryPool pool,
+      CUmemPool_attribute attr, void *value);
 } GstNvCodecCudaVTable;
 /* *INDENT-ON* */
 
@@ -245,6 +260,24 @@ gst_cuda_load_optional_symbols (GModule * module)
   vtable->have_virtual_alloc = TRUE;
 }
 
+static void
+gst_cuda_load_stream_ordered_alloc_symbols (GModule * module)
+{
+  GstNvCodecCudaVTable *vtable = &gst_cuda_vtable;
+
+  LOAD_OPTIONAL_SYMBOL (cuMemAllocAsync, CuMemAllocAsync);
+  LOAD_OPTIONAL_SYMBOL (cuMemAllocFromPoolAsync, CuMemAllocFromPoolAsync);
+  LOAD_OPTIONAL_SYMBOL (cuMemFreeAsync, CuMemFreeAsync);
+  LOAD_OPTIONAL_SYMBOL (cuMemPoolCreate, CuMemPoolCreate);
+  LOAD_OPTIONAL_SYMBOL (cuMemPoolDestroy, CuMemPoolDestroy);
+  LOAD_OPTIONAL_SYMBOL (cuMemPoolSetAttribute, CuMemPoolSetAttribute);
+  LOAD_OPTIONAL_SYMBOL (cuMemPoolGetAttribute, CuMemPoolGetAttribute);
+
+  GST_INFO ("Stream ordered alloc symbols are loaded");
+
+  vtable->have_stream_ordered_alloc = TRUE;
+}
+
 static void
 gst_cuda_load_library_once_func (void)
 {
@@ -353,6 +386,7 @@ gst_cuda_load_library_once_func (void)
   vtable->loaded = TRUE;
 
   gst_cuda_load_optional_symbols (module);
+  gst_cuda_load_stream_ordered_alloc_symbols (module);
 }
 
 /**
@@ -382,6 +416,14 @@ gst_cuda_virtual_memory_symbol_loaded (void)
   return gst_cuda_vtable.have_virtual_alloc;
 }
 
+gboolean
+gst_cuda_stream_ordered_symbol_loaded (void)
+{
+  gst_cuda_load_library ();
+
+  return gst_cuda_vtable.have_stream_ordered_alloc;
+}
+
 CUresult CUDAAPI
 CuInit (unsigned int Flags)
 {
@@ -966,6 +1008,71 @@ CuMemRetainAllocationHandle (CUmemGenericAllocationHandle * handle, void *addr)
   return gst_cuda_vtable.CuMemRetainAllocationHandle (handle, addr);
 }
 
+CUresult CUDAAPI
+CuMemAllocAsync (CUdeviceptr * dptr, size_t bytesize, CUstream hStream)
+{
+  if (!gst_cuda_vtable.CuMemAllocAsync)
+    return CUDA_ERROR_NOT_SUPPORTED;
+
+  return gst_cuda_vtable.CuMemAllocAsync (dptr, bytesize, hStream);
+}
+
+CUresult CUDAAPI
+CuMemAllocFromPoolAsync (CUdeviceptr * dptr, size_t bytesize, CUmemoryPool pool,
+    CUstream hStream)
+{
+  if (!gst_cuda_vtable.CuMemAllocFromPoolAsync)
+    return CUDA_ERROR_NOT_SUPPORTED;
+
+  return gst_cuda_vtable.CuMemAllocFromPoolAsync (dptr,
+      bytesize, pool, hStream);
+}
+
+CUresult CUDAAPI
+CuMemFreeAsync (CUdeviceptr dptr, CUstream hStream)
+{
+  if (!gst_cuda_vtable.CuMemFreeAsync)
+    return CUDA_ERROR_NOT_SUPPORTED;
+
+  return gst_cuda_vtable.CuMemFreeAsync (dptr, hStream);
+}
+
+CUresult CUDAAPI
+CuMemPoolCreate (CUmemoryPool * pool, const CUmemPoolProps * poolProps)
+{
+  if (!gst_cuda_vtable.CuMemPoolCreate)
+    return CUDA_ERROR_NOT_SUPPORTED;
+
+  return gst_cuda_vtable.CuMemPoolCreate (pool, poolProps);
+}
+
+CUresult CUDAAPI
+CuMemPoolDestroy (CUmemoryPool pool)
+{
+  if (!gst_cuda_vtable.CuMemPoolDestroy)
+    return CUDA_ERROR_NOT_SUPPORTED;
+
+  return gst_cuda_vtable.CuMemPoolDestroy (pool);
+}
+
+CUresult CUDAAPI
+CuMemPoolSetAttribute (CUmemoryPool pool, CUmemPool_attribute attr, void *value)
+{
+  if (!gst_cuda_vtable.CuMemPoolSetAttribute)
+    return CUDA_ERROR_NOT_SUPPORTED;
+
+  return gst_cuda_vtable.CuMemPoolSetAttribute (pool, attr, value);
+}
+
+CUresult CUDAAPI
+CuMemPoolGetAttribute (CUmemoryPool pool, CUmemPool_attribute attr, void *value)
+{
+  if (!gst_cuda_vtable.CuMemPoolGetAttribute)
+    return CUDA_ERROR_NOT_SUPPORTED;
+
+  return gst_cuda_vtable.CuMemPoolGetAttribute (pool, attr, value);
+}
+
 /* cudaGL.h */
 CUresult CUDAAPI
 CuGraphicsGLRegisterImage (CUgraphicsResource * pCudaResource,
diff --git a/subprojects/gst-plugins-bad/gst-libs/gst/cuda/stub/cuda.h b/subprojects/gst-plugins-bad/gst-libs/gst/cuda/stub/cuda.h
index 0382301ad9..f6a590b600 100644
--- a/subprojects/gst-plugins-bad/gst-libs/gst/cuda/stub/cuda.h
+++ b/subprojects/gst-plugins-bad/gst-libs/gst/cuda/stub/cuda.h
@@ -31,6 +31,7 @@ typedef gpointer CUmodule;
 typedef gpointer CUfunction;
 typedef gpointer CUmipmappedArray;
 typedef gpointer CUevent;
+typedef gpointer CUmemoryPool;
 
 typedef guint64  CUtexObject;
 typedef guintptr CUdeviceptr;
@@ -62,6 +63,7 @@ typedef enum
   CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR_SUPPORTED = 103,
   CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_WIN32_HANDLE_SUPPORTED = 104,
   CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_WIN32_KMT_HANDLE_SUPPORTED = 105,
+  CU_DEVICE_ATTRIBUTE_MEMORY_POOLS_SUPPORTED = 115,
 } CUdevice_attribute;
 
 typedef enum
@@ -292,6 +294,28 @@ typedef struct
   CUmemAccess_flags flags;
 } CUmemAccessDesc;
 
+typedef struct
+{
+  CUmemAllocationType allocType;
+  CUmemAllocationHandleType handleTypes;
+  CUmemLocation location;
+  void *win32SecurityAttributes;
+  size_t maxSize;
+  unsigned char reserved[56];
+} CUmemPoolProps;
+
+typedef enum
+{
+  CU_MEMPOOL_ATTR_REUSE_FOLLOW_EVENT_DEPENDENCIES = 1,
+  CU_MEMPOOL_ATTR_REUSE_ALLOW_OPPORTUNISTIC,
+  CU_MEMPOOL_ATTR_REUSE_ALLOW_INTERNAL_DEPENDENCIES,
+  CU_MEMPOOL_ATTR_RELEASE_THRESHOLD,
+  CU_MEMPOOL_ATTR_RESERVED_MEM_CURRENT,
+  CU_MEMPOOL_ATTR_RESERVED_MEM_HIGH,
+  CU_MEMPOOL_ATTR_USED_MEM_CURRENT,
+  CU_MEMPOOL_ATTR_USED_MEM_HIGH,
+} CUmemPool_attribute;
+
 #define CUDA_VERSION 10000
 
 #ifdef _WIN32