nvcodec: Add CUDA specific memory and bufferpool

Introducing CUDA buffer pool with generic CUDA memory support. Likewise GL memory, any elements which are able to access CUDA device memory directly can map this CUDA memory without upload/download overhead via the "GST_MAP_CUDA" map flag. Also usual GstMemory map/unmap is also possible with internal staging memory. For staging, CUDA Host allocated memory is used (see CuMemAllocHost API). The memory is allowing system access but has lower overhead during GPU upload/download than normal system memory. Part-of: <https://gitlab.freedesktop.org/gstreamer/gst-plugins-bad/-/merge_requests/1633>
2025-06-05 23:18:52 +00:00 · 2019-08-19 18:02:56 +09:00 · 2019-08-19 18:02:56 +09:00 · cf5ef5635f
commit cf5ef5635f
parent 11353b3f6e
8 changed files with 981 additions and 0 deletions
--- a/sys/nvcodec/gstcudabufferpool.c
+++ b/sys/nvcodec/gstcudabufferpool.c
@ -0,0 +1,259 @@
+/* GStreamer
+ * Copyright (C) <2018-2019> Seungha Yang <seungha.yang@navercorp.com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Library General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Library General Public
+ * License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
+ * Boston, MA 02110-1301, USA.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "gstcudabufferpool.h"
+#include "gstcudacontext.h"
+#include "gstcudamemory.h"
+
+GST_DEBUG_CATEGORY_STATIC (gst_cuda_buffer_pool_debug);
+#define GST_CAT_DEFAULT gst_cuda_buffer_pool_debug
+
+struct _GstCudaBufferPoolPrivate
+{
+  GstCudaContext *context;
+  GstAllocator *allocator;
+  GstVideoInfo info;
+  gboolean add_videometa;
+  gboolean need_alignment;
+  GstCudaAllocationParams params;
+};
+
+#define gst_cuda_buffer_pool_parent_class parent_class
+G_DEFINE_TYPE_WITH_PRIVATE (GstCudaBufferPool, gst_cuda_buffer_pool,
+    GST_TYPE_BUFFER_POOL);
+
+static const gchar **
+gst_cuda_buffer_pool_get_options (GstBufferPool * pool)
+{
+  static const gchar *options[] = { GST_BUFFER_POOL_OPTION_VIDEO_META,
+    GST_BUFFER_POOL_OPTION_VIDEO_ALIGNMENT, NULL
+  };
+
+  return options;
+}
+
+static gboolean
+gst_cuda_buffer_pool_set_config (GstBufferPool * pool, GstStructure * config)
+{
+  GstCudaBufferPool *cuda_pool = GST_CUDA_BUFFER_POOL_CAST (pool);
+  GstCudaBufferPoolPrivate *priv = cuda_pool->priv;
+  GstCaps *caps = NULL;
+  guint size, min_buffers, max_buffers;
+  guint max_align, n;
+  GstAllocator *allocator = NULL;
+  GstAllocationParams *params = (GstAllocationParams *) & priv->params;
+  GstVideoInfo *info = &priv->params.info;
+
+  if (!gst_buffer_pool_config_get_params (config, &caps, &size, &min_buffers,
+          &max_buffers))
+    goto wrong_config;
+
+  if (caps == NULL)
+    goto no_caps;
+
+  if (!gst_buffer_pool_config_get_allocator (config, &allocator, params))
+    goto wrong_config;
+
+  /* now parse the caps from the config */
+  if (!gst_video_info_from_caps (info, caps))
+    goto wrong_caps;
+
+  GST_LOG_OBJECT (pool, "%dx%d, caps %" GST_PTR_FORMAT,
+      GST_VIDEO_INFO_WIDTH (info), GST_VIDEO_INFO_HEIGHT (info), caps);
+
+  gst_clear_object (&priv->allocator);
+
+  if (allocator) {
+    if (!GST_IS_CUDA_ALLOCATOR (allocator)) {
+      goto wrong_allocator;
+    } else {
+      priv->allocator = gst_object_ref (allocator);
+    }
+  } else {
+    allocator = priv->allocator = gst_cuda_allocator_new (priv->context);
+    if (G_UNLIKELY (priv->allocator == NULL))
+      goto no_allocator;
+  }
+
+  priv->add_videometa = gst_buffer_pool_config_has_option (config,
+      GST_BUFFER_POOL_OPTION_VIDEO_META);
+
+  priv->need_alignment = gst_buffer_pool_config_has_option (config,
+      GST_BUFFER_POOL_OPTION_VIDEO_ALIGNMENT);
+
+  max_align = params->align;
+
+  /* do memory align */
+  if (priv->need_alignment && priv->add_videometa) {
+    GstVideoAlignment valign;
+
+    gst_buffer_pool_config_get_video_alignment (config, &valign);
+
+    for (n = 0; n < GST_VIDEO_MAX_PLANES; ++n)
+      max_align |= valign.stride_align[n];
+
+    for (n = 0; n < GST_VIDEO_MAX_PLANES; ++n)
+      valign.stride_align[n] = max_align;
+
+    if (!gst_video_info_align (info, &valign))
+      goto failed_to_align;
+
+    gst_buffer_pool_config_set_video_alignment (config, &valign);
+  }
+
+  if (params->align < max_align) {
+    GST_WARNING_OBJECT (pool, "allocation params alignment %u is smaller "
+        "than the max specified video stride alignment %u, fixing",
+        (guint) params->align, max_align);
+
+    params->align = max_align;
+    gst_buffer_pool_config_set_allocator (config, allocator, params);
+  }
+
+  gst_buffer_pool_config_set_params (config, caps, GST_VIDEO_INFO_SIZE (info),
+      min_buffers, max_buffers);
+
+  return GST_BUFFER_POOL_CLASS (parent_class)->set_config (pool, config);
+
+  /* ERRORS */
+wrong_config:
+  {
+    GST_WARNING_OBJECT (pool, "invalid config");
+    return FALSE;
+  }
+no_caps:
+  {
+    GST_WARNING_OBJECT (pool, "no caps in config");
+    return FALSE;
+  }
+wrong_caps:
+  {
+    GST_WARNING_OBJECT (pool,
+        "failed getting geometry from caps %" GST_PTR_FORMAT, caps);
+    return FALSE;
+  }
+no_allocator:
+  {
+    GST_WARNING_OBJECT (pool, "Could not create new CUDA allocator");
+    return FALSE;
+  }
+wrong_allocator:
+  {
+    GST_WARNING_OBJECT (pool, "Incorrect allocator type for this pool");
+    return FALSE;
+  }
+failed_to_align:
+  {
+    GST_WARNING_OBJECT (pool, "Failed to align");
+    return FALSE;
+  }
+}
+
+static GstFlowReturn
+gst_cuda_buffer_pool_alloc (GstBufferPool * pool, GstBuffer ** buffer,
+    GstBufferPoolAcquireParams * params)
+{
+  GstCudaBufferPool *cuda_pool = GST_CUDA_BUFFER_POOL_CAST (pool);
+  GstCudaBufferPoolPrivate *priv = cuda_pool->priv;
+  GstVideoInfo *info;
+  GstBuffer *cuda;
+  GstMemory *mem;
+
+  info = &priv->params.info;
+
+  cuda = gst_buffer_new ();
+
+  mem = gst_cuda_allocator_alloc (GST_ALLOCATOR_CAST (priv->allocator),
+      GST_VIDEO_INFO_SIZE (info), &priv->params);
+
+  if (mem == NULL) {
+    gst_buffer_unref (cuda);
+    GST_WARNING_OBJECT (pool, "Cannot create CUDA memory");
+    return GST_FLOW_ERROR;
+  }
+  gst_buffer_append_memory (cuda, mem);
+
+  if (priv->add_videometa) {
+    GST_DEBUG_OBJECT (pool, "adding GstVideoMeta");
+    gst_buffer_add_video_meta_full (cuda, GST_VIDEO_FRAME_FLAG_NONE,
+        GST_VIDEO_INFO_FORMAT (info), GST_VIDEO_INFO_WIDTH (info),
+        GST_VIDEO_INFO_HEIGHT (info), GST_VIDEO_INFO_N_PLANES (info),
+        info->offset, info->stride);
+  }
+
+  *buffer = cuda;
+
+  return GST_FLOW_OK;
+}
+
+GstBufferPool *
+gst_cuda_buffer_pool_new (GstCudaContext * context)
+{
+  GstCudaBufferPool *pool;
+
+  pool = g_object_new (GST_TYPE_CUDA_BUFFER_POOL, NULL);
+  gst_object_ref_sink (pool);
+
+  pool->priv->context = gst_object_ref (context);
+
+  GST_LOG_OBJECT (pool, "new CUDA buffer pool %p", pool);
+
+  return GST_BUFFER_POOL_CAST (pool);
+}
+
+static void
+gst_cuda_buffer_pool_dispose (GObject * object)
+{
+  GstCudaBufferPool *pool = GST_CUDA_BUFFER_POOL_CAST (object);
+  GstCudaBufferPoolPrivate *priv = pool->priv;
+
+  GST_LOG_OBJECT (pool, "finalize CUDA buffer pool %p", pool);
+
+  gst_clear_object (&priv->allocator);
+  gst_clear_object (&priv->context);
+
+  G_OBJECT_CLASS (parent_class)->dispose (object);
+}
+
+
+static void
+gst_cuda_buffer_pool_class_init (GstCudaBufferPoolClass * klass)
+{
+  GObjectClass *gobject_class = (GObjectClass *) klass;
+  GstBufferPoolClass *gstbufferpool_class = (GstBufferPoolClass *) klass;
+
+  gobject_class->dispose = gst_cuda_buffer_pool_dispose;
+
+  gstbufferpool_class->get_options = gst_cuda_buffer_pool_get_options;
+  gstbufferpool_class->set_config = gst_cuda_buffer_pool_set_config;
+  gstbufferpool_class->alloc_buffer = gst_cuda_buffer_pool_alloc;
+
+  GST_DEBUG_CATEGORY_INIT (gst_cuda_buffer_pool_debug, "cudabufferpool", 0,
+      "CUDA Buffer Pool");
+}
+
+static void
+gst_cuda_buffer_pool_init (GstCudaBufferPool * pool)
+{
+  pool->priv = gst_cuda_buffer_pool_get_instance_private (pool);
+}
--- a/sys/nvcodec/gstcudabufferpool.h
+++ b/sys/nvcodec/gstcudabufferpool.h
@ -0,0 +1,66 @@
+/* GStreamer
+ * Copyright (C) <2018-2019> Seungha Yang <seungha.yang@navercorp.com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Library General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Library General Public
+ * License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
+ * Boston, MA 02110-1301, USA.
+ */
+
+#ifndef __GST_CUDA_BUFFER_POOL_H__
+#define __GST_CUDA_BUFFER_POOL_H__
+
+#include <gst/video/gstvideometa.h>
+#include <gst/video/gstvideopool.h>
+
+#include "gstcudamemory.h"
+
+G_BEGIN_DECLS
+
+#define GST_TYPE_CUDA_BUFFER_POOL             (gst_cuda_buffer_pool_get_type ())
+#define GST_CUDA_BUFFER_POOL(obj)             (G_TYPE_CHECK_INSTANCE_CAST ((obj),GST_TYPE_CUDA_BUFFER_POOL,GstCudaBufferPool))
+#define GST_CUDA_BUFFER_POOL_CLASS(klass)     (G_TYPE_CHECK_CLASS_CAST ((klass), GST_TYPE_CUDA_BUFFER_POOL,GstCudaBufferPoolClass))
+#define GST_CUDA_BUFFER_POOL_GET_CLASS(obj)   (G_TYPE_INSTANCE_GET_CLASS((obj),  GST_TYPE_CUDA_BUFFER_POOL,GstCudaBufferPoolClass))
+#define GST_IS_CUDA_BUFFER_POOL(obj)          (G_TYPE_CHECK_INSTANCE_TYPE ((obj),GST_TYPE_CUDA_BUFFER_POOL))
+#define GST_IS_CUDA_BUFFER_POOL_CLASS(klass)  (G_TYPE_CHECK_CLASS_TYPE ((klass), GST_TYPE_CUDA_BUFFER_POOL))
+#define GST_CUDA_BUFFER_POOL_CAST(obj)        ((GstCudaBufferPool*)(obj))
+
+typedef struct _GstCudaBufferPool GstCudaBufferPool;
+typedef struct _GstCudaBufferPoolClass GstCudaBufferPoolClass;
+typedef struct _GstCudaBufferPoolPrivate GstCudaBufferPoolPrivate;
+
+/*
+ * GstCudaBufferPool:
+ */
+struct _GstCudaBufferPool
+{
+  GstBufferPool parent;
+
+  GstCudaBufferPoolPrivate *priv;
+};
+
+/*
+ * GstCudaBufferPoolClass:
+ */
+struct _GstCudaBufferPoolClass
+{
+  GstBufferPoolClass parent_class;
+};
+
+GType gst_cuda_buffer_pool_get_type (void);
+
+GstBufferPool * gst_cuda_buffer_pool_new (GstCudaContext * context);
+
+G_END_DECLS
+
+#endif /* __GST_CUDA_BUFFER_POOL_H__ */
--- a/sys/nvcodec/gstcudaloader.c
+++ b/sys/nvcodec/gstcudaloader.c
@ -69,10 +69,14 @@ typedef struct _GstNvCodecCudaVTable
    CUresult (CUDAAPI * CuMemAlloc) (CUdeviceptr * dptr, unsigned int bytesize);
    CUresult (CUDAAPI * CuMemAllocPitch) (CUdeviceptr * dptr, size_t * pPitch,
      size_t WidthInBytes, size_t Height, unsigned int ElementSizeBytes);
+    CUresult (CUDAAPI * CuMemAllocHost) (void **pp, unsigned int bytesize);
    CUresult (CUDAAPI * CuMemcpy2D) (const CUDA_MEMCPY2D * pCopy);
    CUresult (CUDAAPI * CuMemcpy2DAsync) (const CUDA_MEMCPY2D * pCopy,
      CUstream hStream);
+
    CUresult (CUDAAPI * CuMemFree) (CUdeviceptr dptr);
+    CUresult (CUDAAPI * CuMemFreeHost) (void *p);
+
    CUresult (CUDAAPI * CuStreamCreate) (CUstream * phStream,
      unsigned int Flags);
    CUresult (CUDAAPI * CuStreamDestroy) (CUstream hStream);
@ -136,9 +140,12 @@ gst_cuda_load_library (void)

  LOAD_SYMBOL (cuMemAlloc, CuMemAlloc);
  LOAD_SYMBOL (cuMemAllocPitch, CuMemAllocPitch);
+  LOAD_SYMBOL (cuMemAllocHost, CuMemAllocHost);
  LOAD_SYMBOL (cuMemcpy2D, CuMemcpy2D);
  LOAD_SYMBOL (cuMemcpy2DAsync, CuMemcpy2DAsync);
+
  LOAD_SYMBOL (cuMemFree, CuMemFree);
+  LOAD_SYMBOL (cuMemFreeHost, CuMemFreeHost);

  LOAD_SYMBOL (cuStreamCreate, CuStreamCreate);
  LOAD_SYMBOL (cuStreamDestroy, CuStreamDestroy);
@ -285,6 +292,14 @@ CuMemAllocPitch (CUdeviceptr * dptr, size_t * pPitch, size_t WidthInBytes,
      ElementSizeBytes);
 }

+CUresult CUDAAPI
+CuMemAllocHost (void **pp, unsigned int bytesize)
+{
+  g_assert (gst_cuda_vtable.CuMemAllocHost != NULL);
+
+  return gst_cuda_vtable.CuMemAllocHost (pp, bytesize);
+}
+
 CUresult CUDAAPI
 CuMemcpy2D (const CUDA_MEMCPY2D * pCopy)
 {
@ -309,6 +324,14 @@ CuMemFree (CUdeviceptr dptr)
  return gst_cuda_vtable.CuMemFree (dptr);
 }

+CUresult CUDAAPI
+CuMemFreeHost (void *p)
+{
+  g_assert (gst_cuda_vtable.CuMemFreeHost != NULL);
+
+  return gst_cuda_vtable.CuMemFreeHost (p);
+}
+
 CUresult CUDAAPI
 CuStreamCreate (CUstream * phStream, unsigned int Flags)
 {
--- a/sys/nvcodec/gstcudaloader.h
+++ b/sys/nvcodec/gstcudaloader.h
@ -90,6 +90,10 @@ CUresult CUDAAPI CuMemAllocPitch    (CUdeviceptr * dptr,
                                     size_t Height,
                                     unsigned int ElementSizeBytes);

+G_GNUC_INTERNAL
+CUresult CUDAAPI CuMemAllocHost     (void **pp,
+                                     unsigned int bytesize);
+
 G_GNUC_INTERNAL
 CUresult CUDAAPI CuMemcpy2D         (const CUDA_MEMCPY2D * pCopy);

@ -99,6 +103,9 @@ CUresult CUDAAPI CuMemcpy2DAsync    (const CUDA_MEMCPY2D *pCopy, CUstream hStrea
 G_GNUC_INTERNAL
 CUresult CUDAAPI CuMemFree          (CUdeviceptr dptr);

+G_GNUC_INTERNAL
+CUresult CUDAAPI CuMemFreeHost      (void *p);
+
 G_GNUC_INTERNAL
 CUresult CUDAAPI CuStreamCreate     (CUstream *phStream,
                                     unsigned int Flags);
--- a/sys/nvcodec/gstcudamemory.c
+++ b/sys/nvcodec/gstcudamemory.c
@ -0,0 +1,485 @@
+/* GStreamer
+ * Copyright (C) <2018-2019> Seungha Yang <seungha.yang@navercorp.com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Library General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Library General Public
+ * License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
+ * Boston, MA 02110-1301, USA.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "gstcudamemory.h"
+#include "gstcudautils.h"
+
+#include <string.h>
+
+GST_DEBUG_CATEGORY_STATIC (cudaallocator_debug);
+#define GST_CAT_DEFAULT cudaallocator_debug
+GST_DEBUG_CATEGORY_STATIC (GST_CAT_MEMORY);
+
+#define gst_cuda_allocator_parent_class parent_class
+G_DEFINE_TYPE (GstCudaAllocator, gst_cuda_allocator, GST_TYPE_ALLOCATOR);
+
+static void gst_cuda_allocator_dispose (GObject * object);
+static void gst_cuda_allocator_free (GstAllocator * allocator,
+    GstMemory * memory);
+
+static gpointer cuda_mem_map (GstCudaMemory * mem, gsize maxsize,
+    GstMapFlags flags);
+static void cuda_mem_unmap_full (GstCudaMemory * mem, GstMapInfo * info);
+static GstMemory *cuda_mem_copy (GstMemory * mem, gssize offset, gssize size);
+
+static GstMemory *
+gst_cuda_allocator_dummy_alloc (GstAllocator * allocator, gsize size,
+    GstAllocationParams * params)
+{
+  g_return_val_if_reached (NULL);
+}
+
+static void
+gst_cuda_allocator_class_init (GstCudaAllocatorClass * klass)
+{
+  GObjectClass *gobject_class = G_OBJECT_CLASS (klass);
+  GstAllocatorClass *allocator_class = GST_ALLOCATOR_CLASS (klass);
+
+  gobject_class->dispose = gst_cuda_allocator_dispose;
+
+  allocator_class->alloc = GST_DEBUG_FUNCPTR (gst_cuda_allocator_dummy_alloc);
+  allocator_class->free = GST_DEBUG_FUNCPTR (gst_cuda_allocator_free);
+
+  GST_DEBUG_CATEGORY_INIT (cudaallocator_debug, "cudaallocator", 0,
+      "CUDA Allocator");
+  GST_DEBUG_CATEGORY_GET (GST_CAT_MEMORY, "GST_MEMORY");
+}
+
+static void
+gst_cuda_allocator_init (GstCudaAllocator * allocator)
+{
+  GstAllocator *alloc = GST_ALLOCATOR_CAST (allocator);
+
+  GST_DEBUG_OBJECT (allocator, "init");
+
+  alloc->mem_type = GST_CUDA_MEMORY_TYPE_NAME;
+
+  alloc->mem_map = (GstMemoryMapFunction) cuda_mem_map;
+  alloc->mem_unmap_full = (GstMemoryUnmapFullFunction) cuda_mem_unmap_full;
+  alloc->mem_copy = (GstMemoryCopyFunction) cuda_mem_copy;
+
+  GST_OBJECT_FLAG_SET (allocator, GST_ALLOCATOR_FLAG_CUSTOM_ALLOC);
+}
+
+static void
+gst_cuda_allocator_dispose (GObject * object)
+{
+  GstCudaAllocator *self = GST_CUDA_ALLOCATOR_CAST (object);
+
+  GST_DEBUG_OBJECT (self, "dispose");
+
+  gst_clear_object (&self->context);
+  G_OBJECT_CLASS (parent_class)->dispose (object);
+}
+
+GstMemory *
+gst_cuda_allocator_alloc (GstAllocator * allocator, gsize size,
+    GstCudaAllocationParams * params)
+{
+  GstCudaAllocator *self = GST_CUDA_ALLOCATOR_CAST (allocator);
+  gsize maxsize = size + params->parent.prefix + params->parent.padding;
+  gsize align = params->parent.align;
+  gsize offset = params->parent.prefix;
+  GstMemoryFlags flags = params->parent.flags;
+  CUdeviceptr data;
+  gboolean ret = FALSE;
+  GstCudaMemory *mem;
+  GstVideoInfo *info = &params->info;
+  gint i;
+  guint width, height;
+  gsize stride, plane_offset;
+
+  if (!gst_cuda_context_push (self->context))
+    return NULL;
+
+  /* ensure configured alignment */
+  align |= gst_memory_alignment;
+  /* allocate more to compensate for alignment */
+  maxsize += align;
+
+  GST_CAT_DEBUG_OBJECT (GST_CAT_MEMORY, self, "allocate new cuda memory");
+
+  width = GST_VIDEO_INFO_COMP_WIDTH (info, 0) *
+      GST_VIDEO_INFO_COMP_PSTRIDE (info, 0);
+  height = 0;
+  for (i = 0; i < GST_VIDEO_INFO_N_PLANES (info); i++)
+    height += GST_VIDEO_INFO_COMP_HEIGHT (info, i);
+
+  ret = gst_cuda_result (CuMemAllocPitch (&data, &stride, width, height, 16));
+  gst_cuda_context_pop (NULL);
+
+  if (G_UNLIKELY (!ret)) {
+    GST_CAT_ERROR_OBJECT (GST_CAT_MEMORY, self, "CUDA allocation failure");
+    return NULL;
+  }
+
+  mem = g_new0 (GstCudaMemory, 1);
+  g_mutex_init (&mem->lock);
+  mem->data = data;
+  mem->alloc_params = *params;
+  mem->stride = stride;
+
+  plane_offset = 0;
+  for (i = 0; i < GST_VIDEO_INFO_N_PLANES (info); i++) {
+    mem->offset[i] = plane_offset;
+    plane_offset += stride * GST_VIDEO_INFO_COMP_HEIGHT (info, i);
+  }
+
+  mem->context = gst_object_ref (self->context);
+
+  gst_memory_init (GST_MEMORY_CAST (mem),
+      flags, GST_ALLOCATOR_CAST (self), NULL, maxsize, align, offset, size);
+
+  return GST_MEMORY_CAST (mem);
+}
+
+static void
+gst_cuda_allocator_free (GstAllocator * allocator, GstMemory * memory)
+{
+  GstCudaAllocator *self = GST_CUDA_ALLOCATOR_CAST (allocator);
+  GstCudaMemory *mem = GST_CUDA_MEMORY_CAST (memory);
+
+  GST_CAT_DEBUG_OBJECT (GST_CAT_MEMORY, allocator, "free cuda memory");
+
+  g_mutex_clear (&mem->lock);
+
+  gst_cuda_context_push (self->context);
+  if (mem->data)
+    gst_cuda_result (CuMemFree (mem->data));
+
+  if (mem->map_alloc_data)
+    gst_cuda_result (CuMemFreeHost (mem->map_alloc_data));
+
+  gst_cuda_context_pop (NULL);
+  gst_object_unref (mem->context);
+
+  g_free (mem);
+}
+
+/* called with lock */
+static gboolean
+gst_cuda_memory_upload_transfer (GstCudaMemory * mem)
+{
+  gint i;
+  GstVideoInfo *info = &mem->alloc_params.info;
+  gboolean ret = TRUE;
+
+  if (!mem->map_data) {
+    GST_CAT_ERROR (GST_CAT_MEMORY, "no staging memory to upload");
+    return FALSE;
+  }
+
+  for (i = 0; i < GST_VIDEO_INFO_N_PLANES (info); i++) {
+    CUDA_MEMCPY2D param = { 0, };
+
+    param.srcMemoryType = CU_MEMORYTYPE_HOST;
+    param.srcHost =
+        (guint8 *) mem->map_data + GST_VIDEO_INFO_PLANE_OFFSET (info, i);
+    param.srcPitch = GST_VIDEO_INFO_PLANE_STRIDE (info, i);
+
+    param.dstMemoryType = CU_MEMORYTYPE_DEVICE;
+    param.dstDevice = mem->data + mem->offset[i];
+    param.dstPitch = mem->stride;
+    param.WidthInBytes = GST_VIDEO_INFO_COMP_WIDTH (info, i) *
+        GST_VIDEO_INFO_COMP_PSTRIDE (info, i);
+    param.Height = GST_VIDEO_INFO_COMP_HEIGHT (info, i);
+
+    if (!gst_cuda_result (CuMemcpy2DAsync (&param, NULL))) {
+      GST_CAT_ERROR (GST_CAT_MEMORY, "Failed to copy %dth plane", i);
+      ret = FALSE;
+      break;
+    }
+  }
+  gst_cuda_result (CuStreamSynchronize (NULL));
+
+  return ret;
+}
+
+/* called with lock */
+static gboolean
+gst_cuda_memory_download_transfer (GstCudaMemory * mem)
+{
+  gint i;
+  GstVideoInfo *info = &mem->alloc_params.info;
+
+  if (!mem->map_data) {
+    GST_CAT_ERROR (GST_CAT_MEMORY, "no staging memory to upload");
+    return FALSE;
+  }
+
+  for (i = 0; i < GST_VIDEO_INFO_N_PLANES (info); i++) {
+    CUDA_MEMCPY2D param = { 0, };
+
+    param.srcMemoryType = CU_MEMORYTYPE_DEVICE;
+    param.srcDevice = mem->data + mem->offset[i];
+    param.srcPitch = mem->stride;
+
+    param.dstMemoryType = CU_MEMORYTYPE_HOST;
+    param.dstHost =
+        (guint8 *) mem->map_data + GST_VIDEO_INFO_PLANE_OFFSET (info, i);
+    param.dstPitch = GST_VIDEO_INFO_PLANE_STRIDE (info, i);
+    param.WidthInBytes = GST_VIDEO_INFO_COMP_WIDTH (info, i) *
+        GST_VIDEO_INFO_COMP_PSTRIDE (info, i);
+    param.Height = GST_VIDEO_INFO_COMP_HEIGHT (info, i);
+
+    if (!gst_cuda_result (CuMemcpy2DAsync (&param, NULL))) {
+      GST_CAT_ERROR (GST_CAT_MEMORY, "Failed to copy %dth plane", i);
+      CuMemFreeHost (mem->map_alloc_data);
+      mem->map_alloc_data = mem->map_data = mem->align_data = NULL;
+      break;
+    }
+  }
+  gst_cuda_result (CuStreamSynchronize (NULL));
+
+  return ! !mem->map_data;
+}
+
+static gpointer
+gst_cuda_memory_device_memory_map (GstCudaMemory * mem)
+{
+  GstMemory *memory = GST_MEMORY_CAST (mem);
+  gpointer data;
+  gsize aoffset;
+  gsize align = memory->align;
+
+  if (mem->map_data) {
+    return mem->map_data;
+  }
+
+  GST_CAT_DEBUG (GST_CAT_MEMORY, "alloc host memory for map");
+
+  if (!mem->map_alloc_data) {
+    gsize maxsize;
+    guint8 *align_data;
+
+    maxsize = memory->maxsize + align;
+    if (!gst_cuda_context_push (mem->context)) {
+      GST_CAT_ERROR (GST_CAT_MEMORY, "cannot push cuda context");
+
+      return NULL;
+    }
+
+    if (!gst_cuda_result (CuMemAllocHost (&data, maxsize))) {
+      GST_CAT_ERROR (GST_CAT_MEMORY, "cannot alloc host memory");
+      gst_cuda_context_pop (NULL);
+
+      return NULL;
+    }
+
+    if (!gst_cuda_context_pop (NULL)) {
+      GST_CAT_WARNING (GST_CAT_MEMORY, "cannot pop cuda context");
+    }
+
+    mem->map_alloc_data = data;
+    align_data = data;
+
+    /* do align */
+    if ((aoffset = ((guintptr) align_data & align))) {
+      aoffset = (align + 1) - aoffset;
+      align_data += aoffset;
+    }
+    mem->align_data = align_data;
+
+    /* first memory, always need download to staging */
+    GST_MINI_OBJECT_FLAG_SET (mem, GST_CUDA_MEMORY_TRANSFER_NEED_DOWNLOAD);
+  }
+
+  mem->map_data = mem->align_data;
+
+  if (GST_MEMORY_FLAG_IS_SET (mem, GST_CUDA_MEMORY_TRANSFER_NEED_DOWNLOAD)) {
+    if (!gst_cuda_context_push (mem->context)) {
+      GST_CAT_ERROR (GST_CAT_MEMORY, "cannot push cuda context");
+
+      return NULL;
+    }
+
+    gst_cuda_memory_download_transfer (mem);
+
+    if (!gst_cuda_context_pop (NULL)) {
+      GST_CAT_WARNING (GST_CAT_MEMORY, "cannot pop cuda context");
+    }
+  }
+
+  return mem->map_data;
+}
+
+static gpointer
+cuda_mem_map (GstCudaMemory * mem, gsize maxsize, GstMapFlags flags)
+{
+  gpointer ret = NULL;
+
+  g_mutex_lock (&mem->lock);
+  mem->map_count++;
+
+  if ((flags & GST_MAP_CUDA) == GST_MAP_CUDA) {
+    /* upload from staging to device memory if necessary */
+    if (GST_MEMORY_FLAG_IS_SET (mem, GST_CUDA_MEMORY_TRANSFER_NEED_UPLOAD)) {
+      if (!gst_cuda_context_push (mem->context)) {
+        GST_CAT_ERROR (GST_CAT_MEMORY, "cannot push cuda context");
+        g_mutex_unlock (&mem->lock);
+
+        return NULL;
+      }
+
+      if (!gst_cuda_memory_upload_transfer (mem)) {
+        g_mutex_unlock (&mem->lock);
+        return NULL;
+      }
+
+      gst_cuda_context_pop (NULL);
+    }
+
+    GST_MEMORY_FLAG_UNSET (mem, GST_CUDA_MEMORY_TRANSFER_NEED_UPLOAD);
+
+    if ((flags & GST_MAP_WRITE) == GST_MAP_WRITE)
+      GST_MINI_OBJECT_FLAG_SET (mem, GST_CUDA_MEMORY_TRANSFER_NEED_DOWNLOAD);
+
+    g_mutex_unlock (&mem->lock);
+    return (gpointer) mem->data;
+  }
+
+  ret = gst_cuda_memory_device_memory_map (mem);
+  if (ret == NULL) {
+    mem->map_count--;
+    g_mutex_unlock (&mem->lock);
+    return NULL;
+  }
+
+  if ((flags & GST_MAP_WRITE) == GST_MAP_WRITE)
+    GST_MINI_OBJECT_FLAG_SET (mem, GST_CUDA_MEMORY_TRANSFER_NEED_UPLOAD);
+
+  GST_MEMORY_FLAG_UNSET (mem, GST_CUDA_MEMORY_TRANSFER_NEED_DOWNLOAD);
+
+  g_mutex_unlock (&mem->lock);
+
+  return ret;
+}
+
+static void
+cuda_mem_unmap_full (GstCudaMemory * mem, GstMapInfo * info)
+{
+  g_mutex_lock (&mem->lock);
+  mem->map_count--;
+  GST_CAT_TRACE (GST_CAT_MEMORY,
+      "unmap CUDA memory %p, map count %d, have map_data %s",
+      mem, mem->map_count, mem->map_data ? "true" : "false");
+
+  if ((info->flags & GST_MAP_CUDA) == GST_MAP_CUDA) {
+    if ((info->flags & GST_MAP_WRITE) == GST_MAP_WRITE)
+      GST_MINI_OBJECT_FLAG_SET (mem, GST_CUDA_MEMORY_TRANSFER_NEED_DOWNLOAD);
+
+    g_mutex_unlock (&mem->lock);
+    return;
+  }
+
+  if ((info->flags & GST_MAP_WRITE))
+    GST_MINI_OBJECT_FLAG_SET (mem, GST_CUDA_MEMORY_TRANSFER_NEED_UPLOAD);
+
+  if (mem->map_count > 0 || !mem->map_data) {
+    g_mutex_unlock (&mem->lock);
+    return;
+  }
+
+  mem->map_data = NULL;
+  g_mutex_unlock (&mem->lock);
+
+  return;
+}
+
+static GstMemory *
+cuda_mem_copy (GstMemory * mem, gssize offset, gssize size)
+{
+  GstMemory *copy;
+  GstCudaMemory *src_mem = GST_CUDA_MEMORY_CAST (mem);
+  GstCudaMemory *dst_mem;
+  GstCudaContext *ctx = GST_CUDA_ALLOCATOR_CAST (mem->allocator)->context;
+  gint i;
+  GstVideoInfo *info;
+
+  /* offset and size are ignored */
+  copy = gst_cuda_allocator_alloc (mem->allocator, mem->size,
+      &src_mem->alloc_params);
+
+  dst_mem = GST_CUDA_MEMORY_CAST (copy);
+
+  info = &src_mem->alloc_params.info;
+
+  if (!gst_cuda_context_push (ctx)) {
+    GST_CAT_ERROR (GST_CAT_MEMORY, "cannot push cuda context");
+    gst_cuda_allocator_free (mem->allocator, copy);
+
+    return NULL;
+  }
+
+  for (i = 0; i < GST_VIDEO_INFO_N_PLANES (info); i++) {
+    CUDA_MEMCPY2D param = { 0, };
+
+    param.srcMemoryType = CU_MEMORYTYPE_DEVICE;
+    param.srcDevice = src_mem->data + src_mem->offset[i];
+    param.srcPitch = src_mem->stride;
+
+    param.dstMemoryType = CU_MEMORYTYPE_DEVICE;
+    param.dstDevice = dst_mem->data + dst_mem->offset[i];
+    param.dstPitch = dst_mem->stride;
+    param.WidthInBytes = GST_VIDEO_INFO_COMP_WIDTH (info, i) *
+        GST_VIDEO_INFO_COMP_PSTRIDE (info, i);
+    param.Height = GST_VIDEO_INFO_COMP_HEIGHT (info, i);
+
+    if (!gst_cuda_result (CuMemcpy2DAsync (&param, NULL))) {
+      GST_CAT_ERROR_OBJECT (GST_CAT_MEMORY,
+          mem->allocator, "Failed to copy %dth plane", i);
+      gst_cuda_context_pop (NULL);
+      gst_cuda_allocator_free (mem->allocator, copy);
+
+      return NULL;
+    }
+  }
+
+  gst_cuda_result (CuStreamSynchronize (NULL));
+
+  if (!gst_cuda_context_pop (NULL)) {
+    GST_CAT_WARNING (GST_CAT_MEMORY, "cannot pop cuda context");
+  }
+
+  return copy;
+}
+
+GstAllocator *
+gst_cuda_allocator_new (GstCudaContext * context)
+{
+  GstCudaAllocator *allocator;
+
+  g_return_val_if_fail (GST_IS_CUDA_CONTEXT (context), NULL);
+
+  allocator = g_object_new (GST_TYPE_CUDA_ALLOCATOR, NULL);
+  allocator->context = gst_object_ref (context);
+
+  return GST_ALLOCATOR_CAST (allocator);
+}
+
+gboolean
+gst_is_cuda_memory (GstMemory * mem)
+{
+  return mem != NULL && mem->allocator != NULL &&
+      GST_IS_CUDA_ALLOCATOR (mem->allocator);
+}
--- a/sys/nvcodec/gstcudamemory.h
+++ b/sys/nvcodec/gstcudamemory.h
@ -0,0 +1,138 @@
+/* GStreamer
+ * Copyright (C) <2018-2019> Seungha Yang <seungha.yang@navercorp.com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Library General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Library General Public
+ * License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
+ * Boston, MA 02110-1301, USA.
+ */
+
+#ifndef __GST_CUDA_MEMORY_H__
+#define __GST_CUDA_MEMORY_H__
+
+#include <gst/gst.h>
+#include <gst/gstallocator.h>
+#include <gst/video/video.h>
+#include "gstcudaloader.h"
+#include "gstcudacontext.h"
+
+G_BEGIN_DECLS
+
+#define GST_TYPE_CUDA_ALLOCATOR             (gst_cuda_allocator_get_type())
+#define GST_CUDA_ALLOCATOR(obj)             (G_TYPE_CHECK_INSTANCE_CAST((obj),GST_TYPE_CUDA_ALLOCATOR,GstCudaAllocator))
+#define GST_CUDA_ALLOCATOR_CLASS(klass)     (G_TYPE_CHECK_CLASS_CAST((klass), GST_TYPE_CUDA_ALLOCATOR,GstCudaAllocatorClass))
+#define GST_CUDA_ALLOCATOR_GET_CLASS(obj)   (G_TYPE_INSTANCE_GET_CLASS((obj), GST_TYPE_CUDA_ALLOCATOR,GstCudaAllocatorClass))
+#define GST_IS_CUDA_ALLOCATOR(obj)          (G_TYPE_CHECK_INSTANCE_TYPE((obj),GST_TYPE_CUDA_ALLOCATOR))
+#define GST_IS_CUDA_ALLOCATOR_CLASS(klass)  (G_TYPE_CHECK_CLASS_TYPE((klass), GST_TYPE_CUDA_ALLOCATOR))
+#define GST_CUDA_ALLOCATOR_CAST(obj)        ((GstCudaAllocator *)(obj))
+#define GST_CUDA_MEMORY_CAST(mem)           ((GstCudaMemory *) (mem))
+
+typedef struct _GstCudaAllocationParams GstCudaAllocationParams;
+typedef struct _GstCudaAllocator GstCudaAllocator;
+typedef struct _GstCudaAllocatorClass GstCudaAllocatorClass;
+typedef struct _GstCudaMemory GstCudaMemory;
+
+/**
+ * GST_MAP_CUDA:
+ *
+ * Flag indicating that we should map the CUDA device memory
+ * instead of to system memory.
+ *
+ * Combining #GST_MAP_CUDA with #GST_MAP_WRITE has the same semantics as though
+ * you are writing to CUDA device/host memory.
+ * Conversely, combining #GST_MAP_CUDA with
+ * #GST_MAP_READ has the same semantics as though you are reading from
+ * CUDA device/host memory
+ */
+#define GST_MAP_CUDA (GST_MAP_FLAG_LAST << 1)
+
+#define GST_CUDA_MEMORY_TYPE_NAME "gst.cuda.memory"
+
+/**
+ * GST_CAPS_FEATURE_MEMORY_CUDA_MEMORY:
+ *
+ * Name of the caps feature for indicating the use of #GstCudaMemory
+ */
+#define GST_CAPS_FEATURE_MEMORY_CUDA_MEMORY "memory:CUDAMemory"
+
+struct _GstCudaAllocationParams
+{
+  GstAllocationParams parent;
+
+  GstVideoInfo info;
+};
+
+struct _GstCudaAllocator
+{
+  GstAllocator parent;
+  GstCudaContext *context;
+};
+
+struct _GstCudaAllocatorClass
+{
+  GstAllocatorClass parent_class;
+};
+
+GType          gst_cuda_allocator_get_type (void);
+
+GstAllocator * gst_cuda_allocator_new (GstCudaContext * context);
+
+GstMemory    * gst_cuda_allocator_alloc (GstAllocator * allocator,
+                                         gsize size,
+                                         GstCudaAllocationParams * params);
+
+/**
+ * GstCudaMemoryTransfer:
+ * @GST_CUDA_MEMORY_TRANSFER_NEED_DOWNLOAD: the device memory needs downloading
+ *                                          to the staging memory
+ * @GST_CUDA_MEMORY_TRANSFER_NEED_UPLOAD:   the staging memory needs uploading
+ *                                          to the device memory
+ */
+typedef enum
+{
+  GST_CUDA_MEMORY_TRANSFER_NEED_DOWNLOAD   = (GST_MEMORY_FLAG_LAST << 0),
+  GST_CUDA_MEMORY_TRANSFER_NEED_UPLOAD     = (GST_MEMORY_FLAG_LAST << 1)
+} GstCudaMemoryTransfer;
+
+struct _GstCudaMemory
+{
+  GstMemory       mem;
+
+  GstCudaContext *context;
+  CUdeviceptr data;
+
+  GstCudaAllocationParams alloc_params;
+
+  /* offset and stride of CUDA device memory */
+  gsize offset[GST_VIDEO_MAX_PLANES];
+  gint stride;
+
+  /* allocated CUDA Host memory */
+  gpointer map_alloc_data;
+
+  /* aligned CUDA Host memory */
+  guint8 *align_data;
+
+  /* pointing align_data if the memory is mapped */
+  gpointer map_data;
+
+  gint map_count;
+
+  GMutex lock;
+};
+
+gboolean        gst_is_cuda_memory        (GstMemory * mem);
+
+G_END_DECLS
+
+#endif /* __GST_CUDA_MEMORY_H__ */
--- a/sys/nvcodec/meson.build
+++ b/sys/nvcodec/meson.build
@ -12,6 +12,8 @@ nvcodec_sources = [
  'gstnvdecoder.c',
  'gstnvh264dec.c',
  'gstnvh265dec.c',
+  'gstcudamemory.c',
+  'gstcudabufferpool.c',
 ]

 if get_option('nvcodec').disabled()
--- a/sys/nvcodec/stub/cuda.h
+++ b/sys/nvcodec/stub/cuda.h
@ -114,6 +114,7 @@ typedef enum

 #define cuMemAlloc cuMemAlloc_v2
 #define cuMemAllocPitch cuMemAllocPitch_v2
+#define cuMemAllocHost  cuMemAllocHost_v2
 #define cuMemcpy2D cuMemcpy2D_v2
 #define cuMemcpy2DAsync cuMemcpy2DAsync_v2
 #define cuMemFree cuMemFree_v2