gstreamer/subprojects/gst-plugins-bad/gst-libs/gst/cuda/gstcudautils.c

/* GStreamer
 * Copyright (C) <2018-2019> Seungha Yang <seungha.yang@navercorp.com>
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Library General Public
 * License as published by the Free Software Foundation; either
 * version 2 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Library General Public License for more details.
 *
 * You should have received a copy of the GNU Library General Public
 * License along with this library; if not, write to the
 * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
 * Boston, MA 02110-1301, USA.
 */

#ifdef HAVE_CONFIG_H
#include "config.h"
#endif

#include "gstcudautils.h"
#include "gstcudacontext.h"
#include "gstcuda-private.h"

#ifdef HAVE_NVCODEC_GST_GL
#include <gst/gl/gl.h>
#include <gst/gl/gstglfuncs.h>
#endif

#ifdef GST_CUDA_HAS_D3D
#include <gst/d3d11/gstd3d11.h>
#endif

#ifdef HAVE_NVCODEC_NVMM
#include "gstcudanvmm.h"
#endif

#include "gstcudamemory.h"

GST_DEBUG_CATEGORY_STATIC (gst_cuda_utils_debug);
#define GST_CAT_DEFAULT gst_cuda_utils_debug
GST_DEBUG_CATEGORY_STATIC (GST_CAT_CONTEXT);

static void
_init_debug (void)
{
  static gsize once_init = 0;

  if (g_once_init_enter (&once_init)) {

    GST_DEBUG_CATEGORY_INIT (gst_cuda_utils_debug, "cudautils", 0,
        "CUDA utils");
    GST_DEBUG_CATEGORY_GET (GST_CAT_CONTEXT, "GST_CONTEXT");
    g_once_init_leave (&once_init, 1);
  }
}

static gboolean
pad_query (const GValue * item, GValue * value, gpointer user_data)
{
  GstPad *pad = g_value_get_object (item);
  GstQuery *query = user_data;
  gboolean res;

  res = gst_pad_peer_query (pad, query);

  if (res) {
    g_value_set_boolean (value, TRUE);
    return FALSE;
  }

  GST_CAT_INFO_OBJECT (GST_CAT_CONTEXT, pad, "pad peer query failed");
  return TRUE;
}

static gboolean
run_query (GstElement * element, GstQuery * query, GstPadDirection direction)
{
  GstIterator *it;
  GstIteratorFoldFunction func = pad_query;
  GValue res = { 0 };

  g_value_init (&res, G_TYPE_BOOLEAN);
  g_value_set_boolean (&res, FALSE);

  /* Ask neighbor */
  if (direction == GST_PAD_SRC)
    it = gst_element_iterate_src_pads (element);
  else
    it = gst_element_iterate_sink_pads (element);

  while (gst_iterator_fold (it, func, &res, query) == GST_ITERATOR_RESYNC)
    gst_iterator_resync (it);

  gst_iterator_free (it);

  return g_value_get_boolean (&res);
}

static void
find_cuda_context (GstElement * element, GstCudaContext ** cuda_ctx)
{
  GstQuery *query;
  GstContext *ctxt;

  /*  1) Query downstream with GST_QUERY_CONTEXT for the context and
   *      check if upstream already has a context of the specific type
   *  2) Query upstream as above.
   */
  query = gst_query_new_context (GST_CUDA_CONTEXT_TYPE);
  if (run_query (element, query, GST_PAD_SRC)) {
    gst_query_parse_context (query, &ctxt);
    if (ctxt) {
      GST_CAT_INFO_OBJECT (GST_CAT_CONTEXT, element,
          "found context (%p) in downstream query", ctxt);
      gst_element_set_context (element, ctxt);
    }
  }

  /* although we found cuda context above, the element does not want
   * to use the context. Then try to find from the other direction */
  if (*cuda_ctx == NULL && run_query (element, query, GST_PAD_SINK)) {
    gst_query_parse_context (query, &ctxt);
    if (ctxt) {
      GST_CAT_INFO_OBJECT (GST_CAT_CONTEXT, element,
          "found context (%p) in upstream query", ctxt);
      gst_element_set_context (element, ctxt);
    }
  }

  if (*cuda_ctx == NULL) {
    /* 3) Post a GST_MESSAGE_NEED_CONTEXT message on the bus with
     *    the required context type and afterwards check if a
     *    usable context was set now. The message could
     *    be handled by the parent bins of the element and the
     *    application.
     */
    GstMessage *msg;

    GST_CAT_INFO_OBJECT (GST_CAT_CONTEXT, element,
        "posting need context message");
    msg = gst_message_new_need_context (GST_OBJECT_CAST (element),
        GST_CUDA_CONTEXT_TYPE);
    gst_element_post_message (element, msg);
  }

  /*
   * Whomever responds to the need-context message performs a
   * GstElement::set_context() with the required context in which the element
   * is required to update the cuda_ctx or call gst_cuda_handle_set_context().
   */

  gst_query_unref (query);
}

static void
context_set_cuda_context (GstContext * context, GstCudaContext * cuda_ctx)
{
  GstStructure *s;
  guint device_id;

  g_return_if_fail (context != NULL);

  g_object_get (G_OBJECT (cuda_ctx), "cuda-device-id", &device_id, NULL);

  GST_CAT_LOG (GST_CAT_CONTEXT,
      "setting GstCudaContext(%" GST_PTR_FORMAT
      ") with cuda-device-id %d on context(%" GST_PTR_FORMAT ")",
      cuda_ctx, device_id, context);

  s = gst_context_writable_structure (context);
  gst_structure_set (s, GST_CUDA_CONTEXT_TYPE, GST_TYPE_CUDA_CONTEXT,
      cuda_ctx, "cuda-device-id", G_TYPE_UINT, device_id, NULL);
}

/**
 * gst_cuda_ensure_element_context:
 * @element: the #GstElement running the query
 * @device_id: preferred device-id, pass device_id >=0 when
 *             the device_id explicitly required. Otherwise, set -1.
 * @cuda_ctx: (inout): the resulting #GstCudaContext
 *
 * Perform the steps necessary for retrieving a #GstCudaContext from the
 * surrounding elements or from the application using the #GstContext mechanism.
 *
 * If the content of @cuda_ctx is not %NULL, then no #GstContext query is
 * necessary for #GstCudaContext.
 *
 * Returns: whether a #GstCudaContext exists in @cuda_ctx
 *
 * Since: 1.22
 */
gboolean
gst_cuda_ensure_element_context (GstElement * element, gint device_id,
    GstCudaContext ** cuda_ctx)
{
  guint target_device_id = 0;
  gboolean ret = TRUE;
  static GRecMutex lock;
  static gsize init_lock_once = 0;

  g_return_val_if_fail (element != NULL, FALSE);
  g_return_val_if_fail (cuda_ctx != NULL, FALSE);

  _init_debug ();
  if (g_once_init_enter (&init_lock_once)) {
    g_rec_mutex_init (&lock);
    g_once_init_leave (&init_lock_once, 1);
  }

  g_rec_mutex_lock (&lock);

  if (*cuda_ctx)
    goto out;

  find_cuda_context (element, cuda_ctx);
  if (*cuda_ctx)
    goto out;

  if (device_id > 0)
    target_device_id = device_id;

  /* No available CUDA context in pipeline, create new one here */
  *cuda_ctx = gst_cuda_context_new (target_device_id);

  if (*cuda_ctx == NULL) {
    GST_CAT_ERROR_OBJECT (GST_CAT_CONTEXT, element,
        "Failed to create CUDA context with device-id %d", device_id);
    ret = FALSE;
  } else {
    GstContext *context;
    GstMessage *msg;

    /* Propagate new CUDA context */

    context = gst_context_new (GST_CUDA_CONTEXT_TYPE, TRUE);
    context_set_cuda_context (context, *cuda_ctx);

    gst_element_set_context (element, context);

    GST_CAT_INFO_OBJECT (GST_CAT_CONTEXT, element,
        "posting have context (%p) message with CUDA context (%p)",
        context, *cuda_ctx);
    msg = gst_message_new_have_context (GST_OBJECT_CAST (element), context);
    gst_element_post_message (GST_ELEMENT_CAST (element), msg);
  }

out:
  g_rec_mutex_unlock (&lock);

  return ret;
}

/**
 * gst_cuda_handle_set_context:
 * @element: a #GstElement
 * @context: a #GstContext
 * @device_id: preferred device-id, pass device_id >=0 when
 *             the device_id explicitly required. Otherwise, set -1.
 * @cuda_ctx: (inout) (transfer full): location of a #GstCudaContext
 *
 * Helper function for implementing #GstElementClass.set_context() in
 * CUDA capable elements.
 *
 * Retrieves the #GstCudaContext in @context and places the result in @cuda_ctx.
 *
 * Returns: whether the @cuda_ctx could be set successfully
 *
 * Since: 1.22
 */
gboolean
gst_cuda_handle_set_context (GstElement * element,
    GstContext * context, gint device_id, GstCudaContext ** cuda_ctx)
{
  const gchar *context_type;

  g_return_val_if_fail (element != NULL, FALSE);
  g_return_val_if_fail (cuda_ctx != NULL, FALSE);

  _init_debug ();

  if (!context)
    return FALSE;

  context_type = gst_context_get_context_type (context);
  if (g_strcmp0 (context_type, GST_CUDA_CONTEXT_TYPE) == 0) {
    const GstStructure *str;
    GstCudaContext *other_ctx = NULL;
    guint other_device_id = 0;

    /* If we had context already, will not replace it */
    if (*cuda_ctx)
      return TRUE;

    str = gst_context_get_structure (context);
    if (gst_structure_get (str, GST_CUDA_CONTEXT_TYPE, GST_TYPE_CUDA_CONTEXT,
            &other_ctx, NULL)) {
      g_object_get (other_ctx, "cuda-device-id", &other_device_id, NULL);

      if (device_id == -1 || other_device_id == device_id) {
        GST_CAT_DEBUG_OBJECT (GST_CAT_CONTEXT, element, "Found CUDA context");
        *cuda_ctx = other_ctx;

        return TRUE;
      }

      gst_object_unref (other_ctx);
    }
  }

  return FALSE;
}

/**
 * gst_cuda_handle_context_query:
 * @element: a #GstElement
 * @query: a #GstQuery of type %GST_QUERY_CONTEXT
 * @cuda_ctx: (transfer none) (nullable): a #GstCudaContext
 *
 * Returns: Whether the @query was successfully responded to from the passed
 *          @context.
 *
 * Since: 1.22
 */
gboolean
gst_cuda_handle_context_query (GstElement * element,
    GstQuery * query, GstCudaContext * cuda_ctx)
{
  const gchar *context_type;
  GstContext *context, *old_context;

  g_return_val_if_fail (GST_IS_ELEMENT (element), FALSE);
  g_return_val_if_fail (GST_IS_QUERY (query), FALSE);
  g_return_val_if_fail (cuda_ctx == NULL
      || GST_IS_CUDA_CONTEXT (cuda_ctx), FALSE);

  _init_debug ();

  GST_CAT_LOG_OBJECT (GST_CAT_CONTEXT, element,
      "handle context query %" GST_PTR_FORMAT, query);
  gst_query_parse_context_type (query, &context_type);

  if (cuda_ctx && g_strcmp0 (context_type, GST_CUDA_CONTEXT_TYPE) == 0) {
    gst_query_parse_context (query, &old_context);

    if (old_context)
      context = gst_context_copy (old_context);
    else
      context = gst_context_new (GST_CUDA_CONTEXT_TYPE, TRUE);

    context_set_cuda_context (context, cuda_ctx);
    gst_query_set_context (query, context);
    gst_context_unref (context);
    GST_CAT_DEBUG_OBJECT (GST_CAT_CONTEXT, element,
        "successfully set %" GST_PTR_FORMAT " on %" GST_PTR_FORMAT, cuda_ctx,
        query);

    return TRUE;
  }

  return FALSE;
}

/**
 * gst_context_new_cuda_context:
 * @cuda_ctx: (transfer none): a #GstCudaContext
 *
 * Returns: (transfer full): a new #GstContext embedding the @cuda_ctx
 *
 * Since: 1.22
 */
GstContext *
gst_context_new_cuda_context (GstCudaContext * cuda_ctx)
{
  GstContext *context;

  g_return_val_if_fail (GST_IS_CUDA_CONTEXT (cuda_ctx), NULL);

  _init_debug ();

  context = gst_context_new (GST_CUDA_CONTEXT_TYPE, TRUE);
  context_set_cuda_context (context, cuda_ctx);

  return context;
}

static const gchar *gst_cuda_quark_strings[] =
    { "GstCudaQuarkGraphicsResource" };

static GQuark gst_cuda_quark_table[GST_CUDA_QUARK_MAX];

static void
init_cuda_quark_once (void)
{
  static gsize once_init = 0;

  if (g_once_init_enter (&once_init)) {
    gint i;

    for (i = 0; i < GST_CUDA_QUARK_MAX; i++)
      gst_cuda_quark_table[i] =
          g_quark_from_static_string (gst_cuda_quark_strings[i]);

    g_once_init_leave (&once_init, 1);
  }
}

/**
 * gst_cuda_quark_from_id: (skip)
 * @id: a #GstCudaQuarkId
 *
 * Returns: the GQuark for given @id or 0 if @id is unknown value
 *
 * Since: 1.22
 */
GQuark
gst_cuda_quark_from_id (GstCudaQuarkId id)
{
  g_return_val_if_fail (id < GST_CUDA_QUARK_MAX, 0);

  init_cuda_quark_once ();
  _init_debug ();

  return gst_cuda_quark_table[id];
}

/**
 * gst_cuda_graphics_resource_new: (skip)
 * @context: (transfer none): a #GstCudaContext
 * @graphics_context: (transfer none) (nullable): a graphics API specific context object
 * @type: a #GstCudaGraphicsResourceType of resource registration
 *
 * Create new #GstCudaGraphicsResource with given @context and @type
 *
 * Returns: a new #GstCudaGraphicsResource.
 * Free with gst_cuda_graphics_resource_free
 *
 * Since: 1.22
 */
GstCudaGraphicsResource *
gst_cuda_graphics_resource_new (GstCudaContext *
    context, GstObject * graphics_context, GstCudaGraphicsResourceType type)
{
  GstCudaGraphicsResource *resource;

  g_return_val_if_fail (GST_IS_CUDA_CONTEXT (context), NULL);

  _init_debug ();

  resource = g_new0 (GstCudaGraphicsResource, 1);
  resource->cuda_context = gst_object_ref (context);
  if (graphics_context)
    resource->graphics_context = gst_object_ref (graphics_context);

  return resource;
}

/**
 * gst_cuda_graphics_resource_register_gl_buffer: (skip)
 * @resource a #GstCudaGraphicsResource
 * @buffer: a GL buffer object
 * @flags: a `CUgraphicsRegisterFlags`
 *
 * Register the @buffer for access by CUDA.
 * Must be called from the gl context thread with current cuda context was
 * pushed on the current thread
 *
 * Returns: whether @buffer was registered or not
 *
 * Since: 1.22
 */
gboolean
gst_cuda_graphics_resource_register_gl_buffer (GstCudaGraphicsResource *
    resource, guint buffer, CUgraphicsRegisterFlags flags)
{
  CUresult cuda_ret;

  g_return_val_if_fail (resource != NULL, FALSE);
  g_return_val_if_fail (resource->registered == FALSE, FALSE);

  _init_debug ();

  cuda_ret = CuGraphicsGLRegisterBuffer (&resource->resource, buffer, flags);

  if (!gst_cuda_result (cuda_ret))
    return FALSE;

  resource->registered = TRUE;
  resource->type = GST_CUDA_GRAPHICS_RESOURCE_GL_BUFFER;
  resource->flags = flags;

  return TRUE;
}

#ifdef G_OS_WIN32
/**
 * gst_cuda_graphics_resource_register_d3d11_resource: (skip)
 * @resource a #GstCudaGraphicsResource
 * @d3d11_resource: a ID3D11Resource
 * @flags: a CUgraphicsRegisterFlags
 *
 * Register the @d3d11_resource for accessing by CUDA.
 * Must be called with d3d11 device lock with current cuda context was
 * pushed on the current thread
 *
 * Returns: whether @d3d11_resource was registered or not
 *
 * Since: 1.22
 */
gboolean
gst_cuda_graphics_resource_register_d3d11_resource (GstCudaGraphicsResource *
    resource, ID3D11Resource * d3d11_resource, CUgraphicsRegisterFlags flags)
{
  CUresult cuda_ret;

  g_return_val_if_fail (resource != NULL, FALSE);
  g_return_val_if_fail (resource->registered == FALSE, FALSE);

  _init_debug ();

  cuda_ret = CuGraphicsD3D11RegisterResource (&resource->resource,
      d3d11_resource, flags);

  if (!gst_cuda_result (cuda_ret))
    return FALSE;

  resource->registered = TRUE;
  resource->type = GST_CUDA_GRAPHICS_RESOURCE_D3D11_RESOURCE;
  resource->flags = flags;

  return TRUE;
}
#endif

/**
 * gst_cuda_graphics_resource_unregister: (skip)
 * @resource: a #GstCudaGraphicsResource
 *
 * Unregister previously registered resource.
 * For GL resource, this method must be called from gl context thread.
 * Also, current cuda context should be pushed on the current thread
 * before calling this method.
 *
 * Since: 1.22
 */
void
gst_cuda_graphics_resource_unregister (GstCudaGraphicsResource * resource)
{
  g_return_if_fail (resource != NULL);

  _init_debug ();

  if (!resource->registered)
    return;

  gst_cuda_result (CuGraphicsUnregisterResource (resource->resource));
  resource->resource = NULL;
  resource->registered = FALSE;

  return;
}

/**
 * gst_cuda_graphics_resource_map: (skip)
 * @resource: a #GstCudaGraphicsResource
 * @stream: a CUstream
 * @flags: a CUgraphicsMapResourceFlags
 *
 * Map previously registered resource with map flags
 *
 * Returns: (nullable): the `CUgraphicsResource` if successful or %NULL when failed
 *
 * Since: 1.22
 */
CUgraphicsResource
gst_cuda_graphics_resource_map (GstCudaGraphicsResource * resource,
    CUstream stream, CUgraphicsMapResourceFlags flags)
{
  CUresult cuda_ret;

  g_return_val_if_fail (resource != NULL, NULL);
  g_return_val_if_fail (resource->registered != FALSE, NULL);

  _init_debug ();

  cuda_ret = CuGraphicsResourceSetMapFlags (resource->resource, flags);
  if (!gst_cuda_result (cuda_ret))
    return NULL;

  cuda_ret = CuGraphicsMapResources (1, &resource->resource, stream);
  if (!gst_cuda_result (cuda_ret))
    return NULL;

  resource->mapped = TRUE;

  return resource->resource;
}

/**
 * gst_cuda_graphics_resource_unmap: (skip)
 * @resource: a #GstCudaGraphicsResource
 * @stream: a `CUstream`
 *
 * Unmap previously mapped resource
 *
 * Since: 1.22
 */
void
gst_cuda_graphics_resource_unmap (GstCudaGraphicsResource * resource,
    CUstream stream)
{
  g_return_if_fail (resource != NULL);
  g_return_if_fail (resource->registered != FALSE);

  _init_debug ();

  if (!resource->mapped)
    return;

  gst_cuda_result (CuGraphicsUnmapResources (1, &resource->resource, stream));

  resource->mapped = FALSE;
}

#ifdef HAVE_NVCODEC_GST_GL
static void
unregister_resource_from_gl_thread (GstGLContext * gl_context,
    GstCudaGraphicsResource * resource)
{
  GstCudaContext *cuda_context = resource->cuda_context;

  if (!gst_cuda_context_push (cuda_context)) {
    GST_WARNING_OBJECT (cuda_context, "failed to push CUDA context");
    return;
  }

  gst_cuda_graphics_resource_unregister (resource);

  if (!gst_cuda_context_pop (NULL)) {
    GST_WARNING_OBJECT (cuda_context, "failed to pop CUDA context");
  }
}
#endif

#ifdef GST_CUDA_HAS_D3D
static void
unregister_d3d11_resource (GstCudaGraphicsResource * resource)
{
  GstCudaContext *cuda_context = resource->cuda_context;
  GstD3D11Device *device = GST_D3D11_DEVICE (resource->graphics_context);

  if (!gst_cuda_context_push (cuda_context)) {
    GST_WARNING_OBJECT (cuda_context, "failed to push CUDA context");
    return;
  }

  gst_d3d11_device_lock (device);
  gst_cuda_graphics_resource_unregister (resource);
  gst_d3d11_device_unlock (device);

  if (!gst_cuda_context_pop (NULL)) {
    GST_WARNING_OBJECT (cuda_context, "failed to pop CUDA context");
  }
}
#endif

/**
 * gst_cuda_graphics_resource_free: (skip)
 * @resource: a #GstCudaGraphicsResource
 *
 * Free @resource
 *
 * Since: 1.22
 */
void
gst_cuda_graphics_resource_free (GstCudaGraphicsResource * resource)
{
  g_return_if_fail (resource != NULL);

  if (resource->registered) {
#ifdef HAVE_NVCODEC_GST_GL
    if (resource->type == GST_CUDA_GRAPHICS_RESOURCE_GL_BUFFER) {
      gst_gl_context_thread_add ((GstGLContext *) resource->graphics_context,
          (GstGLContextThreadFunc) unregister_resource_from_gl_thread,
          resource);
    } else
#endif
#ifdef GST_CUDA_HAS_D3D
    if (resource->type == GST_CUDA_GRAPHICS_RESOURCE_D3D11_RESOURCE) {
      unregister_d3d11_resource (resource);
    } else
#endif
    {
      /* FIXME: currently only opengl & d3d11 */
      g_assert_not_reached ();
    }
  }

  gst_object_unref (resource->cuda_context);
  if (resource->graphics_context)
    gst_object_unref (resource->graphics_context);
  g_free (resource);
}

const gchar *
gst_cuda_buffer_copy_type_to_string (GstCudaBufferCopyType type)
{
  switch (type) {
    case GST_CUDA_BUFFER_COPY_SYSTEM:
      return "SYSTEM";
    case GST_CUDA_BUFFER_COPY_CUDA:
      return "CUDA";
    case GST_CUDA_BUFFER_COPY_GL:
      return "GL";
    case GST_CUDA_BUFFER_COPY_D3D11:
      return "D3D11";
    case GST_CUDA_BUFFER_COPY_NVMM:
      return "NVMM";
    default:
      g_assert_not_reached ();
      break;
  }

  return "UNKNOWN";
}

static gboolean
gst_cuda_buffer_fallback_copy (GstBuffer * dst, const GstVideoInfo * dst_info,
    GstBuffer * src, const GstVideoInfo * src_info)
{
  GstVideoFrame dst_frame, src_frame;
  guint i, j;

  if (!gst_video_frame_map (&dst_frame, dst_info, dst, GST_MAP_WRITE)) {
    GST_ERROR ("Failed to map dst buffer");
    return FALSE;
  }

  if (!gst_video_frame_map (&src_frame, src_info, src, GST_MAP_READ)) {
    gst_video_frame_unmap (&dst_frame);
    GST_ERROR ("Failed to map src buffer");
    return FALSE;
  }

  /* src and dst resolutions can be different, pick min value */
  for (i = 0; i < GST_VIDEO_FRAME_N_PLANES (&dst_frame); i++) {
    guint dst_width_in_bytes, src_width_in_bytes;
    guint dst_height, src_height;
    guint width_in_bytes, height;
    guint dst_stride, src_stride;
    guint8 *dst_data, *src_data;

    dst_width_in_bytes = GST_VIDEO_FRAME_COMP_WIDTH (&dst_frame, i) *
        GST_VIDEO_FRAME_COMP_PSTRIDE (&dst_frame, i);
    src_width_in_bytes = GST_VIDEO_FRAME_COMP_WIDTH (&src_frame, i) *
        GST_VIDEO_FRAME_COMP_PSTRIDE (&src_frame, i);

    width_in_bytes = MIN (dst_width_in_bytes, src_width_in_bytes);

    dst_height = GST_VIDEO_FRAME_COMP_HEIGHT (&dst_frame, i);
    src_height = GST_VIDEO_FRAME_COMP_HEIGHT (&src_frame, i);

    height = MIN (dst_height, src_height);

    dst_stride = GST_VIDEO_FRAME_PLANE_STRIDE (&dst_frame, i);
    src_stride = GST_VIDEO_FRAME_PLANE_STRIDE (&src_frame, i);

    dst_data = GST_VIDEO_FRAME_PLANE_DATA (&dst_frame, i);
    src_data = GST_VIDEO_FRAME_PLANE_DATA (&src_frame, i);

    for (j = 0; j < height; j++) {
      memcpy (dst_data, src_data, width_in_bytes);
      dst_data += dst_stride;
      src_data += src_stride;
    }
  }

  gst_video_frame_unmap (&src_frame);
  gst_video_frame_unmap (&dst_frame);

  return TRUE;
}

static gboolean
map_buffer_and_fill_copy2d (GstBuffer * buf, const GstVideoInfo * info,
    GstCudaBufferCopyType copy_type, GstVideoFrame * frame,
    GstMapInfo * map_info, gboolean is_src,
    CUDA_MEMCPY2D copy_params[GST_VIDEO_MAX_PLANES])
{
  gboolean buffer_mapped = FALSE;
  guint i;

#ifdef HAVE_NVCODEC_NVMM
  if (copy_type == GST_CUDA_BUFFER_COPY_NVMM) {
    NvBufSurface *surface;
    NvBufSurfaceParams *surface_params;
    NvBufSurfacePlaneParams *plane_params;

    if (!gst_buffer_map (buf, map_info, GST_MAP_READ)) {
      GST_ERROR ("Failed to map input NVMM buffer");
      memset (map_info, 0, sizeof (GstMapInfo));
      return FALSE;
    }

    surface = (NvBufSurface *) map_info->data;

    GST_TRACE ("batch-size %d, num-filled %d, memType %d",
        surface->batchSize, surface->numFilled, surface->memType);

    surface_params = surface->surfaceList;
    buffer_mapped = TRUE;
    if (!surface_params) {
      GST_ERROR ("NVMM memory doesn't hold buffer");
      goto error;
    }

    plane_params = &surface_params->planeParams;
    if (plane_params->num_planes != GST_VIDEO_INFO_N_PLANES (info)) {
      GST_ERROR ("num_planes mismatch, %d / %d",
          plane_params->num_planes, GST_VIDEO_INFO_N_PLANES (info));
      goto error;
    }

    switch (surface->memType) {
        /* TODO: NVBUF_MEM_DEFAULT on jetson is SURFACE_ARRAY */
      case NVBUF_MEM_DEFAULT:
      case NVBUF_MEM_CUDA_DEVICE:
      {
        for (i = 0; i < plane_params->num_planes; i++) {
          if (is_src) {
            copy_params[i].srcMemoryType = CU_MEMORYTYPE_DEVICE;
            copy_params[i].srcDevice = (CUdeviceptr)
                ((guint8 *) surface_params->dataPtr + plane_params->offset[i]);
            copy_params[i].srcPitch = plane_params->pitch[i];
          } else {
            copy_params[i].dstMemoryType = CU_MEMORYTYPE_DEVICE;
            copy_params[i].dstDevice = (CUdeviceptr)
                ((guint8 *) surface_params->dataPtr + plane_params->offset[i]);
            copy_params[i].dstPitch = plane_params->pitch[i];
          }
        }
        break;
      }
      case NVBUF_MEM_CUDA_PINNED:
      {
        for (i = 0; i < plane_params->num_planes; i++) {
          if (is_src) {
            copy_params[i].srcMemoryType = CU_MEMORYTYPE_HOST;
            copy_params[i].srcHost =
                ((guint8 *) surface_params->dataPtr + plane_params->offset[i]);
            copy_params[i].srcPitch = plane_params->pitch[i];
          } else {
            copy_params[i].dstMemoryType = CU_MEMORYTYPE_HOST;
            copy_params[i].dstHost =
                ((guint8 *) surface_params->dataPtr + plane_params->offset[i]);
            copy_params[i].dstPitch = plane_params->pitch[i];
          }
        }
        break;
      }
      case NVBUF_MEM_CUDA_UNIFIED:
      {
        for (i = 0; i < plane_params->num_planes; i++) {
          if (is_src) {
            copy_params[i].srcMemoryType = CU_MEMORYTYPE_UNIFIED;
            copy_params[i].srcDevice = (CUdeviceptr)
                ((guint8 *) surface_params->dataPtr + plane_params->offset[i]);
            copy_params[i].srcPitch = plane_params->pitch[i];
          } else {
            copy_params[i].dstMemoryType = CU_MEMORYTYPE_UNIFIED;
            copy_params[i].dstDevice = (CUdeviceptr)
                ((guint8 *) surface_params->dataPtr + plane_params->offset[i]);
            copy_params[i].dstPitch = plane_params->pitch[i];
          }
        }
        break;
      }
      default:
        GST_ERROR ("Unexpected NVMM memory type %d", surface->memType);
        goto error;
    }

    for (i = 0; i < plane_params->num_planes; i++) {
      gsize width_in_bytes, height;

      width_in_bytes = plane_params->width[i] * plane_params->bytesPerPix[i];
      height = plane_params->height[i];

      if (copy_params[i].WidthInBytes == 0 ||
          width_in_bytes < copy_params[i].WidthInBytes) {
        copy_params[i].WidthInBytes = width_in_bytes;
      }

      if (copy_params[i].Height == 0 || height < copy_params[i].Height) {
        copy_params[i].Height = height;
      }
    }
  } else
#endif
  {
    GstMapFlags map_flags;

    if (is_src)
      map_flags = GST_MAP_READ;
    else
      map_flags = GST_MAP_WRITE;

    if (copy_type == GST_CUDA_BUFFER_COPY_CUDA)
      map_flags |= GST_MAP_CUDA;

    if (!gst_video_frame_map (frame, info, buf, map_flags)) {
      GST_ERROR ("Failed to map buffer");
      goto error;
    }

    for (i = 0; i < GST_VIDEO_FRAME_N_PLANES (frame); i++) {
      gsize width_in_bytes, height;

      if (is_src) {
        if (copy_type == GST_CUDA_BUFFER_COPY_CUDA) {
          copy_params[i].srcMemoryType = CU_MEMORYTYPE_DEVICE;
          copy_params[i].srcDevice =
              (CUdeviceptr) GST_VIDEO_FRAME_PLANE_DATA (frame, i);
        } else {
          copy_params[i].srcMemoryType = CU_MEMORYTYPE_HOST;
          copy_params[i].srcHost = GST_VIDEO_FRAME_PLANE_DATA (frame, i);
        }
        copy_params[i].srcPitch = GST_VIDEO_FRAME_PLANE_STRIDE (frame, i);
      } else {
        if (copy_type == GST_CUDA_BUFFER_COPY_CUDA) {
          copy_params[i].dstMemoryType = CU_MEMORYTYPE_DEVICE;
          copy_params[i].dstDevice =
              (CUdeviceptr) GST_VIDEO_FRAME_PLANE_DATA (frame, i);
        } else {
          copy_params[i].dstMemoryType = CU_MEMORYTYPE_HOST;
          copy_params[i].dstHost = GST_VIDEO_FRAME_PLANE_DATA (frame, i);
        }
        copy_params[i].dstPitch = GST_VIDEO_FRAME_PLANE_STRIDE (frame, i);
      }

      width_in_bytes = GST_VIDEO_FRAME_COMP_WIDTH (frame, i) *
          GST_VIDEO_FRAME_COMP_PSTRIDE (frame, i);
      height = GST_VIDEO_FRAME_COMP_HEIGHT (frame, i);

      if (copy_params[i].WidthInBytes == 0 ||
          width_in_bytes < copy_params[i].WidthInBytes) {
        copy_params[i].WidthInBytes = width_in_bytes;
      }

      if (copy_params[i].Height == 0 || height < copy_params[i].Height) {
        copy_params[i].Height = height;
      }
    }
  }

  return TRUE;

error:
  if (buffer_mapped) {
    gst_buffer_unmap (buf, map_info);
    memset (map_info, 0, sizeof (GstMapInfo));
  }

  return FALSE;
}

static void
unmap_buffer_or_frame (GstBuffer * buf, GstVideoFrame * frame,
    GstMapInfo * map_info)
{
  if (frame->buffer)
    gst_video_frame_unmap (frame);

  if (map_info->data)
    gst_buffer_unmap (buf, map_info);
}

static gboolean
gst_cuda_buffer_copy_internal (GstBuffer * dst_buf,
    GstCudaBufferCopyType dst_type, const GstVideoInfo * dst_info,
    GstBuffer * src_buf, GstCudaBufferCopyType src_type,
    const GstVideoInfo * src_info, GstCudaContext * context, CUstream stream)
{
  GstVideoFrame dst_frame, src_frame;
  gboolean ret = FALSE;
  GstMapInfo dst_map, src_map;
  guint i;
  CUDA_MEMCPY2D copy_params[GST_VIDEO_MAX_PLANES];

  memset (copy_params, 0, sizeof (copy_params));
  memset (&dst_frame, 0, sizeof (GstVideoFrame));
  memset (&src_frame, 0, sizeof (GstVideoFrame));
  memset (&dst_map, 0, sizeof (GstMapInfo));
  memset (&src_map, 0, sizeof (GstMapInfo));

  if (!map_buffer_and_fill_copy2d (dst_buf, dst_info,
          dst_type, &dst_frame, &dst_map, FALSE, copy_params)) {
    GST_ERROR_OBJECT (context, "Failed to map output buffer");
    return FALSE;
  }

  if (!map_buffer_and_fill_copy2d (src_buf, src_info,
          src_type, &src_frame, &src_map, TRUE, copy_params)) {
    GST_ERROR_OBJECT (context, "Failed to map input buffer");
    unmap_buffer_or_frame (dst_buf, &dst_frame, &dst_map);
    return FALSE;
  }

  if (!gst_cuda_context_push (context)) {
    GST_ERROR_OBJECT (context, "Failed to push our context");
    goto unmap_and_out;
  }

  for (i = 0; i < GST_VIDEO_INFO_N_PLANES (dst_info); i++) {
    ret = gst_cuda_result (CuMemcpy2DAsync (&copy_params[i], stream));
    if (!ret) {
      GST_ERROR_OBJECT (context, "Failed to copy plane %d", i);
      break;
    }
  }

  gst_cuda_result (CuStreamSynchronize (stream));
  gst_cuda_context_pop (NULL);

unmap_and_out:
  unmap_buffer_or_frame (dst_buf, &src_frame, &src_map);
  unmap_buffer_or_frame (src_buf, &dst_frame, &dst_map);

  return ret;
}

#ifdef HAVE_NVCODEC_GST_GL
static gboolean
ensure_gl_interop (void)
{
  guint device_count = 0;
  CUdevice device_list[1] = { 0, };
  CUresult cuda_ret;

  cuda_ret = CuGLGetDevices (&device_count,
      device_list, 1, CU_GL_DEVICE_LIST_ALL);

  if (cuda_ret != CUDA_SUCCESS || device_count == 0)
    return FALSE;

  return TRUE;
}

typedef struct _GLCopyData
{
  GstBuffer *src_buf;
  const GstVideoInfo *src_info;
  GstBuffer *dst_buf;
  const GstVideoInfo *dst_info;

  gboolean pbo_to_cuda;
  GstCudaBufferCopyType copy_type;
  GstCudaContext *context;
  CUstream stream;
  gboolean ret;
} GLCopyData;

static GstCudaGraphicsResource *
ensure_cuda_gl_graphics_resource (GstCudaContext * context, GstMemory * mem)
{
  GQuark quark;
  GstCudaGraphicsResource *ret = NULL;

  if (!gst_is_gl_memory_pbo (mem)) {
    GST_WARNING_OBJECT (context, "memory is not GL PBO memory, %s",
        mem->allocator->mem_type);
    return NULL;
  }

  quark = gst_cuda_quark_from_id (GST_CUDA_QUARK_GRAPHICS_RESOURCE);
  ret = (GstCudaGraphicsResource *)
      gst_mini_object_get_qdata (GST_MINI_OBJECT (mem), quark);

  if (!ret) {
    GstGLMemoryPBO *pbo;
    GstGLBuffer *buf;
    GstMapInfo info;

    ret = gst_cuda_graphics_resource_new (context,
        GST_OBJECT (GST_GL_BASE_MEMORY_CAST (mem)->context),
        GST_CUDA_GRAPHICS_RESOURCE_GL_BUFFER);

    if (!gst_memory_map (mem, &info, (GstMapFlags) (GST_MAP_READ | GST_MAP_GL))) {
      GST_ERROR_OBJECT (context, "Failed to map gl memory");
      gst_cuda_graphics_resource_free (ret);
      return NULL;
    }

    pbo = (GstGLMemoryPBO *) mem;
    buf = pbo->pbo;

    if (!gst_cuda_graphics_resource_register_gl_buffer (ret,
            buf->id, CU_GRAPHICS_REGISTER_FLAGS_NONE)) {
      GST_ERROR_OBJECT (context, "Failed to register gl buffer");
      gst_memory_unmap (mem, &info);
      gst_cuda_graphics_resource_free (ret);

      return NULL;
    }

    gst_memory_unmap (mem, &info);

    gst_mini_object_set_qdata (GST_MINI_OBJECT (mem), quark, ret,
        (GDestroyNotify) gst_cuda_graphics_resource_free);
  }

  return ret;
}

static void
gl_copy_thread_func (GstGLContext * gl_context, GLCopyData * data)
{
  GstCudaGraphicsResource *resources[GST_VIDEO_MAX_PLANES];
  guint num_resources;
  GstBuffer *gl_buf, *cuda_buf;
  GstVideoFrame cuda_frame;
  GstMapInfo cuda_map_info;
  CUDA_MEMCPY2D copy_params[GST_VIDEO_MAX_PLANES];
  guint i;
  GstCudaContext *context = data->context;
  CUstream stream = data->stream;

  memset (copy_params, 0, sizeof (copy_params));
  memset (&cuda_frame, 0, sizeof (GstVideoFrame));
  memset (&cuda_map_info, 0, sizeof (GstMapInfo));

  data->ret = FALSE;

  /* Incompatible gl context */
  if (!ensure_gl_interop ())
    return;

  if (data->pbo_to_cuda) {
    gl_buf = data->src_buf;
    cuda_buf = data->dst_buf;

    if (!map_buffer_and_fill_copy2d (cuda_buf,
            data->dst_info, data->copy_type, &cuda_frame, &cuda_map_info,
            FALSE, copy_params)) {
      GST_ERROR_OBJECT (context, "Failed to map output CUDA buffer");
      return;
    }
  } else {
    gl_buf = data->dst_buf;
    cuda_buf = data->src_buf;

    if (!map_buffer_and_fill_copy2d (cuda_buf,
            data->src_info, data->copy_type, &cuda_frame, &cuda_map_info,
            TRUE, copy_params)) {
      GST_ERROR_OBJECT (context, "Failed to map input CUDA buffer");
      return;
    }
  }

  num_resources = gst_buffer_n_memory (gl_buf);
  g_assert (num_resources >= GST_VIDEO_INFO_N_PLANES (data->src_info));

  if (!gst_cuda_context_push (context)) {
    GST_ERROR_OBJECT (context, "Failed to push context");
    unmap_buffer_or_frame (cuda_buf, &cuda_frame, &cuda_map_info);
    return;
  }

  for (i = 0; i < GST_VIDEO_INFO_N_PLANES (data->src_info); i++) {
    GstMemory *mem = gst_buffer_peek_memory (gl_buf, i);
    GstGLMemoryPBO *pbo;

    resources[i] = ensure_cuda_gl_graphics_resource (context, mem);
    if (!resources[i])
      goto out;

    pbo = (GstGLMemoryPBO *) mem;
    if (!data->pbo_to_cuda) {
      /* Need PBO -> texture */
      GST_MINI_OBJECT_FLAG_SET (mem, GST_GL_BASE_MEMORY_TRANSFER_NEED_UPLOAD);

      /* PBO -> sysmem */
      GST_MINI_OBJECT_FLAG_SET (pbo->pbo,
          GST_GL_BASE_MEMORY_TRANSFER_NEED_DOWNLOAD);
    } else {
      /* get the texture into the PBO */
      gst_gl_memory_pbo_upload_transfer (pbo);
      gst_gl_memory_pbo_download_transfer (pbo);
    }
  }

  for (i = 0; i < GST_VIDEO_INFO_N_PLANES (data->src_info); i++) {
    CUgraphicsResource cuda_resource;
    CUdeviceptr dev_ptr;
    size_t size;
    gboolean copy_ret;
    gsize width_in_bytes, height;

    if (data->pbo_to_cuda) {
      cuda_resource =
          gst_cuda_graphics_resource_map (resources[i], stream,
          CU_GRAPHICS_MAP_RESOURCE_FLAGS_READ_ONLY);
    } else {
      cuda_resource =
          gst_cuda_graphics_resource_map (resources[i], stream,
          CU_GRAPHICS_MAP_RESOURCE_FLAGS_WRITE_DISCARD);
    }

    if (!cuda_resource) {
      GST_ERROR_OBJECT (context, "Failed to map graphics resource %d", i);
      goto out;
    }

    if (!gst_cuda_result (CuGraphicsResourceGetMappedPointer (&dev_ptr, &size,
                cuda_resource))) {
      gst_cuda_graphics_resource_unmap (resources[i], stream);
      GST_ERROR_OBJECT (context, "Failed to get mapped pointer");
      goto out;
    }

    if (data->pbo_to_cuda) {
      copy_params[i].srcMemoryType = CU_MEMORYTYPE_DEVICE;
      copy_params[i].srcDevice = dev_ptr;
      copy_params[i].srcPitch = GST_VIDEO_INFO_PLANE_STRIDE (data->src_info, i);

      width_in_bytes = GST_VIDEO_INFO_COMP_WIDTH (data->src_info, i) *
          GST_VIDEO_INFO_COMP_PSTRIDE (data->src_info, i);
      height = GST_VIDEO_INFO_COMP_HEIGHT (data->src_info, i);
    } else {
      copy_params[i].dstMemoryType = CU_MEMORYTYPE_DEVICE;
      copy_params[i].dstDevice = dev_ptr;
      copy_params[i].dstPitch = GST_VIDEO_INFO_PLANE_STRIDE (data->dst_info, i);

      width_in_bytes = GST_VIDEO_INFO_COMP_WIDTH (data->dst_info, i) *
          GST_VIDEO_INFO_COMP_PSTRIDE (data->dst_info, i);
      height = GST_VIDEO_INFO_COMP_HEIGHT (data->dst_info, i);
    }

    if (width_in_bytes < copy_params[i].WidthInBytes)
      copy_params[i].WidthInBytes = width_in_bytes;

    if (height < copy_params[i].Height)
      copy_params[i].Height = height;

    copy_ret = gst_cuda_result (CuMemcpy2DAsync (&copy_params[i], stream));
    gst_cuda_graphics_resource_unmap (resources[i], stream);

    if (!copy_ret) {
      GST_ERROR_OBJECT (context, "Failed to copy plane %d", i);
      goto out;
    }
  }

  data->ret = TRUE;

out:
  gst_cuda_result (CuStreamSynchronize (stream));
  gst_cuda_context_pop (NULL);
  unmap_buffer_or_frame (cuda_buf, &cuda_frame, &cuda_map_info);
}

static gboolean
cuda_copy_gl_interop (GstBuffer * dst_buf, const GstVideoInfo * dst_info,
    GstBuffer * src_buf, const GstVideoInfo * src_info,
    GstGLContext * gl_context, GstCudaContext * context, CUstream stream,
    gboolean pbo_to_cuda, GstCudaBufferCopyType copy_type)
{
  GLCopyData data;

  g_assert (copy_type == GST_CUDA_BUFFER_COPY_CUDA ||
      copy_type == GST_CUDA_BUFFER_COPY_NVMM);

  data.src_buf = src_buf;
  data.src_info = src_info;
  data.dst_buf = dst_buf;
  data.dst_info = dst_info;
  data.pbo_to_cuda = pbo_to_cuda;
  data.copy_type = copy_type;
  data.context = context;
  data.stream = stream;
  data.ret = FALSE;

  gst_gl_context_thread_add (gl_context,
      (GstGLContextThreadFunc) gl_copy_thread_func, &data);

  return data.ret;
}
#endif

#ifdef GST_CUDA_HAS_D3D
static gboolean
ensure_d3d11_interop (GstCudaContext * context, GstD3D11Device * device)
{
  guint device_count = 0;
  guint cuda_device_id;
  CUdevice device_list[1] = { 0, };
  CUresult cuda_ret;

  g_object_get (context, "cuda-device-id", &cuda_device_id, NULL);

  cuda_ret = CuD3D11GetDevices (&device_count,
      device_list, 1, gst_d3d11_device_get_device_handle (device),
      CU_D3D11_DEVICE_LIST_ALL);

  if (cuda_ret != CUDA_SUCCESS || device_count == 0)
    return FALSE;

  if (device_list[0] != (CUdevice) cuda_device_id)
    return FALSE;

  return TRUE;
}

static GstCudaGraphicsResource *
ensure_cuda_d3d11_graphics_resource (GstCudaContext * context, GstMemory * mem)
{
  GQuark quark;
  GstCudaGraphicsResource *ret = NULL;

  if (!gst_is_d3d11_memory (mem)) {
    GST_WARNING_OBJECT (context, "memory is not D3D11 memory, %s",
        mem->allocator->mem_type);
    return NULL;
  }

  quark = gst_cuda_quark_from_id (GST_CUDA_QUARK_GRAPHICS_RESOURCE);
  ret = (GstCudaGraphicsResource *)
      gst_mini_object_get_qdata (GST_MINI_OBJECT (mem), quark);

  if (!ret) {
    ret = gst_cuda_graphics_resource_new (context,
        GST_OBJECT (GST_D3D11_MEMORY_CAST (mem)->device),
        GST_CUDA_GRAPHICS_RESOURCE_D3D11_RESOURCE);

    if (!gst_cuda_graphics_resource_register_d3d11_resource (ret,
            gst_d3d11_memory_get_resource_handle (GST_D3D11_MEMORY_CAST (mem)),
            CU_GRAPHICS_REGISTER_FLAGS_SURFACE_LOAD_STORE)) {
      GST_ERROR_OBJECT (context, "failed to register d3d11 resource");
      gst_cuda_graphics_resource_free (ret);

      return NULL;
    }

    gst_mini_object_set_qdata (GST_MINI_OBJECT (mem), quark, ret,
        (GDestroyNotify) gst_cuda_graphics_resource_free);
  }

  return ret;
}

static gboolean
cuda_copy_d3d11_interop (GstBuffer * dst_buf, const GstVideoInfo * dst_info,
    GstBuffer * src_buf, const GstVideoInfo * src_info, GstD3D11Device * device,
    GstCudaContext * context, CUstream stream, gboolean d3d11_to_cuda)
{
  GstCudaGraphicsResource *resources[GST_VIDEO_MAX_PLANES];
  D3D11_TEXTURE2D_DESC desc[GST_VIDEO_MAX_PLANES];
  guint num_resources;
  GstBuffer *d3d11_buf, *cuda_buf;
  GstVideoFrame d3d11_frame, cuda_frame;
  GstMapInfo cuda_map_info;
  CUDA_MEMCPY2D copy_params[GST_VIDEO_MAX_PLANES];
  guint i;
  gboolean ret = FALSE;

  memset (copy_params, 0, sizeof (copy_params));
  memset (&cuda_frame, 0, sizeof (GstVideoFrame));
  memset (&cuda_map_info, 0, sizeof (GstMapInfo));

  /* Incompatible d3d11 device */
  if (!ensure_d3d11_interop (context, device))
    return FALSE;

  if (d3d11_to_cuda) {
    d3d11_buf = src_buf;
    cuda_buf = dst_buf;
    if (!gst_video_frame_map (&d3d11_frame, src_info, d3d11_buf,
            GST_MAP_READ | GST_MAP_D3D11)) {
      GST_ERROR_OBJECT (context, "Failed to map input D3D11 buffer");
      return FALSE;
    }
    if (!map_buffer_and_fill_copy2d (cuda_buf,
            dst_info, GST_CUDA_BUFFER_COPY_CUDA, &cuda_frame, &cuda_map_info,
            FALSE, copy_params)) {
      GST_ERROR_OBJECT (context, "Failed to map output CUDA buffer");
      gst_video_frame_unmap (&d3d11_frame);
      return FALSE;
    }
  } else {
    d3d11_buf = dst_buf;
    cuda_buf = src_buf;
    if (!gst_video_frame_map (&d3d11_frame, dst_info, d3d11_buf,
            GST_MAP_WRITE | GST_MAP_D3D11)) {
      GST_ERROR_OBJECT (context, "Failed to map output D3D11 buffer");
      return FALSE;
    }
    if (!map_buffer_and_fill_copy2d (cuda_buf,
            src_info, GST_CUDA_BUFFER_COPY_CUDA, &cuda_frame, &cuda_map_info,
            TRUE, copy_params)) {
      GST_ERROR_OBJECT (context, "Failed to map input CUDA buffer");
      gst_video_frame_unmap (&d3d11_frame);
      return FALSE;
    }
  }

  num_resources = gst_buffer_n_memory (d3d11_buf);
  g_assert (num_resources >= GST_VIDEO_FRAME_N_PLANES (&d3d11_frame));

  if (!gst_cuda_context_push (context)) {
    GST_ERROR_OBJECT (context, "Failed to push context");
    gst_video_frame_unmap (&d3d11_frame);
    unmap_buffer_or_frame (cuda_buf, &cuda_frame, &cuda_map_info);
    return FALSE;
  }

  for (i = 0; i < GST_VIDEO_FRAME_N_PLANES (&d3d11_frame); i++) {
    GstMemory *mem = gst_buffer_peek_memory (d3d11_buf, i);

    resources[i] = ensure_cuda_d3d11_graphics_resource (context, mem);
    if (!resources[i]
        || !gst_d3d11_memory_get_texture_desc (GST_D3D11_MEMORY_CAST (mem),
            &desc[i]))
      goto out;
  }

  for (i = 0; i < GST_VIDEO_FRAME_N_PLANES (&d3d11_frame); i++) {
    CUgraphicsResource cuda_resource;
    CUarray d3d11_array;
    gboolean copy_ret;

    if (d3d11_to_cuda) {
      cuda_resource =
          gst_cuda_graphics_resource_map (resources[i], stream,
          CU_GRAPHICS_MAP_RESOURCE_FLAGS_READ_ONLY);
    } else {
      cuda_resource =
          gst_cuda_graphics_resource_map (resources[i], stream,
          CU_GRAPHICS_MAP_RESOURCE_FLAGS_WRITE_DISCARD);
    }

    if (!cuda_resource) {
      GST_ERROR_OBJECT (context, "Failed to map graphics resource %d", i);
      goto out;
    }

    if (!gst_cuda_result (CuGraphicsSubResourceGetMappedArray (&d3d11_array,
                cuda_resource, 0, 0))) {
      gst_cuda_graphics_resource_unmap (resources[i], stream);
      GST_ERROR_OBJECT (context, "Failed to get mapped array");
      goto out;
    }

    if (d3d11_to_cuda) {
      copy_params[i].srcMemoryType = CU_MEMORYTYPE_ARRAY;
      copy_params[i].srcArray = d3d11_array;
      copy_params[i].srcPitch =
          desc[i].Width * GST_VIDEO_FRAME_COMP_PSTRIDE (&d3d11_frame, i);
    } else {
      copy_params[i].dstMemoryType = CU_MEMORYTYPE_ARRAY;
      copy_params[i].dstArray = d3d11_array;
      copy_params[i].dstPitch =
          desc[i].Width * GST_VIDEO_FRAME_COMP_PSTRIDE (&d3d11_frame, i);
    }

    copy_ret = gst_cuda_result (CuMemcpy2DAsync (&copy_params[i], stream));
    gst_cuda_graphics_resource_unmap (resources[i], stream);

    if (!copy_ret) {
      GST_ERROR_OBJECT (context, "Failed to copy plane %d", i);
      goto out;
    }
  }

  ret = TRUE;

out:
  gst_cuda_result (CuStreamSynchronize (stream));
  gst_cuda_context_pop (NULL);
  gst_video_frame_unmap (&d3d11_frame);
  unmap_buffer_or_frame (cuda_buf, &cuda_frame, &cuda_map_info);

  return ret;
}
#endif

gboolean
gst_cuda_buffer_copy (GstBuffer * dst, GstCudaBufferCopyType dst_type,
    const GstVideoInfo * dst_info, GstBuffer * src,
    GstCudaBufferCopyType src_type, const GstVideoInfo * src_info,
    GstCudaContext * context, CUstream stream)
{
  gboolean use_copy_2d = FALSE;
  GstMemory *dst_mem, *src_mem;
#ifdef GST_CUDA_HAS_D3D
  D3D11_TEXTURE2D_DESC desc;
#endif
  GstCudaContext *cuda_context;

  g_return_val_if_fail (GST_IS_BUFFER (dst), FALSE);
  g_return_val_if_fail (dst_info != NULL, FALSE);
  g_return_val_if_fail (GST_IS_BUFFER (src), FALSE);
  g_return_val_if_fail (src_info != NULL, FALSE);
  g_return_val_if_fail (GST_IS_CUDA_CONTEXT (context), FALSE);

  _init_debug ();

  if (dst_type == GST_CUDA_BUFFER_COPY_NVMM &&
      src_type == GST_CUDA_BUFFER_COPY_NVMM) {
    GST_ERROR_OBJECT (context, "Not supported copy NVMM -> NVMM");
    return FALSE;
  }

  if (GST_VIDEO_INFO_FORMAT (dst_info) != GST_VIDEO_INFO_FORMAT (src_info)) {
    GST_ERROR_OBJECT (context,
        "Copy between different format is not supported");
    return FALSE;
  }

  if (dst_type == GST_CUDA_BUFFER_COPY_CUDA ||
      dst_type == GST_CUDA_BUFFER_COPY_NVMM ||
      src_type == GST_CUDA_BUFFER_COPY_CUDA ||
      src_type == GST_CUDA_BUFFER_COPY_NVMM) {
    use_copy_2d = TRUE;
  }

  if (!use_copy_2d) {
    GST_TRACE_OBJECT (context, "Not a device memory, use system memory copy");
    return gst_cuda_buffer_fallback_copy (dst, dst_info, src, src_info);
  }

  dst_mem = gst_buffer_peek_memory (dst, 0);
  src_mem = gst_buffer_peek_memory (src, 0);

#ifdef HAVE_NVCODEC_GST_GL
  if (src_type == GST_CUDA_BUFFER_COPY_GL && gst_is_gl_memory_pbo (src_mem)) {
    GstGLMemory *gl_mem = (GstGLMemory *) src_mem;
    GstGLContext *gl_context = gl_mem->mem.context;
    GstCudaContext *cuda_context = context;

    if (dst_type == GST_CUDA_BUFFER_COPY_CUDA && gst_is_cuda_memory (dst_mem))
      cuda_context = GST_CUDA_MEMORY_CAST (dst_mem)->context;

    GST_TRACE_OBJECT (context, "GL -> %s",
        gst_cuda_buffer_copy_type_to_string (dst_type));

    return cuda_copy_gl_interop (dst, dst_info, src, src_info, gl_context,
        cuda_context, stream, TRUE, dst_type);
  }

  if (dst_type == GST_CUDA_BUFFER_COPY_GL && gst_is_gl_memory_pbo (dst_mem)) {
    GstGLMemory *gl_mem = (GstGLMemory *) dst_mem;
    GstGLContext *gl_context = gl_mem->mem.context;
    GstCudaContext *cuda_context = context;

    if (src_type == GST_CUDA_BUFFER_COPY_CUDA && gst_is_cuda_memory (src_mem))
      cuda_context = GST_CUDA_MEMORY_CAST (src_mem)->context;

    GST_TRACE_OBJECT (context, "%s -> GL",
        gst_cuda_buffer_copy_type_to_string (src_type));

    return cuda_copy_gl_interop (dst, dst_info, src, src_info, gl_context,
        cuda_context, stream, FALSE, src_type);
  }
#endif

#ifdef GST_CUDA_HAS_D3D
  if (src_type == GST_CUDA_BUFFER_COPY_D3D11 && gst_is_d3d11_memory (src_mem) &&
      gst_d3d11_memory_get_texture_desc (GST_D3D11_MEMORY_CAST (src_mem), &desc)
      && desc.Usage == D3D11_USAGE_DEFAULT && gst_is_cuda_memory (dst_mem)) {
    GstD3D11Memory *dmem = GST_D3D11_MEMORY_CAST (src_mem);
    GstD3D11Device *device = dmem->device;
    GstCudaContext *cuda_context = GST_CUDA_MEMORY_CAST (dst_mem)->context;
    gboolean ret;

    GST_TRACE_OBJECT (context, "D3D11 -> CUDA");

    gst_d3d11_device_lock (device);
    ret = cuda_copy_d3d11_interop (dst, dst_info, src, src_info, device,
        cuda_context, stream, TRUE);
    gst_d3d11_device_unlock (device);

    return ret;
  }

  if (dst_type == GST_CUDA_BUFFER_COPY_D3D11 && gst_is_d3d11_memory (dst_mem) &&
      gst_d3d11_memory_get_texture_desc (GST_D3D11_MEMORY_CAST (dst_mem), &desc)
      && desc.Usage == D3D11_USAGE_DEFAULT && gst_is_cuda_memory (src_mem)) {
    GstD3D11Memory *dmem = GST_D3D11_MEMORY_CAST (dst_mem);
    GstD3D11Device *device = dmem->device;
    GstCudaContext *cuda_context = GST_CUDA_MEMORY_CAST (src_mem)->context;
    gboolean ret;

    GST_TRACE_OBJECT (context, "CUDA -> D3D11");

    gst_d3d11_device_lock (device);
    ret = cuda_copy_d3d11_interop (dst, dst_info, src, src_info, device,
        cuda_context, stream, FALSE);
    gst_d3d11_device_unlock (device);

    return ret;
  }
#endif

  if (gst_is_cuda_memory (dst_mem)) {
    cuda_context = GST_CUDA_MEMORY_CAST (dst_mem)->context;
  } else if (gst_is_cuda_memory (src_mem)) {
    cuda_context = GST_CUDA_MEMORY_CAST (src_mem)->context;
  } else {
    cuda_context = context;
  }

  GST_TRACE_OBJECT (context, "%s -> %s",
      gst_cuda_buffer_copy_type_to_string (src_type),
      gst_cuda_buffer_copy_type_to_string (dst_type));

  return gst_cuda_buffer_copy_internal (dst, dst_type, dst_info,
      src, src_type, src_info, cuda_context, stream);
}