nvcodec: Refactor GstCudaMemory abstraction

* Hide GstCudaMemory member variables * Make GstCudaAllocator object GstCudaContext independent * Set offset/stride of memory correctly via video meta * Drop GST_BUFFER_POOL_OPTION_VIDEO_ALIGNMENT support. This implementation actually does not support custom alignment because we allocate device memory via cuMemAllocPitch of which alignment is almost uncontrollable Part-of: <https://gitlab.freedesktop.org/gstreamer/gstreamer/-/merge_requests/1834>
2025-03-13 23:22:54 +00:00 · 2022-03-03 03:25:47 +09:00 · 2022-03-03 03:25:47 +09:00 · 111b2c3f53
commit 111b2c3f53
parent ad0e7fca14
12 changed files with 580 additions and 759 deletions
--- a/subprojects/gst-plugins-bad/sys/nvcodec/cuda-converter.c
+++ b/subprojects/gst-plugins-bad/sys/nvcodec/cuda-converter.c
@ -229,7 +229,7 @@ static const gchar templ_YUV_TO_YUV[] =
 GST_CUDA_KERNEL_FUNC
 "(cudaTextureObject_t tex0, cudaTextureObject_t tex1, cudaTextureObject_t tex2,\n"
 "    unsigned char *dst0, unsigned char *dst1, unsigned char *dst2,\n"
-"    int stride)\n"
+"    int stride, int uv_stride)\n"
 "{\n"
 "  int x_pos = blockIdx.x * blockDim.x + threadIdx.x;\n"
 "  int y_pos = blockIdx.y * blockDim.y + threadIdx.y;\n"
@ -265,7 +265,7 @@ GST_CUDA_KERNEL_FUNC
 "      v = tmp;\n"
 "    }\n"
 "    write_chroma (dst1,\n"
-"      dst2, u, v, x_pos, y_pos, CHROMA_PSTRIDE, stride, MASK);\n"
+"      dst2, u, v, x_pos, y_pos, CHROMA_PSTRIDE, uv_stride, MASK);\n"
 "  }\n"
 "}\n"
 "\n"
@ -589,7 +589,7 @@ GST_CUDA_KERNEL_FUNC_TO_Y444
 GST_CUDA_KERNEL_FUNC_Y444_TO_YUV
 "(cudaTextureObject_t tex0, cudaTextureObject_t tex1, cudaTextureObject_t tex2,\n"
 "    unsigned char *dst0, unsigned char *dst1, unsigned char *dst2,\n"
-"    int stride)\n"
+"    int stride, int uv_stride)\n"
 "{\n"
 "  int x_pos = blockIdx.x * blockDim.x + threadIdx.x;\n"
 "  int y_pos = blockIdx.y * blockDim.y + threadIdx.y;\n"
@ -626,7 +626,7 @@ GST_CUDA_KERNEL_FUNC_Y444_TO_YUV
 "      v = tmp;\n"
 "    }\n"
 "    write_chroma (dst1,\n"
-"      dst2, u, v, x_pos, y_pos, CHROMA_PSTRIDE, stride, MASK);\n"
+"      dst2, u, v, x_pos, y_pos, CHROMA_PSTRIDE, uv_stride, MASK);\n"
 "  }\n"
 "}\n"
 "\n"
@ -745,9 +745,10 @@ struct _GstCudaConverter
  gchar *ptx;
  GstCudaStageBuffer fallback_buffer[GST_VIDEO_MAX_PLANES];

-    gboolean (*convert) (GstCudaConverter * convert, const GstCudaMemory * src,
-      GstVideoInfo * in_info, GstCudaMemory * dst, GstVideoInfo * out_info,
-      CUstream cuda_stream);
+  /* *INDENT-OFF* */
+  gboolean (*convert) (GstCudaConverter * convert, GstVideoFrame * src_frame,
+      GstVideoFrame * dst_frame, CUstream cuda_stream);
+  /* *INDENT-ON* */

  const CUdeviceptr src;
  GstVideoInfo *cur_in_info;
@ -893,67 +894,25 @@ gst_cuda_converter_free (GstCudaConverter * convert)
  g_free (convert);
 }

-/**
- * gst_cuda_converter_frame:
- * @convert: a #GstCudaConverter
- * @src: a #GstCudaMemory
- * @in_info: a #GstVideoInfo representing @src
- * @dst: a #GstCudaMemory
- * @out_info: a #GstVideoInfo representing @dst
- * @cuda_stream: a #CUstream
- *
- * Convert the pixels of @src into @dest using @convert.
- * Called without gst_cuda_context_push() and gst_cuda_context_pop() by caller
- */
 gboolean
-gst_cuda_converter_frame (GstCudaConverter * convert, const GstCudaMemory * src,
-    GstVideoInfo * in_info, GstCudaMemory * dst, GstVideoInfo * out_info,
-    CUstream cuda_stream)
+gst_cuda_converter_convert_frame (GstCudaConverter * convert,
+    GstVideoFrame * src_frame, GstVideoFrame * dst_frame, CUstream cuda_stream)
 {
  gboolean ret;

  g_return_val_if_fail (convert, FALSE);
-  g_return_val_if_fail (src, FALSE);
-  g_return_val_if_fail (in_info, FALSE);
-  g_return_val_if_fail (dst, FALSE);
-  g_return_val_if_fail (out_info, FALSE);
+  g_return_val_if_fail (src_frame, FALSE);
+  g_return_val_if_fail (dst_frame, FALSE);

  gst_cuda_context_push (convert->cuda_ctx);

-  ret = gst_cuda_converter_frame_unlocked (convert,
-      src, in_info, dst, out_info, cuda_stream);
+  ret = convert->convert (convert, src_frame, dst_frame, cuda_stream);

  gst_cuda_context_pop (NULL);

  return ret;
 }

-/**
- * gst_cuda_converter_frame_unlocked:
- * @convert: a #GstCudaConverter
- * @src: a #GstCudaMemory
- * @in_info: a #GstVideoInfo representing @src
- * @dst: a #GstCudaMemory
- * @out_info: a #GstVideoInfo representing @dest
- * @cuda_stream: a #CUstream
- *
- * Convert the pixels of @src into @dest using @convert.
- * Caller should call this method after gst_cuda_context_push()
- */
-gboolean
-gst_cuda_converter_frame_unlocked (GstCudaConverter * convert,
-    const GstCudaMemory * src, GstVideoInfo * in_info, GstCudaMemory * dst,
-    GstVideoInfo * out_info, CUstream cuda_stream)
-{
-  g_return_val_if_fail (convert, FALSE);
-  g_return_val_if_fail (src, FALSE);
-  g_return_val_if_fail (in_info, FALSE);
-  g_return_val_if_fail (dst, FALSE);
-  g_return_val_if_fail (out_info, FALSE);
-
-  return convert->convert (convert, src, in_info, dst, out_info, cuda_stream);
-}
-
 /* allocate fallback memory for texture alignment requirement */
 static gboolean
 convert_ensure_fallback_memory (GstCudaConverter * convert,
@ -1020,8 +979,8 @@ convert_create_texture_unchecked (const CUdeviceptr src, gint width,
 }

 static CUtexObject
-convert_create_texture (GstCudaConverter * convert, const GstCudaMemory * src,
-    GstVideoInfo * info, guint plane, CUstream cuda_stream)
+convert_create_texture (GstCudaConverter * convert, GstVideoFrame * src_frame,
+    guint plane, CUstream cuda_stream)
 {
  CUarray_format format = CU_AD_FORMAT_UNSIGNED_INT8;
  guint channels = 1;
@ -1030,22 +989,23 @@ convert_create_texture (GstCudaConverter * convert, const GstCudaMemory * src,
  CUresult cuda_ret;
  CUfilter_mode mode;

-  if (GST_VIDEO_INFO_COMP_DEPTH (info, plane) > 8)
+  if (GST_VIDEO_FRAME_COMP_DEPTH (src_frame, plane) > 8)
    format = CU_AD_FORMAT_UNSIGNED_INT16;

  /* FIXME: more graceful method ? */
  if (plane != 0 &&
-      GST_VIDEO_INFO_N_PLANES (info) != GST_VIDEO_INFO_N_COMPONENTS (info)) {
+      GST_VIDEO_FRAME_N_PLANES (src_frame) !=
+      GST_VIDEO_FRAME_N_COMPONENTS (src_frame)) {
    channels = 2;
  }

-  src_ptr = src->data + src->offset[plane];
-  stride = src->stride;
+  src_ptr = (CUdeviceptr) GST_VIDEO_FRAME_PLANE_DATA (src_frame, plane);
+  stride = GST_VIDEO_FRAME_PLANE_STRIDE (src_frame, plane);

  if (convert->texture_alignment && (src_ptr % convert->texture_alignment)) {
    CUDA_MEMCPY2D copy_params = { 0, };

-    if (!convert_ensure_fallback_memory (convert, info, plane))
+    if (!convert_ensure_fallback_memory (convert, &src_frame->info, plane))
      return 0;

    GST_LOG ("device memory was not aligned, copy to fallback memory");
@ -1057,9 +1017,9 @@ convert_create_texture (GstCudaConverter * convert, const GstCudaMemory * src,
    copy_params.dstMemoryType = CU_MEMORYTYPE_DEVICE;
    copy_params.dstPitch = convert->fallback_buffer[plane].cuda_stride;
    copy_params.dstDevice = convert->fallback_buffer[plane].device_ptr;
-    copy_params.WidthInBytes = GST_VIDEO_INFO_COMP_WIDTH (info, plane)
-        * GST_VIDEO_INFO_COMP_PSTRIDE (info, plane);
-    copy_params.Height = GST_VIDEO_INFO_COMP_HEIGHT (info, plane);
+    copy_params.WidthInBytes = GST_VIDEO_FRAME_COMP_WIDTH (src_frame, plane)
+        * GST_VIDEO_FRAME_COMP_PSTRIDE (src_frame, plane);
+    copy_params.Height = GST_VIDEO_FRAME_COMP_HEIGHT (src_frame, plane);

    cuda_ret = CuMemcpy2DAsync (&copy_params, cuda_stream);
    if (!gst_cuda_result (cuda_ret)) {
@ -1079,27 +1039,26 @@ convert_create_texture (GstCudaConverter * convert, const GstCudaMemory * src,
    mode = CU_TR_FILTER_MODE_LINEAR;

  return convert_create_texture_unchecked (src_ptr,
-      GST_VIDEO_INFO_COMP_WIDTH (info, plane),
-      GST_VIDEO_INFO_COMP_HEIGHT (info, plane), channels, stride, format, mode,
-      cuda_stream);
+      GST_VIDEO_FRAME_COMP_WIDTH (src_frame, plane),
+      GST_VIDEO_FRAME_COMP_HEIGHT (src_frame, plane), channels, stride, format,
+      mode, cuda_stream);
 }

 /* main conversion function for YUV to YUV conversion */
 static gboolean
-convert_YUV_TO_YUV (GstCudaConverter * convert,
-    const GstCudaMemory * src, GstVideoInfo * in_info, GstCudaMemory * dst,
-    GstVideoInfo * out_info, CUstream cuda_stream)
+convert_YUV_TO_YUV (GstCudaConverter * convert, GstVideoFrame * src_frame,
+    GstVideoFrame * dst_frame, CUstream cuda_stream)
 {
  CUtexObject texture[GST_VIDEO_MAX_PLANES] = { 0, };
  CUresult cuda_ret;
  gboolean ret = FALSE;
  CUdeviceptr dst_ptr[GST_VIDEO_MAX_PLANES] = { 0, };
-  gint dst_stride;
+  gint dst_stride, dst_uv_stride;
  gint width, height;
  gint i;

  gpointer kernel_args[] = { &texture[0], &texture[1], &texture[2],
-    &dst_ptr[0], &dst_ptr[1], &dst_ptr[2], &dst_stride
+    &dst_ptr[0], &dst_ptr[1], &dst_ptr[2], &dst_stride, &dst_uv_stride
  };

  /* conversion step
@ -1110,21 +1069,23 @@ convert_YUV_TO_YUV (GstCudaConverter * convert,
   */

  /* map CUDA device memory to CUDA texture object */
-  for (i = 0; i < GST_VIDEO_INFO_N_PLANES (in_info); i++) {
-    texture[i] = convert_create_texture (convert, src, in_info, i, cuda_stream);
+  for (i = 0; i < GST_VIDEO_FRAME_N_PLANES (src_frame); i++) {
+    texture[i] = convert_create_texture (convert, src_frame, i, cuda_stream);
    if (!texture[i]) {
      GST_ERROR ("couldn't create texture for %d th plane", i);
      goto done;
    }
  }

-  for (i = 0; i < GST_VIDEO_INFO_N_PLANES (out_info); i++)
-    dst_ptr[i] = dst->data + dst->offset[i];
+  for (i = 0; i < GST_VIDEO_FRAME_N_PLANES (dst_frame); i++) {
+    dst_ptr[i] = (CUdeviceptr) GST_VIDEO_FRAME_PLANE_DATA (dst_frame, i);
+  }

-  dst_stride = dst->stride;
+  dst_stride = GST_VIDEO_FRAME_PLANE_STRIDE (dst_frame, 0);
+  dst_uv_stride = GST_VIDEO_FRAME_PLANE_STRIDE (dst_frame, 1);

-  width = GST_VIDEO_INFO_WIDTH (out_info);
-  height = GST_VIDEO_INFO_HEIGHT (out_info);
+  width = GST_VIDEO_FRAME_WIDTH (dst_frame);
+  height = GST_VIDEO_FRAME_HEIGHT (dst_frame);

  cuda_ret =
      CuLaunchKernel (convert->kernel_func[0], DIV_UP (width, CUDA_BLOCK_X),
@ -1140,7 +1101,7 @@ convert_YUV_TO_YUV (GstCudaConverter * convert,
  gst_cuda_result (CuStreamSynchronize (cuda_stream));

 done:
-  for (i = 0; i < GST_VIDEO_INFO_N_PLANES (in_info); i++) {
+  for (i = 0; i < GST_VIDEO_FRAME_N_PLANES (src_frame); i++) {
    if (texture[i])
      gst_cuda_result (CuTexObjectDestroy (texture[i]));
  }
@ -1150,9 +1111,8 @@ done:

 /* main conversion function for YUV to RGB conversion */
 static gboolean
-convert_YUV_TO_RGB (GstCudaConverter * convert,
-    const GstCudaMemory * src, GstVideoInfo * in_info, GstCudaMemory * dst,
-    GstVideoInfo * out_info, CUstream cuda_stream)
+convert_YUV_TO_RGB (GstCudaConverter * convert, GstVideoFrame * src_frame,
+    GstVideoFrame * dst_frame, CUstream cuda_stream)
 {
  CUtexObject texture[GST_VIDEO_MAX_PLANES] = { 0, };
  CUresult cuda_ret;
@ -1174,19 +1134,19 @@ convert_YUV_TO_RGB (GstCudaConverter * convert,
   */

  /* map CUDA device memory to CUDA texture object */
-  for (i = 0; i < GST_VIDEO_INFO_N_PLANES (in_info); i++) {
-    texture[i] = convert_create_texture (convert, src, in_info, i, cuda_stream);
+  for (i = 0; i < GST_VIDEO_FRAME_N_PLANES (src_frame); i++) {
+    texture[i] = convert_create_texture (convert, src_frame, i, cuda_stream);
    if (!texture[i]) {
      GST_ERROR ("couldn't create texture for %d th plane", i);
      goto done;
    }
  }

-  dstRGB = dst->data;
-  dst_stride = dst->stride;
+  dstRGB = (CUdeviceptr) GST_VIDEO_FRAME_PLANE_DATA (dst_frame, 0);
+  dst_stride = GST_VIDEO_FRAME_PLANE_STRIDE (dst_frame, 0);

-  width = GST_VIDEO_INFO_WIDTH (out_info);
-  height = GST_VIDEO_INFO_HEIGHT (out_info);
+  width = GST_VIDEO_FRAME_WIDTH (dst_frame);
+  height = GST_VIDEO_FRAME_HEIGHT (dst_frame);

  cuda_ret =
      CuLaunchKernel (convert->kernel_func[0], DIV_UP (width, CUDA_BLOCK_X),
@ -1202,7 +1162,7 @@ convert_YUV_TO_RGB (GstCudaConverter * convert,
  gst_cuda_result (CuStreamSynchronize (cuda_stream));

 done:
-  for (i = 0; i < GST_VIDEO_INFO_N_PLANES (in_info); i++) {
+  for (i = 0; i < GST_VIDEO_FRAME_N_PLANES (src_frame); i++) {
    if (texture[i])
      gst_cuda_result (CuTexObjectDestroy (texture[i]));
  }
@ -1212,7 +1172,7 @@ done:

 static gboolean
 convert_UNPACK_RGB (GstCudaConverter * convert, CUfunction kernel_func,
-    CUstream cuda_stream, const GstCudaMemory * src, GstVideoInfo * in_info,
+    CUstream cuda_stream, GstVideoFrame * src_frame,
    CUdeviceptr dst, gint dst_stride, GstCudaRGBOrder * rgb_order)
 {
  CUdeviceptr srcRGB = 0;
@ -1227,12 +1187,12 @@ convert_UNPACK_RGB (GstCudaConverter * convert, CUfunction kernel_func,
    &convert->in_rgb_order.B, &convert->in_rgb_order.A,
  };

-  srcRGB = src->data;
-  src_stride = src->stride;
+  srcRGB = (CUdeviceptr) GST_VIDEO_FRAME_PLANE_DATA (src_frame, 0);
+  src_stride = GST_VIDEO_FRAME_PLANE_STRIDE (src_frame, 0);

-  width = GST_VIDEO_INFO_WIDTH (in_info);
-  height = GST_VIDEO_INFO_HEIGHT (in_info);
-  src_pstride = GST_VIDEO_INFO_COMP_PSTRIDE (in_info, 0);
+  width = GST_VIDEO_FRAME_WIDTH (src_frame);
+  height = GST_VIDEO_FRAME_HEIGHT (src_frame);
+  src_pstride = GST_VIDEO_FRAME_COMP_PSTRIDE (src_frame, 0);

  cuda_ret =
      CuLaunchKernel (kernel_func, DIV_UP (width, CUDA_BLOCK_X),
@ -1274,9 +1234,8 @@ convert_TO_Y444 (GstCudaConverter * convert, CUfunction kernel_func,

 /* main conversion function for RGB to YUV conversion */
 static gboolean
-convert_RGB_TO_YUV (GstCudaConverter * convert,
-    const GstCudaMemory * src, GstVideoInfo * in_info, GstCudaMemory * dst,
-    GstVideoInfo * out_info, CUstream cuda_stream)
+convert_RGB_TO_YUV (GstCudaConverter * convert, GstVideoFrame * src_frame,
+    GstVideoFrame * dst_frame, CUstream cuda_stream)
 {
  CUtexObject texture = 0;
  CUtexObject yuv_texture[3] = { 0, };
@ -1285,7 +1244,7 @@ convert_RGB_TO_YUV (GstCudaConverter * convert,
  gboolean ret = FALSE;
  gint in_width, in_height;
  gint out_width, out_height;
-  gint dst_stride;
+  gint dst_stride, dst_uv_stride;
  CUarray_format format = CU_AD_FORMAT_UNSIGNED_INT8;
  CUfilter_mode mode = CU_TR_FILTER_MODE_POINT;
  gint pstride = 1;
@ -1293,7 +1252,7 @@ convert_RGB_TO_YUV (GstCudaConverter * convert,
  gint i;

  gpointer kernel_args[] = { &yuv_texture[0], &yuv_texture[1], &yuv_texture[2],
-    &dst_ptr[0], &dst_ptr[1], &dst_ptr[2], &dst_stride
+    &dst_ptr[0], &dst_ptr[1], &dst_ptr[2], &dst_stride, &dst_uv_stride
  };

  /* conversion step
@ -1304,21 +1263,22 @@ convert_RGB_TO_YUV (GstCudaConverter * convert,
   *         the CUDA kernel function
   */
  if (!convert_UNPACK_RGB (convert, convert->kernel_func[0], cuda_stream,
-          src, in_info, convert->unpack_surface.device_ptr,
+          src_frame, convert->unpack_surface.device_ptr,
          convert->unpack_surface.cuda_stride, &convert->in_rgb_order)) {
    GST_ERROR ("could not unpack input rgb");

    goto done;
  }

-  in_width = GST_VIDEO_INFO_WIDTH (in_info);
-  in_height = GST_VIDEO_INFO_HEIGHT (in_info);
+  in_width = GST_VIDEO_FRAME_WIDTH (src_frame);
+  in_height = GST_VIDEO_FRAME_HEIGHT (src_frame);

-  out_width = GST_VIDEO_INFO_WIDTH (out_info);
-  out_height = GST_VIDEO_INFO_HEIGHT (out_info);
-  dst_stride = dst->stride;
+  out_width = GST_VIDEO_FRAME_WIDTH (dst_frame);
+  out_height = GST_VIDEO_FRAME_HEIGHT (dst_frame);
+  dst_stride = GST_VIDEO_FRAME_PLANE_STRIDE (dst_frame, 0);
+  dst_uv_stride = GST_VIDEO_FRAME_PLANE_STRIDE (dst_frame, 1);

-  if (GST_VIDEO_INFO_COMP_DEPTH (in_info, 0) > 8) {
+  if (GST_VIDEO_FRAME_COMP_DEPTH (src_frame, 0) > 8) {
    pstride = 2;
    bitdepth = 16;
    format = CU_AD_FORMAT_UNSIGNED_INT16;
@ -1365,8 +1325,8 @@ convert_RGB_TO_YUV (GstCudaConverter * convert,
    }
  }

-  for (i = 0; i < GST_VIDEO_INFO_N_PLANES (out_info); i++)
-    dst_ptr[i] = dst->data + dst->offset[i];
+  for (i = 0; i < GST_VIDEO_FRAME_N_PLANES (dst_frame); i++)
+    dst_ptr[i] = (CUdeviceptr) GST_VIDEO_FRAME_PLANE_DATA (dst_frame, i);

  cuda_ret =
      CuLaunchKernel (convert->kernel_func[2], DIV_UP (out_width, CUDA_BLOCK_X),
@ -1394,9 +1354,8 @@ done:

 /* main conversion function for RGB to RGB conversion */
 static gboolean
-convert_RGB_TO_RGB (GstCudaConverter * convert,
-    const GstCudaMemory * src, GstVideoInfo * in_info, GstCudaMemory * dst,
-    GstVideoInfo * out_info, CUstream cuda_stream)
+convert_RGB_TO_RGB (GstCudaConverter * convert, GstVideoFrame * src_frame,
+    GstVideoFrame * dst_frame, CUstream cuda_stream)
 {
  CUtexObject texture = 0;
  CUresult cuda_ret;
@ -1418,23 +1377,23 @@ convert_RGB_TO_RGB (GstCudaConverter * convert,
   */

  if (!convert_UNPACK_RGB (convert, convert->kernel_func[0], cuda_stream,
-          src, in_info, convert->unpack_surface.device_ptr,
+          src_frame, convert->unpack_surface.device_ptr,
          convert->unpack_surface.cuda_stride, &convert->in_rgb_order)) {
    GST_ERROR ("could not unpack input rgb");

    goto done;
  }

-  in_width = GST_VIDEO_INFO_WIDTH (in_info);
-  in_height = GST_VIDEO_INFO_HEIGHT (in_info);
+  in_width = GST_VIDEO_FRAME_WIDTH (src_frame);
+  in_height = GST_VIDEO_FRAME_HEIGHT (src_frame);

-  out_width = GST_VIDEO_INFO_WIDTH (out_info);
-  out_height = GST_VIDEO_INFO_HEIGHT (out_info);
+  out_width = GST_VIDEO_FRAME_WIDTH (dst_frame);
+  out_height = GST_VIDEO_FRAME_HEIGHT (dst_frame);

-  dstRGB = dst->data;
-  dst_stride = dst->stride;
+  dstRGB = (CUdeviceptr) GST_VIDEO_FRAME_PLANE_DATA (dst_frame, 0);
+  dst_stride = GST_VIDEO_FRAME_PLANE_STRIDE (dst_frame, 0);

-  if (GST_VIDEO_INFO_COMP_DEPTH (in_info, 0) > 8)
+  if (GST_VIDEO_FRAME_COMP_DEPTH (src_frame, 0) > 8)
    format = CU_AD_FORMAT_UNSIGNED_INT16;

  /* Use h/w linear interpolation only when resize is required.
--- a/subprojects/gst-plugins-bad/sys/nvcodec/cuda-converter.h
+++ b/subprojects/gst-plugins-bad/sys/nvcodec/cuda-converter.h
@ -34,21 +34,11 @@ GstCudaConverter *    gst_cuda_converter_new           (GstVideoInfo * in_info,

 void                 gst_cuda_converter_free           (GstCudaConverter * convert);

-gboolean             gst_cuda_converter_frame          (GstCudaConverter * convert,
-                                                        const GstCudaMemory * src,
-                                                        GstVideoInfo * in_info,
-                                                        GstCudaMemory * dst,
-                                                        GstVideoInfo * out_info,
+gboolean             gst_cuda_converter_convert_frame  (GstCudaConverter * convert,
+                                                        GstVideoFrame * src_frame,
+                                                        GstVideoFrame * dst_frame,
                                                        CUstream cuda_stream);

-gboolean             gst_cuda_converter_frame_unlocked (GstCudaConverter * convert,
-                                                        const GstCudaMemory * src,
-                                                        GstVideoInfo * in_info,
-                                                        GstCudaMemory * dst,
-                                                        GstVideoInfo * out_info,
-                                                        CUstream cuda_stream);
-
-
 G_END_DECLS

 #endif /* __GST_CUDA_CONVERTER_H__ */
--- a/subprojects/gst-plugins-bad/sys/nvcodec/gstcudabasefilter.c
+++ b/subprojects/gst-plugins-bad/sys/nvcodec/gstcudabasefilter.c
@ -168,47 +168,30 @@ gst_cuda_base_filter_propose_allocation (GstBaseTransform * trans,

  if (gst_query_get_n_allocation_pools (query) == 0) {
    GstStructure *config;
-    GstVideoAlignment align;
-    GstAllocationParams params = { 0, 31, 0, 0, };
-    GstAllocator *allocator = NULL;
-    gint i;

    pool = gst_cuda_buffer_pool_new (ctrans->context);

    config = gst_buffer_pool_get_config (pool);

-    gst_video_alignment_reset (&align);
-    for (i = 0; i < GST_VIDEO_INFO_N_PLANES (&info); i++) {
-      align.stride_align[i] = 31;
-    }
-    gst_video_info_align (&info, &align);
-
    gst_buffer_pool_config_add_option (config,
        GST_BUFFER_POOL_OPTION_VIDEO_META);
-    gst_buffer_pool_config_add_option (config,
-        GST_BUFFER_POOL_OPTION_VIDEO_ALIGNMENT);

-    gst_buffer_pool_config_set_video_alignment (config, &align);
    size = GST_VIDEO_INFO_SIZE (&info);
    gst_buffer_pool_config_set_params (config, caps, size, 0, 0);

-    gst_query_add_allocation_meta (query, GST_VIDEO_META_API_TYPE, NULL);
-    gst_query_add_allocation_pool (query, pool, size, 0, 0);
-
-    if (gst_buffer_pool_config_get_allocator (config, &allocator, &params)) {
-      if (params.align < 31)
-        params.align = 31;
-
-      gst_query_add_allocation_param (query, allocator, &params);
-      gst_buffer_pool_config_set_allocator (config, allocator, &params);
-    }
-
    if (!gst_buffer_pool_set_config (pool, config)) {
      GST_ERROR_OBJECT (ctrans, "failed to set config");
      gst_object_unref (pool);
      return FALSE;
    }

+    /* Get updated size by cuda buffer pool */
+    config = gst_buffer_pool_get_config (pool);
+    gst_buffer_pool_config_get_params (config, NULL, &size, NULL, NULL);
+    gst_structure_free (config);
+
+    gst_query_add_allocation_pool (query, pool, size, 0, 0);
+
    gst_object_unref (pool);
  }

@ -265,6 +248,12 @@ gst_cuda_base_filter_decide_allocation (GstBaseTransform * trans,
  gst_buffer_pool_config_add_option (config, GST_BUFFER_POOL_OPTION_VIDEO_META);
  gst_buffer_pool_config_set_params (config, outcaps, size, min, max);
  gst_buffer_pool_set_config (pool, config);
+
+  /* Get updated size by cuda buffer pool */
+  config = gst_buffer_pool_get_config (pool);
+  gst_buffer_pool_config_get_params (config, NULL, &size, NULL, NULL);
+  gst_structure_free (config);
+
  if (update_pool)
    gst_query_set_nth_allocation_pool (query, 0, pool, size, min, max);
  else
@ -285,8 +274,6 @@ gst_cuda_base_filter_transform (GstBaseTransform * trans,
  GstVideoFrame in_frame, out_frame;
  GstFlowReturn ret = GST_FLOW_OK;
  GstMemory *mem;
-  GstCudaMemory *in_cuda_mem = NULL;
-  GstCudaMemory *out_cuda_mem = NULL;

  if (gst_buffer_n_memory (inbuf) != 1) {
    GST_ERROR_OBJECT (self, "Invalid input buffer");
@ -299,8 +286,6 @@ gst_cuda_base_filter_transform (GstBaseTransform * trans,
    return GST_FLOW_ERROR;
  }

-  in_cuda_mem = GST_CUDA_MEMORY_CAST (mem);
-
  if (gst_buffer_n_memory (outbuf) != 1) {
    GST_ERROR_OBJECT (self, "Invalid output buffer");
    return GST_FLOW_ERROR;
@ -312,8 +297,6 @@ gst_cuda_base_filter_transform (GstBaseTransform * trans,
    return GST_FLOW_ERROR;
  }

-  out_cuda_mem = GST_CUDA_MEMORY_CAST (mem);
-
  if (!gst_video_frame_map (&in_frame, &ctrans->in_info, inbuf,
          GST_MAP_READ | GST_MAP_CUDA)) {
    GST_ERROR_OBJECT (self, "Failed to map input buffer");
@ -327,9 +310,8 @@ gst_cuda_base_filter_transform (GstBaseTransform * trans,
    return GST_FLOW_ERROR;
  }

-  if (!gst_cuda_converter_frame (self->converter,
-          in_cuda_mem, &ctrans->in_info,
-          out_cuda_mem, &ctrans->out_info, ctrans->cuda_stream)) {
+  if (!gst_cuda_converter_convert_frame (self->converter, &in_frame, &out_frame,
+          ctrans->cuda_stream)) {
    GST_ERROR_OBJECT (self, "Failed to convert frame");
    ret = GST_FLOW_ERROR;
  }
--- a/subprojects/gst-plugins-bad/sys/nvcodec/gstcudabufferpool.c
+++ b/subprojects/gst-plugins-bad/sys/nvcodec/gstcudabufferpool.c
@ -30,11 +30,8 @@ GST_DEBUG_CATEGORY_STATIC (gst_cuda_buffer_pool_debug);

 struct _GstCudaBufferPoolPrivate
 {
-  GstAllocator *allocator;
+  GstCudaAllocator *allocator;
  GstVideoInfo info;
-  gboolean add_videometa;
-  gboolean need_alignment;
-  GstCudaAllocationParams params;
 };

 #define gst_cuda_buffer_pool_parent_class parent_class
@ -44,8 +41,7 @@ G_DEFINE_TYPE_WITH_PRIVATE (GstCudaBufferPool, gst_cuda_buffer_pool,
 static const gchar **
 gst_cuda_buffer_pool_get_options (GstBufferPool * pool)
 {
-  static const gchar *options[] = { GST_BUFFER_POOL_OPTION_VIDEO_META,
-    GST_BUFFER_POOL_OPTION_VIDEO_ALIGNMENT, NULL
+  static const gchar *options[] = { GST_BUFFER_POOL_OPTION_VIDEO_META, NULL
  };

  return options;
@ -54,153 +50,86 @@ gst_cuda_buffer_pool_get_options (GstBufferPool * pool)
 static gboolean
 gst_cuda_buffer_pool_set_config (GstBufferPool * pool, GstStructure * config)
 {
-  GstCudaBufferPool *cuda_pool = GST_CUDA_BUFFER_POOL_CAST (pool);
-  GstCudaBufferPoolPrivate *priv = cuda_pool->priv;
+  GstCudaBufferPool *self = GST_CUDA_BUFFER_POOL (pool);
+  GstCudaBufferPoolPrivate *priv = self->priv;
  GstCaps *caps = NULL;
  guint size, min_buffers, max_buffers;
-  guint max_align, n;
-  GstAllocator *allocator = NULL;
-  GstAllocationParams *params = (GstAllocationParams *) & priv->params;
-  GstVideoInfo *info = &priv->params.info;
+  GstVideoInfo info;
+  GstMemory *mem;
+  GstCudaMemory *cmem;

  if (!gst_buffer_pool_config_get_params (config, &caps, &size, &min_buffers,
-          &max_buffers))
-    goto wrong_config;
-
-  if (caps == NULL)
-    goto no_caps;
-
-  if (!gst_buffer_pool_config_get_allocator (config, &allocator, params))
-    goto wrong_config;
-
-  /* now parse the caps from the config */
-  if (!gst_video_info_from_caps (info, caps))
-    goto wrong_caps;
-
-  GST_LOG_OBJECT (pool, "%dx%d, caps %" GST_PTR_FORMAT,
-      GST_VIDEO_INFO_WIDTH (info), GST_VIDEO_INFO_HEIGHT (info), caps);
-
-  gst_clear_object (&priv->allocator);
-
-  if (allocator) {
-    if (!GST_IS_CUDA_ALLOCATOR (allocator)) {
-      goto wrong_allocator;
-    } else {
-      priv->allocator = gst_object_ref (allocator);
-    }
-  } else {
-    allocator = priv->allocator = gst_cuda_allocator_new (cuda_pool->context);
-    if (G_UNLIKELY (priv->allocator == NULL))
-      goto no_allocator;
-  }
-
-  priv->add_videometa = gst_buffer_pool_config_has_option (config,
-      GST_BUFFER_POOL_OPTION_VIDEO_META);
-
-  priv->need_alignment = gst_buffer_pool_config_has_option (config,
-      GST_BUFFER_POOL_OPTION_VIDEO_ALIGNMENT);
-
-  max_align = params->align;
-
-  /* do memory align */
-  if (priv->need_alignment && priv->add_videometa) {
-    GstVideoAlignment valign;
-
-    gst_buffer_pool_config_get_video_alignment (config, &valign);
-
-    for (n = 0; n < GST_VIDEO_MAX_PLANES; ++n)
-      max_align |= valign.stride_align[n];
-
-    for (n = 0; n < GST_VIDEO_MAX_PLANES; ++n)
-      valign.stride_align[n] = max_align;
-
-    if (!gst_video_info_align (info, &valign))
-      goto failed_to_align;
-
-    gst_buffer_pool_config_set_video_alignment (config, &valign);
-  }
-
-  if (params->align < max_align) {
-    GST_WARNING_OBJECT (pool, "allocation params alignment %u is smaller "
-        "than the max specified video stride alignment %u, fixing",
-        (guint) params->align, max_align);
-
-    params->align = max_align;
-    gst_buffer_pool_config_set_allocator (config, allocator, params);
-  }
-
-  gst_buffer_pool_config_set_params (config, caps, GST_VIDEO_INFO_SIZE (info),
-      min_buffers, max_buffers);
-
-  return GST_BUFFER_POOL_CLASS (parent_class)->set_config (pool, config);
-
-  /* ERRORS */
-wrong_config:
-  {
-    GST_WARNING_OBJECT (pool, "invalid config");
+          &max_buffers)) {
+    GST_WARNING_OBJECT (self, "invalid config");
    return FALSE;
  }
-no_caps:
-  {
+
+  if (!caps) {
    GST_WARNING_OBJECT (pool, "no caps in config");
    return FALSE;
  }
-wrong_caps:
-  {
-    GST_WARNING_OBJECT (pool,
-        "failed getting geometry from caps %" GST_PTR_FORMAT, caps);
+
+  if (!gst_video_info_from_caps (&info, caps)) {
+    GST_WARNING_OBJECT (self, "Failed to convert caps to video-info");
    return FALSE;
  }
-no_allocator:
-  {
-    GST_WARNING_OBJECT (pool, "Could not create new CUDA allocator");
+
+  gst_clear_object (&priv->allocator);
+  priv->allocator = (GstCudaAllocator *)
+      gst_allocator_find (GST_CUDA_MEMORY_TYPE_NAME);
+  if (!priv->allocator) {
+    GST_WARNING_OBJECT (self, "CudaAllocator is unavailable");
    return FALSE;
  }
-wrong_allocator:
-  {
-    GST_WARNING_OBJECT (pool, "Incorrect allocator type for this pool");
-    return FALSE;
-  }
-failed_to_align:
-  {
-    GST_WARNING_OBJECT (pool, "Failed to align");
+
+  mem = gst_cuda_allocator_alloc (priv->allocator, self->context, &info);
+  if (!mem) {
+    GST_WARNING_OBJECT (self, "Failed to allocate memory");
    return FALSE;
  }
+
+  cmem = GST_CUDA_MEMORY_CAST (mem);
+
+  gst_buffer_pool_config_set_params (config, caps,
+      GST_VIDEO_INFO_SIZE (&cmem->info), min_buffers, max_buffers);
+
+  priv->info = info;
+
+  gst_memory_unref (mem);
+
+  return GST_BUFFER_POOL_CLASS (parent_class)->set_config (pool, config);
 }

 static GstFlowReturn
 gst_cuda_buffer_pool_alloc (GstBufferPool * pool, GstBuffer ** buffer,
    GstBufferPoolAcquireParams * params)
 {
-  GstCudaBufferPool *cuda_pool = GST_CUDA_BUFFER_POOL_CAST (pool);
-  GstCudaBufferPoolPrivate *priv = cuda_pool->priv;
-  GstVideoInfo *info;
-  GstBuffer *cuda;
+  GstCudaBufferPool *self = GST_CUDA_BUFFER_POOL_CAST (pool);
+  GstCudaBufferPoolPrivate *priv = self->priv;
+  GstVideoInfo *info = &priv->info;
+  GstBuffer *buf;
  GstMemory *mem;
+  GstCudaMemory *cmem;

-  info = &priv->params.info;
-
-  cuda = gst_buffer_new ();
-
-  mem = gst_cuda_allocator_alloc (GST_ALLOCATOR_CAST (priv->allocator),
-      GST_VIDEO_INFO_SIZE (info), &priv->params);
-
-  if (mem == NULL) {
-    gst_buffer_unref (cuda);
+  mem = gst_cuda_allocator_alloc (priv->allocator, self->context, &priv->info);
+  if (!mem) {
    GST_WARNING_OBJECT (pool, "Cannot create CUDA memory");
    return GST_FLOW_ERROR;
  }
-  gst_buffer_append_memory (cuda, mem);

-  if (priv->add_videometa) {
-    GST_DEBUG_OBJECT (pool, "adding GstVideoMeta");
-    gst_buffer_add_video_meta_full (cuda, GST_VIDEO_FRAME_FLAG_NONE,
-        GST_VIDEO_INFO_FORMAT (info), GST_VIDEO_INFO_WIDTH (info),
-        GST_VIDEO_INFO_HEIGHT (info), GST_VIDEO_INFO_N_PLANES (info),
-        info->offset, info->stride);
-  }
+  cmem = GST_CUDA_MEMORY_CAST (mem);

-  *buffer = cuda;
+  buf = gst_buffer_new ();
+
+  gst_buffer_append_memory (buf, mem);
+
+  GST_DEBUG_OBJECT (pool, "adding GstVideoMeta");
+  gst_buffer_add_video_meta_full (buf, GST_VIDEO_FRAME_FLAG_NONE,
+      GST_VIDEO_INFO_FORMAT (info), GST_VIDEO_INFO_WIDTH (info),
+      GST_VIDEO_INFO_HEIGHT (info), GST_VIDEO_INFO_N_PLANES (info),
+      cmem->info.offset, cmem->info.stride);
+
+  *buffer = buf;

  return GST_FLOW_OK;
 }
@ -208,44 +137,41 @@ gst_cuda_buffer_pool_alloc (GstBufferPool * pool, GstBuffer ** buffer,
 GstBufferPool *
 gst_cuda_buffer_pool_new (GstCudaContext * context)
 {
-  GstCudaBufferPool *pool;
+  GstCudaBufferPool *self;

-  pool = g_object_new (GST_TYPE_CUDA_BUFFER_POOL, NULL);
-  gst_object_ref_sink (pool);
+  g_return_val_if_fail (GST_IS_CUDA_CONTEXT (context), NULL);

-  pool->context = gst_object_ref (context);
+  self = g_object_new (GST_TYPE_CUDA_BUFFER_POOL, NULL);
+  gst_object_ref_sink (self);

-  GST_LOG_OBJECT (pool, "new CUDA buffer pool %p", pool);
+  self->context = gst_object_ref (context);

-  return GST_BUFFER_POOL_CAST (pool);
+  return GST_BUFFER_POOL_CAST (self);
 }

 static void
 gst_cuda_buffer_pool_dispose (GObject * object)
 {
-  GstCudaBufferPool *pool = GST_CUDA_BUFFER_POOL_CAST (object);
-  GstCudaBufferPoolPrivate *priv = pool->priv;
-
-  GST_LOG_OBJECT (pool, "finalize CUDA buffer pool %p", pool);
+  GstCudaBufferPool *self = GST_CUDA_BUFFER_POOL_CAST (object);
+  GstCudaBufferPoolPrivate *priv = self->priv;

  gst_clear_object (&priv->allocator);
-  gst_clear_object (&pool->context);
+  gst_clear_object (&self->context);

  G_OBJECT_CLASS (parent_class)->dispose (object);
 }

-
 static void
 gst_cuda_buffer_pool_class_init (GstCudaBufferPoolClass * klass)
 {
  GObjectClass *gobject_class = (GObjectClass *) klass;
-  GstBufferPoolClass *gstbufferpool_class = (GstBufferPoolClass *) klass;
+  GstBufferPoolClass *bufferpool_class = (GstBufferPoolClass *) klass;

  gobject_class->dispose = gst_cuda_buffer_pool_dispose;

-  gstbufferpool_class->get_options = gst_cuda_buffer_pool_get_options;
-  gstbufferpool_class->set_config = gst_cuda_buffer_pool_set_config;
-  gstbufferpool_class->alloc_buffer = gst_cuda_buffer_pool_alloc;
+  bufferpool_class->get_options = gst_cuda_buffer_pool_get_options;
+  bufferpool_class->set_config = gst_cuda_buffer_pool_set_config;
+  bufferpool_class->alloc_buffer = gst_cuda_buffer_pool_alloc;

  GST_DEBUG_CATEGORY_INIT (gst_cuda_buffer_pool_debug, "cudabufferpool", 0,
      "CUDA Buffer Pool");
--- a/subprojects/gst-plugins-bad/sys/nvcodec/gstcudadownload.c
+++ b/subprojects/gst-plugins-bad/sys/nvcodec/gstcudadownload.c
@ -180,10 +180,6 @@ gst_cuda_download_propose_allocation (GstBaseTransform * trans,
  if (gst_query_get_n_allocation_pools (query) == 0) {
    GstCapsFeatures *features;
    GstStructure *config;
-    GstVideoAlignment align;
-    GstAllocationParams params = { 0, 31, 0, 0, };
-    GstAllocator *allocator = NULL;
-    gint i;

    features = gst_caps_get_features (caps, 0);

@ -197,38 +193,25 @@ gst_cuda_download_propose_allocation (GstBaseTransform * trans,

    config = gst_buffer_pool_get_config (pool);

-    gst_video_alignment_reset (&align);
-    for (i = 0; i < GST_VIDEO_INFO_N_PLANES (&info); i++) {
-      align.stride_align[i] = 31;
-    }
-    gst_video_info_align (&info, &align);
-
    gst_buffer_pool_config_add_option (config,
        GST_BUFFER_POOL_OPTION_VIDEO_META);
-    gst_buffer_pool_config_add_option (config,
-        GST_BUFFER_POOL_OPTION_VIDEO_ALIGNMENT);

-    gst_buffer_pool_config_set_video_alignment (config, &align);
    size = GST_VIDEO_INFO_SIZE (&info);
    gst_buffer_pool_config_set_params (config, caps, size, 0, 0);

-    gst_query_add_allocation_meta (query, GST_VIDEO_META_API_TYPE, NULL);
-    gst_query_add_allocation_pool (query, pool, size, 0, 0);
-
-    if (gst_buffer_pool_config_get_allocator (config, &allocator, &params)) {
-      if (params.align < 31)
-        params.align = 31;
-
-      gst_query_add_allocation_param (query, allocator, &params);
-      gst_buffer_pool_config_set_allocator (config, allocator, &params);
-    }
-
    if (!gst_buffer_pool_set_config (pool, config)) {
      GST_ERROR_OBJECT (ctrans, "failed to set config");
      gst_object_unref (pool);
      return FALSE;
    }

+    /* Get updated size by cuda buffer pool */
+    config = gst_buffer_pool_get_config (pool);
+    gst_buffer_pool_config_get_params (config, NULL, &size, NULL, NULL);
+    gst_structure_free (config);
+
+    gst_query_add_allocation_pool (query, pool, size, 0, 0);
+
    gst_object_unref (pool);
  }

@ -295,6 +278,12 @@ gst_cuda_download_decide_allocation (GstBaseTransform * trans, GstQuery * query)
  gst_buffer_pool_config_add_option (config, GST_BUFFER_POOL_OPTION_VIDEO_META);
  gst_buffer_pool_config_set_params (config, outcaps, size, min, max);
  gst_buffer_pool_set_config (pool, config);
+
+  /* Get updated size by cuda buffer pool */
+  config = gst_buffer_pool_get_config (pool);
+  gst_buffer_pool_config_get_params (config, NULL, &size, NULL, NULL);
+  gst_structure_free (config);
+
  if (update_pool)
    gst_query_set_nth_allocation_pool (query, 0, pool, size, min, max);
  else
--- a/subprojects/gst-plugins-bad/sys/nvcodec/gstcudamemory.c
+++ b/subprojects/gst-plugins-bad/sys/nvcodec/gstcudamemory.c
@ -26,20 +26,33 @@

 #include <string.h>

-GST_DEBUG_CATEGORY_STATIC (cudaallocator_debug);
-#define GST_CAT_DEFAULT cudaallocator_debug
-GST_DEBUG_CATEGORY_STATIC (GST_CAT_MEMORY);
+GST_DEBUG_CATEGORY_STATIC (cuda_allocator_debug);
+#define GST_CAT_DEFAULT cuda_allocator_debug
+
+static GstAllocator *_gst_cuda_allocator = NULL;
+
+struct _GstCudaMemoryPrivate
+{
+  CUdeviceptr data;
+  void *staging;
+
+  /* params used for cuMemAllocPitch */
+  gsize pitch;
+  guint width_in_bytes;
+  guint height;
+
+  GMutex lock;
+};

 #define gst_cuda_allocator_parent_class parent_class
 G_DEFINE_TYPE (GstCudaAllocator, gst_cuda_allocator, GST_TYPE_ALLOCATOR);

-static void gst_cuda_allocator_dispose (GObject * object);
 static void gst_cuda_allocator_free (GstAllocator * allocator,
    GstMemory * memory);

-static gpointer cuda_mem_map (GstCudaMemory * mem, gsize maxsize,
+static gpointer cuda_mem_map (GstMemory * mem, gsize maxsize,
    GstMapFlags flags);
-static void cuda_mem_unmap_full (GstCudaMemory * mem, GstMapInfo * info);
+static void cuda_mem_unmap_full (GstMemory * mem, GstMapInfo * info);
 static GstMemory *cuda_mem_copy (GstMemory * mem, gssize offset, gssize size);

 static GstMemory *
@ -52,17 +65,13 @@ gst_cuda_allocator_dummy_alloc (GstAllocator * allocator, gsize size,
 static void
 gst_cuda_allocator_class_init (GstCudaAllocatorClass * klass)
 {
-  GObjectClass *gobject_class = G_OBJECT_CLASS (klass);
  GstAllocatorClass *allocator_class = GST_ALLOCATOR_CLASS (klass);

-  gobject_class->dispose = gst_cuda_allocator_dispose;
-
  allocator_class->alloc = GST_DEBUG_FUNCPTR (gst_cuda_allocator_dummy_alloc);
  allocator_class->free = GST_DEBUG_FUNCPTR (gst_cuda_allocator_free);

-  GST_DEBUG_CATEGORY_INIT (cudaallocator_debug, "cudaallocator", 0,
+  GST_DEBUG_CATEGORY_INIT (cuda_allocator_debug, "cudaallocator", 0,
      "CUDA Allocator");
-  GST_DEBUG_CATEGORY_GET (GST_CAT_MEMORY, "GST_MEMORY");
 }

 static void
@ -74,337 +83,273 @@ gst_cuda_allocator_init (GstCudaAllocator * allocator)

  alloc->mem_type = GST_CUDA_MEMORY_TYPE_NAME;

-  alloc->mem_map = (GstMemoryMapFunction) cuda_mem_map;
-  alloc->mem_unmap_full = (GstMemoryUnmapFullFunction) cuda_mem_unmap_full;
-  alloc->mem_copy = (GstMemoryCopyFunction) cuda_mem_copy;
+  alloc->mem_map = cuda_mem_map;
+  alloc->mem_unmap_full = cuda_mem_unmap_full;
+  alloc->mem_copy = cuda_mem_copy;

  GST_OBJECT_FLAG_SET (allocator, GST_ALLOCATOR_FLAG_CUSTOM_ALLOC);
 }

-static void
-gst_cuda_allocator_dispose (GObject * object)
+static GstMemory *
+gst_cuda_allocator_alloc_internal (GstCudaAllocator * self,
+    GstCudaContext * context, const GstVideoInfo * info,
+    guint width_in_bytes, guint alloc_height)
 {
-  GstCudaAllocator *self = GST_CUDA_ALLOCATOR_CAST (object);
-
-  GST_DEBUG_OBJECT (self, "dispose");
-
-  gst_clear_object (&self->context);
-  G_OBJECT_CLASS (parent_class)->dispose (object);
-}
-
-GstMemory *
-gst_cuda_allocator_alloc (GstAllocator * allocator, gsize size,
-    GstCudaAllocationParams * params)
-{
-  GstCudaAllocator *self = GST_CUDA_ALLOCATOR_CAST (allocator);
-  gsize maxsize = size + params->parent.prefix + params->parent.padding;
-  gsize align = params->parent.align;
-  gsize offset = params->parent.prefix;
-  GstMemoryFlags flags = params->parent.flags;
+  GstCudaMemoryPrivate *priv;
+  GstCudaMemory *mem;
  CUdeviceptr data;
  gboolean ret = FALSE;
-  GstCudaMemory *mem;
-  GstVideoInfo *info = &params->info;
-  gint i;
-  guint width, height;
-  gsize stride, plane_offset;
+  gsize pitch;
+  guint height = GST_VIDEO_INFO_HEIGHT (info);
+  GstVideoInfo *alloc_info;

-  if (!gst_cuda_context_push (self->context))
+  if (!gst_cuda_context_push (context))
    return NULL;

-  /* ensure configured alignment */
-  align |= gst_memory_alignment;
-  /* allocate more to compensate for alignment */
-  maxsize += align;
-
-  GST_CAT_DEBUG_OBJECT (GST_CAT_MEMORY, self, "allocate new cuda memory");
-
-  width = GST_VIDEO_INFO_COMP_WIDTH (info, 0) *
-      GST_VIDEO_INFO_COMP_PSTRIDE (info, 0);
-  height = 0;
-  for (i = 0; i < GST_VIDEO_INFO_N_PLANES (info); i++)
-    height += GST_VIDEO_INFO_COMP_HEIGHT (info, i);
-
-  ret = gst_cuda_result (CuMemAllocPitch (&data, &stride, width, height, 16));
+  ret = gst_cuda_result (CuMemAllocPitch (&data, &pitch, width_in_bytes,
+          alloc_height, 16));
  gst_cuda_context_pop (NULL);

-  if (G_UNLIKELY (!ret)) {
-    GST_CAT_ERROR_OBJECT (GST_CAT_MEMORY, self, "CUDA allocation failure");
+  if (!ret) {
+    GST_ERROR_OBJECT (self, "Failed to allocate CUDA memory");
    return NULL;
  }

  mem = g_new0 (GstCudaMemory, 1);
-  g_mutex_init (&mem->lock);
-  mem->data = data;
-  mem->alloc_params = *params;
-  mem->stride = stride;
+  mem->priv = priv = g_new0 (GstCudaMemoryPrivate, 1);

-  plane_offset = 0;
-  for (i = 0; i < GST_VIDEO_INFO_N_PLANES (info); i++) {
-    mem->offset[i] = plane_offset;
-    plane_offset += stride * GST_VIDEO_INFO_COMP_HEIGHT (info, i);
+  priv->data = data;
+  priv->pitch = pitch;
+  priv->width_in_bytes = width_in_bytes;
+  priv->height = alloc_height;
+  g_mutex_init (&priv->lock);
+
+  mem->context = gst_object_ref (context);
+  mem->info = *info;
+  mem->info.size = pitch * alloc_height;
+
+  alloc_info = &mem->info;
+  gst_memory_init (GST_MEMORY_CAST (mem), 0, GST_ALLOCATOR_CAST (self),
+      NULL, alloc_info->size, 0, 0, alloc_info->size);
+
+  switch (GST_VIDEO_INFO_FORMAT (info)) {
+    case GST_VIDEO_FORMAT_I420:
+    case GST_VIDEO_FORMAT_YV12:
+    case GST_VIDEO_FORMAT_I420_10LE:
+      /* we are wasting space yes, but required so that this memory
+       * can be used in kernel function */
+      alloc_info->stride[0] = pitch;
+      alloc_info->stride[1] = pitch;
+      alloc_info->stride[2] = pitch;
+      alloc_info->offset[0] = 0;
+      alloc_info->offset[1] = alloc_info->stride[0] * height;
+      alloc_info->offset[2] = alloc_info->offset[1] +
+          alloc_info->stride[1] * height / 2;
+      break;
+    case GST_VIDEO_FORMAT_NV12:
+    case GST_VIDEO_FORMAT_NV21:
+    case GST_VIDEO_FORMAT_P010_10LE:
+    case GST_VIDEO_FORMAT_P016_LE:
+      alloc_info->stride[0] = pitch;
+      alloc_info->stride[1] = pitch;
+      alloc_info->offset[0] = 0;
+      alloc_info->offset[1] = alloc_info->stride[0] * height;
+      break;
+    case GST_VIDEO_FORMAT_Y444:
+    case GST_VIDEO_FORMAT_Y444_16LE:
+      alloc_info->stride[0] = pitch;
+      alloc_info->stride[1] = pitch;
+      alloc_info->stride[2] = pitch;
+      alloc_info->offset[0] = 0;
+      alloc_info->offset[1] = alloc_info->stride[0] * height;
+      alloc_info->offset[2] = alloc_info->offset[1] * 2;
+      break;
+    case GST_VIDEO_FORMAT_BGRA:
+    case GST_VIDEO_FORMAT_RGBA:
+    case GST_VIDEO_FORMAT_RGBx:
+    case GST_VIDEO_FORMAT_BGRx:
+    case GST_VIDEO_FORMAT_ARGB:
+    case GST_VIDEO_FORMAT_ABGR:
+    case GST_VIDEO_FORMAT_RGB:
+    case GST_VIDEO_FORMAT_BGR:
+    case GST_VIDEO_FORMAT_BGR10A2_LE:
+    case GST_VIDEO_FORMAT_RGB10A2_LE:
+      alloc_info->stride[0] = pitch;
+      alloc_info->offset[0] = 0;
+      break;
+    default:
+      GST_ERROR_OBJECT (self, "Unexpected format %s",
+          gst_video_format_to_string (GST_VIDEO_INFO_FORMAT (info)));
+      g_assert_not_reached ();
+      gst_memory_unref (GST_MEMORY_CAST (mem));
+      return NULL;
  }

-  mem->context = gst_object_ref (self->context);
-
-  gst_memory_init (GST_MEMORY_CAST (mem),
-      flags, GST_ALLOCATOR_CAST (self), NULL, maxsize, align, offset, size);
-
  return GST_MEMORY_CAST (mem);
 }

 static void
 gst_cuda_allocator_free (GstAllocator * allocator, GstMemory * memory)
 {
-  GstCudaAllocator *self = GST_CUDA_ALLOCATOR_CAST (allocator);
  GstCudaMemory *mem = GST_CUDA_MEMORY_CAST (memory);
+  GstCudaMemoryPrivate *priv = mem->priv;

-  GST_CAT_DEBUG_OBJECT (GST_CAT_MEMORY, allocator, "free cuda memory");
-
-  g_mutex_clear (&mem->lock);
-
-  gst_cuda_context_push (self->context);
-  if (mem->data)
-    gst_cuda_result (CuMemFree (mem->data));
-
-  if (mem->map_alloc_data)
-    gst_cuda_result (CuMemFreeHost (mem->map_alloc_data));
+  gst_cuda_context_push (mem->context);
+  if (priv->data)
+    gst_cuda_result (CuMemFree (priv->data));

+  if (priv->staging)
+    gst_cuda_result (CuMemFreeHost (priv->staging));
  gst_cuda_context_pop (NULL);
+
  gst_object_unref (mem->context);

+  g_mutex_clear (&priv->lock);
+  g_free (mem->priv);
  g_free (mem);
 }

-/* called with lock */
 static gboolean
-gst_cuda_memory_upload_transfer (GstCudaMemory * mem)
+gst_cuda_memory_upload (GstCudaAllocator * self, GstCudaMemory * mem)
 {
-  gint i;
-  GstVideoInfo *info = &mem->alloc_params.info;
+  GstCudaMemoryPrivate *priv = mem->priv;
  gboolean ret = TRUE;
+  CUDA_MEMCPY2D param = { 0, };

-  if (!mem->map_data) {
-    GST_CAT_ERROR (GST_CAT_MEMORY, "no staging memory to upload");
+  if (!priv->staging ||
+      !GST_MEMORY_FLAG_IS_SET (mem, GST_CUDA_MEMORY_TRANSFER_NEED_UPLOAD)) {
+    return TRUE;
+  }
+
+  if (!gst_cuda_context_push (mem->context)) {
+    GST_ERROR_OBJECT (self, "Failed to push cuda context");
    return FALSE;
  }

-  for (i = 0; i < GST_VIDEO_INFO_N_PLANES (info); i++) {
-    CUDA_MEMCPY2D param = { 0, };
+  param.srcMemoryType = CU_MEMORYTYPE_HOST;
+  param.srcHost = priv->staging;
+  param.srcPitch = priv->pitch;

-    param.srcMemoryType = CU_MEMORYTYPE_HOST;
-    param.srcHost =
-        (guint8 *) mem->map_data + GST_VIDEO_INFO_PLANE_OFFSET (info, i);
-    param.srcPitch = GST_VIDEO_INFO_PLANE_STRIDE (info, i);
+  param.dstMemoryType = CU_MEMORYTYPE_DEVICE;
+  param.dstDevice = (CUdeviceptr) priv->data;
+  param.dstPitch = priv->pitch;
+  param.WidthInBytes = priv->width_in_bytes;
+  param.Height = priv->height;

-    param.dstMemoryType = CU_MEMORYTYPE_DEVICE;
-    param.dstDevice = mem->data + mem->offset[i];
-    param.dstPitch = mem->stride;
-    param.WidthInBytes = GST_VIDEO_INFO_COMP_WIDTH (info, i) *
-        GST_VIDEO_INFO_COMP_PSTRIDE (info, i);
-    param.Height = GST_VIDEO_INFO_COMP_HEIGHT (info, i);
+  ret = gst_cuda_result (CuMemcpy2D (&param));
+  gst_cuda_context_pop (NULL);

-    if (!gst_cuda_result (CuMemcpy2DAsync (&param, NULL))) {
-      GST_CAT_ERROR (GST_CAT_MEMORY, "Failed to copy %dth plane", i);
-      ret = FALSE;
-      break;
-    }
-  }
-  gst_cuda_result (CuStreamSynchronize (NULL));
+  if (!ret)
+    GST_ERROR_OBJECT (self, "Failed to upload memory");

  return ret;
 }

-/* called with lock */
 static gboolean
-gst_cuda_memory_download_transfer (GstCudaMemory * mem)
+gst_cuda_memory_download (GstCudaAllocator * self, GstCudaMemory * mem)
 {
-  gint i;
-  GstVideoInfo *info = &mem->alloc_params.info;
+  GstCudaMemoryPrivate *priv = mem->priv;
+  gboolean ret = TRUE;
+  CUDA_MEMCPY2D param = { 0, };

-  if (!mem->map_data) {
-    GST_CAT_ERROR (GST_CAT_MEMORY, "no staging memory to upload");
+  if (!GST_MEMORY_FLAG_IS_SET (mem, GST_CUDA_MEMORY_TRANSFER_NEED_DOWNLOAD))
+    return TRUE;
+
+  if (!gst_cuda_context_push (mem->context)) {
+    GST_ERROR_OBJECT (self, "Failed to push cuda context");
    return FALSE;
  }

-  for (i = 0; i < GST_VIDEO_INFO_N_PLANES (info); i++) {
-    CUDA_MEMCPY2D param = { 0, };
-
-    param.srcMemoryType = CU_MEMORYTYPE_DEVICE;
-    param.srcDevice = mem->data + mem->offset[i];
-    param.srcPitch = mem->stride;
-
-    param.dstMemoryType = CU_MEMORYTYPE_HOST;
-    param.dstHost =
-        (guint8 *) mem->map_data + GST_VIDEO_INFO_PLANE_OFFSET (info, i);
-    param.dstPitch = GST_VIDEO_INFO_PLANE_STRIDE (info, i);
-    param.WidthInBytes = GST_VIDEO_INFO_COMP_WIDTH (info, i) *
-        GST_VIDEO_INFO_COMP_PSTRIDE (info, i);
-    param.Height = GST_VIDEO_INFO_COMP_HEIGHT (info, i);
-
-    if (!gst_cuda_result (CuMemcpy2DAsync (&param, NULL))) {
-      GST_CAT_ERROR (GST_CAT_MEMORY, "Failed to copy %dth plane", i);
-      CuMemFreeHost (mem->map_alloc_data);
-      mem->map_alloc_data = mem->map_data = mem->align_data = NULL;
-      break;
-    }
-  }
-  gst_cuda_result (CuStreamSynchronize (NULL));
-
-  if (!mem->map_data)
-    return FALSE;
-
-  return TRUE;
-}
-
-static gpointer
-gst_cuda_memory_device_memory_map (GstCudaMemory * mem)
-{
-  GstMemory *memory = GST_MEMORY_CAST (mem);
-  gpointer data;
-  gsize aoffset;
-  gsize align = memory->align;
-
-  if (mem->map_data) {
-    return mem->map_data;
-  }
-
-  GST_CAT_DEBUG (GST_CAT_MEMORY, "alloc host memory for map");
-
-  if (!mem->map_alloc_data) {
-    gsize maxsize;
-    guint8 *align_data;
-
-    maxsize = memory->maxsize + align;
-    if (!gst_cuda_context_push (mem->context)) {
-      GST_CAT_ERROR (GST_CAT_MEMORY, "cannot push cuda context");
-
-      return NULL;
-    }
-
-    if (!gst_cuda_result (CuMemAllocHost (&data, maxsize))) {
-      GST_CAT_ERROR (GST_CAT_MEMORY, "cannot alloc host memory");
+  if (!priv->staging) {
+    ret = gst_cuda_result (CuMemAllocHost (&priv->staging,
+            GST_MEMORY_CAST (mem)->size));
+    if (!ret) {
+      GST_ERROR_OBJECT (self, "Failed to allocate staging memory");
      gst_cuda_context_pop (NULL);
-
-      return NULL;
-    }
-
-    if (!gst_cuda_context_pop (NULL)) {
-      GST_CAT_WARNING (GST_CAT_MEMORY, "cannot pop cuda context");
-    }
-
-    mem->map_alloc_data = data;
-    align_data = data;
-
-    /* do align */
-    if ((aoffset = ((guintptr) align_data & align))) {
-      aoffset = (align + 1) - aoffset;
-      align_data += aoffset;
-    }
-    mem->align_data = align_data;
-
-    /* first memory, always need download to staging */
-    GST_MINI_OBJECT_FLAG_SET (mem, GST_CUDA_MEMORY_TRANSFER_NEED_DOWNLOAD);
-  }
-
-  mem->map_data = mem->align_data;
-
-  if (GST_MEMORY_FLAG_IS_SET (mem, GST_CUDA_MEMORY_TRANSFER_NEED_DOWNLOAD)) {
-    if (!gst_cuda_context_push (mem->context)) {
-      GST_CAT_ERROR (GST_CAT_MEMORY, "cannot push cuda context");
-
-      return NULL;
-    }
-
-    gst_cuda_memory_download_transfer (mem);
-
-    if (!gst_cuda_context_pop (NULL)) {
-      GST_CAT_WARNING (GST_CAT_MEMORY, "cannot pop cuda context");
+      return FALSE;
    }
  }

-  return mem->map_data;
+  param.srcMemoryType = CU_MEMORYTYPE_DEVICE;
+  param.srcDevice = (CUdeviceptr) priv->data;
+  param.srcPitch = priv->pitch;
+
+  param.dstMemoryType = CU_MEMORYTYPE_HOST;
+  param.dstHost = priv->staging;
+  param.dstPitch = priv->pitch;
+  param.WidthInBytes = priv->width_in_bytes;
+  param.Height = priv->height;
+
+  ret = gst_cuda_result (CuMemcpy2D (&param));
+  gst_cuda_context_pop (NULL);
+
+  if (!ret)
+    GST_ERROR_OBJECT (self, "Failed to upload memory");
+
+  return ret;
 }

 static gpointer
-cuda_mem_map (GstCudaMemory * mem, gsize maxsize, GstMapFlags flags)
+cuda_mem_map (GstMemory * mem, gsize maxsize, GstMapFlags flags)
 {
+  GstCudaAllocator *self = GST_CUDA_ALLOCATOR (mem->allocator);
+  GstCudaMemory *cmem = GST_CUDA_MEMORY_CAST (mem);
+  GstCudaMemoryPrivate *priv = cmem->priv;
  gpointer ret = NULL;

-  g_mutex_lock (&mem->lock);
-  mem->map_count++;
-
+  g_mutex_lock (&priv->lock);
  if ((flags & GST_MAP_CUDA) == GST_MAP_CUDA) {
-    /* upload from staging to device memory if necessary */
-    if (GST_MEMORY_FLAG_IS_SET (mem, GST_CUDA_MEMORY_TRANSFER_NEED_UPLOAD)) {
-      if (!gst_cuda_context_push (mem->context)) {
-        GST_CAT_ERROR (GST_CAT_MEMORY, "cannot push cuda context");
-        g_mutex_unlock (&mem->lock);
-
-        return NULL;
-      }
-
-      if (!gst_cuda_memory_upload_transfer (mem)) {
-        g_mutex_unlock (&mem->lock);
-        return NULL;
-      }
-
-      gst_cuda_context_pop (NULL);
-    }
+    if (!gst_cuda_memory_upload (self, cmem))
+      goto out;

    GST_MEMORY_FLAG_UNSET (mem, GST_CUDA_MEMORY_TRANSFER_NEED_UPLOAD);

    if ((flags & GST_MAP_WRITE) == GST_MAP_WRITE)
      GST_MINI_OBJECT_FLAG_SET (mem, GST_CUDA_MEMORY_TRANSFER_NEED_DOWNLOAD);

-    g_mutex_unlock (&mem->lock);
-    return (gpointer) mem->data;
+    ret = (gpointer) priv->data;
+    goto out;
  }

-  ret = gst_cuda_memory_device_memory_map (mem);
-  if (ret == NULL) {
-    mem->map_count--;
-    g_mutex_unlock (&mem->lock);
-    return NULL;
-  }
+  /* First CPU access, must be downloaded */
+  if (!priv->staging)
+    GST_MINI_OBJECT_FLAG_SET (mem, GST_CUDA_MEMORY_TRANSFER_NEED_DOWNLOAD);
+
+  if (!gst_cuda_memory_download (self, cmem))
+    goto out;
+
+  ret = priv->staging;

  if ((flags & GST_MAP_WRITE) == GST_MAP_WRITE)
    GST_MINI_OBJECT_FLAG_SET (mem, GST_CUDA_MEMORY_TRANSFER_NEED_UPLOAD);

  GST_MEMORY_FLAG_UNSET (mem, GST_CUDA_MEMORY_TRANSFER_NEED_DOWNLOAD);

-  g_mutex_unlock (&mem->lock);
+out:
+  g_mutex_unlock (&priv->lock);

  return ret;
 }

 static void
-cuda_mem_unmap_full (GstCudaMemory * mem, GstMapInfo * info)
+cuda_mem_unmap_full (GstMemory * mem, GstMapInfo * info)
 {
-  g_mutex_lock (&mem->lock);
-  mem->map_count--;
-  GST_CAT_TRACE (GST_CAT_MEMORY,
-      "unmap CUDA memory %p, map count %d, have map_data %s",
-      mem, mem->map_count, mem->map_data ? "true" : "false");
+  GstCudaMemory *cmem = GST_CUDA_MEMORY_CAST (mem);
+  GstCudaMemoryPrivate *priv = cmem->priv;

+  g_mutex_lock (&priv->lock);
  if ((info->flags & GST_MAP_CUDA) == GST_MAP_CUDA) {
    if ((info->flags & GST_MAP_WRITE) == GST_MAP_WRITE)
      GST_MINI_OBJECT_FLAG_SET (mem, GST_CUDA_MEMORY_TRANSFER_NEED_DOWNLOAD);

-    g_mutex_unlock (&mem->lock);
-    return;
+    goto out;
  }

-  if ((info->flags & GST_MAP_WRITE))
+  if ((info->flags & GST_MAP_WRITE) == GST_MAP_WRITE)
    GST_MINI_OBJECT_FLAG_SET (mem, GST_CUDA_MEMORY_TRANSFER_NEED_UPLOAD);

-  if (mem->map_count > 0 || !mem->map_data) {
-    g_mutex_unlock (&mem->lock);
-    return;
-  }
-
-  mem->map_data = NULL;
-  g_mutex_unlock (&mem->lock);
+out:
+  g_mutex_unlock (&priv->lock);

  return;
 }
@ -412,72 +357,82 @@ cuda_mem_unmap_full (GstCudaMemory * mem, GstMapInfo * info)
 static GstMemory *
 cuda_mem_copy (GstMemory * mem, gssize offset, gssize size)
 {
-  GstMemory *copy;
+  GstCudaAllocator *self = GST_CUDA_ALLOCATOR (mem->allocator);
  GstCudaMemory *src_mem = GST_CUDA_MEMORY_CAST (mem);
-  GstCudaMemory *dst_mem;
-  GstCudaContext *ctx = GST_CUDA_ALLOCATOR_CAST (mem->allocator)->context;
-  gint i;
-  GstVideoInfo *info;
+  GstCudaContext *context = src_mem->context;
+  GstMapInfo src_info, dst_info;
+  CUDA_MEMCPY2D param = { 0, };
+  GstMemory *copy;
+  gboolean ret;

  /* offset and size are ignored */
-  copy = gst_cuda_allocator_alloc (mem->allocator, mem->size,
-      &src_mem->alloc_params);
+  copy = gst_cuda_allocator_alloc_internal (self, context,
+      &src_mem->info, src_mem->priv->width_in_bytes, src_mem->priv->height);

-  dst_mem = GST_CUDA_MEMORY_CAST (copy);
+  if (!copy) {
+    GST_ERROR_OBJECT (self, "Failed to allocate memory for copying");
+    return NULL;
+  }

-  info = &src_mem->alloc_params.info;
+  if (!gst_memory_map (mem, &src_info, GST_MAP_READ | GST_MAP_CUDA)) {
+    GST_ERROR_OBJECT (self, "Failed to map src memory");
+    gst_memory_unref (copy);
+    return NULL;
+  }

-  if (!gst_cuda_context_push (ctx)) {
-    GST_CAT_ERROR (GST_CAT_MEMORY, "cannot push cuda context");
-    gst_cuda_allocator_free (mem->allocator, copy);
+  if (!gst_memory_map (copy, &dst_info, GST_MAP_WRITE | GST_MAP_CUDA)) {
+    GST_ERROR_OBJECT (self, "Failed to map dst memory");
+    gst_memory_unmap (mem, &src_info);
+    gst_memory_unref (copy);
+    return NULL;
+  }
+
+  if (!gst_cuda_context_push (context)) {
+    GST_ERROR_OBJECT (self, "Failed to push cuda context");
+    gst_memory_unmap (mem, &src_info);
+    gst_memory_unmap (copy, &dst_info);

    return NULL;
  }

-  for (i = 0; i < GST_VIDEO_INFO_N_PLANES (info); i++) {
-    CUDA_MEMCPY2D param = { 0, };
+  param.srcMemoryType = CU_MEMORYTYPE_DEVICE;
+  param.srcDevice = (CUdeviceptr) src_info.data;
+  param.srcPitch = src_mem->priv->pitch;

-    param.srcMemoryType = CU_MEMORYTYPE_DEVICE;
-    param.srcDevice = src_mem->data + src_mem->offset[i];
-    param.srcPitch = src_mem->stride;
+  param.dstMemoryType = CU_MEMORYTYPE_DEVICE;
+  param.dstDevice = (CUdeviceptr) dst_info.data;
+  param.dstPitch = src_mem->priv->pitch;
+  param.WidthInBytes = src_mem->priv->width_in_bytes;
+  param.Height = src_mem->priv->height;

-    param.dstMemoryType = CU_MEMORYTYPE_DEVICE;
-    param.dstDevice = dst_mem->data + dst_mem->offset[i];
-    param.dstPitch = dst_mem->stride;
-    param.WidthInBytes = GST_VIDEO_INFO_COMP_WIDTH (info, i) *
-        GST_VIDEO_INFO_COMP_PSTRIDE (info, i);
-    param.Height = GST_VIDEO_INFO_COMP_HEIGHT (info, i);
+  ret = gst_cuda_result (CuMemcpy2D (&param));
+  gst_cuda_context_pop (NULL);

-    if (!gst_cuda_result (CuMemcpy2DAsync (&param, NULL))) {
-      GST_CAT_ERROR_OBJECT (GST_CAT_MEMORY,
-          mem->allocator, "Failed to copy %dth plane", i);
-      gst_cuda_context_pop (NULL);
-      gst_cuda_allocator_free (mem->allocator, copy);
+  gst_memory_unmap (mem, &src_info);
+  gst_memory_unmap (copy, &dst_info);

-      return NULL;
-    }
-  }
-
-  gst_cuda_result (CuStreamSynchronize (NULL));
-
-  if (!gst_cuda_context_pop (NULL)) {
-    GST_CAT_WARNING (GST_CAT_MEMORY, "cannot pop cuda context");
+  if (!ret) {
+    GST_ERROR_OBJECT (self, "Failed to copy memory");
+    gst_memory_unref (copy);
+    return NULL;
  }

  return copy;
 }

-GstAllocator *
-gst_cuda_allocator_new (GstCudaContext * context)
+void
+gst_cuda_memory_init_once (void)
 {
-  GstCudaAllocator *allocator;
+  static gsize _init = 0;

-  g_return_val_if_fail (GST_IS_CUDA_CONTEXT (context), NULL);
+  if (g_once_init_enter (&_init)) {
+    _gst_cuda_allocator =
+        (GstAllocator *) g_object_new (GST_TYPE_CUDA_ALLOCATOR, NULL);
+    gst_object_ref_sink (_gst_cuda_allocator);

-  allocator = g_object_new (GST_TYPE_CUDA_ALLOCATOR, NULL);
-  allocator->context = gst_object_ref (context);
-
-  return GST_ALLOCATOR_CAST (allocator);
+    gst_allocator_register (GST_CUDA_MEMORY_TYPE_NAME, _gst_cuda_allocator);
+    g_once_init_leave (&_init, 1);
+  }
 }

 gboolean
@ -486,3 +441,51 @@ gst_is_cuda_memory (GstMemory * mem)
  return mem != NULL && mem->allocator != NULL &&
      GST_IS_CUDA_ALLOCATOR (mem->allocator);
 }
+
+GstMemory *
+gst_cuda_allocator_alloc (GstCudaAllocator * allocator,
+    GstCudaContext * context, const GstVideoInfo * info)
+{
+  guint alloc_height;
+
+  g_return_val_if_fail (GST_IS_CUDA_ALLOCATOR (allocator), NULL);
+  g_return_val_if_fail (GST_IS_CUDA_CONTEXT (context), NULL);
+  g_return_val_if_fail (info != NULL, NULL);
+
+  alloc_height = GST_VIDEO_INFO_HEIGHT (info);
+
+  /* make sure valid height for subsampled formats */
+  switch (GST_VIDEO_INFO_FORMAT (info)) {
+    case GST_VIDEO_FORMAT_I420:
+    case GST_VIDEO_FORMAT_YV12:
+    case GST_VIDEO_FORMAT_NV12:
+    case GST_VIDEO_FORMAT_P010_10LE:
+    case GST_VIDEO_FORMAT_P016_LE:
+    case GST_VIDEO_FORMAT_I420_10LE:
+      alloc_height = GST_ROUND_UP_2 (alloc_height);
+      break;
+    default:
+      break;
+  }
+
+  switch (GST_VIDEO_INFO_FORMAT (info)) {
+    case GST_VIDEO_FORMAT_I420:
+    case GST_VIDEO_FORMAT_YV12:
+    case GST_VIDEO_FORMAT_I420_10LE:
+    case GST_VIDEO_FORMAT_NV12:
+    case GST_VIDEO_FORMAT_NV21:
+    case GST_VIDEO_FORMAT_P010_10LE:
+    case GST_VIDEO_FORMAT_P016_LE:
+      alloc_height *= 2;
+      break;
+    case GST_VIDEO_FORMAT_Y444:
+    case GST_VIDEO_FORMAT_Y444_16LE:
+      alloc_height *= 3;
+      break;
+    default:
+      break;
+  }
+
+  return gst_cuda_allocator_alloc_internal (allocator, context,
+      info, info->stride[0], alloc_height);
+}
--- a/subprojects/gst-plugins-bad/sys/nvcodec/gstcudamemory.h
+++ b/subprojects/gst-plugins-bad/sys/nvcodec/gstcudamemory.h
@ -37,10 +37,11 @@ G_BEGIN_DECLS
 #define GST_CUDA_ALLOCATOR_CAST(obj)        ((GstCudaAllocator *)(obj))
 #define GST_CUDA_MEMORY_CAST(mem)           ((GstCudaMemory *) (mem))

-typedef struct _GstCudaAllocationParams GstCudaAllocationParams;
 typedef struct _GstCudaAllocator GstCudaAllocator;
 typedef struct _GstCudaAllocatorClass GstCudaAllocatorClass;
+
 typedef struct _GstCudaMemory GstCudaMemory;
+typedef struct _GstCudaMemoryPrivate GstCudaMemoryPrivate;

 /**
 * GST_MAP_CUDA:
@ -65,32 +66,6 @@ typedef struct _GstCudaMemory GstCudaMemory;
 */
 #define GST_CAPS_FEATURE_MEMORY_CUDA_MEMORY "memory:CUDAMemory"

-struct _GstCudaAllocationParams
-{
-  GstAllocationParams parent;
-
-  GstVideoInfo info;
-};
-
-struct _GstCudaAllocator
-{
-  GstAllocator parent;
-  GstCudaContext *context;
-};
-
-struct _GstCudaAllocatorClass
-{
-  GstAllocatorClass parent_class;
-};
-
-GType          gst_cuda_allocator_get_type (void);
-
-GstAllocator * gst_cuda_allocator_new (GstCudaContext * context);
-
-GstMemory    * gst_cuda_allocator_alloc (GstAllocator * allocator,
-                                         gsize size,
-                                         GstCudaAllocationParams * params);
-
 /**
 * GstCudaMemoryTransfer:
 * @GST_CUDA_MEMORY_TRANSFER_NEED_DOWNLOAD: the device memory needs downloading
@ -106,32 +81,36 @@ typedef enum

 struct _GstCudaMemory
 {
-  GstMemory       mem;
+  GstMemory mem;

+  /*< public >*/
  GstCudaContext *context;
-  CUdeviceptr data;
+  GstVideoInfo info;

-  GstCudaAllocationParams alloc_params;
-
-  /* offset and stride of CUDA device memory */
-  gsize offset[GST_VIDEO_MAX_PLANES];
-  gint stride;
-
-  /* allocated CUDA Host memory */
-  gpointer map_alloc_data;
-
-  /* aligned CUDA Host memory */
-  guint8 *align_data;
-
-  /* pointing align_data if the memory is mapped */
-  gpointer map_data;
-
-  gint map_count;
-
-  GMutex lock;
+  /*< private >*/
+  GstCudaMemoryPrivate *priv;
+  gpointer _gst_reserved[GST_PADDING];
 };

-gboolean        gst_is_cuda_memory        (GstMemory * mem);
+struct _GstCudaAllocator
+{
+  GstAllocator parent;
+};
+
+struct _GstCudaAllocatorClass
+{
+  GstAllocatorClass parent_class;
+};
+
+void           gst_cuda_memory_init_once   (void);
+
+gboolean       gst_is_cuda_memory          (GstMemory * mem);
+
+GType          gst_cuda_allocator_get_type (void);
+
+GstMemory    * gst_cuda_allocator_alloc    (GstCudaAllocator * allocator,
+                                            GstCudaContext * context,
+                                            const GstVideoInfo * info);

 G_END_DECLS

--- a/subprojects/gst-plugins-bad/sys/nvcodec/gstcudaupload.c
+++ b/subprojects/gst-plugins-bad/sys/nvcodec/gstcudaupload.c
@ -174,10 +174,6 @@ gst_cuda_upload_propose_allocation (GstBaseTransform * trans,
  if (gst_query_get_n_allocation_pools (query) == 0) {
    GstCapsFeatures *features;
    GstStructure *config;
-    GstVideoAlignment align;
-    GstAllocationParams params = { 0, 31, 0, 0, };
-    GstAllocator *allocator = NULL;
-    gint i;

    features = gst_caps_get_features (caps, 0);

@ -191,38 +187,25 @@ gst_cuda_upload_propose_allocation (GstBaseTransform * trans,

    config = gst_buffer_pool_get_config (pool);

-    gst_video_alignment_reset (&align);
-    for (i = 0; i < GST_VIDEO_INFO_N_PLANES (&info); i++) {
-      align.stride_align[i] = 31;
-    }
-    gst_video_info_align (&info, &align);
-
    gst_buffer_pool_config_add_option (config,
        GST_BUFFER_POOL_OPTION_VIDEO_META);
-    gst_buffer_pool_config_add_option (config,
-        GST_BUFFER_POOL_OPTION_VIDEO_ALIGNMENT);

-    gst_buffer_pool_config_set_video_alignment (config, &align);
    size = GST_VIDEO_INFO_SIZE (&info);
    gst_buffer_pool_config_set_params (config, caps, size, 0, 0);

-    gst_query_add_allocation_meta (query, GST_VIDEO_META_API_TYPE, NULL);
-    gst_query_add_allocation_pool (query, pool, size, 0, 0);
-
-    if (gst_buffer_pool_config_get_allocator (config, &allocator, &params)) {
-      if (params.align < 31)
-        params.align = 31;
-
-      gst_query_add_allocation_param (query, allocator, &params);
-      gst_buffer_pool_config_set_allocator (config, allocator, &params);
-    }
-
    if (!gst_buffer_pool_set_config (pool, config)) {
      GST_ERROR_OBJECT (ctrans, "failed to set config");
      gst_object_unref (pool);
      return FALSE;
    }

+    /* Get updated size by cuda buffer pool */
+    config = gst_buffer_pool_get_config (pool);
+    gst_buffer_pool_config_get_params (config, NULL, &size, NULL, NULL);
+    gst_structure_free (config);
+
+    gst_query_add_allocation_pool (query, pool, size, 0, 0);
+
    gst_object_unref (pool);
  }

@ -289,6 +272,12 @@ gst_cuda_upload_decide_allocation (GstBaseTransform * trans, GstQuery * query)
  gst_buffer_pool_config_add_option (config, GST_BUFFER_POOL_OPTION_VIDEO_META);
  gst_buffer_pool_config_set_params (config, outcaps, size, min, max);
  gst_buffer_pool_set_config (pool, config);
+
+  /* Get updated size by cuda buffer pool */
+  config = gst_buffer_pool_get_config (pool);
+  gst_buffer_pool_config_get_params (config, NULL, &size, NULL, NULL);
+  gst_structure_free (config);
+
  if (update_pool)
    gst_query_set_nth_allocation_pool (query, 0, pool, size, min, max);
  else
--- a/subprojects/gst-plugins-bad/sys/nvcodec/gstnvbaseenc.c
+++ b/subprojects/gst-plugins-bad/sys/nvcodec/gstnvbaseenc.c
@ -625,6 +625,7 @@ gst_nv_base_enc_propose_allocation (GstVideoEncoder * enc, GstQuery * query)
  GstBufferPool *pool;
  GstStructure *config;
  GstCapsFeatures *features;
+  guint size;

  GST_DEBUG_OBJECT (nvenc, "propose allocation");

@ -665,18 +666,25 @@ gst_nv_base_enc_propose_allocation (GstVideoEncoder * enc, GstQuery * query)
    goto done;
  }

-  config = gst_buffer_pool_get_config (pool);
-  gst_buffer_pool_config_set_params (config, caps, GST_VIDEO_INFO_SIZE (&info),
-      nvenc->items->len, nvenc->items->len);
+  size = GST_VIDEO_INFO_SIZE (&info);

-  gst_query_add_allocation_pool (query, pool, GST_VIDEO_INFO_SIZE (&info),
+  config = gst_buffer_pool_get_config (pool);
+  gst_buffer_pool_config_set_params (config, caps, size,
      nvenc->items->len, nvenc->items->len);
  gst_buffer_pool_config_add_option (config, GST_BUFFER_POOL_OPTION_VIDEO_META);
-  gst_query_add_allocation_meta (query, GST_VIDEO_META_API_TYPE, NULL);

  if (!gst_buffer_pool_set_config (pool, config))
    goto error_pool_config;

+  /* Get updated size by cuda buffer pool */
+  config = gst_buffer_pool_get_config (pool);
+  gst_buffer_pool_config_get_params (config, NULL, &size, NULL, NULL);
+  gst_structure_free (config);
+
+  gst_query_add_allocation_pool (query, pool, size,
+      nvenc->items->len, nvenc->items->len);
+  gst_query_add_allocation_meta (query, GST_VIDEO_META_API_TYPE, NULL);
+
  gst_object_unref (pool);

 done:
@ -2231,17 +2239,12 @@ gst_nv_base_enc_upload_frame (GstNvBaseEnc * nvenc, GstVideoFrame * frame,
  CUdeviceptr dst = resource->cuda_pointer;
  GstVideoInfo *info = &frame->info;
  CUresult cuda_ret;
-  GstCudaMemory *cuda_mem = NULL;

  if (!gst_cuda_context_push (nvenc->cuda_ctx)) {
    GST_ERROR_OBJECT (nvenc, "cannot push context");
    return FALSE;
  }

-  if (use_device_memory) {
-    cuda_mem = (GstCudaMemory *) gst_buffer_peek_memory (frame->buffer, 0);
-  }
-
  for (i = 0; i < GST_VIDEO_FRAME_N_PLANES (frame); i++) {
    CUDA_MEMCPY2D param = { 0, };
    guint dest_stride = _get_cuda_device_stride (&nvenc->input_info, i,
@ -2249,13 +2252,12 @@ gst_nv_base_enc_upload_frame (GstNvBaseEnc * nvenc, GstVideoFrame * frame,

    if (use_device_memory) {
      param.srcMemoryType = CU_MEMORYTYPE_DEVICE;
-      param.srcDevice = cuda_mem->data + cuda_mem->offset[i];
-      param.srcPitch = cuda_mem->stride;
+      param.srcDevice = (CUdeviceptr) GST_VIDEO_FRAME_PLANE_DATA (frame, i);
    } else {
      param.srcMemoryType = CU_MEMORYTYPE_HOST;
      param.srcHost = GST_VIDEO_FRAME_PLANE_DATA (frame, i);
-      param.srcPitch = GST_VIDEO_FRAME_PLANE_STRIDE (frame, i);
    }
+    param.srcPitch = GST_VIDEO_FRAME_PLANE_STRIDE (frame, i);

    param.dstMemoryType = CU_MEMORYTYPE_DEVICE;
    param.dstDevice = dst;
--- a/subprojects/gst-plugins-bad/sys/nvcodec/gstnvdec.c
+++ b/subprojects/gst-plugins-bad/sys/nvcodec/gstnvdec.c
@ -1241,34 +1241,25 @@ gst_nvdec_copy_device_to_memory (GstNvDec * nvdec,
  GstVideoInfo *info = &nvdec->output_state->info;
  gint i;
  GstMemory *mem;
-  GstCudaMemory *cuda_mem = NULL;
-
-  if (!gst_cuda_context_push (nvdec->cuda_ctx)) {
-    GST_WARNING_OBJECT (nvdec, "failed to lock CUDA context");
-    return FALSE;
-  }
+  gboolean use_device_copy = FALSE;
+  GstMapFlags map_flags = GST_MAP_WRITE;

  if (nvdec->mem_type == GST_NVDEC_MEM_TYPE_CUDA &&
      (mem = gst_buffer_peek_memory (output_buffer, 0)) &&
      gst_is_cuda_memory (mem)) {
-    GstCudaMemory *cmem = GST_CUDA_MEMORY_CAST (mem);
-
-    if (cmem->context == nvdec->cuda_ctx ||
-        gst_cuda_context_get_handle (cmem->context) ==
-        gst_cuda_context_get_handle (nvdec->cuda_ctx) ||
-        (gst_cuda_context_can_access_peer (cmem->context, nvdec->cuda_ctx) &&
-            gst_cuda_context_can_access_peer (nvdec->cuda_ctx,
-                cmem->context))) {
-      cuda_mem = cmem;
-    }
+    map_flags |= GST_MAP_CUDA;
+    use_device_copy = TRUE;
  }

-  if (!cuda_mem) {
-    if (!gst_video_frame_map (&video_frame, info, output_buffer, GST_MAP_WRITE)) {
-      GST_ERROR_OBJECT (nvdec, "frame map failure");
-      gst_cuda_context_pop (NULL);
-      return FALSE;
-    }
+  if (!gst_video_frame_map (&video_frame, info, output_buffer, map_flags)) {
+    GST_ERROR_OBJECT (nvdec, "frame map failure");
+    return FALSE;
+  }
+
+  if (!gst_cuda_context_push (nvdec->cuda_ctx)) {
+    gst_video_frame_unmap (&video_frame);
+    GST_WARNING_OBJECT (nvdec, "failed to lock CUDA context");
+    return FALSE;
  }

  params.progressive_frame = dispinfo->progressive_frame;
@ -1286,17 +1277,17 @@ gst_nvdec_copy_device_to_memory (GstNvDec * nvdec,
  copy_params.srcMemoryType = CU_MEMORYTYPE_DEVICE;
  copy_params.srcPitch = pitch;
  copy_params.dstMemoryType =
-      cuda_mem ? CU_MEMORYTYPE_DEVICE : CU_MEMORYTYPE_HOST;
+      use_device_copy ? CU_MEMORYTYPE_DEVICE : CU_MEMORYTYPE_HOST;

  for (i = 0; i < GST_VIDEO_INFO_N_PLANES (info); i++) {
    copy_params.srcDevice = dptr + (i * pitch * GST_VIDEO_INFO_HEIGHT (info));
-    if (cuda_mem) {
-      copy_params.dstDevice = cuda_mem->data + cuda_mem->offset[i];
-      copy_params.dstPitch = cuda_mem->stride;
+    if (use_device_copy) {
+      copy_params.dstDevice =
+          (CUdeviceptr) GST_VIDEO_FRAME_PLANE_DATA (&video_frame, i);
    } else {
      copy_params.dstHost = GST_VIDEO_FRAME_PLANE_DATA (&video_frame, i);
-      copy_params.dstPitch = GST_VIDEO_FRAME_PLANE_STRIDE (&video_frame, i);
    }
+    copy_params.dstPitch = GST_VIDEO_FRAME_PLANE_STRIDE (&video_frame, i);
    copy_params.WidthInBytes = GST_VIDEO_INFO_COMP_WIDTH (info, i)
        * GST_VIDEO_INFO_COMP_PSTRIDE (info, i);
    copy_params.Height = GST_VIDEO_INFO_COMP_HEIGHT (info, i);
@ -1304,8 +1295,7 @@ gst_nvdec_copy_device_to_memory (GstNvDec * nvdec,
    if (!gst_cuda_result (CuMemcpy2DAsync (&copy_params, nvdec->cuda_stream))) {
      GST_ERROR_OBJECT (nvdec, "failed to copy %dth plane", i);
      CuvidUnmapVideoFrame (nvdec->decoder, dptr);
-      if (!cuda_mem)
-        gst_video_frame_unmap (&video_frame);
+      gst_video_frame_unmap (&video_frame);
      gst_cuda_context_pop (NULL);
      return FALSE;
    }
@ -1313,8 +1303,7 @@ gst_nvdec_copy_device_to_memory (GstNvDec * nvdec,

  gst_cuda_result (CuStreamSynchronize (nvdec->cuda_stream));

-  if (!cuda_mem)
-    gst_video_frame_unmap (&video_frame);
+  gst_video_frame_unmap (&video_frame);

  if (!gst_cuda_result (CuvidUnmapVideoFrame (nvdec->decoder, dptr)))
    GST_WARNING_OBJECT (nvdec, "failed to unmap video frame");
@ -1558,9 +1547,15 @@ gst_nvdec_ensure_cuda_pool (GstNvDec * nvdec, GstQuery * query)
  n = gst_query_get_n_allocation_pools (query);
  if (n > 0) {
    gst_query_parse_nth_allocation_pool (query, 0, &pool, &size, &min, &max);
-    if (pool && !GST_IS_CUDA_BUFFER_POOL (pool)) {
-      gst_object_unref (pool);
-      pool = NULL;
+    if (pool) {
+      if (!GST_IS_CUDA_BUFFER_POOL (pool)) {
+        gst_clear_object (&pool);
+      } else {
+        GstCudaBufferPool *cpool = GST_CUDA_BUFFER_POOL (pool);
+
+        if (cpool->context != nvdec->cuda_ctx)
+          gst_clear_object (&pool);
+      }
    }
  }

@ -1578,6 +1573,12 @@ gst_nvdec_ensure_cuda_pool (GstNvDec * nvdec, GstQuery * query)
  gst_buffer_pool_config_set_params (config, outcaps, size, min, max);
  gst_buffer_pool_config_add_option (config, GST_BUFFER_POOL_OPTION_VIDEO_META);
  gst_buffer_pool_set_config (pool, config);
+
+  /* Get updated size by cuda buffer pool */
+  config = gst_buffer_pool_get_config (pool);
+  gst_buffer_pool_config_get_params (config, NULL, &size, NULL, NULL);
+  gst_structure_free (config);
+
  if (n > 0)
    gst_query_set_nth_allocation_pool (query, 0, pool, size, min, max);
  else
--- a/subprojects/gst-plugins-bad/sys/nvcodec/gstnvdecoder.c
+++ b/subprojects/gst-plugins-bad/sys/nvcodec/gstnvdecoder.c
@ -740,33 +740,24 @@ gst_nv_decoder_copy_frame_to_cuda (GstNvDecoder * decoder,
 {
  CUDA_MEMCPY2D copy_params = { 0, };
  GstMemory *mem;
-  GstCudaMemory *cuda_mem = NULL;
  gint i;
  gboolean ret = FALSE;
+  GstVideoFrame video_frame;

  mem = gst_buffer_peek_memory (buffer, 0);
  if (!gst_is_cuda_memory (mem)) {
    GST_WARNING_OBJECT (decoder, "Not a CUDA memory");
    return FALSE;
-  } else {
-    GstCudaMemory *cmem = GST_CUDA_MEMORY_CAST (mem);
-
-    if (cmem->context == decoder->context ||
-        gst_cuda_context_get_handle (cmem->context) ==
-        gst_cuda_context_get_handle (decoder->context) ||
-        (gst_cuda_context_can_access_peer (cmem->context, decoder->context) &&
-            gst_cuda_context_can_access_peer (decoder->context,
-                cmem->context))) {
-      cuda_mem = cmem;
-    }
  }

-  if (!cuda_mem) {
-    GST_WARNING_OBJECT (decoder, "Access to CUDA memory is not allowed");
+  if (!gst_video_frame_map (&video_frame,
+          &decoder->info, buffer, GST_MAP_WRITE | GST_MAP_CUDA)) {
+    GST_ERROR_OBJECT (decoder, "frame map failure");
    return FALSE;
  }

  if (!gst_cuda_context_push (decoder->context)) {
+    gst_video_frame_unmap (&video_frame);
    GST_ERROR_OBJECT (decoder, "Failed to push CUDA context");
    return FALSE;
  }
@ -778,8 +769,9 @@ gst_nv_decoder_copy_frame_to_cuda (GstNvDecoder * decoder,
  for (i = 0; i < GST_VIDEO_INFO_N_PLANES (&decoder->info); i++) {
    copy_params.srcDevice = frame->devptr +
        (i * frame->pitch * GST_VIDEO_INFO_HEIGHT (&decoder->info));
-    copy_params.dstDevice = cuda_mem->data + cuda_mem->offset[i];
-    copy_params.dstPitch = cuda_mem->stride;
+    copy_params.dstDevice =
+        (CUdeviceptr) GST_VIDEO_FRAME_PLANE_DATA (&video_frame, i);
+    copy_params.dstPitch = GST_VIDEO_FRAME_PLANE_STRIDE (&video_frame, i);
    copy_params.WidthInBytes = GST_VIDEO_INFO_COMP_WIDTH (&decoder->info, 0)
        * GST_VIDEO_INFO_COMP_PSTRIDE (&decoder->info, 0);
    copy_params.Height = GST_VIDEO_INFO_COMP_HEIGHT (&decoder->info, i);
@ -795,6 +787,7 @@ gst_nv_decoder_copy_frame_to_cuda (GstNvDecoder * decoder,
  ret = TRUE;

 done:
+  gst_video_frame_unmap (&video_frame);
  gst_cuda_context_pop (NULL);

  GST_LOG_OBJECT (decoder, "Copy frame to CUDA ret %d", ret);
@ -1567,6 +1560,12 @@ gst_nv_decoder_ensure_cuda_pool (GstNvDecoder * decoder, GstQuery * query)
  gst_buffer_pool_config_set_params (config, outcaps, size, min, max);
  gst_buffer_pool_config_add_option (config, GST_BUFFER_POOL_OPTION_VIDEO_META);
  gst_buffer_pool_set_config (pool, config);
+
+  /* Get updated size by cuda buffer pool */
+  config = gst_buffer_pool_get_config (pool);
+  gst_buffer_pool_config_get_params (config, NULL, &size, NULL, NULL);
+  gst_structure_free (config);
+
  if (n > 0)
    gst_query_set_nth_allocation_pool (query, 0, pool, size, min, max);
  else
--- a/subprojects/gst-plugins-bad/sys/nvcodec/plugin.c
+++ b/subprojects/gst-plugins-bad/sys/nvcodec/plugin.c
@ -39,6 +39,7 @@
 #include "gstcudadownload.h"
 #include "gstcudaupload.h"
 #include "gstcudafilter.h"
+#include "gstcudamemory.h"

 GST_DEBUG_CATEGORY (gst_nvcodec_debug);
 GST_DEBUG_CATEGORY (gst_nvdec_debug);
@ -238,6 +239,7 @@ plugin_init (GstPlugin * plugin)
      GST_TYPE_CUDA_UPLOAD);

  gst_cuda_filter_plugin_init (plugin);
+  gst_cuda_memory_init_once ();

  return TRUE;
 }