diff --git a/.indent_cpp_list b/.indent_cpp_list
index 8439878265..db2cc2d444 100644
--- a/.indent_cpp_list
+++ b/.indent_cpp_list
@@ -1,3 +1,4 @@
+subprojects/gst-plugins-bad/ext/nvcomp
 subprojects/gst-plugins-bad/ext/qt6d3d11
 subprojects/gst-plugins-bad/gst-libs/gst/cuda
 subprojects/gst-plugins-bad/gst-libs/gst/d3d11
diff --git a/subprojects/gst-plugins-bad/ext/meson.build b/subprojects/gst-plugins-bad/ext/meson.build
index 4f7139835f..f0bffe0c48 100644
--- a/subprojects/gst-plugins-bad/ext/meson.build
+++ b/subprojects/gst-plugins-bad/ext/meson.build
@@ -38,6 +38,7 @@ subdir('mpeg2enc')
 subdir('mplex')
 subdir('musepack')
 subdir('neon')
+subdir('nvcomp')
 subdir('onnx')
 subdir('openal')
 subdir('openaptx')
diff --git a/subprojects/gst-plugins-bad/ext/nvcomp/gstnvcomp.cpp b/subprojects/gst-plugins-bad/ext/nvcomp/gstnvcomp.cpp
new file mode 100644
index 0000000000..f3337fb7da
--- /dev/null
+++ b/subprojects/gst-plugins-bad/ext/nvcomp/gstnvcomp.cpp
@@ -0,0 +1,60 @@
+/* GStreamer
+ * Copyright (C) 2024 Seungha Yang <seungha@centricular.com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Library General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Library General Public
+ * License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
+ * Boston, MA 02110-1301, USA.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "gstnvcomp.h"
+#include <mutex>
+
+static const GEnumValue nvcomp_methods[] = {
+  {GST_NV_COMP_LZ4, "LZ4", "lz4"},
+  {GST_NV_COMP_SNAPPY, "SNAPPY", "snappy"},
+  {GST_NV_COMP_GDEFLATE, "GDEFLATE", "gdeflate"},
+  {GST_NV_COMP_DEFLATE, "DEFLATE", "deflate"},
+  {GST_NV_COMP_ZSTD, "ZSTD", "zstd"},
+  {GST_NV_COMP_CASCADED, "CASCADED", "cascaded"},
+  {GST_NV_COMP_BITCOMP, "BITCOMP", "bitcomp"},
+  {GST_NV_COMP_ANS, "ANS", "ans"},
+  {0, nullptr, nullptr},
+};
+
+GType
+gst_nv_comp_method_get_type (void)
+{
+  static GType method_type = 0;
+  static std::once_flag once;
+
+  std::call_once (once,[&] {
+        method_type = g_enum_register_static ("GstNvCompMethod",
+            nvcomp_methods);
+      });
+
+  return method_type;
+}
+
+const gchar *
+gst_nv_comp_method_to_string (GstNvCompMethod method)
+{
+  if (method >= GST_NV_COMP_LAST)
+    return nullptr;
+
+  return nvcomp_methods[method].value_nick;
+}
diff --git a/subprojects/gst-plugins-bad/ext/nvcomp/gstnvcomp.h b/subprojects/gst-plugins-bad/ext/nvcomp/gstnvcomp.h
new file mode 100644
index 0000000000..af6f9e510b
--- /dev/null
+++ b/subprojects/gst-plugins-bad/ext/nvcomp/gstnvcomp.h
@@ -0,0 +1,56 @@
+/* GStreamer
+ * Copyright (C) 2024 Seungha Yang <seungha@centricular.com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Library General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Library General Public
+ * License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
+ * Boston, MA 02110-1301, USA.
+ */
+
+#pragma once
+
+#include <gst/gst.h>
+#include <gst/video/video.h>
+
+#ifdef G_OS_WIN32
+#include <windows.h>
+#endif
+
+#include <gst/cuda/gstcuda.h>
+#include <stdint.h>
+#include <nvcomp.h>
+
+G_BEGIN_DECLS
+
+enum GstNvCompMethod
+{
+  GST_NV_COMP_LZ4,
+  GST_NV_COMP_SNAPPY,
+  GST_NV_COMP_GDEFLATE,
+  GST_NV_COMP_DEFLATE,
+  GST_NV_COMP_ZSTD,
+  GST_NV_COMP_CASCADED,
+  GST_NV_COMP_BITCOMP,
+  GST_NV_COMP_ANS,
+  GST_NV_COMP_LAST,
+};
+
+#define GST_TYPE_NV_COMP_METHOD (gst_nv_comp_method_get_type())
+GType gst_nv_comp_method_get_type ();
+
+const gchar * gst_nv_comp_method_to_string (GstNvCompMethod method);
+
+#define GST_NV_COMP_HEADER_VERSION 1
+#define GST_NV_COMP_HEADER_MIN_SIZE (sizeof (guint32) * 6)
+
+G_END_DECLS
\ No newline at end of file
diff --git a/subprojects/gst-plugins-bad/ext/nvcomp/gstnvcompvideodec.cpp b/subprojects/gst-plugins-bad/ext/nvcomp/gstnvcompvideodec.cpp
new file mode 100644
index 0000000000..08a0c647b7
--- /dev/null
+++ b/subprojects/gst-plugins-bad/ext/nvcomp/gstnvcompvideodec.cpp
@@ -0,0 +1,1739 @@
+/* GStreamer
+ * Copyright (C) 2024 Seungha Yang <seungha@centricular.com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Library General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Library General Public
+ * License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
+ * Boston, MA 02110-1301, USA.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "gstnvcompvideodec.h"
+
+#ifdef HAVE_GST_GL
+#include <gst/gl/gl.h>
+#include <gst/gl/gstglfuncs.h>
+#endif
+
+#include <nvcomp/nvcompManagerFactory.hpp>
+#include <nvcomp/ans.h>
+#include <nvcomp/bitcomp.h>
+#include <nvcomp/cascaded.h>
+#include <nvcomp/deflate.h>
+#include <nvcomp/gdeflate.h>
+#include <nvcomp/lz4.h>
+#include <nvcomp/snappy.h>
+#include <nvcomp/zstd.h>
+#include <memory>
+#include <string>
+#include <string.h>
+#include <vector>
+
+GST_DEBUG_CATEGORY_STATIC (gst_nv_comp_video_dec_debug);
+#define GST_CAT_DEFAULT gst_nv_comp_video_dec_debug
+
+#ifdef HAVE_GST_GL
+#define SRC_CAPS \
+    GST_VIDEO_CAPS_MAKE_WITH_FEATURES (GST_CAPS_FEATURE_MEMORY_CUDA_MEMORY, \
+        GST_VIDEO_FORMATS_ALL) ";" \
+    GST_VIDEO_CAPS_MAKE_WITH_FEATURES (GST_CAPS_FEATURE_MEMORY_GL_MEMORY, \
+        GST_VIDEO_FORMATS_ALL) ";" \
+    GST_VIDEO_CAPS_MAKE (GST_VIDEO_FORMATS_ALL)
+#else
+#define SRC_CAPS \
+    GST_VIDEO_CAPS_MAKE_WITH_FEATURES (GST_CAPS_FEATURE_MEMORY_CUDA_MEMORY, \
+        GST_VIDEO_FORMATS_ALL) ";" \
+    GST_VIDEO_CAPS_MAKE (GST_VIDEO_FORMATS_ALL)
+#endif
+
+static GstStaticPadTemplate sink_template =
+    GST_STATIC_PAD_TEMPLATE ("sink", GST_PAD_SINK, GST_PAD_ALWAYS,
+    GST_STATIC_CAPS ("video/x-nvcomp; video/x-nvcomp-lz4; "
+        "video/x-nvcomp-snappy; video/x-nvcomp-gdeflate; "
+        "video/x-nvcomp-deflate; video/x-nvcomp-zstd; video/x-nvcomp-cascaded; "
+        "video/x-nvcomp-bitcomp; video/x-nvcomp-ans"));
+
+static GstStaticPadTemplate src_template =
+GST_STATIC_PAD_TEMPLATE ("src", GST_PAD_SRC, GST_PAD_ALWAYS,
+    GST_STATIC_CAPS (SRC_CAPS));
+
+/* *INDENT-OFF* */
+using namespace nvcomp;
+
+struct DecoderTask
+{
+  ~DecoderTask ()
+  {
+    if (ctx) {
+      gst_cuda_context_push (ctx);
+      clear_resource ();
+      gst_cuda_context_pop (nullptr);
+      gst_object_unref (ctx);
+    }
+  }
+
+  void clear_resource ()
+  {
+    if (!ctx)
+      return;
+
+    if (device_compressed)
+      CuMemFree ((CUdeviceptr) device_compressed);
+    device_compressed = nullptr;
+
+    if (host_compressed)
+      CuMemFreeHost (host_compressed);
+    host_compressed = nullptr;
+
+    if (device_compressed_bytes)
+      CuMemFree ((CUdeviceptr) device_compressed_bytes);
+    device_compressed_bytes = nullptr;
+
+    if (device_compressed_ptrs)
+      CuMemFree ((CUdeviceptr) device_compressed_ptrs);
+    device_compressed_ptrs = nullptr;
+
+    if (host_compressed_bytes)
+      CuMemFreeHost (host_compressed_bytes);
+    host_compressed_bytes = nullptr;
+
+    if (host_compressed_ptrs)
+      CuMemFreeHost (host_compressed_ptrs);
+    host_compressed_ptrs = nullptr;
+
+    if (device_uncompressed)
+      CuMemFree ((CUdeviceptr) device_uncompressed);
+    device_uncompressed = nullptr;
+
+    if (device_uncompressed_temp)
+      CuMemFree ((CUdeviceptr) device_uncompressed_temp);
+    device_uncompressed_temp = nullptr;
+
+    if (host_uncompressed)
+      CuMemFreeHost (host_uncompressed);
+    host_uncompressed = nullptr;
+
+    if (device_uncompressed_bytes)
+      CuMemFree ((CUdeviceptr) device_uncompressed_bytes);
+    device_uncompressed_bytes = nullptr;
+
+    if (device_uncompressed_ptrs)
+      CuMemFree ((CUdeviceptr) device_uncompressed_ptrs);
+    device_uncompressed_ptrs = nullptr;
+
+    if (host_uncompressed_bytes)
+      CuMemFreeHost (host_uncompressed_bytes);
+    host_uncompressed_bytes = nullptr;
+
+    if (host_uncompressed_ptrs)
+      CuMemFreeHost (host_uncompressed_ptrs);
+    host_uncompressed_ptrs = nullptr;
+
+    if (device_actual_uncompressed_bytes)
+      CuMemFree ((CUdeviceptr) device_actual_uncompressed_bytes);
+    device_actual_uncompressed_bytes = nullptr;
+
+    if (temp_ptr)
+      CuMemFree ((CUdeviceptr) temp_ptr);
+    temp_ptr = nullptr;
+
+    if (device_statuses)
+      CuMemFree ((CUdeviceptr) device_statuses);
+    device_statuses = nullptr;
+
+    batch_size = 0;
+    max_compressed_chunk_size = 0;
+    max_uncompressed_chunk_size = 0;
+  }
+
+  bool allocate_batched (size_t num_chunks,
+      size_t compressed_chunk_size,
+      size_t uncompressed_chunk_size, size_t temp_bytes)
+  {
+    size_t compressed_alloc;
+    size_t uncompressed_alloc;
+    size_t alloc_size = num_chunks * sizeof (size_t);
+    uint8_t *src;
+
+    compressed_chunk_size = GST_ROUND_UP_8 (compressed_chunk_size);
+    uncompressed_chunk_size = GST_ROUND_UP_8 (uncompressed_chunk_size);
+
+    compressed_alloc = num_chunks * compressed_chunk_size;
+    uncompressed_alloc = num_chunks * uncompressed_chunk_size;
+
+    auto ret = CuMemAlloc ((CUdeviceptr *) &device_compressed,
+        compressed_alloc);
+    if (!gst_cuda_result (ret))
+      return false;
+
+    ret = CuMemAllocHost ((void **) &host_compressed, compressed_alloc);
+    if (!gst_cuda_result (ret))
+      return false;
+
+    ret = CuMemAlloc ((CUdeviceptr *) &device_compressed_bytes,
+        alloc_size);
+    if (!gst_cuda_result (ret))
+      return false;
+
+    ret = CuMemAlloc ((CUdeviceptr *) &device_compressed_ptrs,
+        alloc_size);
+    if (!gst_cuda_result (ret))
+      return false;
+
+    ret = CuMemAllocHost ((void **) &host_compressed_bytes,
+        alloc_size);
+    if (!gst_cuda_result (ret))
+      return false;
+
+    ret = CuMemAllocHost ((void **) &host_compressed_ptrs,
+        alloc_size);
+    if (!gst_cuda_result (ret))
+      return false;
+
+    src = device_compressed;
+    for (size_t i = 0; i < num_chunks; i++) {
+      host_compressed_ptrs[i] = src;
+      src += compressed_chunk_size;
+    }
+
+    ret = CuMemcpyHtoD ((CUdeviceptr) device_compressed_ptrs,
+        host_compressed_ptrs, alloc_size);
+    if (!gst_cuda_result (ret))
+      return false;
+
+    ret = CuMemAlloc ((CUdeviceptr *) &device_uncompressed_temp,
+        uncompressed_alloc);
+    if (!gst_cuda_result (ret))
+      return false;
+
+    ret = CuMemAlloc ((CUdeviceptr *) &device_uncompressed,
+        uncompressed_alloc);
+    if (!gst_cuda_result (ret))
+      return false;
+
+    ret = CuMemAllocHost ((void **) &host_uncompressed, uncompressed_alloc);
+    if (!gst_cuda_result (ret))
+      return false;
+
+    ret = CuMemAlloc ((CUdeviceptr *) &device_uncompressed_bytes,
+        alloc_size);
+    if (!gst_cuda_result (ret))
+      return false;
+
+    ret = CuMemAlloc ((CUdeviceptr *) &device_uncompressed_ptrs,
+        alloc_size);
+    if (!gst_cuda_result (ret))
+      return false;
+
+    ret = CuMemAllocHost ((void **) &host_uncompressed_bytes,
+        alloc_size);
+    if (!gst_cuda_result (ret))
+      return false;
+
+    ret = CuMemAllocHost ((void **) &host_uncompressed_ptrs,
+        alloc_size);
+    if (!gst_cuda_result (ret))
+      return false;
+
+    src = device_uncompressed_temp;
+    for (size_t i = 0; i < num_chunks; i++) {
+      host_uncompressed_bytes[i] = uncompressed_chunk_size;
+      host_uncompressed_ptrs[i] = src;
+      src += uncompressed_chunk_size;
+    }
+
+    ret = CuMemcpyHtoD ((CUdeviceptr) device_uncompressed_bytes,
+        host_uncompressed_bytes, alloc_size);
+    if (!gst_cuda_result (ret))
+      return false;
+
+    ret = CuMemcpyHtoD ((CUdeviceptr) device_uncompressed_ptrs,
+        host_uncompressed_ptrs, alloc_size);
+    if (!gst_cuda_result (ret))
+      return false;
+
+    ret = CuMemAlloc ((CUdeviceptr *) &device_actual_uncompressed_bytes,
+        alloc_size);
+    if (!gst_cuda_result (ret))
+      return false;
+
+    if (temp_bytes > 0) {
+      ret = CuMemAlloc ((CUdeviceptr *) &temp_ptr, temp_bytes);
+      if (!gst_cuda_result (ret))
+        return false;
+    }
+
+    ret = CuMemAlloc ((CUdeviceptr *) &device_statuses,
+        sizeof (nvcompStatus_t) * num_chunks);
+    if (!gst_cuda_result (ret))
+      return false;
+
+    batched = TRUE;
+    batch_size = num_chunks;
+    temp_size = temp_bytes;
+    max_compressed_chunk_size = compressed_chunk_size;
+    max_uncompressed_chunk_size = uncompressed_chunk_size;
+    compressed_alloc_size = compressed_alloc;
+    uncompressed_alloc_size = uncompressed_alloc;
+
+    return true;
+  }
+
+  GstCudaContext *ctx = nullptr;
+
+  uint8_t *device_compressed = nullptr;
+  uint8_t *host_compressed = nullptr;
+
+  size_t *device_compressed_bytes = nullptr;
+  void **device_compressed_ptrs = nullptr;
+
+  size_t *host_compressed_bytes = nullptr;
+  void **host_compressed_ptrs = nullptr;
+
+  uint8_t *device_uncompressed = nullptr;
+  uint8_t *device_uncompressed_temp = nullptr;
+  uint8_t *host_uncompressed = nullptr;
+
+  size_t *device_uncompressed_bytes = nullptr;
+  void **device_uncompressed_ptrs = nullptr;
+
+  size_t *host_uncompressed_bytes = nullptr;
+  void **host_uncompressed_ptrs = nullptr;
+
+  size_t *device_actual_uncompressed_bytes = nullptr;
+
+  void *temp_ptr = nullptr;
+  size_t temp_size = 0;
+
+  nvcompStatus_t *device_statuses = nullptr;
+
+  gboolean batched = FALSE;
+  size_t batch_size = 0;
+  size_t max_uncompressed_chunk_size = 0;
+  size_t max_compressed_chunk_size = 0;
+  size_t uncompressed_alloc_size = 0;
+  size_t compressed_alloc_size = 0;
+};
+
+struct BatchedDecompBase
+{
+  virtual nvcompStatus_t get_temp_size(
+      size_t num_chunks,
+      size_t max_uncompressed_chunk_bytes,
+      size_t * temp_bytes) = 0;
+
+  virtual nvcompStatus_t decompress(
+    void **device_compressed_ptrs,
+    size_t *device_compressed_bytes,
+    size_t *device_uncompressed_bytes,
+    size_t *device_actual_uncompressed_bytes,
+    size_t batch_size,
+    void *device_temp_ptr,
+    size_t temp_bytes,
+    void **device_uncompressed_ptrs,
+    nvcompStatus_t *device_statuses,
+    cudaStream_t stream) = 0;
+};
+
+template <auto T, auto D>
+class BatchedDecomp : public BatchedDecompBase
+{
+public:
+  BatchedDecomp () {}
+
+  nvcompStatus_t get_temp_size(
+      size_t num_chunks,
+      size_t max_uncompressed_chunk_bytes,
+      size_t * temp_bytes)
+  {
+    return T (num_chunks, max_uncompressed_chunk_bytes, temp_bytes);
+  }
+
+  nvcompStatus_t decompress(
+    void **device_compressed_ptrs,
+    size_t *device_compressed_bytes,
+    size_t *device_uncompressed_bytes,
+    size_t *device_actual_uncompressed_bytes,
+    size_t batch_size,
+    void *device_temp_ptr,
+    size_t temp_bytes,
+    void **device_uncompressed_ptrs,
+    nvcompStatus_t *device_statuses,
+    cudaStream_t stream)
+  {
+    return D (device_compressed_ptrs, device_compressed_bytes,
+        device_uncompressed_bytes, device_actual_uncompressed_bytes,
+        batch_size, device_temp_ptr, temp_bytes, device_uncompressed_ptrs,
+        device_statuses, stream);
+  }
+};
+
+struct GstNvCompVideoDecPrivate
+{
+  GstNvCompVideoDecPrivate ()
+  {
+    gst_video_info_init (&info);
+  }
+
+  GstCudaContext *ctx = nullptr;
+  GstCudaStream *stream = nullptr;
+
+#ifdef HAVE_GST_GL
+  GstGLDisplay *gl_display = nullptr;
+  GstGLContext *gl_context = nullptr;
+  GstGLContext *other_gl_context = nullptr;
+#endif
+
+  GstVideoCodecState *state = nullptr;
+  std::shared_ptr<nvcompManagerBase> manager;
+  std::shared_ptr<BatchedDecompBase> batched_decomp;
+  std::shared_ptr<DecoderTask> task;
+  gboolean gl_interop = FALSE;
+
+  GstVideoInfo info;
+  gboolean batched = FALSE;
+  GstNvCompMethod method;
+};
+/* *INDENT-ON* */
+
+struct _GstNvCompVideoDec
+{
+  GstVideoDecoder parent;
+  GstNvCompVideoDecPrivate *priv;
+};
+
+static void gst_nv_comp_video_dec_finalize (GObject * object);
+
+static void gst_nv_comp_video_dec_set_context (GstElement * element,
+    GstContext * context);
+
+static gboolean gst_nv_comp_video_dec_open (GstVideoDecoder * decoder);
+static gboolean gst_nv_comp_video_dec_close (GstVideoDecoder * decoder);
+static gboolean gst_nv_comp_video_dec_sink_query (GstVideoDecoder * decoder,
+    GstQuery * query);
+static gboolean gst_nv_comp_video_dec_src_query (GstVideoDecoder * decoder,
+    GstQuery * query);
+static gboolean
+gst_nv_comp_video_dec_decide_allocation (GstVideoDecoder * decoder,
+    GstQuery * query);
+static gboolean gst_nv_comp_video_dec_set_format (GstVideoDecoder * decoder,
+    GstVideoCodecState * state);
+static gboolean gst_nv_comp_video_dec_negotiate (GstVideoDecoder * decoder);
+static GstFlowReturn
+gst_nv_comp_video_dec_handle_frame (GstVideoDecoder * decoder,
+    GstVideoCodecFrame * frame);
+
+#define gst_nv_comp_video_dec_parent_class parent_class
+G_DEFINE_TYPE (GstNvCompVideoDec,
+    gst_nv_comp_video_dec, GST_TYPE_VIDEO_DECODER);
+
+static void
+gst_nv_comp_video_dec_class_init (GstNvCompVideoDecClass * klass)
+{
+  auto object_class = G_OBJECT_CLASS (klass);
+  auto element_class = GST_ELEMENT_CLASS (klass);
+  auto decoder_class = GST_VIDEO_DECODER_CLASS (klass);
+
+  object_class->finalize = gst_nv_comp_video_dec_finalize;
+
+  gst_element_class_add_static_pad_template (element_class, &sink_template);
+  gst_element_class_add_static_pad_template (element_class, &src_template);
+
+  gst_element_class_set_static_metadata (element_class,
+      "nvCOMP Video Decoder", "Decoder/Video/Hardware",
+      "Decompress a video stream using nvCOMP library",
+      "Seungha Yang <seungha@centricular.com>");
+
+  element_class->set_context =
+      GST_DEBUG_FUNCPTR (gst_nv_comp_video_dec_set_context);
+
+  decoder_class->open = GST_DEBUG_FUNCPTR (gst_nv_comp_video_dec_open);
+  decoder_class->close = GST_DEBUG_FUNCPTR (gst_nv_comp_video_dec_close);
+  decoder_class->sink_query =
+      GST_DEBUG_FUNCPTR (gst_nv_comp_video_dec_sink_query);
+  decoder_class->src_query =
+      GST_DEBUG_FUNCPTR (gst_nv_comp_video_dec_src_query);
+  decoder_class->decide_allocation =
+      GST_DEBUG_FUNCPTR (gst_nv_comp_video_dec_decide_allocation);
+  decoder_class->set_format =
+      GST_DEBUG_FUNCPTR (gst_nv_comp_video_dec_set_format);
+  decoder_class->negotiate =
+      GST_DEBUG_FUNCPTR (gst_nv_comp_video_dec_negotiate);
+  decoder_class->handle_frame =
+      GST_DEBUG_FUNCPTR (gst_nv_comp_video_dec_handle_frame);
+
+  GST_DEBUG_CATEGORY_INIT (gst_nv_comp_video_dec_debug,
+      "nvcompvideodec", 0, "nvcompvideodec");
+}
+
+static void
+gst_nv_comp_video_dec_init (GstNvCompVideoDec * self)
+{
+  self->priv = new GstNvCompVideoDecPrivate ();
+}
+
+static void
+gst_nv_comp_video_dec_finalize (GObject * object)
+{
+  auto self = GST_NV_COMP_VIDEO_DEC (object);
+
+  delete self->priv;
+
+  G_OBJECT_CLASS (parent_class)->finalize (object);
+}
+
+static void
+gst_nv_comp_video_dec_set_context (GstElement * element, GstContext * context)
+{
+  auto self = GST_NV_COMP_VIDEO_DEC (element);
+  auto priv = self->priv;
+
+  gst_cuda_handle_set_context (element, context, -1, &priv->ctx);
+#ifdef HAVE_GST_GL
+  if (gst_gl_handle_set_context (element, context, &priv->gl_display,
+          &priv->other_gl_context)) {
+    if (priv->gl_display)
+      gst_gl_display_filter_gl_api (priv->gl_display, GST_GL_API_OPENGL3);
+  }
+#endif
+
+  GST_ELEMENT_CLASS (parent_class)->set_context (element, context);
+}
+
+static gboolean
+gst_nv_comp_video_dec_open (GstVideoDecoder * decoder)
+{
+  auto self = GST_NV_COMP_VIDEO_DEC (decoder);
+  auto priv = self->priv;
+
+  if (!gst_cuda_ensure_element_context (GST_ELEMENT_CAST (decoder),
+          -1, &priv->ctx)) {
+    GST_ERROR_OBJECT (self, "Couldn't get cuda context");
+    return FALSE;
+  }
+
+  priv->stream = gst_cuda_stream_new (priv->ctx);
+
+  return TRUE;
+}
+
+static gboolean
+gst_nv_comp_video_dec_close (GstVideoDecoder * decoder)
+{
+  auto self = GST_NV_COMP_VIDEO_DEC (decoder);
+  auto priv = self->priv;
+
+  if (priv->ctx) {
+    gst_cuda_context_push (priv->ctx);
+    priv->manager = nullptr;
+    priv->task = nullptr;
+
+    gst_cuda_context_pop (nullptr);
+  }
+
+  gst_clear_cuda_stream (&priv->stream);
+  gst_clear_object (&priv->ctx);
+
+#ifdef HAVE_GST_GL
+  gst_clear_object (&priv->other_gl_context);
+  gst_clear_object (&priv->gl_context);
+  gst_clear_object (&priv->gl_context);
+#endif
+
+  return TRUE;
+}
+
+static gboolean
+gst_nv_comp_video_dec_handle_context_query (GstNvCompVideoDec * self,
+    GstQuery * query)
+{
+  auto priv = self->priv;
+
+#ifdef HAVE_GST_GL
+  {
+    GstGLDisplay *display = nullptr;
+    GstGLContext *other = nullptr;
+    GstGLContext *local = nullptr;
+
+    if (priv->gl_display)
+      display = (GstGLDisplay *) gst_object_ref (priv->gl_display);
+    if (priv->gl_context)
+      local = (GstGLContext *) gst_object_ref (priv->gl_context);
+    if (priv->other_gl_context)
+      other = (GstGLContext *) gst_object_ref (priv->other_gl_context);
+
+    auto ret = gst_gl_handle_context_query (GST_ELEMENT (self), query,
+        display, local, other);
+    gst_clear_object (&display);
+    gst_clear_object (&other);
+    gst_clear_object (&local);
+
+    if (ret)
+      return TRUE;
+  }
+#endif
+
+  if (gst_cuda_handle_context_query (GST_ELEMENT (self), query, priv->ctx))
+    return TRUE;
+
+  return FALSE;
+}
+
+static gboolean
+gst_nv_comp_video_dec_sink_query (GstVideoDecoder * decoder, GstQuery * query)
+{
+  auto self = GST_NV_COMP_VIDEO_DEC (decoder);
+
+  switch (GST_QUERY_TYPE (query)) {
+    case GST_QUERY_CONTEXT:
+      if (gst_nv_comp_video_dec_handle_context_query (self, query))
+        return TRUE;
+      break;
+    default:
+      break;
+  }
+
+  return GST_VIDEO_DECODER_CLASS (parent_class)->sink_query (decoder, query);
+}
+
+static gboolean
+gst_nv_comp_video_dec_src_query (GstVideoDecoder * decoder, GstQuery * query)
+{
+  auto self = GST_NV_COMP_VIDEO_DEC (decoder);
+
+  switch (GST_QUERY_TYPE (query)) {
+    case GST_QUERY_CONTEXT:
+      if (gst_nv_comp_video_dec_handle_context_query (self, query))
+        return TRUE;
+      break;
+    default:
+      break;
+  }
+
+  return GST_VIDEO_DECODER_CLASS (parent_class)->src_query (decoder, query);
+}
+
+#ifdef HAVE_GST_GL
+static void
+check_cuda_device_from_gl_context (GstGLContext * context, gboolean * ret)
+{
+  guint device_count = 0;
+  CUdevice device_list[1] = { 0, };
+  CUresult cuda_ret;
+
+  *ret = FALSE;
+  cuda_ret = CuGLGetDevices (&device_count,
+      device_list, 1, CU_GL_DEVICE_LIST_ALL);
+
+  if (!gst_cuda_result (cuda_ret) || device_count == 0)
+    return;
+
+  *ret = TRUE;
+}
+
+static gboolean
+gst_nv_comp_video_dec_ensure_gl_context (GstNvCompVideoDec * self)
+{
+  auto priv = self->priv;
+  gboolean ret = FALSE;
+
+  if (!gst_gl_ensure_element_data (GST_ELEMENT (self), &priv->gl_display,
+          &priv->other_gl_context)) {
+    GST_DEBUG_OBJECT (self, "Couldn't get GL display");
+    return FALSE;
+  }
+
+  gst_gl_display_filter_gl_api (priv->gl_display, GST_GL_API_OPENGL3);
+
+  if (!gst_gl_display_ensure_context (priv->gl_display, priv->other_gl_context,
+          &priv->gl_context, nullptr)) {
+    GST_DEBUG_OBJECT (self, "Couldn't get GL context");
+    return FALSE;
+  }
+
+  gst_gl_context_thread_add (priv->gl_context,
+      (GstGLContextThreadFunc) check_cuda_device_from_gl_context, &ret);
+
+  return ret;
+}
+#endif
+
+static gboolean
+gst_nv_comp_video_dec_decide_allocation (GstVideoDecoder * decoder,
+    GstQuery * query)
+{
+  auto self = GST_NV_COMP_VIDEO_DEC (decoder);
+  auto priv = self->priv;
+  GstBufferPool *pool = nullptr;
+  guint size;
+  guint min = 0;
+  guint max = 0;
+  GstCaps *caps;
+
+  gst_query_parse_allocation (query, &caps, nullptr);
+  if (!caps) {
+    GST_WARNING_OBJECT (self, "null caps in query");
+    return FALSE;
+  }
+
+  GstVideoInfo info;
+  if (!gst_video_info_from_caps (&info, caps)) {
+    GST_WARNING_OBJECT (self, "Failed to convert caps into info");
+    return FALSE;
+  }
+
+  gboolean update_pool = FALSE;
+  if (gst_query_get_n_allocation_pools (query) > 0) {
+    gst_query_parse_nth_allocation_pool (query, 0, &pool, &size, &min, &max);
+    update_pool = TRUE;
+  }
+
+  auto features = gst_caps_get_features (caps, 0);
+  gboolean use_cuda_pool = FALSE;
+  if (gst_caps_features_contains (features,
+          GST_CAPS_FEATURE_MEMORY_CUDA_MEMORY)) {
+    GST_DEBUG_OBJECT (self, "Downstream support CUDA memory");
+    if (pool) {
+      if (!GST_IS_CUDA_BUFFER_POOL (pool)) {
+        gst_clear_object (&pool);
+      } else {
+        auto cuda_pool = GST_CUDA_BUFFER_POOL (pool);
+        if (cuda_pool->context != priv->ctx)
+          gst_clear_object (&pool);
+      }
+    }
+
+    if (!pool)
+      pool = gst_cuda_buffer_pool_new (priv->ctx);
+    use_cuda_pool = TRUE;
+  }
+#ifdef HAVE_GST_GL
+  else if (gst_caps_features_contains (features,
+          GST_CAPS_FEATURE_MEMORY_GL_MEMORY) && priv->gl_interop) {
+    GST_DEBUG_OBJECT (self, "Downstream support GL memory");
+    if (!gst_nv_comp_video_dec_ensure_gl_context (self)) {
+      priv->gl_interop = FALSE;
+    } else {
+      if (pool && !GST_IS_GL_BUFFER_POOL (pool))
+        gst_clear_object (&pool);
+
+      if (!pool)
+        pool = gst_gl_buffer_pool_new (priv->gl_context);
+    }
+  }
+#endif
+
+  if (!pool)
+    pool = gst_video_buffer_pool_new ();
+
+  auto config = gst_buffer_pool_get_config (pool);
+
+  size = GST_VIDEO_INFO_SIZE (&info);
+  gst_buffer_pool_config_set_params (config, caps, size, 0, 0);
+  if (use_cuda_pool && priv->stream) {
+    /* Set our stream on buffer pool config so that CUstream can be shared */
+    gst_buffer_pool_config_set_cuda_stream (config, priv->stream);
+  }
+
+  if (!gst_buffer_pool_set_config (pool, config)) {
+    GST_WARNING_OBJECT (self, "Failed to set pool config");
+    gst_object_unref (pool);
+    return FALSE;
+  }
+
+  config = gst_buffer_pool_get_config (pool);
+  gst_buffer_pool_config_get_params (config, nullptr, &size, nullptr, nullptr);
+  gst_structure_free (config);
+
+  if (update_pool)
+    gst_query_set_nth_allocation_pool (query, 0, pool, size, min, max);
+  else
+    gst_query_add_allocation_pool (query, pool, size, min, max);
+  gst_object_unref (pool);
+
+  return TRUE;
+}
+
+static gboolean
+gst_nv_comp_video_dec_alloc_task (GstNvCompVideoDec * self,
+    DecoderTask * task, gboolean batched, gsize size)
+{
+  if (batched)
+    return TRUE;
+
+  task->uncompressed_alloc_size = size;
+  auto cuda_ret =
+      CuMemAlloc ((CUdeviceptr *) & task->device_uncompressed, size);
+  if (!gst_cuda_result (cuda_ret))
+    return FALSE;
+
+  cuda_ret = CuMemAllocHost ((void **) &task->host_uncompressed, size);
+  if (!gst_cuda_result (cuda_ret))
+    return FALSE;
+
+  task->compressed_alloc_size = size;
+  cuda_ret = CuMemAlloc ((CUdeviceptr *) & task->device_compressed, size);
+  if (!gst_cuda_result (cuda_ret))
+    return FALSE;
+
+  cuda_ret = CuMemAllocHost ((void **) &task->host_compressed, size);
+  if (!gst_cuda_result (cuda_ret))
+    return FALSE;
+
+  return TRUE;
+}
+
+static gboolean
+gst_nv_comp_video_dec_set_format (GstVideoDecoder * decoder,
+    GstVideoCodecState * state)
+{
+  auto self = GST_NV_COMP_VIDEO_DEC (decoder);
+  auto priv = self->priv;
+
+  if (!priv->ctx) {
+    GST_ERROR_OBJECT (self, "CUDA context was not configured");
+    return FALSE;
+  }
+
+  GST_DEBUG_OBJECT (self, "Set format with caps %" GST_PTR_FORMAT, state->caps);
+
+  g_clear_pointer (&priv->state, gst_video_codec_state_unref);
+  priv->state = gst_video_codec_state_ref (state);
+
+  auto s = gst_caps_get_structure (state->caps, 0);
+  std::string mime_type = gst_structure_get_name (s);
+
+  auto format_str = gst_structure_get_string (s, "format");
+  if (!format_str) {
+    GST_ERROR_OBJECT (self, "Unknown video format");
+    return FALSE;
+  }
+
+  GstVideoFormat format = gst_video_format_from_string (format_str);
+  if (format == GST_VIDEO_FORMAT_UNKNOWN || format == GST_VIDEO_FORMAT_ENCODED) {
+    GST_ERROR_OBJECT (self, "Invalid format string %s", format_str);
+    return FALSE;
+  }
+
+  s = gst_structure_copy (s);
+  gst_structure_set_name (s, "video/x-raw");
+
+  auto video_caps = gst_caps_new_empty ();
+  gst_caps_append_structure (video_caps, s);
+
+  auto ret = gst_video_info_from_caps (&priv->info, video_caps);
+  gst_caps_unref (video_caps);
+  if (!ret) {
+    GST_ERROR_OBJECT (self, "Couldn't build output caps");
+    return FALSE;
+  }
+
+  if (!gst_cuda_context_push (priv->ctx)) {
+    GST_ERROR_OBJECT (self, "Couldn't push context");
+    return FALSE;
+  }
+
+  priv->manager = nullptr;
+  priv->batched_decomp = nullptr;
+  priv->task = nullptr;
+
+  priv->batched = TRUE;
+  if (mime_type == "video/x-nvcomp") {
+    priv->batched = FALSE;
+  } else if (mime_type == "video/x-nvcomp-lz4") {
+    priv->batched_decomp = std::make_shared < BatchedDecomp <
+        nvcompBatchedLZ4DecompressGetTempSize,
+        nvcompBatchedLZ4DecompressAsync >> ();
+  } else if (mime_type == "video/x-nvcomp-snappy") {
+    priv->batched_decomp = std::make_shared < BatchedDecomp <
+        nvcompBatchedSnappyDecompressGetTempSize,
+        nvcompBatchedSnappyDecompressAsync >> ();
+  } else if (mime_type == "video/x-nvcomp-gdeflate") {
+    priv->batched_decomp = std::make_shared < BatchedDecomp <
+        nvcompBatchedGdeflateDecompressGetTempSize,
+        nvcompBatchedGdeflateDecompressAsync >> ();
+  } else if (mime_type == "video/x-nvcomp-deflate") {
+    priv->batched_decomp = std::make_shared < BatchedDecomp <
+        nvcompBatchedDeflateDecompressGetTempSize,
+        nvcompBatchedDeflateDecompressAsync >> ();
+  } else if (mime_type == "video/x-nvcomp-zstd") {
+    priv->batched_decomp = std::make_shared < BatchedDecomp <
+        nvcompBatchedZstdDecompressGetTempSize,
+        nvcompBatchedZstdDecompressAsync >> ();
+  } else if (mime_type == "video/x-nvcomp-cascaded") {
+    priv->batched_decomp = std::make_shared < BatchedDecomp <
+        nvcompBatchedCascadedDecompressGetTempSize,
+        nvcompBatchedCascadedDecompressAsync >> ();
+  } else if (mime_type == "video/x-nvcomp-bitcomp") {
+    priv->batched_decomp = std::make_shared < BatchedDecomp <
+        nvcompBatchedBitcompDecompressGetTempSize,
+        nvcompBatchedBitcompDecompressAsync >> ();
+  } else if (mime_type == "video/x-nvcomp-ans") {
+    priv->batched_decomp = std::make_shared < BatchedDecomp <
+        nvcompBatchedANSDecompressGetTempSize,
+        nvcompBatchedANSDecompressAsync >> ();
+  } else {
+    gst_cuda_context_pop (nullptr);
+    g_assert_not_reached ();
+    return FALSE;
+  }
+
+  auto task = std::make_shared < DecoderTask > ();
+  task->ctx = (GstCudaContext *) gst_object_ref (priv->ctx);
+
+  if (!gst_nv_comp_video_dec_alloc_task (self, task.get (), priv->batched,
+          priv->info.size)) {
+    task = nullptr;
+    gst_cuda_context_pop (nullptr);
+    return FALSE;
+  }
+
+  priv->task = task;
+  gst_cuda_context_pop (nullptr);
+
+  return gst_video_decoder_negotiate (decoder);
+}
+
+static gboolean
+is_supported_cuda_format (GstVideoFormat format)
+{
+  switch (format) {
+    case GST_VIDEO_FORMAT_I420:
+    case GST_VIDEO_FORMAT_YV12:
+    case GST_VIDEO_FORMAT_NV12:
+    case GST_VIDEO_FORMAT_NV21:
+    case GST_VIDEO_FORMAT_P010_10LE:
+    case GST_VIDEO_FORMAT_P012_LE:
+    case GST_VIDEO_FORMAT_P016_LE:
+    case GST_VIDEO_FORMAT_I420_10LE:
+    case GST_VIDEO_FORMAT_I420_12LE:
+    case GST_VIDEO_FORMAT_Y444:
+    case GST_VIDEO_FORMAT_Y444_10LE:
+    case GST_VIDEO_FORMAT_Y444_12LE:
+    case GST_VIDEO_FORMAT_Y444_16LE:
+    case GST_VIDEO_FORMAT_BGRA:
+    case GST_VIDEO_FORMAT_RGBA:
+    case GST_VIDEO_FORMAT_RGBx:
+    case GST_VIDEO_FORMAT_BGRx:
+    case GST_VIDEO_FORMAT_ARGB:
+    case GST_VIDEO_FORMAT_ABGR:
+    case GST_VIDEO_FORMAT_RGB:
+    case GST_VIDEO_FORMAT_BGR:
+    case GST_VIDEO_FORMAT_BGR10A2_LE:
+    case GST_VIDEO_FORMAT_RGB10A2_LE:
+    case GST_VIDEO_FORMAT_Y42B:
+    case GST_VIDEO_FORMAT_I422_10LE:
+    case GST_VIDEO_FORMAT_I422_12LE:
+    case GST_VIDEO_FORMAT_YUY2:
+    case GST_VIDEO_FORMAT_UYVY:
+    case GST_VIDEO_FORMAT_RGBP:
+    case GST_VIDEO_FORMAT_BGRP:
+    case GST_VIDEO_FORMAT_GBR:
+    case GST_VIDEO_FORMAT_GBR_10LE:
+    case GST_VIDEO_FORMAT_GBR_12LE:
+    case GST_VIDEO_FORMAT_GBR_16LE:
+    case GST_VIDEO_FORMAT_GBRA:
+    case GST_VIDEO_FORMAT_VUYA:
+      return TRUE;
+    default:
+      break;
+  }
+
+  return FALSE;
+}
+
+#ifdef HAVE_GST_GL
+static gboolean
+is_supported_gl_format (GstVideoFormat format)
+{
+  auto gl_caps = gst_caps_from_string ("video/x-raw, format = (string) "
+      GST_GL_COLOR_CONVERT_FORMATS);
+  auto our_caps = gst_caps_new_empty_simple ("video/x-raw");
+  gst_caps_set_simple (our_caps,
+      "format", G_TYPE_STRING, gst_video_format_to_string (format), nullptr);
+  auto ret = gst_caps_is_subset (our_caps, gl_caps);
+  gst_caps_unref (gl_caps);
+  gst_caps_unref (our_caps);
+
+  return ret;
+}
+#endif
+
+static gboolean
+gst_nv_comp_video_dec_negotiate (GstVideoDecoder * decoder)
+{
+  auto self = GST_NV_COMP_VIDEO_DEC (decoder);
+  auto priv = self->priv;
+  gboolean is_cuda = FALSE;
+#ifdef HAVE_GST_GL
+  gboolean is_gl = FALSE;
+#endif
+
+  auto peer_caps = gst_pad_get_allowed_caps (decoder->srcpad);
+  GST_DEBUG_OBJECT (self, "Allowed caps %" GST_PTR_FORMAT, peer_caps);
+
+  if (!peer_caps || gst_caps_is_any (peer_caps)) {
+    GST_DEBUG_OBJECT (self,
+        "cannot determine output format, use system memory");
+  } else {
+    GstCapsFeatures *features;
+    guint size = gst_caps_get_size (peer_caps);
+    guint i;
+
+    for (i = 0; i < size; i++) {
+      features = gst_caps_get_features (peer_caps, i);
+
+      if (!features)
+        continue;
+
+      if (gst_caps_features_contains (features,
+              GST_CAPS_FEATURE_MEMORY_CUDA_MEMORY)) {
+        is_cuda = TRUE;
+      }
+#ifdef HAVE_GST_GL
+      if (gst_caps_features_contains (features,
+              GST_CAPS_FEATURE_MEMORY_GL_MEMORY)) {
+        is_gl = TRUE;
+      }
+#endif
+    }
+  }
+  gst_clear_caps (&peer_caps);
+
+  auto state = gst_video_decoder_set_interlaced_output_state (decoder,
+      GST_VIDEO_INFO_FORMAT (&priv->info),
+      GST_VIDEO_INFO_INTERLACE_MODE (&priv->info), priv->info.width,
+      priv->info.height, priv->state);
+
+  if (!state) {
+    GST_ERROR_OBJECT (self, "Couldn't set output state");
+    return FALSE;
+  }
+
+  priv->gl_interop = FALSE;
+
+  state->caps = gst_video_info_to_caps (&state->info);
+  auto format = GST_VIDEO_INFO_FORMAT (&priv->info);
+  if (is_cuda && is_supported_cuda_format (format)) {
+    gst_caps_set_features_simple (state->caps,
+        gst_caps_features_new (GST_CAPS_FEATURE_MEMORY_CUDA_MEMORY, nullptr));
+  }
+#ifdef HAVE_GST_GL
+  else if (is_gl && is_supported_gl_format (format)) {
+    gst_caps_set_features_simple (state->caps,
+        gst_caps_features_new (GST_CAPS_FEATURE_MEMORY_GL_MEMORY, nullptr));
+    priv->gl_interop = TRUE;
+  }
+#endif
+
+  return GST_VIDEO_DECODER_CLASS (parent_class)->negotiate (decoder);
+}
+
+static gboolean
+gst_nv_comp_video_dec_download (GstNvCompVideoDec * self, GstVideoFrame * frame,
+    CUstream stream, gboolean is_device_copy)
+{
+  auto priv = self->priv;
+  auto info = &priv->info;
+  auto finfo = info->finfo;
+  gint comp[GST_VIDEO_MAX_COMPONENTS];
+  CUresult ret = CUDA_SUCCESS;
+  auto task = priv->task;
+
+  for (guint i = 0; i < GST_VIDEO_FRAME_N_PLANES (frame); i++) {
+    guint8 *sp;
+    if (is_device_copy)
+      sp = task->device_uncompressed + info->offset[i];
+    else
+      sp = task->host_uncompressed + info->offset[i];
+
+    guint8 *dp = (guint8 *) GST_VIDEO_FRAME_PLANE_DATA (frame, i);
+    guint ss, ds;
+    guint w, h;
+
+    if (GST_VIDEO_FORMAT_INFO_HAS_PALETTE (finfo) && i == 1) {
+      if (is_device_copy) {
+        ret = CuMemcpyDtoDAsync ((CUdeviceptr) dp, (CUdeviceptr) sp,
+            256 * 4, stream);
+      } else {
+        memcpy (dp, sp, 256 * 4);
+      }
+
+      if (!gst_cuda_result (ret)) {
+        GST_ERROR_OBJECT (self, "CUDA memcpy failed");
+        return FALSE;
+      }
+
+      return TRUE;
+    }
+
+    ds = GST_VIDEO_FRAME_PLANE_STRIDE (frame, i);
+    ss = GST_VIDEO_INFO_PLANE_STRIDE (info, i);
+
+    gst_video_format_info_component (finfo, i, comp);
+
+    w = GST_VIDEO_INFO_COMP_WIDTH (info, comp[0]) *
+        GST_VIDEO_INFO_COMP_PSTRIDE (info, comp[0]);
+    if (w == 0)
+      w = MIN (ss, ds);
+
+    h = GST_VIDEO_INFO_COMP_HEIGHT (info, comp[0]);
+
+    if (GST_VIDEO_FORMAT_INFO_IS_TILED (finfo)) {
+      gint tile_size;
+      gint sx_tiles, sy_tiles, dx_tiles, dy_tiles;
+      GstVideoTileMode mode;
+
+      tile_size = GST_VIDEO_FORMAT_INFO_TILE_SIZE (info->finfo, i);
+
+      mode = GST_VIDEO_FORMAT_INFO_TILE_MODE (info->finfo);
+
+      sx_tiles = GST_VIDEO_TILE_X_TILES (ss);
+      sy_tiles = GST_VIDEO_TILE_Y_TILES (ss);
+
+      dx_tiles = GST_VIDEO_TILE_X_TILES (ds);
+      dy_tiles = GST_VIDEO_TILE_Y_TILES (ds);
+
+      w = MIN (sx_tiles, dx_tiles);
+      h = MIN (sy_tiles, dy_tiles);
+
+      for (guint j = 0; j < h; j++) {
+        for (guint k = 0; k < w; k++) {
+          guint si, di;
+          guint8 *cur_dp;
+          guint8 *cur_sp;
+
+          si = gst_video_tile_get_index (mode, k, j, sx_tiles, sy_tiles);
+          di = gst_video_tile_get_index (mode, k, j, dx_tiles, dy_tiles);
+
+          cur_dp = dp + (di * tile_size);
+          cur_sp = sp + (si * tile_size);
+
+          if (is_device_copy) {
+            ret = CuMemcpyDtoDAsync ((CUdeviceptr) cur_dp, (CUdeviceptr) cur_sp,
+                w, stream);
+          } else {
+            memcpy (cur_dp, cur_sp, w);
+          }
+
+          if (!gst_cuda_result (ret)) {
+            GST_ERROR_OBJECT (self, "CUDA memcpy failed");
+            return FALSE;
+          }
+        }
+      }
+    } else {
+      if (is_device_copy) {
+        CUDA_MEMCPY2D params = { };
+        params.srcMemoryType = CU_MEMORYTYPE_DEVICE;
+        params.srcDevice = (CUdeviceptr) sp;
+        params.srcPitch = ss;
+
+        params.dstMemoryType = CU_MEMORYTYPE_DEVICE;
+        params.dstDevice = (CUdeviceptr) dp;
+        params.dstPitch = ds;
+
+        params.WidthInBytes = w;
+        params.Height = h;
+
+        ret = CuMemcpy2DAsync (&params, stream);
+
+        if (!gst_cuda_result (ret)) {
+          GST_ERROR_OBJECT (self, "CUDA memcpy failed");
+          return FALSE;
+        }
+      } else {
+        for (guint j = 0; j < h; j++) {
+          memcpy (dp, sp, w);
+          dp += ds;
+          sp += ss;
+        }
+      }
+    }
+  }
+
+  return TRUE;
+}
+
+#ifdef HAVE_GST_GL
+struct GLInteropData
+{
+  GstNvCompVideoDec *self = nullptr;
+  GstBuffer *buffer = nullptr;
+  gboolean ret = FALSE;
+};
+
+static GstCudaGraphicsResource *
+ensure_gl_cuda_resource (GstNvCompVideoDec * self, GstMemory * mem)
+{
+  auto priv = self->priv;
+  GstCudaGraphicsResource *resource;
+  GQuark quark;
+
+  if (!gst_is_gl_memory_pbo (mem)) {
+    GST_WARNING_OBJECT (self, "memory is not GL PBO memory, %s",
+        mem->allocator->mem_type);
+    return nullptr;
+  }
+
+  quark = gst_cuda_quark_from_id (GST_CUDA_QUARK_GRAPHICS_RESOURCE);
+  resource = (GstCudaGraphicsResource *)
+      gst_mini_object_get_qdata (GST_MINI_OBJECT (mem), quark);
+
+  if (!resource) {
+    GstMapInfo map_info;
+    GstGLMemoryPBO *pbo = (GstGLMemoryPBO *) mem;
+    GstGLBuffer *gl_buf = pbo->pbo;
+    gboolean ret;
+
+    if (!gst_memory_map (mem, &map_info,
+            (GstMapFlags) (GST_MAP_READ | GST_MAP_GL))) {
+      GST_ERROR_OBJECT (self, "Couldn't map gl memory");
+      return nullptr;
+    }
+
+    resource = gst_cuda_graphics_resource_new (priv->ctx,
+        GST_OBJECT (GST_GL_BASE_MEMORY_CAST (mem)->context),
+        GST_CUDA_GRAPHICS_RESOURCE_GL_BUFFER);
+
+    GST_LOG_OBJECT (self, "registering gl buffer %d to CUDA", gl_buf->id);
+    ret = gst_cuda_graphics_resource_register_gl_buffer (resource, gl_buf->id,
+        CU_GRAPHICS_REGISTER_FLAGS_NONE);
+    gst_memory_unmap (mem, &map_info);
+
+    if (!ret) {
+      GST_ERROR_OBJECT (self, "Couldn't register gl buffer %d", gl_buf->id);
+      gst_cuda_graphics_resource_free (resource);
+      return nullptr;
+    }
+
+    gst_mini_object_set_qdata (GST_MINI_OBJECT (mem), quark, resource,
+        (GDestroyNotify) gst_cuda_graphics_resource_free);
+  }
+
+  return resource;
+}
+
+static void
+gst_nv_comp_video_dec_download_gl (GstGLContext * context, GLInteropData * data)
+{
+  auto self = data->self;
+  auto priv = self->priv;
+  auto info = &priv->info;
+  auto finfo = info->finfo;
+  GstCudaGraphicsResource *gst_res[GST_VIDEO_MAX_PLANES] = { nullptr, };
+  CUgraphicsResource cuda_res[GST_VIDEO_MAX_PLANES] = { nullptr, };
+  CUdeviceptr src_devptr[GST_VIDEO_MAX_PLANES] = { 0, };
+  CUstream stream = gst_cuda_stream_get_handle (priv->stream);
+  CUresult ret;
+  gint comp[GST_VIDEO_MAX_COMPONENTS];
+  auto task = priv->task;
+
+  if (!gst_cuda_context_push (priv->ctx)) {
+    GST_ERROR_OBJECT (self, "Couldn't push context");
+    return;
+  }
+
+  for (guint i = 0; i < GST_VIDEO_INFO_N_PLANES (info); i++) {
+    GstMemory *mem = gst_buffer_peek_memory (data->buffer, i);
+    gsize src_size;
+
+    if (!gst_is_gl_memory_pbo (mem)) {
+      GST_ERROR_OBJECT (self, "Not a GL PBO memory");
+      goto out;
+    }
+
+    gst_res[i] = ensure_gl_cuda_resource (self, mem);
+    if (!gst_res[i]) {
+      GST_ERROR_OBJECT (self, "Couldn't get resource %d", i);
+      goto out;
+    }
+
+    cuda_res[i] = gst_cuda_graphics_resource_map (gst_res[i], stream,
+        CU_GRAPHICS_MAP_RESOURCE_FLAGS_WRITE_DISCARD);
+    if (!cuda_res[i]) {
+      GST_ERROR_OBJECT (self, "Couldn't map resource");
+      goto out;
+    }
+
+    ret = CuGraphicsResourceGetMappedPointer (&src_devptr[i],
+        &src_size, cuda_res[i]);
+    if (!gst_cuda_result (ret)) {
+      GST_ERROR_OBJECT (self, "Couldn't get mapped device pointer");
+      goto out;
+    }
+
+    /* Need PBO -> texture */
+    GST_MINI_OBJECT_FLAG_SET (mem, GST_GL_BASE_MEMORY_TRANSFER_NEED_UPLOAD);
+  }
+
+  for (guint i = 0; i < GST_VIDEO_INFO_N_PLANES (info); i++) {
+    guint8 *sp = task->device_uncompressed + info->offset[i];
+    guint8 *dp = (guint8 *) src_devptr[i];
+    guint ss, ds;
+    guint w, h;
+
+    if (GST_VIDEO_FORMAT_INFO_HAS_PALETTE (finfo) && i == 1) {
+      ret = CuMemcpyDtoDAsync ((CUdeviceptr) dp, (CUdeviceptr) sp,
+          256 * 4, stream);
+
+      if (!gst_cuda_result (ret)) {
+        GST_ERROR_OBJECT (self, "CUDA memcpy failed");
+        goto out;
+      }
+
+      data->ret = TRUE;
+      goto out;
+    }
+
+    auto meta = gst_buffer_get_video_meta (data->buffer);
+    if (meta)
+      ds = meta->stride[i];
+    else
+      ds = GST_VIDEO_INFO_PLANE_STRIDE (info, i);
+
+    ss = GST_VIDEO_INFO_PLANE_STRIDE (info, i);
+
+    gst_video_format_info_component (finfo, i, comp);
+
+    w = GST_VIDEO_INFO_COMP_WIDTH (info, comp[0]) *
+        GST_VIDEO_INFO_COMP_PSTRIDE (info, comp[0]);
+    if (w == 0)
+      w = MIN (ss, ds);
+
+    h = GST_VIDEO_INFO_COMP_HEIGHT (info, comp[0]);
+
+    if (GST_VIDEO_FORMAT_INFO_IS_TILED (finfo)) {
+      gint tile_size;
+      gint sx_tiles, sy_tiles, dx_tiles, dy_tiles;
+      GstVideoTileMode mode;
+
+      tile_size = GST_VIDEO_FORMAT_INFO_TILE_SIZE (info->finfo, i);
+
+      mode = GST_VIDEO_FORMAT_INFO_TILE_MODE (info->finfo);
+
+      sx_tiles = GST_VIDEO_TILE_X_TILES (ss);
+      sy_tiles = GST_VIDEO_TILE_Y_TILES (ss);
+
+      dx_tiles = GST_VIDEO_TILE_X_TILES (ds);
+      dy_tiles = GST_VIDEO_TILE_Y_TILES (ds);
+
+      w = MIN (sx_tiles, dx_tiles);
+      h = MIN (sy_tiles, dy_tiles);
+
+      for (guint j = 0; j < h; j++) {
+        for (guint k = 0; k < w; k++) {
+          guint si, di;
+          guint8 *cur_dp;
+          guint8 *cur_sp;
+
+          si = gst_video_tile_get_index (mode, k, j, sx_tiles, sy_tiles);
+          di = gst_video_tile_get_index (mode, k, j, dx_tiles, dy_tiles);
+
+          cur_dp = dp + (di * tile_size);
+          cur_sp = sp + (si * tile_size);
+
+          ret = CuMemcpyDtoDAsync ((CUdeviceptr) cur_dp, (CUdeviceptr) cur_sp,
+              w, stream);
+
+          if (!gst_cuda_result (ret)) {
+            GST_ERROR_OBJECT (self, "CUDA memcpy failed");
+            goto out;
+          }
+        }
+      }
+    } else {
+      CUDA_MEMCPY2D params = { };
+      params.srcMemoryType = CU_MEMORYTYPE_DEVICE;
+      params.srcDevice = (CUdeviceptr) sp;
+      params.srcPitch = ss;
+
+      params.dstMemoryType = CU_MEMORYTYPE_DEVICE;
+      params.dstDevice = (CUdeviceptr) dp;
+      params.dstPitch = ds;
+
+      params.WidthInBytes = w;
+      params.Height = h;
+
+      ret = CuMemcpy2DAsync (&params, stream);
+      if (!gst_cuda_result (ret)) {
+        GST_ERROR_OBJECT (self, "CUDA memcpy failed");
+        goto out;
+      }
+    }
+  }
+
+  data->ret = TRUE;
+
+out:
+  for (guint i = 0; i < gst_buffer_n_memory (data->buffer); i++) {
+    if (!gst_res[i])
+      break;
+
+    gst_cuda_graphics_resource_unmap (gst_res[i], stream);
+  }
+
+  CuStreamSynchronize (stream);
+  gst_cuda_context_pop (nullptr);
+}
+#endif
+
+struct ChunkData
+{
+  size_t uncomp_size = 0;
+  size_t comp_size = 0;
+  size_t offset = 0;
+};
+
+static gboolean
+gst_nv_comp_video_dec_parse_header (GstNvCompVideoDec * self,
+    const guint8 * data, gsize size,
+    size_t &uncompressed_chunk_size, size_t &max_compressed_chunk_size,
+    size_t &batch_size, std::vector < ChunkData > &compressed_chunks)
+{
+  guint32 val;
+  const guint8 *ptr = data;
+  gsize remaining = size;
+
+  if (size <= GST_NV_COMP_HEADER_MIN_SIZE) {
+    GST_ERROR_OBJECT (self, "Too small size");
+    return FALSE;
+  }
+
+  val = GST_READ_UINT32_LE (ptr);
+  if (val != GST_NV_COMP_HEADER_VERSION) {
+    GST_ERROR_OBJECT (self, "Invalid version");
+    return FALSE;
+  }
+  ptr += sizeof (guint32);
+  remaining -= sizeof (guint32);
+
+  uncompressed_chunk_size = GST_READ_UINT32_LE (ptr);
+  ptr += sizeof (guint32);
+  remaining -= sizeof (guint32);
+
+  max_compressed_chunk_size = GST_READ_UINT32_LE (ptr);
+  ptr += sizeof (guint32);
+  remaining -= sizeof (guint32);
+
+  batch_size = GST_READ_UINT32_LE (ptr);
+  ptr += sizeof (guint32);
+  remaining -= sizeof (guint32);
+
+  compressed_chunks.resize (batch_size);
+  size_t total_compressed_size = 0;
+  for (size_t i = 0; i < batch_size; i++) {
+    if (remaining < sizeof (guint32))
+      return FALSE;
+
+    compressed_chunks[i].uncomp_size = GST_READ_UINT32_LE (ptr);
+    ptr += sizeof (guint32);
+    remaining -= sizeof (guint32);
+
+    if (remaining < sizeof (guint32))
+      return FALSE;
+
+    compressed_chunks[i].comp_size = GST_READ_UINT32_LE (ptr);
+    total_compressed_size += compressed_chunks[i].comp_size;
+
+    ptr += sizeof (guint32);
+    remaining -= sizeof (guint32);
+  }
+
+  if (remaining != total_compressed_size) {
+    GST_ERROR_OBJECT (self, "Size mismatch, remaining: %" G_GSIZE_FORMAT
+        ", total compressed: %" G_GSIZE_FORMAT, remaining,
+        total_compressed_size);
+    return FALSE;
+  }
+
+  for (size_t i = 0; i < batch_size; i++) {
+    compressed_chunks[i].offset = ptr - data;
+    ptr += compressed_chunks[i].comp_size;
+  }
+
+  return TRUE;
+}
+
+static GstFlowReturn
+gst_nv_comp_video_dec_handle_frame (GstVideoDecoder * decoder,
+    GstVideoCodecFrame * frame)
+{
+  auto self = GST_NV_COMP_VIDEO_DEC (decoder);
+  auto priv = self->priv;
+  CUstream stream = nullptr;
+  GstVideoFrame vframe;
+  GstMapInfo map_info;
+  CUresult cuda_ret;
+  gboolean need_copy = TRUE;
+  GstMemory *mem;
+  nvcompStatus_t status;
+  auto task = priv->task;
+  GstFlowReturn ret;
+
+  if (!priv->ctx || !priv->task) {
+    GST_ERROR_OBJECT (self, "Context was not configured");
+    goto error;
+  }
+
+  ret = gst_video_decoder_allocate_output_frame (decoder, frame);
+  if (ret != GST_FLOW_OK) {
+    gst_video_decoder_release_frame (decoder, frame);
+    return ret;
+  }
+
+  if (!gst_cuda_context_push (priv->ctx)) {
+    GST_ERROR_OBJECT (self, "Couldn't push context");
+    goto error;
+  }
+
+  stream = gst_cuda_stream_get_handle (priv->stream);
+
+  if (!gst_buffer_map (frame->input_buffer, &map_info, GST_MAP_READ)) {
+    GST_ERROR_OBJECT (self, "Couldn't map input buffer");
+    gst_cuda_context_pop (nullptr);
+    goto error;
+  }
+
+  if (priv->batched) {
+    g_assert (priv->batched_decomp);
+
+    /* Parse custom header */
+    size_t uncompressed_chunk_size;
+    size_t max_compressed_chunk_size;
+    size_t batch_size;
+    std::vector < ChunkData > compressed_chunks;
+    guint8 *mapped_data = map_info.data;
+    uint8_t *uncompressed;
+    if (!gst_nv_comp_video_dec_parse_header (self, mapped_data,
+            map_info.size, uncompressed_chunk_size, max_compressed_chunk_size,
+            batch_size, compressed_chunks)) {
+      gst_buffer_unmap (frame->input_buffer, &map_info);
+      gst_cuda_context_pop (nullptr);
+      goto error;
+    }
+
+    GST_LOG_OBJECT (self, "batch size %" G_GSIZE_FORMAT
+        ", uncompressed-chunk-size %" G_GSIZE_FORMAT
+        ", compressed-chunk-size %" G_GSIZE_FORMAT,
+        batch_size, uncompressed_chunk_size, max_compressed_chunk_size);
+
+    if (task->batch_size < batch_size ||
+        task->max_uncompressed_chunk_size < uncompressed_chunk_size ||
+        task->max_compressed_chunk_size < max_compressed_chunk_size) {
+      task->clear_resource ();
+    }
+
+    if (task->batch_size == 0) {
+      size_t temp_size = 0;
+
+      GST_DEBUG_OBJECT (self, "Allocating resource");
+
+      status = priv->batched_decomp->get_temp_size (batch_size,
+          uncompressed_chunk_size, &temp_size);
+      if (status != nvcompSuccess) {
+        GST_ERROR_OBJECT (self, "Couldn't get temp size");
+        gst_buffer_unmap (frame->input_buffer, &map_info);
+        gst_cuda_context_pop (nullptr);
+        goto error;
+      }
+
+      if (!task->allocate_batched (batch_size,
+              max_compressed_chunk_size, uncompressed_chunk_size, temp_size)) {
+        GST_ERROR_OBJECT (self, "Couldn't allocate resource");
+        gst_buffer_unmap (frame->input_buffer, &map_info);
+        gst_cuda_context_pop (nullptr);
+        goto error;
+      }
+    }
+
+    for (size_t i = 0; i < batch_size; i++) {
+      memcpy (task->host_compressed + (i * task->max_compressed_chunk_size),
+          mapped_data + compressed_chunks[i].offset,
+          compressed_chunks[i].comp_size);
+      task->host_compressed_bytes[i] = compressed_chunks[i].comp_size;
+    }
+    gst_buffer_unmap (frame->input_buffer, &map_info);
+
+    for (size_t i = 0; i < batch_size; i++) {
+      GST_LOG_OBJECT (self, "Uploading chunk %" G_GSIZE_FORMAT
+          ", size %" G_GSIZE_FORMAT, i, compressed_chunks[i].comp_size);
+      auto offset = i * task->max_compressed_chunk_size;
+
+      cuda_ret = CuMemcpyHtoDAsync ((CUdeviceptr)
+          (task->device_compressed + offset),
+          task->host_compressed + offset,
+          compressed_chunks[i].comp_size, stream);
+      if (!gst_cuda_result (cuda_ret)) {
+        gst_cuda_context_pop (nullptr);
+        goto error;
+      }
+    }
+
+    cuda_ret = CuMemcpyHtoDAsync ((CUdeviceptr) task->device_compressed_bytes,
+        task->host_compressed_bytes, sizeof (size_t) * batch_size, stream);
+    if (!gst_cuda_result (cuda_ret)) {
+      gst_cuda_context_pop (nullptr);
+      goto error;
+    }
+
+    status = priv->batched_decomp->decompress (task->device_compressed_ptrs,
+        task->device_compressed_bytes, task->device_uncompressed_bytes,
+        task->device_actual_uncompressed_bytes, batch_size,
+        task->temp_ptr, task->temp_size, task->device_uncompressed_ptrs,
+        task->device_statuses, (cudaStream_t) stream);
+    if (status != nvcompSuccess) {
+      GST_ERROR_OBJECT (self, "Couldn't decompress stream, status: %d", status);
+      gst_cuda_context_pop (nullptr);
+      goto error;
+    }
+
+    uncompressed = task->device_uncompressed;
+    for (size_t i = 0; i < batch_size; i++) {
+      auto size = compressed_chunks[i].uncomp_size;
+      cuda_ret = CuMemcpyDtoDAsync ((CUdeviceptr) uncompressed,
+          (CUdeviceptr) task->host_uncompressed_ptrs[i], size, stream);
+
+      if (!gst_cuda_result (cuda_ret)) {
+        gst_cuda_context_pop (nullptr);
+        goto error;
+      }
+      uncompressed += size;
+    }
+  } else {
+    if (task->compressed_alloc_size < map_info.size) {
+      if (task->device_compressed)
+        CuMemFree ((CUdeviceptr) task->device_compressed);
+      task->device_compressed = nullptr;
+
+      if (task->host_compressed)
+        CuMemFreeHost (task->host_compressed);
+      task->host_compressed = nullptr;
+
+      task->compressed_alloc_size = GST_ROUND_UP_128 (map_info.size);
+      auto cuda_ret = CuMemAlloc ((CUdeviceptr *) & task->device_compressed,
+          task->compressed_alloc_size);
+      if (!gst_cuda_result (cuda_ret)) {
+        gst_buffer_unmap (frame->input_buffer, &map_info);
+        gst_cuda_context_pop (nullptr);
+        goto error;
+      }
+
+      cuda_ret = CuMemAllocHost ((void **) &task->host_compressed,
+          task->compressed_alloc_size);
+      if (!gst_cuda_result (cuda_ret)) {
+        gst_buffer_unmap (frame->input_buffer, &map_info);
+        gst_cuda_context_pop (nullptr);
+        goto error;
+      }
+    }
+
+    memcpy (task->host_compressed, map_info.data, map_info.size);
+
+    cuda_ret = CuMemcpyHtoDAsync ((CUdeviceptr) task->device_compressed,
+        task->host_compressed, map_info.size, stream);
+    gst_buffer_unmap (frame->input_buffer, &map_info);
+
+    if (!gst_cuda_result (cuda_ret)) {
+      GST_ERROR_OBJECT (self, "Couldn't copy compressed memory");
+      gst_cuda_context_pop (nullptr);
+      goto error;
+    }
+
+    if (!priv->manager) {
+      priv->manager = create_manager (task->device_compressed,
+          (cudaStream_t) stream);
+    }
+
+    {
+      auto config =
+          priv->manager->configure_decompression (task->device_compressed);
+      if (config.decomp_data_size != priv->info.size) {
+        GST_ERROR_OBJECT (self, "size mismatch, expected %" G_GSIZE_FORMAT
+            ", required %" G_GSIZE_FORMAT, priv->info.size,
+            config.decomp_data_size);
+        gst_cuda_context_pop (nullptr);
+        goto error;
+      }
+
+      priv->manager->decompress (task->device_uncompressed,
+          task->device_compressed, config);
+    }
+  }
+
+  mem = gst_buffer_peek_memory (frame->output_buffer, 0);
+#ifdef HAVE_GST_GL
+  if (priv->gl_interop && gst_buffer_n_memory (frame->output_buffer) ==
+      GST_VIDEO_INFO_N_PLANES (&priv->info)) {
+    GLInteropData interop_data;
+    interop_data.self = self;
+    interop_data.buffer = frame->output_buffer;
+    interop_data.ret = FALSE;
+
+    auto gl_mem = (GstGLMemory *) mem;
+    gst_gl_context_thread_add (gl_mem->mem.context,
+        (GstGLContextThreadFunc) gst_nv_comp_video_dec_download_gl,
+        &interop_data);
+    if (interop_data.ret) {
+      need_copy = FALSE;
+      GST_TRACE_OBJECT (self, "CUDA -> GL copy done");
+    } else {
+      priv->gl_interop = FALSE;
+    }
+  }
+#endif
+
+  if (need_copy) {
+    GstMapFlags map_flags = GST_MAP_WRITE;
+    gboolean device_copy = FALSE;
+    gboolean do_sync = TRUE;
+    if (gst_is_cuda_memory (mem)) {
+      auto cmem = GST_CUDA_MEMORY_CAST (mem);
+      if (cmem->context == priv->ctx) {
+        map_flags = (GstMapFlags) (GST_MAP_WRITE | GST_MAP_CUDA);
+        device_copy = TRUE;
+        auto mem_stream = gst_cuda_memory_get_stream (cmem);
+        if (mem_stream && mem_stream == priv->stream)
+          do_sync = FALSE;
+      }
+    }
+
+    if (!device_copy) {
+      cuda_ret = CuMemcpyDtoHAsync (task->host_uncompressed,
+          (CUdeviceptr) task->device_uncompressed, priv->info.size, stream);
+      if (!gst_cuda_result (cuda_ret)) {
+        GST_ERROR_OBJECT (self, "Couldn't download image");
+        gst_cuda_context_pop (nullptr);
+        goto error;
+      }
+      CuStreamSynchronize (stream);
+      do_sync = FALSE;
+    }
+
+    gst_video_frame_map (&vframe, &priv->info, frame->output_buffer, map_flags);
+    gst_nv_comp_video_dec_download (self, &vframe, stream, device_copy);
+    if (do_sync)
+      CuStreamSynchronize (stream);
+    gst_video_frame_unmap (&vframe);
+  }
+  gst_cuda_context_pop (nullptr);
+
+  return gst_video_decoder_finish_frame (decoder, frame);
+
+error:
+  gst_video_decoder_release_frame (decoder, frame);
+  return GST_FLOW_ERROR;
+}
diff --git a/subprojects/gst-plugins-bad/ext/nvcomp/gstnvcompvideodec.h b/subprojects/gst-plugins-bad/ext/nvcomp/gstnvcompvideodec.h
new file mode 100644
index 0000000000..b53d539f7d
--- /dev/null
+++ b/subprojects/gst-plugins-bad/ext/nvcomp/gstnvcompvideodec.h
@@ -0,0 +1,32 @@
+/* GStreamer
+ * Copyright (C) 2024 Seungha Yang <seungha@centricular.com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Library General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Library General Public
+ * License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
+ * Boston, MA 02110-1301, USA.
+ */
+
+#pragma once
+
+#include <gst/gst.h>
+#include <gst/video/video.h>
+#include "gstnvcomp.h"
+
+G_BEGIN_DECLS
+
+#define GST_TYPE_NV_COMP_VIDEO_DEC (gst_nv_comp_video_dec_get_type())
+G_DECLARE_FINAL_TYPE (GstNvCompVideoDec, gst_nv_comp_video_dec,
+    GST, NV_COMP_VIDEO_DEC, GstVideoDecoder)
+
+G_END_DECLS
\ No newline at end of file
diff --git a/subprojects/gst-plugins-bad/ext/nvcomp/gstnvcompvideoenc.cpp b/subprojects/gst-plugins-bad/ext/nvcomp/gstnvcompvideoenc.cpp
new file mode 100644
index 0000000000..4aa6af905c
--- /dev/null
+++ b/subprojects/gst-plugins-bad/ext/nvcomp/gstnvcompvideoenc.cpp
@@ -0,0 +1,2014 @@
+/* GStreamer
+ * Copyright (C) 2024 Seungha Yang <seungha@centricular.com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Library General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Library General Public
+ * License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
+ * Boston, MA 02110-1301, USA.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "gstnvcompvideoenc.h"
+#ifdef HAVE_GST_GL
+#include <gst/gl/gl.h>
+#include <gst/gl/gstglfuncs.h>
+#endif
+
+#include <nvcomp/ans.hpp>
+#include <nvcomp/bitcomp.hpp>
+#include <nvcomp/cascaded.hpp>
+#include <nvcomp/deflate.hpp>
+#include <nvcomp/gdeflate.hpp>
+#include <nvcomp/lz4.hpp>
+#include <nvcomp/snappy.hpp>
+#include <nvcomp/zstd.hpp>
+#include <memory>
+#include <mutex>
+#include <condition_variable>
+#include <queue>
+#include <atomic>
+#include <string>
+#include <string.h>
+
+GST_DEBUG_CATEGORY_STATIC (gst_nv_comp_video_enc_debug);
+#define GST_CAT_DEFAULT gst_nv_comp_video_enc_debug
+
+#ifdef HAVE_GST_GL
+#define SINK_CAPS \
+    GST_VIDEO_CAPS_MAKE_WITH_FEATURES (GST_CAPS_FEATURE_MEMORY_CUDA_MEMORY, \
+        GST_VIDEO_FORMATS_ALL) ";" \
+    GST_VIDEO_CAPS_MAKE_WITH_FEATURES (GST_CAPS_FEATURE_MEMORY_GL_MEMORY, \
+        GST_VIDEO_FORMATS_ALL) ";" \
+    GST_VIDEO_CAPS_MAKE (GST_VIDEO_FORMATS_ALL)
+#else
+#define SINK_CAPS \
+    GST_VIDEO_CAPS_MAKE_WITH_FEATURES (GST_CAPS_FEATURE_MEMORY_CUDA_MEMORY, \
+        GST_VIDEO_FORMATS_ALL) ";" \
+    GST_VIDEO_CAPS_MAKE (GST_VIDEO_FORMATS_ALL)
+#endif
+
+static GstStaticPadTemplate sink_template =
+GST_STATIC_PAD_TEMPLATE ("sink", GST_PAD_SINK, GST_PAD_ALWAYS,
+    GST_STATIC_CAPS (SINK_CAPS));
+
+static GstStaticPadTemplate src_template =
+    GST_STATIC_PAD_TEMPLATE ("src", GST_PAD_SRC, GST_PAD_ALWAYS,
+    GST_STATIC_CAPS ("video/x-nvcomp; video/x-nvcomp-lz4; "
+        "video/x-nvcomp-snappy; video/x-nvcomp-gdeflate; "
+        "video/x-nvcomp-deflate; video/x-nvcomp-zstd; video/x-nvcomp-cascaded; "
+        "video/x-nvcomp-bitcomp; video/x-nvcomp-ans"));
+
+enum GstNvCompDataType
+{
+  GST_NV_COMP_DATA_TYPE_DEFAULT = -1,
+  GST_NV_COMP_DATA_TYPE_CHAR = NVCOMP_TYPE_CHAR,
+  GST_NV_COMP_DATA_TYPE_UCHAR = NVCOMP_TYPE_UCHAR,
+  GST_NV_COMP_DATA_TYPE_SHORT = NVCOMP_TYPE_SHORT,
+  GST_NV_COMP_DATA_TYPE_USHORT = NVCOMP_TYPE_USHORT,
+  GST_NV_COMP_DATA_TYPE_INT = NVCOMP_TYPE_INT,
+  GST_NV_COMP_DATA_TYPE_UINT = NVCOMP_TYPE_UINT,
+  GST_NV_COMP_DATA_TYPE_LONGLONG = NVCOMP_TYPE_LONGLONG,
+  GST_NV_COMP_DATA_TYPE_ULONGLONG = NVCOMP_TYPE_ULONGLONG,
+  GST_NV_COMP_DATA_TYPE_BITS = NVCOMP_TYPE_BITS,
+};
+
+#define GST_TYPE_NV_COMP_DATA_TYPE (gst_nv_comp_data_type_type())
+static GType
+gst_nv_comp_data_type_type (void)
+{
+  static GType data_type = 0;
+  static std::once_flag once;
+  static const GEnumValue types[] = {
+    {GST_NV_COMP_DATA_TYPE_DEFAULT, "Default", "default"},
+    {GST_NV_COMP_DATA_TYPE_CHAR, "CHAR", "char"},
+    {GST_NV_COMP_DATA_TYPE_UCHAR, "UCHAR", "uchar"},
+    {GST_NV_COMP_DATA_TYPE_SHORT, "SHORT", "short"},
+    {GST_NV_COMP_DATA_TYPE_USHORT, "USHORT", "ushort"},
+    {GST_NV_COMP_DATA_TYPE_INT, "INT", "int"},
+    {GST_NV_COMP_DATA_TYPE_UINT, "UINT", "uint"},
+    {GST_NV_COMP_DATA_TYPE_LONGLONG, "LONGLONG", "longlong"},
+    {GST_NV_COMP_DATA_TYPE_ULONGLONG, "ULONGLONG", "ulonglong"},
+    {GST_NV_COMP_DATA_TYPE_BITS, "BITS", "bits"},
+    {0, nullptr, nullptr},
+  };
+
+  std::call_once (once,[&] {
+        data_type = g_enum_register_static ("GstNvCompDataType", types);
+      });
+
+  return data_type;
+}
+
+enum GstNvCompDeflateAlgo
+{
+  GST_NV_COMP_DEFLATE_HIGH_THROUGHPUT,
+  GST_NV_COMP_DEFLATE_LOW_THROUGHPUT,
+  GST_NV_COMP_DEFLATE_HIGHEST_THROUGHPUT,
+};
+
+#define GST_TYPE_NV_COMP_DEFLATE_ALGO (gst_nv_comp_deflate_algo_get_type())
+static GType
+gst_nv_comp_deflate_algo_get_type (void)
+{
+  static GType algo_type = 0;
+  static std::once_flag once;
+  static const GEnumValue algo[] = {
+    {GST_NV_COMP_DEFLATE_HIGH_THROUGHPUT,
+        "High throughput, low compression ratio", "high-throughput"},
+    {GST_NV_COMP_DEFLATE_LOW_THROUGHPUT,
+        "Low throughput, high compression ratio", "low-throughput"},
+    {GST_NV_COMP_DEFLATE_HIGHEST_THROUGHPUT,
+        "Highest throughput, entropy-only compression", "highest-throughput"},
+    {0, nullptr, nullptr},
+  };
+
+  std::call_once (once,[&] {
+        algo_type = g_enum_register_static ("GstNvCompDeflateAlgo", algo);
+      });
+
+  return algo_type;
+}
+
+enum GstNvCompBitcompAlgo
+{
+  GST_NV_COMP_BITCOMP_DEFAULT,
+  GST_NV_COMP_BITCOMP_SPARSE,
+};
+
+#define GST_TYPE_NV_COMP_BITCOMP_ALGO (gst_nv_comp_bitcomp_algo_get_type())
+static GType
+gst_nv_comp_bitcomp_algo_get_type (void)
+{
+  static GType algo_type = 0;
+  static std::once_flag once;
+  static const GEnumValue algo[] = {
+    {GST_NV_COMP_BITCOMP_DEFAULT, "Default", "default"},
+    {GST_NV_COMP_BITCOMP_SPARSE, "Sparse", "sparse"},
+    {0, nullptr, nullptr},
+  };
+
+  std::call_once (once,[&] {
+        algo_type = g_enum_register_static ("GstNvCompBitcompAlgo", algo);
+      });
+
+  return algo_type;
+}
+
+enum
+{
+  PROP_0,
+  PROP_METHOD,
+  PROP_DEFLATE_ALGO,
+  PROP_BITCOMP_ALGO,
+  PROP_DATA_TYPE,
+  PROP_CHUNK_SIZE,
+  PROP_ASYNC_DEPTH,
+  PROP_BATCHED,
+};
+
+#define DEFAULT_METHOD GST_NV_COMP_BITCOMP
+#define DEFAULT_DEFLATE_ALGO GST_NV_COMP_DEFLATE_HIGH_THROUGHPUT
+#define DEFAULT_BITCOMP_ALGO GST_NV_COMP_BITCOMP_SPARSE
+#define DEFAULT_DATA_TYPE GST_NV_COMP_DATA_TYPE_DEFAULT
+#define DEFAULT_CHUNK_SIZE 0
+#define DEFAULT_BATCHED TRUE
+#define DEFAULT_ASYNC_DEPTH 2
+
+/* *INDENT-OFF* */
+using namespace nvcomp;
+
+struct EncoderTask
+{
+  ~EncoderTask ()
+  {
+    if (ctx) {
+      gst_cuda_context_push (ctx);
+      if (event)
+        CuEventDestroy (event);
+      if (device_uncompressed)
+        CuMemFree ((CUdeviceptr) device_uncompressed);
+      if (host_uncompressed)
+        CuMemFreeHost (host_uncompressed);
+      if (device_compressed)
+        CuMemFree ((CUdeviceptr) device_compressed);
+      if (host_compressed)
+        CuMemFreeHost (host_compressed);
+      if (device_uncompressed_bytes)
+        CuMemFree ((CUdeviceptr) device_uncompressed_bytes);
+      if (device_uncompressed_ptrs)
+        CuMemFree ((CUdeviceptr) device_uncompressed_ptrs);
+      if (device_compressed_bytes)
+        CuMemFree ((CUdeviceptr) device_compressed_bytes);
+      if (host_uncompressed_bytes)
+        CuMemFreeHost (host_uncompressed_bytes);
+      if (host_uncompressed_ptrs)
+        CuMemFreeHost (host_uncompressed_ptrs);
+      if (device_compressed_ptrs)
+        CuMemFree ((CUdeviceptr) device_compressed_ptrs);
+      if (host_compressed_bytes)
+        CuMemFreeHost (host_compressed_bytes);
+      if (host_compressed_ptrs)
+        CuMemFreeHost (host_compressed_ptrs);
+      if (temp_ptr)
+        CuMemFree ((CUdeviceptr) temp_ptr);
+
+      gst_cuda_context_pop (nullptr);
+      gst_object_unref (ctx);
+    }
+  }
+
+  GstCudaContext *ctx = nullptr;
+  CUevent event = nullptr;
+  uint8_t *device_uncompressed = nullptr;
+  uint8_t *host_uncompressed = nullptr;
+
+  uint8_t *device_compressed = nullptr;
+  uint8_t *host_compressed = nullptr;
+
+  size_t *device_uncompressed_bytes = nullptr;
+  void **device_uncompressed_ptrs = nullptr;
+
+  size_t *host_uncompressed_bytes = nullptr;
+  void **host_uncompressed_ptrs = nullptr;
+
+  size_t *device_compressed_bytes = nullptr;
+  void **device_compressed_ptrs = nullptr;
+
+  size_t *host_compressed_bytes = nullptr;
+  void **host_compressed_ptrs = nullptr;
+
+  void *temp_ptr = nullptr;
+  size_t temp_size = 0;
+
+  size_t compressed_size = 0;
+
+  gboolean batched;
+  size_t batch_size;
+  size_t chunk_size;
+  size_t max_output_chunk_size;
+  size_t compressed_alloc_size;
+};
+
+struct BatchedCompBase
+{
+  virtual nvcompStatus_t get_temp_size(
+      size_t batch_size,
+      size_t max_uncompressed_chunk_bytes,
+      size_t * temp_bytes) = 0;
+
+  virtual nvcompStatus_t get_max_compressed_chunk_size(
+      size_t max_uncompressed_chunk_bytes,
+      size_t * max_compressed_bytes) = 0;
+
+  virtual nvcompStatus_t compress(
+      void **device_uncompressed_ptrs,
+      size_t *device_uncompressed_bytes,
+      size_t max_uncompressed_chunk_bytes,
+      size_t batch_size,
+      void *device_temp_ptr,
+      size_t temp_bytes,
+      void **device_compressed_ptrs,
+      size_t *device_compressed_bytes,
+      cudaStream_t stream) = 0;
+};
+
+template <typename FormatOptT, auto T, auto O, auto C>
+class BatchedComp : public BatchedCompBase
+{
+public:
+  BatchedComp (const FormatOptT & opt) : opts_(opt) {}
+
+  nvcompStatus_t get_temp_size(
+      size_t batch_size,
+      size_t max_uncompressed_chunk_bytes,
+      size_t * temp_bytes)
+  {
+    return T (batch_size, max_uncompressed_chunk_bytes, opts_, temp_bytes);
+  }
+
+  nvcompStatus_t get_max_compressed_chunk_size(
+      size_t max_uncompressed_chunk_bytes,
+      size_t * max_compressed_bytes)
+  {
+    return O (max_uncompressed_chunk_bytes, opts_, max_compressed_bytes);
+  }
+
+  nvcompStatus_t compress(
+      void **device_uncompressed_ptrs,
+      size_t *device_uncompressed_bytes,
+      size_t max_uncompressed_chunk_bytes,
+      size_t batch_size,
+      void *device_temp_ptr,
+      size_t temp_bytes,
+      void **device_compressed_ptrs,
+      size_t *device_compressed_bytes,
+      cudaStream_t stream)
+  {
+    return C (device_uncompressed_ptrs, device_uncompressed_bytes,
+        max_uncompressed_chunk_bytes, batch_size, device_temp_ptr, temp_bytes,
+        device_compressed_ptrs, device_compressed_bytes, opts_, stream);
+  }
+
+private:
+  FormatOptT opts_;
+};
+
+struct GstNvCompVideoEncPrivate
+{
+  GstCudaContext *ctx = nullptr;
+  GstCudaStream *stream = nullptr;
+
+#ifdef HAVE_GST_GL
+  GstGLDisplay *gl_display = nullptr;
+  GstGLContext *gl_context = nullptr;
+  GstGLContext *other_gl_context = nullptr;
+#endif
+
+  GstBufferPool *pool = nullptr;
+
+  GstVideoCodecState *state = nullptr;
+  std::shared_ptr<nvcompManagerBase> manager;
+  std::shared_ptr<CompressionConfig> config;
+  std::shared_ptr<BatchedCompBase> batched_comp;
+
+  gboolean gl_interop = FALSE;
+
+  std::mutex lock;
+  std::mutex input_lock;
+  std::condition_variable input_cond;
+  std::mutex output_lock;
+  std::condition_variable output_cond;
+
+  std::queue<std::shared_ptr<EncoderTask>> input_task_queue;
+  std::queue<std::shared_ptr<EncoderTask>> output_task_queue;
+  std::shared_ptr<EncoderTask> cur_task;
+  GThread *encode_thread = nullptr;
+  std::atomic<GstFlowReturn> last_flow = { GST_FLOW_OK };
+
+  GstNvCompMethod method = DEFAULT_METHOD;
+  GstNvCompDeflateAlgo deflate_algo = DEFAULT_DEFLATE_ALGO;
+  GstNvCompBitcompAlgo bitcomp_algo = DEFAULT_BITCOMP_ALGO;
+  GstNvCompDataType data_type = DEFAULT_DATA_TYPE;
+  guint chunk_size = DEFAULT_CHUNK_SIZE;
+  gboolean batched = DEFAULT_BATCHED;
+  guint async_depth = DEFAULT_ASYNC_DEPTH;
+};
+/* *INDENT-ON* */
+
+struct _GstNvCompVideoEnc
+{
+  GstVideoEncoder parent;
+  GstNvCompVideoEncPrivate *priv;
+};
+
+static void gst_nv_comp_video_enc_finalize (GObject * object);
+static void gst_nv_comp_video_enc_set_property (GObject * object, guint prop_id,
+    const GValue * value, GParamSpec * pspec);
+static void gst_nv_comp_video_enc_get_property (GObject * object, guint prop_id,
+    GValue * value, GParamSpec * pspec);
+
+static void gst_nv_comp_video_enc_set_context (GstElement * element,
+    GstContext * context);
+
+static gboolean gst_nv_comp_video_enc_open (GstVideoEncoder * encoder);
+static gboolean gst_nv_comp_video_enc_close (GstVideoEncoder * encoder);
+static gboolean gst_nv_comp_video_enc_stop (GstVideoEncoder * encoder);
+static gboolean gst_nv_comp_video_enc_flush (GstVideoEncoder * encoder);
+static GstFlowReturn gst_nv_comp_video_enc_finish (GstVideoEncoder * encoder);
+static gboolean gst_nv_comp_video_enc_sink_query (GstVideoEncoder * encoder,
+    GstQuery * query);
+static gboolean gst_nv_comp_video_enc_src_query (GstVideoEncoder * encoder,
+    GstQuery * query);
+static gboolean
+gst_nv_comp_video_enc_propose_allocation (GstVideoEncoder * encoder,
+    GstQuery * query);
+static gboolean gst_nv_comp_video_enc_set_format (GstVideoEncoder * encoder,
+    GstVideoCodecState * state);
+static GstFlowReturn
+gst_nv_comp_video_enc_handle_frame (GstVideoEncoder * encoder,
+    GstVideoCodecFrame * frame);
+
+#define gst_nv_comp_video_enc_parent_class parent_class
+G_DEFINE_TYPE (GstNvCompVideoEnc,
+    gst_nv_comp_video_enc, GST_TYPE_VIDEO_ENCODER);
+
+static void
+gst_nv_comp_video_enc_class_init (GstNvCompVideoEncClass * klass)
+{
+  auto object_class = G_OBJECT_CLASS (klass);
+  auto element_class = GST_ELEMENT_CLASS (klass);
+  auto encoder_class = GST_VIDEO_ENCODER_CLASS (klass);
+
+  object_class->finalize = gst_nv_comp_video_enc_finalize;
+  object_class->set_property = gst_nv_comp_video_enc_set_property;
+  object_class->get_property = gst_nv_comp_video_enc_get_property;
+
+  g_object_class_install_property (object_class, PROP_METHOD,
+      g_param_spec_enum ("method", "Method",
+          "Compression method",
+          GST_TYPE_NV_COMP_METHOD, DEFAULT_METHOD,
+          (GParamFlags) (G_PARAM_READWRITE | G_PARAM_STATIC_STRINGS)));
+  g_object_class_install_property (object_class, PROP_DEFLATE_ALGO,
+      g_param_spec_enum ("deflate-algo", "Deflate Algo",
+          "Algorithm to use for deflate and gdeflate methods",
+          GST_TYPE_NV_COMP_DEFLATE_ALGO, DEFAULT_DEFLATE_ALGO,
+          (GParamFlags) (G_PARAM_READWRITE | G_PARAM_STATIC_STRINGS)));
+  g_object_class_install_property (object_class, PROP_BITCOMP_ALGO,
+      g_param_spec_enum ("bitcomp-algo", "Bitcomp Algo",
+          "Algorithm to use for bitcomp method",
+          GST_TYPE_NV_COMP_BITCOMP_ALGO, DEFAULT_BITCOMP_ALGO,
+          (GParamFlags) (G_PARAM_READWRITE | G_PARAM_STATIC_STRINGS)));
+  g_object_class_install_property (object_class, PROP_DATA_TYPE,
+      g_param_spec_enum ("data-type", "Data Type",
+          "Compression data type",
+          GST_TYPE_NV_COMP_DATA_TYPE, DEFAULT_DATA_TYPE,
+          (GParamFlags) (G_PARAM_READWRITE | G_PARAM_STATIC_STRINGS)));
+  g_object_class_install_property (object_class, PROP_CHUNK_SIZE,
+      g_param_spec_uint ("chunk-size", "Chunk Size",
+          "Uncompressed chunk size for batched compression (0 = default)",
+          0, G_MAXINT32, DEFAULT_CHUNK_SIZE,
+          (GParamFlags) (G_PARAM_READWRITE | GST_PARAM_MUTABLE_READY |
+              G_PARAM_STATIC_STRINGS)));
+  g_object_class_install_property (object_class, PROP_BATCHED,
+      g_param_spec_boolean ("batched", "Batched",
+          "Use low-level C API for batched operation", DEFAULT_BATCHED,
+          (GParamFlags) (G_PARAM_READWRITE | GST_PARAM_MUTABLE_READY |
+              G_PARAM_STATIC_STRINGS)));
+  g_object_class_install_property (object_class, PROP_ASYNC_DEPTH,
+      g_param_spec_uint ("async-depth", "Async Depth",
+          "Internal resource pool size for threaded encoding",
+          1, 4, DEFAULT_ASYNC_DEPTH,
+          (GParamFlags) (G_PARAM_READWRITE | GST_PARAM_MUTABLE_READY |
+              G_PARAM_STATIC_STRINGS)));
+
+  gst_element_class_add_static_pad_template (element_class, &sink_template);
+  gst_element_class_add_static_pad_template (element_class, &src_template);
+
+  gst_element_class_set_static_metadata (element_class,
+      "nvCOMP Video Encoder", "Encoder/Video/Hardware",
+      "Lossless video compression element based on nvCOMP library",
+      "Seungha Yang <seungha@centricular.com>");
+
+  element_class->set_context =
+      GST_DEBUG_FUNCPTR (gst_nv_comp_video_enc_set_context);
+
+  encoder_class->open = GST_DEBUG_FUNCPTR (gst_nv_comp_video_enc_open);
+  encoder_class->close = GST_DEBUG_FUNCPTR (gst_nv_comp_video_enc_close);
+  encoder_class->stop = GST_DEBUG_FUNCPTR (gst_nv_comp_video_enc_stop);
+  encoder_class->flush = GST_DEBUG_FUNCPTR (gst_nv_comp_video_enc_flush);
+  encoder_class->finish = GST_DEBUG_FUNCPTR (gst_nv_comp_video_enc_finish);
+  encoder_class->sink_query =
+      GST_DEBUG_FUNCPTR (gst_nv_comp_video_enc_sink_query);
+  encoder_class->src_query =
+      GST_DEBUG_FUNCPTR (gst_nv_comp_video_enc_src_query);
+  encoder_class->propose_allocation =
+      GST_DEBUG_FUNCPTR (gst_nv_comp_video_enc_propose_allocation);
+  encoder_class->set_format =
+      GST_DEBUG_FUNCPTR (gst_nv_comp_video_enc_set_format);
+  encoder_class->handle_frame =
+      GST_DEBUG_FUNCPTR (gst_nv_comp_video_enc_handle_frame);
+
+  GST_DEBUG_CATEGORY_INIT (gst_nv_comp_video_enc_debug,
+      "nvcompvideoenc", 0, "nvcompvideoenc");
+}
+
+static void
+gst_nv_comp_video_enc_init (GstNvCompVideoEnc * self)
+{
+  self->priv = new GstNvCompVideoEncPrivate ();
+}
+
+static void
+gst_nv_comp_video_enc_finalize (GObject * object)
+{
+  auto self = GST_NV_COMP_VIDEO_ENC (object);
+
+  delete self->priv;
+
+  G_OBJECT_CLASS (parent_class)->finalize (object);
+}
+
+static void
+gst_nv_comp_video_enc_set_property (GObject * object, guint prop_id,
+    const GValue * value, GParamSpec * pspec)
+{
+  auto self = GST_NV_COMP_VIDEO_ENC (object);
+  auto priv = self->priv;
+
+  std::lock_guard < std::mutex > lk (priv->lock);
+
+  switch (prop_id) {
+    case PROP_METHOD:
+      priv->method = (GstNvCompMethod) g_value_get_enum (value);
+      break;
+    case PROP_DEFLATE_ALGO:
+      priv->deflate_algo = (GstNvCompDeflateAlgo) g_value_get_enum (value);
+      break;
+    case PROP_BITCOMP_ALGO:
+      priv->bitcomp_algo = (GstNvCompBitcompAlgo) g_value_get_enum (value);
+      break;
+    case PROP_DATA_TYPE:
+      priv->data_type = (GstNvCompDataType) g_value_get_enum (value);
+      break;
+    case PROP_CHUNK_SIZE:
+      priv->chunk_size = g_value_get_uint (value);
+      break;
+    case PROP_BATCHED:
+      priv->batched = g_value_get_boolean (value);
+      break;
+    case PROP_ASYNC_DEPTH:
+      priv->async_depth = g_value_get_uint (value);
+      break;
+    default:
+      G_OBJECT_WARN_INVALID_PROPERTY_ID (object, prop_id, pspec);
+      break;
+  }
+}
+
+static void
+gst_nv_comp_video_enc_get_property (GObject * object, guint prop_id,
+    GValue * value, GParamSpec * pspec)
+{
+  auto self = GST_NV_COMP_VIDEO_ENC (object);
+  auto priv = self->priv;
+
+  std::lock_guard < std::mutex > lk (priv->lock);
+
+  switch (prop_id) {
+    case PROP_METHOD:
+      g_value_set_enum (value, priv->method);
+      break;
+    case PROP_DEFLATE_ALGO:
+      g_value_set_enum (value, priv->deflate_algo);
+      break;
+    case PROP_BITCOMP_ALGO:
+      g_value_set_enum (value, priv->bitcomp_algo);
+      break;
+    case PROP_DATA_TYPE:
+      g_value_set_enum (value, priv->data_type);
+      break;
+    case PROP_CHUNK_SIZE:
+      g_value_set_uint (value, priv->chunk_size);
+      break;
+    case PROP_BATCHED:
+      g_value_set_boolean (value, priv->batched);
+      break;
+    case PROP_ASYNC_DEPTH:
+      g_value_set_uint (value, priv->async_depth);
+      break;
+    default:
+      G_OBJECT_WARN_INVALID_PROPERTY_ID (object, prop_id, pspec);
+      break;
+  }
+}
+
+static void
+gst_nv_comp_video_enc_set_context (GstElement * element, GstContext * context)
+{
+  auto self = GST_NV_COMP_VIDEO_ENC (element);
+  auto priv = self->priv;
+
+  gst_cuda_handle_set_context (element, context, -1, &priv->ctx);
+#ifdef HAVE_GST_GL
+  if (gst_gl_handle_set_context (element, context, &priv->gl_display,
+          &priv->other_gl_context)) {
+    if (priv->gl_display)
+      gst_gl_display_filter_gl_api (priv->gl_display, GST_GL_API_OPENGL3);
+  }
+#endif
+
+  GST_ELEMENT_CLASS (parent_class)->set_context (element, context);
+}
+
+static gboolean
+gst_nv_comp_video_enc_open (GstVideoEncoder * encoder)
+{
+  auto self = GST_NV_COMP_VIDEO_ENC (encoder);
+  auto priv = self->priv;
+
+  if (!gst_cuda_ensure_element_context (GST_ELEMENT_CAST (encoder),
+          -1, &priv->ctx)) {
+    GST_ERROR_OBJECT (self, "Couldn't get cuda context");
+    return FALSE;
+  }
+
+  priv->stream = gst_cuda_stream_new (priv->ctx);
+
+  return TRUE;
+}
+
+static gboolean
+gst_nv_comp_video_enc_close (GstVideoEncoder * encoder)
+{
+  auto self = GST_NV_COMP_VIDEO_ENC (encoder);
+  auto priv = self->priv;
+
+  gst_clear_cuda_stream (&priv->stream);
+  gst_clear_object (&priv->ctx);
+
+#ifdef HAVE_GST_GL
+  gst_clear_object (&priv->other_gl_context);
+  gst_clear_object (&priv->gl_context);
+  gst_clear_object (&priv->gl_context);
+#endif
+
+  return TRUE;
+}
+
+static void
+gst_nv_comp_video_enc_drain (GstNvCompVideoEnc * self, gboolean locked)
+{
+  auto priv = self->priv;
+  if (!priv->encode_thread)
+    return;
+
+  if (locked)
+    GST_VIDEO_ENCODER_STREAM_UNLOCK (self);
+
+  {
+    std::lock_guard < std::mutex > lk (priv->output_lock);
+    priv->output_task_queue.push (nullptr);
+    priv->output_cond.notify_all ();
+  }
+
+  g_clear_pointer (&priv->encode_thread, g_thread_join);
+
+  if (locked)
+    GST_VIDEO_ENCODER_STREAM_LOCK (self);
+
+  priv->last_flow = GST_FLOW_OK;
+}
+
+static gboolean
+gst_nv_comp_video_enc_stop (GstVideoEncoder * encoder)
+{
+  auto self = GST_NV_COMP_VIDEO_ENC (encoder);
+  auto priv = self->priv;
+
+  gst_nv_comp_video_enc_drain (self, FALSE);
+
+  if (priv->ctx) {
+    gst_cuda_context_push (priv->ctx);
+    priv->manager = nullptr;
+    priv->cur_task = nullptr;
+    priv->input_task_queue = { };
+    priv->output_task_queue = { };
+
+    gst_cuda_context_pop (nullptr);
+  }
+
+  g_clear_pointer (&priv->state, gst_video_codec_state_unref);
+
+  if (priv->pool) {
+    gst_buffer_pool_set_active (priv->pool, FALSE);
+    gst_clear_object (&priv->pool);
+  }
+
+  return TRUE;
+}
+
+static gboolean
+gst_nv_comp_video_enc_flush (GstVideoEncoder * encoder)
+{
+  auto self = GST_NV_COMP_VIDEO_ENC (encoder);
+
+  gst_nv_comp_video_enc_drain (self, TRUE);
+
+  return TRUE;
+}
+
+static GstFlowReturn
+gst_nv_comp_video_enc_finish (GstVideoEncoder * encoder)
+{
+  auto self = GST_NV_COMP_VIDEO_ENC (encoder);
+
+  gst_nv_comp_video_enc_drain (self, TRUE);
+
+  return GST_FLOW_OK;
+}
+
+static gboolean
+gst_nv_comp_video_enc_handle_context_query (GstNvCompVideoEnc * self,
+    GstQuery * query)
+{
+  auto priv = self->priv;
+
+#ifdef HAVE_GST_GL
+  {
+    GstGLDisplay *display = nullptr;
+    GstGLContext *other = nullptr;
+    GstGLContext *local = nullptr;
+
+    if (priv->gl_display)
+      display = (GstGLDisplay *) gst_object_ref (priv->gl_display);
+    if (priv->gl_context)
+      local = (GstGLContext *) gst_object_ref (priv->gl_context);
+    if (priv->other_gl_context)
+      other = (GstGLContext *) gst_object_ref (priv->other_gl_context);
+
+    auto ret = gst_gl_handle_context_query (GST_ELEMENT (self), query,
+        display, local, other);
+    gst_clear_object (&display);
+    gst_clear_object (&other);
+    gst_clear_object (&local);
+
+    if (ret)
+      return TRUE;
+  }
+#endif
+
+  if (gst_cuda_handle_context_query (GST_ELEMENT (self), query, priv->ctx))
+    return TRUE;
+
+  return FALSE;
+}
+
+static gboolean
+gst_nv_comp_video_enc_sink_query (GstVideoEncoder * encoder, GstQuery * query)
+{
+  auto self = GST_NV_COMP_VIDEO_ENC (encoder);
+
+  switch (GST_QUERY_TYPE (query)) {
+    case GST_QUERY_CONTEXT:
+      if (gst_nv_comp_video_enc_handle_context_query (self, query))
+        return TRUE;
+      break;
+    default:
+      break;
+  }
+
+  return GST_VIDEO_ENCODER_CLASS (parent_class)->sink_query (encoder, query);
+}
+
+static gboolean
+gst_nv_comp_video_enc_src_query (GstVideoEncoder * encoder, GstQuery * query)
+{
+  auto self = GST_NV_COMP_VIDEO_ENC (encoder);
+
+  switch (GST_QUERY_TYPE (query)) {
+    case GST_QUERY_CONTEXT:
+      if (gst_nv_comp_video_enc_handle_context_query (self, query))
+        return TRUE;
+      break;
+    default:
+      break;
+  }
+
+  return GST_VIDEO_ENCODER_CLASS (parent_class)->src_query (encoder, query);
+}
+
+#ifdef HAVE_GST_GL
+static void
+check_cuda_device_from_gl_context (GstGLContext * context, gboolean * ret)
+{
+  guint device_count = 0;
+  CUdevice device_list[1] = { 0, };
+  CUresult cuda_ret;
+
+  *ret = FALSE;
+  cuda_ret = CuGLGetDevices (&device_count,
+      device_list, 1, CU_GL_DEVICE_LIST_ALL);
+
+  if (!gst_cuda_result (cuda_ret) || device_count == 0)
+    return;
+
+  *ret = TRUE;
+}
+
+static gboolean
+gst_nv_comp_video_enc_ensure_gl_context (GstNvCompVideoEnc * self)
+{
+  auto priv = self->priv;
+  gboolean ret = FALSE;
+
+  if (!gst_gl_ensure_element_data (GST_ELEMENT (self), &priv->gl_display,
+          &priv->other_gl_context)) {
+    GST_DEBUG_OBJECT (self, "Couldn't get GL display");
+    return FALSE;
+  }
+
+  gst_gl_display_filter_gl_api (priv->gl_display, GST_GL_API_OPENGL3);
+
+  if (!gst_gl_display_ensure_context (priv->gl_display, priv->other_gl_context,
+          &priv->gl_context, nullptr)) {
+    GST_DEBUG_OBJECT (self, "Couldn't get GL context");
+    return FALSE;
+  }
+
+  gst_gl_context_thread_add (priv->gl_context,
+      (GstGLContextThreadFunc) check_cuda_device_from_gl_context, &ret);
+
+  return ret;
+}
+#endif
+
+static gboolean
+gst_nv_comp_video_enc_propose_allocation (GstVideoEncoder * encoder,
+    GstQuery * query)
+{
+  auto self = GST_NV_COMP_VIDEO_ENC (encoder);
+  auto priv = self->priv;
+  GstBufferPool *pool = nullptr;
+  guint size;
+
+  GstCaps *caps;
+  gst_query_parse_allocation (query, &caps, nullptr);
+  if (!caps) {
+    GST_WARNING_OBJECT (self, "null caps in query");
+    return FALSE;
+  }
+
+  GstVideoInfo info;
+  if (!gst_video_info_from_caps (&info, caps)) {
+    GST_WARNING_OBJECT (self, "Failed to convert caps into info");
+    return FALSE;
+  }
+
+  auto features = gst_caps_get_features (caps, 0);
+  gboolean use_cuda_pool = FALSE;
+  if (gst_caps_features_contains (features,
+          GST_CAPS_FEATURE_MEMORY_CUDA_MEMORY)) {
+    GST_DEBUG_OBJECT (self, "upstream support CUDA memory");
+    pool = gst_cuda_buffer_pool_new (priv->ctx);
+    use_cuda_pool = TRUE;
+  }
+#ifdef HAVE_GST_GL
+  else if (gst_caps_features_contains (features,
+          GST_CAPS_FEATURE_MEMORY_GL_MEMORY)) {
+    if (!gst_nv_comp_video_enc_ensure_gl_context (self)) {
+      priv->gl_interop = FALSE;
+    } else {
+      pool = gst_gl_buffer_pool_new (priv->gl_context);
+    }
+  }
+#endif
+
+  if (!pool)
+    pool = gst_video_buffer_pool_new ();
+
+  auto config = gst_buffer_pool_get_config (pool);
+  gst_buffer_pool_config_add_option (config, GST_BUFFER_POOL_OPTION_VIDEO_META);
+
+  size = GST_VIDEO_INFO_SIZE (&info);
+  gst_buffer_pool_config_set_params (config, caps, size, 0, 0);
+  if (use_cuda_pool && priv->stream) {
+    /* Set our stream on buffer pool config so that CUstream can be shared */
+    gst_buffer_pool_config_set_cuda_stream (config, priv->stream);
+  }
+
+  if (!gst_buffer_pool_set_config (pool, config)) {
+    GST_WARNING_OBJECT (self, "Failed to set pool config");
+    gst_object_unref (pool);
+    return FALSE;
+  }
+
+  config = gst_buffer_pool_get_config (pool);
+  gst_buffer_pool_config_get_params (config, nullptr, &size, nullptr, nullptr);
+  gst_structure_free (config);
+
+  gst_query_add_allocation_pool (query, pool, size, 0, 0);
+  gst_query_add_allocation_meta (query, GST_VIDEO_META_API_TYPE, nullptr);
+  gst_object_unref (pool);
+
+  return TRUE;
+}
+
+static gboolean
+gst_nv_comp_video_enc_alloc_task (GstNvCompVideoEnc * self, EncoderTask * task,
+    gboolean batched, size_t uncompressed_size, size_t compressed_size,
+    size_t batch_size, size_t chunk_size, size_t output_chunk_size,
+    size_t temp_size)
+{
+  size_t alloc_size = sizeof (size_t) * batch_size;
+  uint8_t *uncomp_data;
+  uint8_t *comp_data;
+
+  auto ret = CuEventCreate (&task->event,
+      CU_EVENT_BLOCKING_SYNC | CU_EVENT_DISABLE_TIMING);
+  if (!gst_cuda_result (ret))
+    return FALSE;
+
+  auto aligned_uncompressed_size = uncompressed_size;
+  ret = CuMemAlloc ((CUdeviceptr *) & task->device_uncompressed,
+      aligned_uncompressed_size);
+  if (!gst_cuda_result (ret))
+    return FALSE;
+
+  ret = CuMemAllocHost ((void **) &task->host_uncompressed,
+      aligned_uncompressed_size);
+  if (!gst_cuda_result (ret))
+    return FALSE;
+
+  auto aligned_compressed_size = GST_ROUND_UP_8 (compressed_size);
+  ret = CuMemAlloc ((CUdeviceptr *) & task->device_compressed,
+      aligned_compressed_size);
+  if (!gst_cuda_result (ret))
+    return FALSE;
+
+  ret = CuMemAllocHost ((void **) &task->host_compressed,
+      aligned_compressed_size);
+  if (!gst_cuda_result (ret))
+    return FALSE;
+
+  if (!batched)
+    return TRUE;
+
+  ret = CuMemAllocHost ((void **) &task->host_uncompressed_bytes, alloc_size);
+  if (!gst_cuda_result (ret))
+    return FALSE;
+
+  ret = CuMemAllocHost ((void **) &task->host_uncompressed_ptrs, alloc_size);
+  if (!gst_cuda_result (ret))
+    return FALSE;
+
+  for (size_t i = 0; i < batch_size; i++) {
+    if (i + 1 < batch_size)
+      task->host_uncompressed_bytes[i] = chunk_size;
+    else
+      task->host_uncompressed_bytes[i] = (uncompressed_size - (chunk_size * i));
+  }
+
+  ret = CuMemAlloc ((CUdeviceptr *) & task->device_uncompressed_bytes,
+      alloc_size);
+  if (!gst_cuda_result (ret))
+    return FALSE;
+
+  ret = CuMemAlloc ((CUdeviceptr *) & task->device_uncompressed_ptrs,
+      alloc_size);
+  if (!gst_cuda_result (ret))
+    return FALSE;
+
+  ret = CuMemAlloc ((CUdeviceptr *) & task->device_compressed_bytes,
+      alloc_size);
+  if (!gst_cuda_result (ret))
+    return FALSE;
+
+  ret = CuMemAlloc ((CUdeviceptr *) & task->device_compressed_ptrs, alloc_size);
+  if (!gst_cuda_result (ret))
+    return FALSE;
+
+  ret = CuMemAllocHost ((void **) &task->host_compressed_bytes, alloc_size);
+  if (!gst_cuda_result (ret))
+    return FALSE;
+
+  ret = CuMemAllocHost ((void **) &task->host_compressed_ptrs, alloc_size);
+  if (!gst_cuda_result (ret))
+    return FALSE;
+
+  if (temp_size > 0) {
+    ret = CuMemAlloc ((CUdeviceptr *) & task->temp_ptr, temp_size);
+    if (!gst_cuda_result (ret))
+      return FALSE;
+  }
+
+  task->temp_size = temp_size;
+
+  uncomp_data = task->device_uncompressed;
+  comp_data = task->device_compressed;
+  for (size_t i = 0; i < batch_size; i++) {
+    task->host_uncompressed_ptrs[i] = uncomp_data;
+    uncomp_data += chunk_size;
+
+    task->host_compressed_ptrs[i] = comp_data;
+    comp_data += output_chunk_size;
+  }
+
+  ret = CuMemcpyHtoD ((CUdeviceptr) task->device_uncompressed_bytes,
+      task->host_uncompressed_bytes, alloc_size);
+  if (!gst_cuda_result (ret))
+    return FALSE;
+
+  ret = CuMemcpyHtoD ((CUdeviceptr) task->device_uncompressed_ptrs,
+      task->host_uncompressed_ptrs, alloc_size);
+  if (!gst_cuda_result (ret))
+    return FALSE;
+
+  ret = CuMemcpyHtoD ((CUdeviceptr) task->device_compressed_ptrs,
+      task->host_compressed_ptrs, alloc_size);
+  if (!gst_cuda_result (ret))
+    return FALSE;
+
+  task->batched = batched;
+  task->batch_size = batch_size;
+  task->chunk_size = chunk_size;
+  task->max_output_chunk_size = output_chunk_size;
+  task->compressed_alloc_size = aligned_compressed_size;
+
+  return TRUE;
+}
+
+static gboolean
+gst_nv_comp_video_enc_set_format (GstVideoEncoder * encoder,
+    GstVideoCodecState * state)
+{
+  auto self = GST_NV_COMP_VIDEO_ENC (encoder);
+  auto priv = self->priv;
+
+  gst_nv_comp_video_enc_drain (self, TRUE);
+
+  std::lock_guard < std::mutex > lk (priv->lock);
+
+  if (!priv->ctx) {
+    GST_ERROR_OBJECT (self, "CUDA context was not configured");
+    return FALSE;
+  }
+
+  if (priv->pool) {
+    gst_buffer_pool_set_active (priv->pool, FALSE);
+    gst_clear_object (&priv->pool);
+  }
+
+  g_clear_pointer (&priv->state, gst_video_codec_state_unref);
+  priv->state = gst_video_codec_state_ref (state);
+
+  std::string mime_type = "video/x-nvcomp";
+
+  if (!gst_cuda_context_push (priv->ctx)) {
+    GST_ERROR_OBJECT (self, "Couldn't push context");
+    return FALSE;
+  }
+
+  priv->gl_interop = FALSE;
+#if HAVE_GST_GL
+  auto features = gst_caps_get_features (state->caps, 0);
+  if (gst_caps_features_contains (features, GST_CAPS_FEATURE_MEMORY_GL_MEMORY))
+    priv->gl_interop = TRUE;
+#endif
+
+  priv->manager = nullptr;
+  priv->config = nullptr;
+  priv->batched_comp = nullptr;
+  priv->input_task_queue = { };
+  priv->output_task_queue = { };
+  auto stream = (cudaStream_t) gst_cuda_stream_get_handle (priv->stream);
+  guint device_id = 0;
+  g_object_get (priv->ctx, "cuda-device-id", &device_id, nullptr);
+  size_t chunk_size = priv->chunk_size;
+  size_t batch_size = 0;
+
+  switch (priv->method) {
+    case GST_NV_COMP_LZ4:
+    {
+      nvcompBatchedLZ4Opts_t opts = nvcompBatchedLZ4DefaultOpts;
+      if (priv->data_type != GST_NV_COMP_DATA_TYPE_DEFAULT)
+        opts.data_type = (nvcompType_t) priv->data_type;
+
+      if (chunk_size == 0)
+        chunk_size = 65536;
+
+      chunk_size = MAX (32768, chunk_size);
+      chunk_size = GST_ROUND_UP_8 (chunk_size);
+      chunk_size = MIN (chunk_size, nvcompLZ4CompressionMaxAllowedChunkSize);
+
+      if (priv->batched) {
+        batch_size = (state->info.size + chunk_size - 1) / chunk_size;
+
+        priv->batched_comp =
+            std::make_shared < BatchedComp < nvcompBatchedLZ4Opts_t,
+            nvcompBatchedLZ4CompressGetTempSize,
+            nvcompBatchedLZ4CompressGetMaxOutputChunkSize,
+            nvcompBatchedLZ4CompressAsync >> (opts);
+        mime_type = "video/x-nvcomp-lz4";
+      } else {
+        priv->manager = std::make_shared < LZ4Manager > (chunk_size,
+            opts, stream, device_id);
+      }
+      GST_DEBUG_OBJECT (self, "Using LZ4");
+      break;
+    }
+    case GST_NV_COMP_SNAPPY:
+    {
+      if (chunk_size == 0)
+        chunk_size = 65536;
+
+      chunk_size = MAX (32768, chunk_size);
+      chunk_size = GST_ROUND_UP_8 (chunk_size);
+      chunk_size = MIN (chunk_size, nvcompSnappyCompressionMaxAllowedChunkSize);
+
+      if (priv->batched) {
+        batch_size = (state->info.size + chunk_size - 1) / chunk_size;
+
+        priv->batched_comp =
+            std::make_shared < BatchedComp < nvcompBatchedSnappyOpts_t,
+            nvcompBatchedSnappyCompressGetTempSize,
+            nvcompBatchedSnappyCompressGetMaxOutputChunkSize,
+            nvcompBatchedSnappyCompressAsync >>
+            (nvcompBatchedSnappyDefaultOpts);
+        mime_type = "video/x-nvcomp-snappy";
+      } else {
+        priv->manager = std::make_shared < SnappyManager > (chunk_size,
+            nvcompBatchedSnappyDefaultOpts, stream, device_id);
+      }
+      GST_DEBUG_OBJECT (self, "Using SNAPPY");
+      break;
+    }
+    case GST_NV_COMP_GDEFLATE:
+    {
+      nvcompBatchedGdeflateOpts_t opts;
+      opts.algo = (int) priv->deflate_algo;
+
+      if (chunk_size == 0)
+        chunk_size = 65536;
+
+      chunk_size = MAX (32768, chunk_size);
+      chunk_size = GST_ROUND_UP_8 (chunk_size);
+      chunk_size = MIN (chunk_size,
+          nvcompGdeflateCompressionMaxAllowedChunkSize);
+
+
+      if (priv->batched) {
+        batch_size = (state->info.size + chunk_size - 1) / chunk_size;
+
+        priv->batched_comp =
+            std::make_shared < BatchedComp < nvcompBatchedGdeflateOpts_t,
+            nvcompBatchedGdeflateCompressGetTempSize,
+            nvcompBatchedGdeflateCompressGetMaxOutputChunkSize,
+            nvcompBatchedGdeflateCompressAsync >> (opts);
+        mime_type = "video/x-nvcomp-gdeflate";
+      } else {
+        priv->manager = std::make_shared < GdeflateManager > (chunk_size,
+            opts, stream, device_id);
+      }
+      GST_DEBUG_OBJECT (self, "Using GDEFLATE");
+      break;
+    }
+    case GST_NV_COMP_DEFLATE:
+    {
+      if (chunk_size == 0)
+        chunk_size = 65536;
+
+      chunk_size = MAX (32768, chunk_size);
+      chunk_size = GST_ROUND_UP_8 (chunk_size);
+      chunk_size = MIN (chunk_size,
+          nvcompDeflateCompressionMaxAllowedChunkSize);
+
+      nvcompBatchedDeflateOpts_t opts;
+      opts.algo = (int) priv->deflate_algo;
+      if (priv->batched) {
+        batch_size = (state->info.size + chunk_size - 1) / chunk_size;
+
+        priv->batched_comp =
+            std::make_shared < BatchedComp < nvcompBatchedDeflateOpts_t,
+            nvcompBatchedDeflateCompressGetTempSize,
+            nvcompBatchedDeflateCompressGetMaxOutputChunkSize,
+            nvcompBatchedDeflateCompressAsync >> (opts);
+        mime_type = "video/x-nvcomp-deflate";
+      } else {
+        priv->manager = std::make_shared < DeflateManager > (chunk_size,
+            opts, stream, device_id);
+      }
+      GST_DEBUG_OBJECT (self, "Using DEFLATE");
+      break;
+    }
+    case GST_NV_COMP_ZSTD:
+    {
+      if (chunk_size == 0)
+        chunk_size = 65536;
+
+      chunk_size = MAX (32768, chunk_size);
+      chunk_size = GST_ROUND_UP_8 (chunk_size);
+      chunk_size = MIN (chunk_size, nvcompZstdCompressionMaxAllowedChunkSize);
+
+      if (priv->batched) {
+        batch_size = (state->info.size + chunk_size - 1) / chunk_size;
+
+        priv->batched_comp =
+            std::make_shared < BatchedComp < nvcompBatchedZstdOpts_t,
+            nvcompBatchedZstdCompressGetTempSize,
+            nvcompBatchedZstdCompressGetMaxOutputChunkSize,
+            nvcompBatchedZstdCompressAsync >> (nvcompBatchedZstdDefaultOpts);
+        mime_type = "video/x-nvcomp-zstd";
+      } else {
+        priv->manager = std::make_shared < ZstdManager > (chunk_size,
+            nvcompBatchedZstdDefaultOpts, stream, device_id);
+      }
+      GST_DEBUG_OBJECT (self, "Using ZSTD");
+      break;
+    }
+    case GST_NV_COMP_CASCADED:
+    {
+      if (chunk_size == 0)
+        chunk_size = 4096;
+
+      chunk_size = MAX (512, chunk_size);
+      chunk_size = GST_ROUND_UP_8 (chunk_size);
+      chunk_size = MIN (chunk_size, 16384);
+
+      nvcompBatchedCascadedOpts_t opts = nvcompBatchedCascadedDefaultOpts;
+      opts.chunk_size = chunk_size;
+      if (priv->data_type != GST_NV_COMP_DATA_TYPE_DEFAULT)
+        opts.type = (nvcompType_t) priv->data_type;
+
+      if (priv->batched) {
+        batch_size = (state->info.size + chunk_size - 1) / chunk_size;
+
+        priv->batched_comp =
+            std::make_shared < BatchedComp < nvcompBatchedCascadedOpts_t,
+            nvcompBatchedCascadedCompressGetTempSize,
+            nvcompBatchedCascadedCompressGetMaxOutputChunkSize,
+            nvcompBatchedCascadedCompressAsync >> (opts);
+        mime_type = "video/x-nvcomp-cascaded";
+      } else {
+        priv->manager = std::make_shared < CascadedManager > (chunk_size,
+            opts, stream, device_id);
+      }
+      GST_DEBUG_OBJECT (self, "Using CASCADED");
+      break;
+    }
+    case GST_NV_COMP_BITCOMP:
+    {
+      if (chunk_size == 0)
+        chunk_size = 65536;
+
+      chunk_size = MAX (32768, chunk_size);
+      chunk_size = GST_ROUND_UP_8 (chunk_size);
+      chunk_size = MIN (chunk_size,
+          nvcompBitcompCompressionMaxAllowedChunkSize);
+
+      nvcompBatchedBitcompFormatOpts opts = nvcompBatchedBitcompDefaultOpts;
+      opts.algorithm_type = (int) priv->bitcomp_algo;
+      if (priv->data_type != GST_NV_COMP_DATA_TYPE_DEFAULT)
+        opts.data_type = (nvcompType_t) priv->data_type;
+
+      if (priv->batched) {
+        batch_size = (state->info.size + chunk_size - 1) / chunk_size;
+
+        priv->batched_comp =
+            std::make_shared < BatchedComp < nvcompBatchedBitcompFormatOpts,
+            nvcompBatchedBitcompCompressGetTempSize,
+            nvcompBatchedBitcompCompressGetMaxOutputChunkSize,
+            nvcompBatchedBitcompCompressAsync >> (opts);
+        mime_type = "video/x-nvcomp-bitcomp";
+      } else {
+        priv->manager = std::make_shared < BitcompManager > (chunk_size,
+            opts, stream, device_id);
+      }
+      GST_DEBUG_OBJECT (self, "Using BITCOMP");
+      break;
+    }
+    case GST_NV_COMP_ANS:
+    {
+      if (chunk_size == 0)
+        chunk_size = 65536;
+
+      chunk_size = MAX (32768, chunk_size);
+      chunk_size = GST_ROUND_UP_8 (chunk_size);
+      chunk_size = MIN (chunk_size, nvcompANSCompressionMaxAllowedChunkSize);
+
+      if (priv->batched) {
+        batch_size = (state->info.size + chunk_size - 1) / chunk_size;
+        priv->batched_comp =
+            std::make_shared < BatchedComp < nvcompBatchedANSOpts_t,
+            nvcompBatchedANSCompressGetTempSize,
+            nvcompBatchedANSCompressGetMaxOutputChunkSize,
+            nvcompBatchedANSCompressAsync >> (nvcompBatchedANSDefaultOpts);
+        mime_type = "video/x-nvcomp-ans";
+      } else {
+        priv->manager = std::make_shared < ANSManager > (chunk_size,
+            nvcompBatchedANSDefaultOpts, stream, device_id);
+      }
+
+      GST_DEBUG_OBJECT (self, "Using ANS");
+      break;
+    }
+    default:
+      g_assert_not_reached ();
+      return FALSE;
+  }
+
+  size_t max_output_size = 0;
+  size_t max_output_chunk_size = 0;
+  size_t temp_size = 0;
+  if (priv->batched) {
+    auto status = priv->batched_comp->get_temp_size (batch_size,
+        chunk_size, &temp_size);
+    if (status != nvcompSuccess) {
+      GST_ERROR_OBJECT (self, "Couldn't get temp size");
+      gst_cuda_context_pop (nullptr);
+      return FALSE;
+    }
+
+    status = priv->batched_comp->get_max_compressed_chunk_size (chunk_size,
+        &max_output_chunk_size);
+    if (status != nvcompSuccess) {
+      GST_ERROR_OBJECT (self, "Couldn't get max output chunk size");
+      gst_cuda_context_pop (nullptr);
+      return FALSE;
+    }
+
+    max_output_chunk_size = GST_ROUND_UP_8 (max_output_chunk_size);
+    max_output_size = max_output_chunk_size * batch_size;
+  } else {
+    priv->config = std::make_shared < CompressionConfig >
+        (priv->manager->configure_compression (state->info.size));
+    max_output_size = priv->config->max_compressed_buffer_size;
+  }
+
+  GST_DEBUG_OBJECT (self, "Allocating resource, batched: %d"
+      ", uncompressed size: %" G_GSIZE_FORMAT
+      ", max-output-size: %" G_GSIZE_FORMAT
+      ", batch-size: %" G_GSIZE_FORMAT
+      ", chunk-size: %" G_GSIZE_FORMAT
+      ", max-output-chunk-size: %" G_GSIZE_FORMAT
+      ", temp-size: %" G_GSIZE_FORMAT, priv->batched, state->info.size,
+      max_output_size, batch_size, chunk_size, max_output_chunk_size,
+      temp_size);
+
+  for (guint i = 0; i < priv->async_depth; i++) {
+    auto task = std::make_shared < EncoderTask > ();
+    task->ctx = (GstCudaContext *) gst_object_ref (priv->ctx);
+
+    if (!gst_nv_comp_video_enc_alloc_task (self, task.get (), priv->batched,
+            state->info.size, max_output_size, batch_size, chunk_size,
+            max_output_chunk_size, temp_size)) {
+      priv->manager = nullptr;
+      priv->input_task_queue = { };
+      task = nullptr;
+      gst_cuda_context_pop (nullptr);
+      return FALSE;
+    }
+
+    priv->input_task_queue.push (task);
+  }
+
+  /* In case of batched, custom header is added to signal chunk and batch size */
+  if (priv->batched) {
+    /* version */
+    max_output_size += sizeof (guint32);
+
+    /* max uncompressed chunk size */
+    max_output_size += sizeof (guint32);
+
+    /* max compressed chunk size */
+    max_output_size += sizeof (guint32);
+
+    /* batch size */
+    max_output_size += sizeof (guint32);
+
+    /* each uncompressed/compressed chunk size */
+    max_output_size += (sizeof (guint32) * batch_size * 2);
+  }
+
+  priv->pool = gst_buffer_pool_new ();
+  auto config = gst_buffer_pool_get_config (priv->pool);
+  gst_buffer_pool_config_set_params (config, nullptr, max_output_size, 0, 0);
+  gst_buffer_pool_set_config (priv->pool, config);
+  gst_buffer_pool_set_active (priv->pool, TRUE);
+
+  gst_cuda_context_pop (nullptr);
+
+  auto caps = gst_caps_new_simple (mime_type.c_str (), "format", G_TYPE_STRING,
+      gst_video_format_to_string (GST_VIDEO_INFO_FORMAT (&state->info)),
+      nullptr);
+  auto out_state =
+      gst_video_encoder_set_output_state (GST_VIDEO_ENCODER (encoder),
+      caps, state);
+  gst_video_codec_state_unref (out_state);
+
+  return TRUE;
+}
+
+static gboolean
+gst_nv_comp_video_enc_upload (GstNvCompVideoEnc * self, GstVideoFrame * frame,
+    CUstream stream, gboolean is_device_copy)
+{
+  auto priv = self->priv;
+  auto info = &priv->state->info;
+  auto finfo = info->finfo;
+  gint comp[GST_VIDEO_MAX_COMPONENTS];
+  CUresult ret = CUDA_SUCCESS;
+  auto cur_task = priv->cur_task;
+
+  for (guint i = 0; i < GST_VIDEO_FRAME_N_PLANES (frame); i++) {
+    guint8 *sp = (guint8 *) GST_VIDEO_FRAME_PLANE_DATA (frame, i);
+    guint8 *dp;
+    if (is_device_copy)
+      dp = cur_task->device_uncompressed + info->offset[i];
+    else
+      dp = cur_task->host_uncompressed + info->offset[i];
+
+    guint ss, ds;
+    guint w, h;
+
+    if (GST_VIDEO_FORMAT_INFO_HAS_PALETTE (finfo) && i == 1) {
+      if (is_device_copy) {
+        ret = CuMemcpyDtoDAsync ((CUdeviceptr) dp, (CUdeviceptr) sp,
+            256 * 4, stream);
+      } else {
+        memcpy (dp, sp, 256 * 4);
+      }
+
+      if (!gst_cuda_result (ret)) {
+        GST_ERROR_OBJECT (self, "CUDA memcpy failed");
+        return FALSE;
+      }
+
+      return TRUE;
+    }
+
+    ss = GST_VIDEO_FRAME_PLANE_STRIDE (frame, i);
+    ds = GST_VIDEO_INFO_PLANE_STRIDE (info, i);
+
+    gst_video_format_info_component (finfo, i, comp);
+
+    w = GST_VIDEO_INFO_COMP_WIDTH (info, comp[0]) *
+        GST_VIDEO_INFO_COMP_PSTRIDE (info, comp[0]);
+    if (w == 0)
+      w = MIN (ss, ds);
+
+    h = GST_VIDEO_INFO_COMP_HEIGHT (info, comp[0]);
+
+    if (GST_VIDEO_FORMAT_INFO_IS_TILED (finfo)) {
+      gint tile_size;
+      gint sx_tiles, sy_tiles, dx_tiles, dy_tiles;
+      GstVideoTileMode mode;
+
+      tile_size = GST_VIDEO_FORMAT_INFO_TILE_SIZE (info->finfo, i);
+
+      mode = GST_VIDEO_FORMAT_INFO_TILE_MODE (info->finfo);
+
+      sx_tiles = GST_VIDEO_TILE_X_TILES (ss);
+      sy_tiles = GST_VIDEO_TILE_Y_TILES (ss);
+
+      dx_tiles = GST_VIDEO_TILE_X_TILES (ds);
+      dy_tiles = GST_VIDEO_TILE_Y_TILES (ds);
+
+      w = MIN (sx_tiles, dx_tiles);
+      h = MIN (sy_tiles, dy_tiles);
+
+      for (guint j = 0; j < h; j++) {
+        for (guint k = 0; k < w; k++) {
+          guint si, di;
+          guint8 *cur_dp;
+          guint8 *cur_sp;
+
+          si = gst_video_tile_get_index (mode, k, j, sx_tiles, sy_tiles);
+          di = gst_video_tile_get_index (mode, k, j, dx_tiles, dy_tiles);
+
+          cur_dp = dp + (di * tile_size);
+          cur_sp = sp + (si * tile_size);
+
+          if (is_device_copy) {
+            ret = CuMemcpyDtoDAsync ((CUdeviceptr) cur_dp, (CUdeviceptr) cur_sp,
+                w, stream);
+          } else {
+            memcpy (cur_dp, cur_sp, w);
+          }
+
+          if (!gst_cuda_result (ret)) {
+            GST_ERROR_OBJECT (self, "CUDA memcpy failed");
+            return FALSE;
+          }
+        }
+      }
+    } else {
+      if (is_device_copy) {
+        CUDA_MEMCPY2D params = { };
+        params.srcMemoryType = CU_MEMORYTYPE_DEVICE;
+        params.srcDevice = (CUdeviceptr) sp;
+        params.srcPitch = ss;
+
+        params.dstMemoryType = CU_MEMORYTYPE_DEVICE;
+        params.dstDevice = (CUdeviceptr) dp;
+        params.dstPitch = ds;
+
+        params.WidthInBytes = w;
+        params.Height = h;
+
+        ret = CuMemcpy2DAsync (&params, stream);
+
+        if (!gst_cuda_result (ret)) {
+          GST_ERROR_OBJECT (self, "CUDA memcpy failed");
+          return FALSE;
+        }
+      } else {
+        for (guint j = 0; j < h; j++) {
+          memcpy (dp, sp, w);
+          dp += ds;
+          sp += ss;
+        }
+      }
+    }
+  }
+
+  return TRUE;
+}
+
+#ifdef HAVE_GST_GL
+struct GLInteropData
+{
+  GstNvCompVideoEnc *self = nullptr;
+  GstBuffer *buffer = nullptr;
+  gboolean ret = FALSE;
+};
+
+static GstCudaGraphicsResource *
+ensure_gl_cuda_resource (GstNvCompVideoEnc * self, GstMemory * mem)
+{
+  auto priv = self->priv;
+  GstCudaGraphicsResource *resource;
+  GQuark quark;
+
+  if (!gst_is_gl_memory_pbo (mem)) {
+    GST_WARNING_OBJECT (self, "memory is not GL PBO memory, %s",
+        mem->allocator->mem_type);
+    return nullptr;
+  }
+
+  quark = gst_cuda_quark_from_id (GST_CUDA_QUARK_GRAPHICS_RESOURCE);
+  resource = (GstCudaGraphicsResource *)
+      gst_mini_object_get_qdata (GST_MINI_OBJECT (mem), quark);
+
+  if (!resource) {
+    GstMapInfo map_info;
+    GstGLMemoryPBO *pbo = (GstGLMemoryPBO *) mem;
+    GstGLBuffer *gl_buf = pbo->pbo;
+    gboolean ret;
+
+    if (!gst_memory_map (mem, &map_info,
+            (GstMapFlags) (GST_MAP_READ | GST_MAP_GL))) {
+      GST_ERROR_OBJECT (self, "Couldn't map gl memory");
+      return nullptr;
+    }
+
+    resource = gst_cuda_graphics_resource_new (priv->ctx,
+        GST_OBJECT (GST_GL_BASE_MEMORY_CAST (mem)->context),
+        GST_CUDA_GRAPHICS_RESOURCE_GL_BUFFER);
+
+    GST_LOG_OBJECT (self, "registering gl buffer %d to CUDA", gl_buf->id);
+    ret = gst_cuda_graphics_resource_register_gl_buffer (resource, gl_buf->id,
+        CU_GRAPHICS_REGISTER_FLAGS_NONE);
+    gst_memory_unmap (mem, &map_info);
+
+    if (!ret) {
+      GST_ERROR_OBJECT (self, "Couldn't register gl buffer %d", gl_buf->id);
+      gst_cuda_graphics_resource_free (resource);
+      return nullptr;
+    }
+
+    gst_mini_object_set_qdata (GST_MINI_OBJECT (mem), quark, resource,
+        (GDestroyNotify) gst_cuda_graphics_resource_free);
+  }
+
+  return resource;
+}
+
+static void
+gst_nv_comp_video_enc_upload_gl (GstGLContext * context, GLInteropData * data)
+{
+  auto self = data->self;
+  auto priv = self->priv;
+  auto info = &priv->state->info;
+  auto finfo = info->finfo;
+  GstCudaGraphicsResource *gst_res[GST_VIDEO_MAX_PLANES] = { nullptr, };
+  CUgraphicsResource cuda_res[GST_VIDEO_MAX_PLANES] = { nullptr, };
+  CUdeviceptr src_devptr[GST_VIDEO_MAX_PLANES] = { 0, };
+  CUstream stream = gst_cuda_stream_get_handle (priv->stream);
+  CUresult ret;
+  gint comp[GST_VIDEO_MAX_COMPONENTS];
+  auto cur_task = priv->cur_task;
+
+  if (!gst_cuda_context_push (priv->ctx)) {
+    GST_ERROR_OBJECT (self, "Couldn't push context");
+    return;
+  }
+
+  for (guint i = 0; i < GST_VIDEO_INFO_N_PLANES (info); i++) {
+    GstMemory *mem = gst_buffer_peek_memory (data->buffer, i);
+    GstGLMemoryPBO *pbo = (GstGLMemoryPBO *) mem;
+    gsize src_size;
+
+    if (!gst_is_gl_memory_pbo (mem)) {
+      GST_ERROR_OBJECT (self, "Not a GL PBO memory");
+      goto out;
+    }
+
+    gst_res[i] = ensure_gl_cuda_resource (self, mem);
+    if (!gst_res[i]) {
+      GST_ERROR_OBJECT (self, "Couldn't get resource %d", i);
+      goto out;
+    }
+
+    gst_gl_memory_pbo_upload_transfer (pbo);
+    gst_gl_memory_pbo_download_transfer (pbo);
+
+    cuda_res[i] = gst_cuda_graphics_resource_map (gst_res[i], stream,
+        CU_GRAPHICS_MAP_RESOURCE_FLAGS_READ_ONLY);
+    if (!cuda_res[i]) {
+      GST_ERROR_OBJECT (self, "Couldn't map resource");
+      goto out;
+    }
+
+    ret = CuGraphicsResourceGetMappedPointer (&src_devptr[i],
+        &src_size, cuda_res[i]);
+    if (!gst_cuda_result (ret)) {
+      GST_ERROR_OBJECT (self, "Couldn't get mapped device pointer");
+      goto out;
+    }
+  }
+
+
+  for (guint i = 0; i < GST_VIDEO_INFO_N_PLANES (info); i++) {
+    guint8 *sp = (guint8 *) src_devptr[i];
+    guint8 *dp = cur_task->device_uncompressed + info->offset[i];
+    guint ss, ds;
+    guint w, h;
+
+    if (GST_VIDEO_FORMAT_INFO_HAS_PALETTE (finfo) && i == 1) {
+      ret = CuMemcpyDtoDAsync ((CUdeviceptr) dp, (CUdeviceptr) sp,
+          256 * 4, stream);
+
+      if (!gst_cuda_result (ret)) {
+        GST_ERROR_OBJECT (self, "CUDA memcpy failed");
+        goto out;
+      }
+
+      data->ret = TRUE;
+      goto out;
+    }
+
+    auto meta = gst_buffer_get_video_meta (data->buffer);
+    if (meta)
+      ss = meta->stride[i];
+    else
+      ss = GST_VIDEO_INFO_PLANE_STRIDE (info, i);
+
+    ds = GST_VIDEO_INFO_PLANE_STRIDE (info, i);
+
+    gst_video_format_info_component (finfo, i, comp);
+
+    w = GST_VIDEO_INFO_COMP_WIDTH (info, comp[0]) *
+        GST_VIDEO_INFO_COMP_PSTRIDE (info, comp[0]);
+    if (w == 0)
+      w = MIN (ss, ds);
+
+    h = GST_VIDEO_INFO_COMP_HEIGHT (info, comp[0]);
+
+    if (GST_VIDEO_FORMAT_INFO_IS_TILED (finfo)) {
+      gint tile_size;
+      gint sx_tiles, sy_tiles, dx_tiles, dy_tiles;
+      GstVideoTileMode mode;
+
+      tile_size = GST_VIDEO_FORMAT_INFO_TILE_SIZE (info->finfo, i);
+
+      mode = GST_VIDEO_FORMAT_INFO_TILE_MODE (info->finfo);
+
+      sx_tiles = GST_VIDEO_TILE_X_TILES (ss);
+      sy_tiles = GST_VIDEO_TILE_Y_TILES (ss);
+
+      dx_tiles = GST_VIDEO_TILE_X_TILES (ds);
+      dy_tiles = GST_VIDEO_TILE_Y_TILES (ds);
+
+      w = MIN (sx_tiles, dx_tiles);
+      h = MIN (sy_tiles, dy_tiles);
+
+      for (guint j = 0; j < h; j++) {
+        for (guint k = 0; k < w; k++) {
+          guint si, di;
+          guint8 *cur_dp;
+          guint8 *cur_sp;
+
+          si = gst_video_tile_get_index (mode, k, j, sx_tiles, sy_tiles);
+          di = gst_video_tile_get_index (mode, k, j, dx_tiles, dy_tiles);
+
+          cur_dp = dp + (di * tile_size);
+          cur_sp = sp + (si * tile_size);
+
+          ret = CuMemcpyDtoDAsync ((CUdeviceptr) cur_dp, (CUdeviceptr) cur_sp,
+              w, stream);
+
+          if (!gst_cuda_result (ret)) {
+            GST_ERROR_OBJECT (self, "CUDA memcpy failed");
+            goto out;
+          }
+        }
+      }
+    } else {
+      CUDA_MEMCPY2D params = { };
+      params.srcMemoryType = CU_MEMORYTYPE_DEVICE;
+      params.srcDevice = (CUdeviceptr) sp;
+      params.srcPitch = ss;
+
+      params.dstMemoryType = CU_MEMORYTYPE_DEVICE;
+      params.dstDevice = (CUdeviceptr) dp;
+      params.dstPitch = ds;
+
+      params.WidthInBytes = w;
+      params.Height = h;
+
+      ret = CuMemcpy2DAsync (&params, stream);
+
+      if (!gst_cuda_result (ret)) {
+        GST_ERROR_OBJECT (self, "CUDA memcpy failed");
+        goto out;
+      }
+    }
+  }
+
+  data->ret = TRUE;
+
+out:
+  for (guint i = 0; i < gst_buffer_n_memory (data->buffer); i++) {
+    if (!gst_res[i])
+      break;
+
+    gst_cuda_graphics_resource_unmap (gst_res[i], stream);
+  }
+
+  CuStreamSynchronize (stream);
+  gst_cuda_context_pop (nullptr);
+}
+#endif
+
+static gpointer
+gst_nv_comp_video_enc_thread_func (GstNvCompVideoEnc * self)
+{
+  auto encoder = GST_VIDEO_ENCODER (self);
+  auto priv = self->priv;
+
+  GST_DEBUG_OBJECT (self, "Entering loop");
+
+  while (1) {
+    std::shared_ptr < EncoderTask > task;
+
+    {
+      std::unique_lock < std::mutex > lk (priv->output_lock);
+      while (priv->output_task_queue.empty ())
+        priv->output_cond.wait (lk);
+
+      task = priv->output_task_queue.front ();
+      priv->output_task_queue.pop ();
+    }
+
+    if (!task) {
+      GST_DEBUG_OBJECT (self, "Got empty task, terminate");
+      break;
+    }
+
+    auto frame = gst_video_encoder_get_oldest_frame (encoder);
+
+    gst_cuda_context_push (priv->ctx);
+    CuEventSynchronize (task->event);
+    gst_cuda_context_pop (nullptr);
+
+    gst_buffer_pool_acquire_buffer (priv->pool, &frame->output_buffer, nullptr);
+    GstMapInfo map_info;
+    gst_buffer_map (frame->output_buffer, &map_info, GST_MAP_WRITE);
+    if (task->batched) {
+      task->compressed_size = 0;
+      auto dst = (uint8_t *) map_info.data;
+
+      /* Write custom header */
+      GST_WRITE_UINT32_LE (dst, GST_NV_COMP_HEADER_VERSION);
+      dst += sizeof (guint32);
+      task->compressed_size += sizeof (guint32);
+
+      GST_WRITE_UINT32_LE (dst, task->chunk_size);
+      dst += sizeof (guint32);
+      task->compressed_size += sizeof (guint32);
+
+      GST_WRITE_UINT32_LE (dst, task->max_output_chunk_size);
+      dst += sizeof (guint32);
+      task->compressed_size += sizeof (guint32);
+
+      GST_WRITE_UINT32_LE (dst, task->batch_size);
+      dst += sizeof (guint32);
+      task->compressed_size += sizeof (guint32);
+
+      for (size_t i = 0; i < task->batch_size; i++) {
+        GST_WRITE_UINT32_LE (dst, task->host_uncompressed_bytes[i]);
+        dst += sizeof (guint32);
+        task->compressed_size += sizeof (guint32);
+
+        GST_WRITE_UINT32_LE (dst, task->host_compressed_bytes[i]);
+        dst += sizeof (guint32);
+        task->compressed_size += sizeof (guint32);
+      }
+
+      /* Write compressed data */
+      for (size_t i = 0; i < task->batch_size; i++) {
+        auto size = task->host_compressed_bytes[i];
+        auto src = task->host_compressed + (i * task->max_output_chunk_size);
+        memcpy (dst, src, size);
+        dst += size;
+        task->compressed_size += size;
+      }
+    } else {
+      memcpy (map_info.data, task->host_compressed, task->compressed_size);
+    }
+    gst_buffer_unmap (frame->output_buffer, &map_info);
+
+    if (task->compressed_size > 0) {
+      gst_buffer_set_size (frame->output_buffer, task->compressed_size);
+      frame->dts = frame->pts;
+
+      auto ratio = (double) priv->state->info.size / task->compressed_size;
+      GST_LOG_OBJECT (self, "compressed buffer size %" G_GSIZE_FORMAT
+          ", ratio %.2f", task->compressed_size, ratio);
+    } else {
+      GST_ERROR_OBJECT (self, "Zero compressed size");
+      gst_clear_buffer (&frame->output_buffer);
+    }
+
+    {
+      std::lock_guard < std::mutex > lk (priv->input_lock);
+      priv->input_task_queue.push (task);
+      priv->input_cond.notify_all ();
+    }
+
+    priv->last_flow = gst_video_encoder_finish_frame (encoder, frame);
+  };
+
+  GST_DEBUG_OBJECT (self, "Leaving loop");
+
+  return nullptr;
+}
+
+static GstFlowReturn
+gst_nv_comp_video_enc_handle_frame (GstVideoEncoder * encoder,
+    GstVideoCodecFrame * frame)
+{
+  auto self = GST_NV_COMP_VIDEO_ENC (encoder);
+  auto priv = self->priv;
+  GstMemory *mem;
+  CUstream stream = nullptr;
+  GstVideoFrame vframe;
+  auto info = &priv->state->info;
+  size_t compressed_size = 0;
+  gboolean need_copy = TRUE;
+  std::shared_ptr < EncoderTask > task;
+
+  if (!priv->ctx || (!priv->manager && !priv->batched_comp)) {
+    GST_ERROR_OBJECT (self, "Context was not configured");
+    goto error;
+  }
+
+  if (priv->last_flow != GST_FLOW_OK) {
+    GST_INFO_OBJECT (self, "Last flow was %s",
+        gst_flow_get_name (priv->last_flow));
+    gst_video_encoder_finish_frame (encoder, frame);
+    return priv->last_flow;
+  }
+
+  if (!priv->encode_thread) {
+    priv->encode_thread = g_thread_new ("nvcompvideoenc",
+        (GThreadFunc) gst_nv_comp_video_enc_thread_func, self);
+  }
+
+  GST_VIDEO_ENCODER_STREAM_UNLOCK (encoder);
+  {
+    std::unique_lock < std::mutex > lk (priv->input_lock);
+    while (priv->input_task_queue.empty ())
+      priv->input_cond.wait (lk);
+
+    priv->cur_task = priv->input_task_queue.front ();
+    priv->input_task_queue.pop ();
+  }
+  GST_VIDEO_ENCODER_STREAM_LOCK (encoder);
+
+  mem = gst_buffer_peek_memory (frame->input_buffer, 0);
+#ifdef HAVE_GST_GL
+  if (priv->gl_interop && gst_is_gl_memory (mem) &&
+      gst_buffer_n_memory (frame->input_buffer) ==
+      GST_VIDEO_INFO_N_PLANES (info)) {
+    GLInteropData interop_data;
+    interop_data.self = self;
+    interop_data.buffer = frame->input_buffer;
+    interop_data.ret = FALSE;
+
+    auto gl_mem = (GstGLMemory *) mem;
+    gst_gl_context_thread_add (gl_mem->mem.context,
+        (GstGLContextThreadFunc) gst_nv_comp_video_enc_upload_gl,
+        &interop_data);
+    if (interop_data.ret) {
+      need_copy = FALSE;
+      GST_TRACE_OBJECT (self, "GL -> CUDA copy done");
+    } else {
+      priv->gl_interop = FALSE;
+    }
+  }
+#endif
+
+  if (!gst_cuda_context_push (priv->ctx)) {
+    GST_ERROR_OBJECT (self, "Couldn't push context");
+    std::lock_guard < std::mutex > lk (priv->input_lock);
+    priv->input_task_queue.push (std::move (priv->cur_task));
+    goto error;
+  }
+
+  stream = gst_cuda_stream_get_handle (priv->stream);
+
+  if (need_copy) {
+    gboolean device_copy = FALSE;
+    if (gst_is_cuda_memory (mem)) {
+      GstCudaMemory *cmem = GST_CUDA_MEMORY_CAST (mem);
+      if (cmem->context == priv->ctx) {
+        device_copy = TRUE;
+        if (!gst_video_frame_map (&vframe, info, frame->input_buffer,
+                (GstMapFlags) (GST_MAP_READ | GST_MAP_CUDA))) {
+          GST_ERROR_OBJECT (self, "Couldn't map cuda memory");
+          gst_cuda_context_pop (nullptr);
+          std::lock_guard < std::mutex > lk (priv->input_lock);
+          priv->input_task_queue.push (std::move (priv->cur_task));
+          goto error;
+        }
+
+        if (gst_cuda_memory_get_stream (cmem) != priv->stream) {
+          GST_DEBUG_OBJECT (self, "Different stream, need sync");
+          gst_cuda_memory_sync (cmem);
+        }
+      }
+    }
+
+    if (!device_copy && !gst_video_frame_map (&vframe,
+            info, frame->input_buffer, GST_MAP_READ)) {
+      GST_ERROR_OBJECT (self, "Couldn't map input frame");
+      gst_cuda_context_pop (nullptr);
+      std::lock_guard < std::mutex > lk (priv->input_lock);
+      priv->input_task_queue.push (std::move (priv->cur_task));
+      goto error;
+    }
+
+    if (!gst_nv_comp_video_enc_upload (self, &vframe, stream, device_copy)) {
+      gst_video_frame_unmap (&vframe);
+      gst_cuda_context_pop (nullptr);
+      std::lock_guard < std::mutex > lk (priv->input_lock);
+      priv->input_task_queue.push (std::move (priv->cur_task));
+      goto error;
+    }
+
+    gst_video_frame_unmap (&vframe);
+
+    if (!device_copy) {
+      CuMemcpyHtoDAsync ((CUdeviceptr) priv->cur_task->device_uncompressed,
+          priv->cur_task->host_uncompressed, info->size, stream);
+    }
+  }
+
+  task = std::move (priv->cur_task);
+  if (task->batched) {
+    g_assert (priv->batched_comp);
+
+    auto status = priv->batched_comp->compress (task->device_uncompressed_ptrs,
+        task->device_uncompressed_bytes, task->chunk_size, task->batch_size,
+        task->temp_ptr, task->temp_size, task->device_compressed_ptrs,
+        task->device_compressed_bytes, (cudaStream_t) stream);
+    if (status != nvcompSuccess) {
+      GST_ERROR_OBJECT (self, "Compression failed, ret %d", status);
+      gst_cuda_context_pop (nullptr);
+      std::lock_guard < std::mutex > lk (priv->input_lock);
+      priv->input_task_queue.push (std::move (task));
+      goto error;
+    }
+
+    CuMemcpyDtoHAsync (task->host_compressed_bytes,
+        (CUdeviceptr) task->device_compressed_bytes,
+        sizeof (size_t) * task->batch_size, stream);
+    CuMemcpyDtoHAsync (task->host_compressed,
+        (CUdeviceptr) task->device_compressed,
+        task->compressed_alloc_size, stream);
+  } else {
+    g_assert (priv->manager);
+
+    priv->manager->compress (task->device_uncompressed,
+        task->device_compressed, *priv->config);
+
+    compressed_size =
+        priv->manager->get_compressed_output_size (task->device_compressed);
+
+    task->compressed_size = compressed_size;
+    CuMemcpyDtoHAsync (task->host_compressed,
+        (CUdeviceptr) task->device_compressed, compressed_size, stream);
+  }
+
+  CuEventRecord (task->event, stream);
+  gst_cuda_context_pop (nullptr);
+
+  {
+    std::lock_guard < std::mutex > lk (priv->output_lock);
+    priv->output_task_queue.push (std::move (task));
+    priv->output_cond.notify_one ();
+  }
+
+  gst_video_codec_frame_unref (frame);
+
+  return priv->last_flow;
+
+error:
+  gst_video_encoder_finish_frame (encoder, frame);
+
+  return GST_FLOW_ERROR;
+}
diff --git a/subprojects/gst-plugins-bad/ext/nvcomp/gstnvcompvideoenc.h b/subprojects/gst-plugins-bad/ext/nvcomp/gstnvcompvideoenc.h
new file mode 100644
index 0000000000..d7ef55743e
--- /dev/null
+++ b/subprojects/gst-plugins-bad/ext/nvcomp/gstnvcompvideoenc.h
@@ -0,0 +1,32 @@
+/* GStreamer
+ * Copyright (C) 2024 Seungha Yang <seungha@centricular.com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Library General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Library General Public
+ * License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
+ * Boston, MA 02110-1301, USA.
+ */
+
+#pragma once
+
+#include <gst/gst.h>
+#include <gst/video/video.h>
+#include "gstnvcomp.h"
+
+G_BEGIN_DECLS
+
+#define GST_TYPE_NV_COMP_VIDEO_ENC (gst_nv_comp_video_enc_get_type())
+G_DECLARE_FINAL_TYPE (GstNvCompVideoEnc, gst_nv_comp_video_enc,
+    GST, NV_COMP_VIDEO_ENC, GstVideoEncoder)
+
+G_END_DECLS
\ No newline at end of file
diff --git a/subprojects/gst-plugins-bad/ext/nvcomp/meson.build b/subprojects/gst-plugins-bad/ext/nvcomp/meson.build
new file mode 100644
index 0000000000..ffb40867b2
--- /dev/null
+++ b/subprojects/gst-plugins-bad/ext/nvcomp/meson.build
@@ -0,0 +1,70 @@
+nvcomp_sources = [
+  'gstnvcomp.cpp',
+  'gstnvcompvideodec.cpp',
+  'gstnvcompvideoenc.cpp',
+  'plugin.cpp',
+]
+extra_args = ['-DGST_USE_UNSTABLE_API']
+
+nvcomp_opt = get_option('nvcomp')
+if nvcomp_opt.disabled() or host_system not in ['windows', 'linux']
+  subdir_done()
+endif
+
+nvcomp_sdk_path = get_option('nvcomp-sdk-path')
+if nvcomp_sdk_path == ''
+  nvcomp_sdk_path = run_command(python3, '-c', 'import os; print(os.environ.get("NVCOMP_SDK_PATH"))', check: false).stdout().strip()
+endif
+
+if nvcomp_sdk_path == '' or nvcomp_sdk_path == 'None'
+  if nvcomp_opt.enabled()
+    error('nvcomp-sdk-path option must be specified for nvCOMP plugin')
+  endif
+  subdir_done()
+endif
+
+if not gstcuda_dep.found()
+  if nvcomp_opt.enabled()
+    error('nvCOMP plugin was enabled explicitly, but required gstcuda was not found')
+  endif
+  subdir_done()
+endif
+
+nvcomp_inc_dirs = [include_directories(join_paths(nvcomp_sdk_path, 'include'), './stub'),
+                   cuda_stubinc]
+
+nvcomp_lib_path = join_paths(nvcomp_sdk_path, 'lib')
+nvcomp_lib = cc.find_library('nvcomp',
+                             dirs: nvcomp_lib_path, required: nvcomp_opt)
+if not nvcomp_lib.found()
+  subdir_done()
+endif
+
+nvcomp_bitcomp_lib = cc.find_library('nvcomp_bitcomp',
+                                     dirs: nvcomp_lib_path, required: nvcomp_opt)
+if not nvcomp_bitcomp_lib.found()
+  subdir_done()
+endif
+
+nvcomp_gdeflate_lib = cc.find_library('nvcomp_gdeflate',
+                                      dirs: nvcomp_lib_path, required: nvcomp_opt)
+if not nvcomp_gdeflate_lib.found()
+  subdir_done()
+endif
+
+if gstgl_dep.found()
+  extra_args += ['-DHAVE_GST_GL']
+endif
+
+gstnvcomp = library('gstnvcomp',
+  nvcomp_sources,
+  c_args : gst_plugins_bad_args + extra_args,
+  cpp_args : gst_plugins_bad_args + extra_args,
+  include_directories : [configinc] + nvcomp_inc_dirs,
+  dependencies :  [gstbase_dep, gstvideo_dep, gstcuda_dep, gstgl_dep, nvcomp_lib,
+                  nvcomp_bitcomp_lib, nvcomp_gdeflate_lib],
+  override_options : ['cpp_std=c++17'],
+  install : true,
+  install_dir : plugins_install_dir,
+)
+plugins += [gstnvcomp]
diff --git a/subprojects/gst-plugins-bad/ext/nvcomp/plugin.cpp b/subprojects/gst-plugins-bad/ext/nvcomp/plugin.cpp
new file mode 100644
index 0000000000..375a587fa3
--- /dev/null
+++ b/subprojects/gst-plugins-bad/ext/nvcomp/plugin.cpp
@@ -0,0 +1,47 @@
+/* GStreamer
+ * Copyright (C) 2024 Seungha Yang <seungha@centricular.com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Library General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Library General Public
+ * License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
+ * Boston, MA 02110-1301, USA.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <gst/gst.h>
+#include "gstnvcomp.h"
+#include "gstnvcompvideodec.h"
+#include "gstnvcompvideoenc.h"
+
+static gboolean
+plugin_init (GstPlugin * plugin)
+{
+  if (!gst_cuda_load_library ()) {
+    gst_plugin_add_status_warning (plugin, "CUDA library was not found.");
+    return TRUE;
+  }
+
+  gst_element_register (plugin,
+      "nvcompvideodec", GST_RANK_NONE, GST_TYPE_NV_COMP_VIDEO_DEC);
+  gst_element_register (plugin,
+      "nvcompvideoenc", GST_RANK_NONE, GST_TYPE_NV_COMP_VIDEO_ENC);
+
+  return TRUE;
+}
+
+GST_PLUGIN_DEFINE (GST_VERSION_MAJOR, GST_VERSION_MINOR, nvcomp,
+    "GStreamer nvCOMP plugin", plugin_init, VERSION, "LGPL",
+    GST_PACKAGE_NAME, GST_PACKAGE_ORIGIN)
diff --git a/subprojects/gst-plugins-bad/ext/nvcomp/stub/cuda_runtime.h b/subprojects/gst-plugins-bad/ext/nvcomp/stub/cuda_runtime.h
new file mode 100644
index 0000000000..f6df2d0bf9
--- /dev/null
+++ b/subprojects/gst-plugins-bad/ext/nvcomp/stub/cuda_runtime.h
@@ -0,0 +1,9 @@
+#pragma once
+
+#include <glib.h>
+
+G_BEGIN_DECLS
+
+typedef struct CUstream_st* cudaStream_t;
+
+G_END_DECLS
diff --git a/subprojects/gst-plugins-bad/meson_options.txt b/subprojects/gst-plugins-bad/meson_options.txt
index 5752b83fee..fb3db29119 100644
--- a/subprojects/gst-plugins-bad/meson_options.txt
+++ b/subprojects/gst-plugins-bad/meson_options.txt
@@ -143,6 +143,7 @@ option('mplex', type : 'feature', value : 'auto', description : 'mplex audio/vid
 option('msdk', type : 'feature', value : 'auto', description : 'Intel Media SDK video encoder/decoder plugin')
 option('musepack', type : 'feature', value : 'auto', description : 'libmpcdec Musepack decoder plugin')
 option('neon', type : 'feature', value : 'auto', description : 'NEON HTTP source plugin')
+option('nvcomp', type : 'feature', value : 'auto', description : 'NVIDIA nvCOMP compression/decompression plugin')
 option('nvcodec', type : 'feature', value : 'auto', description : 'NVIDIA GPU codec plugin')
 option('onnx', type : 'feature', value : 'auto', description : 'ONNX neural network plugin')
 option('openal', type : 'feature', value : 'auto', description : 'OpenAL plugin')
@@ -241,6 +242,10 @@ option('sctp-internal-usrsctp', type: 'feature', value : 'enabled',
 option('mfx_api', type : 'combo', choices : ['MSDK', 'oneVPL', 'auto'], value : 'auto',
        description : 'Select MFX API to build against')
 
+# nvCOMP plugin options
+option('nvcomp-sdk-path', type: 'string', value : '',
+       description : 'nvCOMP SDK root directory')
+
 # QSV plugin options
 option('mfx-modules-dir', type: 'string', value : '',
        description : 'libmfx runtime module dir, linux only')