diff --git a/.indent_cpp_list b/.indent_cpp_list index 8439878265..db2cc2d444 100644 --- a/.indent_cpp_list +++ b/.indent_cpp_list @@ -1,3 +1,4 @@ +subprojects/gst-plugins-bad/ext/nvcomp subprojects/gst-plugins-bad/ext/qt6d3d11 subprojects/gst-plugins-bad/gst-libs/gst/cuda subprojects/gst-plugins-bad/gst-libs/gst/d3d11 diff --git a/subprojects/gst-plugins-bad/ext/meson.build b/subprojects/gst-plugins-bad/ext/meson.build index 4f7139835f..f0bffe0c48 100644 --- a/subprojects/gst-plugins-bad/ext/meson.build +++ b/subprojects/gst-plugins-bad/ext/meson.build @@ -38,6 +38,7 @@ subdir('mpeg2enc') subdir('mplex') subdir('musepack') subdir('neon') +subdir('nvcomp') subdir('onnx') subdir('openal') subdir('openaptx') diff --git a/subprojects/gst-plugins-bad/ext/nvcomp/gstnvcomp.cpp b/subprojects/gst-plugins-bad/ext/nvcomp/gstnvcomp.cpp new file mode 100644 index 0000000000..f3337fb7da --- /dev/null +++ b/subprojects/gst-plugins-bad/ext/nvcomp/gstnvcomp.cpp @@ -0,0 +1,60 @@ +/* GStreamer + * Copyright (C) 2024 Seungha Yang + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Library General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Library General Public License for more details. + * + * You should have received a copy of the GNU Library General Public + * License along with this library; if not, write to the + * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, + * Boston, MA 02110-1301, USA. + */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "gstnvcomp.h" +#include + +static const GEnumValue nvcomp_methods[] = { + {GST_NV_COMP_LZ4, "LZ4", "lz4"}, + {GST_NV_COMP_SNAPPY, "SNAPPY", "snappy"}, + {GST_NV_COMP_GDEFLATE, "GDEFLATE", "gdeflate"}, + {GST_NV_COMP_DEFLATE, "DEFLATE", "deflate"}, + {GST_NV_COMP_ZSTD, "ZSTD", "zstd"}, + {GST_NV_COMP_CASCADED, "CASCADED", "cascaded"}, + {GST_NV_COMP_BITCOMP, "BITCOMP", "bitcomp"}, + {GST_NV_COMP_ANS, "ANS", "ans"}, + {0, nullptr, nullptr}, +}; + +GType +gst_nv_comp_method_get_type (void) +{ + static GType method_type = 0; + static std::once_flag once; + + std::call_once (once,[&] { + method_type = g_enum_register_static ("GstNvCompMethod", + nvcomp_methods); + }); + + return method_type; +} + +const gchar * +gst_nv_comp_method_to_string (GstNvCompMethod method) +{ + if (method >= GST_NV_COMP_LAST) + return nullptr; + + return nvcomp_methods[method].value_nick; +} diff --git a/subprojects/gst-plugins-bad/ext/nvcomp/gstnvcomp.h b/subprojects/gst-plugins-bad/ext/nvcomp/gstnvcomp.h new file mode 100644 index 0000000000..af6f9e510b --- /dev/null +++ b/subprojects/gst-plugins-bad/ext/nvcomp/gstnvcomp.h @@ -0,0 +1,56 @@ +/* GStreamer + * Copyright (C) 2024 Seungha Yang + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Library General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Library General Public License for more details. + * + * You should have received a copy of the GNU Library General Public + * License along with this library; if not, write to the + * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, + * Boston, MA 02110-1301, USA. + */ + +#pragma once + +#include +#include + +#ifdef G_OS_WIN32 +#include +#endif + +#include +#include +#include + +G_BEGIN_DECLS + +enum GstNvCompMethod +{ + GST_NV_COMP_LZ4, + GST_NV_COMP_SNAPPY, + GST_NV_COMP_GDEFLATE, + GST_NV_COMP_DEFLATE, + GST_NV_COMP_ZSTD, + GST_NV_COMP_CASCADED, + GST_NV_COMP_BITCOMP, + GST_NV_COMP_ANS, + GST_NV_COMP_LAST, +}; + +#define GST_TYPE_NV_COMP_METHOD (gst_nv_comp_method_get_type()) +GType gst_nv_comp_method_get_type (); + +const gchar * gst_nv_comp_method_to_string (GstNvCompMethod method); + +#define GST_NV_COMP_HEADER_VERSION 1 +#define GST_NV_COMP_HEADER_MIN_SIZE (sizeof (guint32) * 6) + +G_END_DECLS \ No newline at end of file diff --git a/subprojects/gst-plugins-bad/ext/nvcomp/gstnvcompvideodec.cpp b/subprojects/gst-plugins-bad/ext/nvcomp/gstnvcompvideodec.cpp new file mode 100644 index 0000000000..08a0c647b7 --- /dev/null +++ b/subprojects/gst-plugins-bad/ext/nvcomp/gstnvcompvideodec.cpp @@ -0,0 +1,1739 @@ +/* GStreamer + * Copyright (C) 2024 Seungha Yang + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Library General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Library General Public License for more details. + * + * You should have received a copy of the GNU Library General Public + * License along with this library; if not, write to the + * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, + * Boston, MA 02110-1301, USA. + */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "gstnvcompvideodec.h" + +#ifdef HAVE_GST_GL +#include +#include +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +GST_DEBUG_CATEGORY_STATIC (gst_nv_comp_video_dec_debug); +#define GST_CAT_DEFAULT gst_nv_comp_video_dec_debug + +#ifdef HAVE_GST_GL +#define SRC_CAPS \ + GST_VIDEO_CAPS_MAKE_WITH_FEATURES (GST_CAPS_FEATURE_MEMORY_CUDA_MEMORY, \ + GST_VIDEO_FORMATS_ALL) ";" \ + GST_VIDEO_CAPS_MAKE_WITH_FEATURES (GST_CAPS_FEATURE_MEMORY_GL_MEMORY, \ + GST_VIDEO_FORMATS_ALL) ";" \ + GST_VIDEO_CAPS_MAKE (GST_VIDEO_FORMATS_ALL) +#else +#define SRC_CAPS \ + GST_VIDEO_CAPS_MAKE_WITH_FEATURES (GST_CAPS_FEATURE_MEMORY_CUDA_MEMORY, \ + GST_VIDEO_FORMATS_ALL) ";" \ + GST_VIDEO_CAPS_MAKE (GST_VIDEO_FORMATS_ALL) +#endif + +static GstStaticPadTemplate sink_template = + GST_STATIC_PAD_TEMPLATE ("sink", GST_PAD_SINK, GST_PAD_ALWAYS, + GST_STATIC_CAPS ("video/x-nvcomp; video/x-nvcomp-lz4; " + "video/x-nvcomp-snappy; video/x-nvcomp-gdeflate; " + "video/x-nvcomp-deflate; video/x-nvcomp-zstd; video/x-nvcomp-cascaded; " + "video/x-nvcomp-bitcomp; video/x-nvcomp-ans")); + +static GstStaticPadTemplate src_template = +GST_STATIC_PAD_TEMPLATE ("src", GST_PAD_SRC, GST_PAD_ALWAYS, + GST_STATIC_CAPS (SRC_CAPS)); + +/* *INDENT-OFF* */ +using namespace nvcomp; + +struct DecoderTask +{ + ~DecoderTask () + { + if (ctx) { + gst_cuda_context_push (ctx); + clear_resource (); + gst_cuda_context_pop (nullptr); + gst_object_unref (ctx); + } + } + + void clear_resource () + { + if (!ctx) + return; + + if (device_compressed) + CuMemFree ((CUdeviceptr) device_compressed); + device_compressed = nullptr; + + if (host_compressed) + CuMemFreeHost (host_compressed); + host_compressed = nullptr; + + if (device_compressed_bytes) + CuMemFree ((CUdeviceptr) device_compressed_bytes); + device_compressed_bytes = nullptr; + + if (device_compressed_ptrs) + CuMemFree ((CUdeviceptr) device_compressed_ptrs); + device_compressed_ptrs = nullptr; + + if (host_compressed_bytes) + CuMemFreeHost (host_compressed_bytes); + host_compressed_bytes = nullptr; + + if (host_compressed_ptrs) + CuMemFreeHost (host_compressed_ptrs); + host_compressed_ptrs = nullptr; + + if (device_uncompressed) + CuMemFree ((CUdeviceptr) device_uncompressed); + device_uncompressed = nullptr; + + if (device_uncompressed_temp) + CuMemFree ((CUdeviceptr) device_uncompressed_temp); + device_uncompressed_temp = nullptr; + + if (host_uncompressed) + CuMemFreeHost (host_uncompressed); + host_uncompressed = nullptr; + + if (device_uncompressed_bytes) + CuMemFree ((CUdeviceptr) device_uncompressed_bytes); + device_uncompressed_bytes = nullptr; + + if (device_uncompressed_ptrs) + CuMemFree ((CUdeviceptr) device_uncompressed_ptrs); + device_uncompressed_ptrs = nullptr; + + if (host_uncompressed_bytes) + CuMemFreeHost (host_uncompressed_bytes); + host_uncompressed_bytes = nullptr; + + if (host_uncompressed_ptrs) + CuMemFreeHost (host_uncompressed_ptrs); + host_uncompressed_ptrs = nullptr; + + if (device_actual_uncompressed_bytes) + CuMemFree ((CUdeviceptr) device_actual_uncompressed_bytes); + device_actual_uncompressed_bytes = nullptr; + + if (temp_ptr) + CuMemFree ((CUdeviceptr) temp_ptr); + temp_ptr = nullptr; + + if (device_statuses) + CuMemFree ((CUdeviceptr) device_statuses); + device_statuses = nullptr; + + batch_size = 0; + max_compressed_chunk_size = 0; + max_uncompressed_chunk_size = 0; + } + + bool allocate_batched (size_t num_chunks, + size_t compressed_chunk_size, + size_t uncompressed_chunk_size, size_t temp_bytes) + { + size_t compressed_alloc; + size_t uncompressed_alloc; + size_t alloc_size = num_chunks * sizeof (size_t); + uint8_t *src; + + compressed_chunk_size = GST_ROUND_UP_8 (compressed_chunk_size); + uncompressed_chunk_size = GST_ROUND_UP_8 (uncompressed_chunk_size); + + compressed_alloc = num_chunks * compressed_chunk_size; + uncompressed_alloc = num_chunks * uncompressed_chunk_size; + + auto ret = CuMemAlloc ((CUdeviceptr *) &device_compressed, + compressed_alloc); + if (!gst_cuda_result (ret)) + return false; + + ret = CuMemAllocHost ((void **) &host_compressed, compressed_alloc); + if (!gst_cuda_result (ret)) + return false; + + ret = CuMemAlloc ((CUdeviceptr *) &device_compressed_bytes, + alloc_size); + if (!gst_cuda_result (ret)) + return false; + + ret = CuMemAlloc ((CUdeviceptr *) &device_compressed_ptrs, + alloc_size); + if (!gst_cuda_result (ret)) + return false; + + ret = CuMemAllocHost ((void **) &host_compressed_bytes, + alloc_size); + if (!gst_cuda_result (ret)) + return false; + + ret = CuMemAllocHost ((void **) &host_compressed_ptrs, + alloc_size); + if (!gst_cuda_result (ret)) + return false; + + src = device_compressed; + for (size_t i = 0; i < num_chunks; i++) { + host_compressed_ptrs[i] = src; + src += compressed_chunk_size; + } + + ret = CuMemcpyHtoD ((CUdeviceptr) device_compressed_ptrs, + host_compressed_ptrs, alloc_size); + if (!gst_cuda_result (ret)) + return false; + + ret = CuMemAlloc ((CUdeviceptr *) &device_uncompressed_temp, + uncompressed_alloc); + if (!gst_cuda_result (ret)) + return false; + + ret = CuMemAlloc ((CUdeviceptr *) &device_uncompressed, + uncompressed_alloc); + if (!gst_cuda_result (ret)) + return false; + + ret = CuMemAllocHost ((void **) &host_uncompressed, uncompressed_alloc); + if (!gst_cuda_result (ret)) + return false; + + ret = CuMemAlloc ((CUdeviceptr *) &device_uncompressed_bytes, + alloc_size); + if (!gst_cuda_result (ret)) + return false; + + ret = CuMemAlloc ((CUdeviceptr *) &device_uncompressed_ptrs, + alloc_size); + if (!gst_cuda_result (ret)) + return false; + + ret = CuMemAllocHost ((void **) &host_uncompressed_bytes, + alloc_size); + if (!gst_cuda_result (ret)) + return false; + + ret = CuMemAllocHost ((void **) &host_uncompressed_ptrs, + alloc_size); + if (!gst_cuda_result (ret)) + return false; + + src = device_uncompressed_temp; + for (size_t i = 0; i < num_chunks; i++) { + host_uncompressed_bytes[i] = uncompressed_chunk_size; + host_uncompressed_ptrs[i] = src; + src += uncompressed_chunk_size; + } + + ret = CuMemcpyHtoD ((CUdeviceptr) device_uncompressed_bytes, + host_uncompressed_bytes, alloc_size); + if (!gst_cuda_result (ret)) + return false; + + ret = CuMemcpyHtoD ((CUdeviceptr) device_uncompressed_ptrs, + host_uncompressed_ptrs, alloc_size); + if (!gst_cuda_result (ret)) + return false; + + ret = CuMemAlloc ((CUdeviceptr *) &device_actual_uncompressed_bytes, + alloc_size); + if (!gst_cuda_result (ret)) + return false; + + if (temp_bytes > 0) { + ret = CuMemAlloc ((CUdeviceptr *) &temp_ptr, temp_bytes); + if (!gst_cuda_result (ret)) + return false; + } + + ret = CuMemAlloc ((CUdeviceptr *) &device_statuses, + sizeof (nvcompStatus_t) * num_chunks); + if (!gst_cuda_result (ret)) + return false; + + batched = TRUE; + batch_size = num_chunks; + temp_size = temp_bytes; + max_compressed_chunk_size = compressed_chunk_size; + max_uncompressed_chunk_size = uncompressed_chunk_size; + compressed_alloc_size = compressed_alloc; + uncompressed_alloc_size = uncompressed_alloc; + + return true; + } + + GstCudaContext *ctx = nullptr; + + uint8_t *device_compressed = nullptr; + uint8_t *host_compressed = nullptr; + + size_t *device_compressed_bytes = nullptr; + void **device_compressed_ptrs = nullptr; + + size_t *host_compressed_bytes = nullptr; + void **host_compressed_ptrs = nullptr; + + uint8_t *device_uncompressed = nullptr; + uint8_t *device_uncompressed_temp = nullptr; + uint8_t *host_uncompressed = nullptr; + + size_t *device_uncompressed_bytes = nullptr; + void **device_uncompressed_ptrs = nullptr; + + size_t *host_uncompressed_bytes = nullptr; + void **host_uncompressed_ptrs = nullptr; + + size_t *device_actual_uncompressed_bytes = nullptr; + + void *temp_ptr = nullptr; + size_t temp_size = 0; + + nvcompStatus_t *device_statuses = nullptr; + + gboolean batched = FALSE; + size_t batch_size = 0; + size_t max_uncompressed_chunk_size = 0; + size_t max_compressed_chunk_size = 0; + size_t uncompressed_alloc_size = 0; + size_t compressed_alloc_size = 0; +}; + +struct BatchedDecompBase +{ + virtual nvcompStatus_t get_temp_size( + size_t num_chunks, + size_t max_uncompressed_chunk_bytes, + size_t * temp_bytes) = 0; + + virtual nvcompStatus_t decompress( + void **device_compressed_ptrs, + size_t *device_compressed_bytes, + size_t *device_uncompressed_bytes, + size_t *device_actual_uncompressed_bytes, + size_t batch_size, + void *device_temp_ptr, + size_t temp_bytes, + void **device_uncompressed_ptrs, + nvcompStatus_t *device_statuses, + cudaStream_t stream) = 0; +}; + +template +class BatchedDecomp : public BatchedDecompBase +{ +public: + BatchedDecomp () {} + + nvcompStatus_t get_temp_size( + size_t num_chunks, + size_t max_uncompressed_chunk_bytes, + size_t * temp_bytes) + { + return T (num_chunks, max_uncompressed_chunk_bytes, temp_bytes); + } + + nvcompStatus_t decompress( + void **device_compressed_ptrs, + size_t *device_compressed_bytes, + size_t *device_uncompressed_bytes, + size_t *device_actual_uncompressed_bytes, + size_t batch_size, + void *device_temp_ptr, + size_t temp_bytes, + void **device_uncompressed_ptrs, + nvcompStatus_t *device_statuses, + cudaStream_t stream) + { + return D (device_compressed_ptrs, device_compressed_bytes, + device_uncompressed_bytes, device_actual_uncompressed_bytes, + batch_size, device_temp_ptr, temp_bytes, device_uncompressed_ptrs, + device_statuses, stream); + } +}; + +struct GstNvCompVideoDecPrivate +{ + GstNvCompVideoDecPrivate () + { + gst_video_info_init (&info); + } + + GstCudaContext *ctx = nullptr; + GstCudaStream *stream = nullptr; + +#ifdef HAVE_GST_GL + GstGLDisplay *gl_display = nullptr; + GstGLContext *gl_context = nullptr; + GstGLContext *other_gl_context = nullptr; +#endif + + GstVideoCodecState *state = nullptr; + std::shared_ptr manager; + std::shared_ptr batched_decomp; + std::shared_ptr task; + gboolean gl_interop = FALSE; + + GstVideoInfo info; + gboolean batched = FALSE; + GstNvCompMethod method; +}; +/* *INDENT-ON* */ + +struct _GstNvCompVideoDec +{ + GstVideoDecoder parent; + GstNvCompVideoDecPrivate *priv; +}; + +static void gst_nv_comp_video_dec_finalize (GObject * object); + +static void gst_nv_comp_video_dec_set_context (GstElement * element, + GstContext * context); + +static gboolean gst_nv_comp_video_dec_open (GstVideoDecoder * decoder); +static gboolean gst_nv_comp_video_dec_close (GstVideoDecoder * decoder); +static gboolean gst_nv_comp_video_dec_sink_query (GstVideoDecoder * decoder, + GstQuery * query); +static gboolean gst_nv_comp_video_dec_src_query (GstVideoDecoder * decoder, + GstQuery * query); +static gboolean +gst_nv_comp_video_dec_decide_allocation (GstVideoDecoder * decoder, + GstQuery * query); +static gboolean gst_nv_comp_video_dec_set_format (GstVideoDecoder * decoder, + GstVideoCodecState * state); +static gboolean gst_nv_comp_video_dec_negotiate (GstVideoDecoder * decoder); +static GstFlowReturn +gst_nv_comp_video_dec_handle_frame (GstVideoDecoder * decoder, + GstVideoCodecFrame * frame); + +#define gst_nv_comp_video_dec_parent_class parent_class +G_DEFINE_TYPE (GstNvCompVideoDec, + gst_nv_comp_video_dec, GST_TYPE_VIDEO_DECODER); + +static void +gst_nv_comp_video_dec_class_init (GstNvCompVideoDecClass * klass) +{ + auto object_class = G_OBJECT_CLASS (klass); + auto element_class = GST_ELEMENT_CLASS (klass); + auto decoder_class = GST_VIDEO_DECODER_CLASS (klass); + + object_class->finalize = gst_nv_comp_video_dec_finalize; + + gst_element_class_add_static_pad_template (element_class, &sink_template); + gst_element_class_add_static_pad_template (element_class, &src_template); + + gst_element_class_set_static_metadata (element_class, + "nvCOMP Video Decoder", "Decoder/Video/Hardware", + "Decompress a video stream using nvCOMP library", + "Seungha Yang "); + + element_class->set_context = + GST_DEBUG_FUNCPTR (gst_nv_comp_video_dec_set_context); + + decoder_class->open = GST_DEBUG_FUNCPTR (gst_nv_comp_video_dec_open); + decoder_class->close = GST_DEBUG_FUNCPTR (gst_nv_comp_video_dec_close); + decoder_class->sink_query = + GST_DEBUG_FUNCPTR (gst_nv_comp_video_dec_sink_query); + decoder_class->src_query = + GST_DEBUG_FUNCPTR (gst_nv_comp_video_dec_src_query); + decoder_class->decide_allocation = + GST_DEBUG_FUNCPTR (gst_nv_comp_video_dec_decide_allocation); + decoder_class->set_format = + GST_DEBUG_FUNCPTR (gst_nv_comp_video_dec_set_format); + decoder_class->negotiate = + GST_DEBUG_FUNCPTR (gst_nv_comp_video_dec_negotiate); + decoder_class->handle_frame = + GST_DEBUG_FUNCPTR (gst_nv_comp_video_dec_handle_frame); + + GST_DEBUG_CATEGORY_INIT (gst_nv_comp_video_dec_debug, + "nvcompvideodec", 0, "nvcompvideodec"); +} + +static void +gst_nv_comp_video_dec_init (GstNvCompVideoDec * self) +{ + self->priv = new GstNvCompVideoDecPrivate (); +} + +static void +gst_nv_comp_video_dec_finalize (GObject * object) +{ + auto self = GST_NV_COMP_VIDEO_DEC (object); + + delete self->priv; + + G_OBJECT_CLASS (parent_class)->finalize (object); +} + +static void +gst_nv_comp_video_dec_set_context (GstElement * element, GstContext * context) +{ + auto self = GST_NV_COMP_VIDEO_DEC (element); + auto priv = self->priv; + + gst_cuda_handle_set_context (element, context, -1, &priv->ctx); +#ifdef HAVE_GST_GL + if (gst_gl_handle_set_context (element, context, &priv->gl_display, + &priv->other_gl_context)) { + if (priv->gl_display) + gst_gl_display_filter_gl_api (priv->gl_display, GST_GL_API_OPENGL3); + } +#endif + + GST_ELEMENT_CLASS (parent_class)->set_context (element, context); +} + +static gboolean +gst_nv_comp_video_dec_open (GstVideoDecoder * decoder) +{ + auto self = GST_NV_COMP_VIDEO_DEC (decoder); + auto priv = self->priv; + + if (!gst_cuda_ensure_element_context (GST_ELEMENT_CAST (decoder), + -1, &priv->ctx)) { + GST_ERROR_OBJECT (self, "Couldn't get cuda context"); + return FALSE; + } + + priv->stream = gst_cuda_stream_new (priv->ctx); + + return TRUE; +} + +static gboolean +gst_nv_comp_video_dec_close (GstVideoDecoder * decoder) +{ + auto self = GST_NV_COMP_VIDEO_DEC (decoder); + auto priv = self->priv; + + if (priv->ctx) { + gst_cuda_context_push (priv->ctx); + priv->manager = nullptr; + priv->task = nullptr; + + gst_cuda_context_pop (nullptr); + } + + gst_clear_cuda_stream (&priv->stream); + gst_clear_object (&priv->ctx); + +#ifdef HAVE_GST_GL + gst_clear_object (&priv->other_gl_context); + gst_clear_object (&priv->gl_context); + gst_clear_object (&priv->gl_context); +#endif + + return TRUE; +} + +static gboolean +gst_nv_comp_video_dec_handle_context_query (GstNvCompVideoDec * self, + GstQuery * query) +{ + auto priv = self->priv; + +#ifdef HAVE_GST_GL + { + GstGLDisplay *display = nullptr; + GstGLContext *other = nullptr; + GstGLContext *local = nullptr; + + if (priv->gl_display) + display = (GstGLDisplay *) gst_object_ref (priv->gl_display); + if (priv->gl_context) + local = (GstGLContext *) gst_object_ref (priv->gl_context); + if (priv->other_gl_context) + other = (GstGLContext *) gst_object_ref (priv->other_gl_context); + + auto ret = gst_gl_handle_context_query (GST_ELEMENT (self), query, + display, local, other); + gst_clear_object (&display); + gst_clear_object (&other); + gst_clear_object (&local); + + if (ret) + return TRUE; + } +#endif + + if (gst_cuda_handle_context_query (GST_ELEMENT (self), query, priv->ctx)) + return TRUE; + + return FALSE; +} + +static gboolean +gst_nv_comp_video_dec_sink_query (GstVideoDecoder * decoder, GstQuery * query) +{ + auto self = GST_NV_COMP_VIDEO_DEC (decoder); + + switch (GST_QUERY_TYPE (query)) { + case GST_QUERY_CONTEXT: + if (gst_nv_comp_video_dec_handle_context_query (self, query)) + return TRUE; + break; + default: + break; + } + + return GST_VIDEO_DECODER_CLASS (parent_class)->sink_query (decoder, query); +} + +static gboolean +gst_nv_comp_video_dec_src_query (GstVideoDecoder * decoder, GstQuery * query) +{ + auto self = GST_NV_COMP_VIDEO_DEC (decoder); + + switch (GST_QUERY_TYPE (query)) { + case GST_QUERY_CONTEXT: + if (gst_nv_comp_video_dec_handle_context_query (self, query)) + return TRUE; + break; + default: + break; + } + + return GST_VIDEO_DECODER_CLASS (parent_class)->src_query (decoder, query); +} + +#ifdef HAVE_GST_GL +static void +check_cuda_device_from_gl_context (GstGLContext * context, gboolean * ret) +{ + guint device_count = 0; + CUdevice device_list[1] = { 0, }; + CUresult cuda_ret; + + *ret = FALSE; + cuda_ret = CuGLGetDevices (&device_count, + device_list, 1, CU_GL_DEVICE_LIST_ALL); + + if (!gst_cuda_result (cuda_ret) || device_count == 0) + return; + + *ret = TRUE; +} + +static gboolean +gst_nv_comp_video_dec_ensure_gl_context (GstNvCompVideoDec * self) +{ + auto priv = self->priv; + gboolean ret = FALSE; + + if (!gst_gl_ensure_element_data (GST_ELEMENT (self), &priv->gl_display, + &priv->other_gl_context)) { + GST_DEBUG_OBJECT (self, "Couldn't get GL display"); + return FALSE; + } + + gst_gl_display_filter_gl_api (priv->gl_display, GST_GL_API_OPENGL3); + + if (!gst_gl_display_ensure_context (priv->gl_display, priv->other_gl_context, + &priv->gl_context, nullptr)) { + GST_DEBUG_OBJECT (self, "Couldn't get GL context"); + return FALSE; + } + + gst_gl_context_thread_add (priv->gl_context, + (GstGLContextThreadFunc) check_cuda_device_from_gl_context, &ret); + + return ret; +} +#endif + +static gboolean +gst_nv_comp_video_dec_decide_allocation (GstVideoDecoder * decoder, + GstQuery * query) +{ + auto self = GST_NV_COMP_VIDEO_DEC (decoder); + auto priv = self->priv; + GstBufferPool *pool = nullptr; + guint size; + guint min = 0; + guint max = 0; + GstCaps *caps; + + gst_query_parse_allocation (query, &caps, nullptr); + if (!caps) { + GST_WARNING_OBJECT (self, "null caps in query"); + return FALSE; + } + + GstVideoInfo info; + if (!gst_video_info_from_caps (&info, caps)) { + GST_WARNING_OBJECT (self, "Failed to convert caps into info"); + return FALSE; + } + + gboolean update_pool = FALSE; + if (gst_query_get_n_allocation_pools (query) > 0) { + gst_query_parse_nth_allocation_pool (query, 0, &pool, &size, &min, &max); + update_pool = TRUE; + } + + auto features = gst_caps_get_features (caps, 0); + gboolean use_cuda_pool = FALSE; + if (gst_caps_features_contains (features, + GST_CAPS_FEATURE_MEMORY_CUDA_MEMORY)) { + GST_DEBUG_OBJECT (self, "Downstream support CUDA memory"); + if (pool) { + if (!GST_IS_CUDA_BUFFER_POOL (pool)) { + gst_clear_object (&pool); + } else { + auto cuda_pool = GST_CUDA_BUFFER_POOL (pool); + if (cuda_pool->context != priv->ctx) + gst_clear_object (&pool); + } + } + + if (!pool) + pool = gst_cuda_buffer_pool_new (priv->ctx); + use_cuda_pool = TRUE; + } +#ifdef HAVE_GST_GL + else if (gst_caps_features_contains (features, + GST_CAPS_FEATURE_MEMORY_GL_MEMORY) && priv->gl_interop) { + GST_DEBUG_OBJECT (self, "Downstream support GL memory"); + if (!gst_nv_comp_video_dec_ensure_gl_context (self)) { + priv->gl_interop = FALSE; + } else { + if (pool && !GST_IS_GL_BUFFER_POOL (pool)) + gst_clear_object (&pool); + + if (!pool) + pool = gst_gl_buffer_pool_new (priv->gl_context); + } + } +#endif + + if (!pool) + pool = gst_video_buffer_pool_new (); + + auto config = gst_buffer_pool_get_config (pool); + + size = GST_VIDEO_INFO_SIZE (&info); + gst_buffer_pool_config_set_params (config, caps, size, 0, 0); + if (use_cuda_pool && priv->stream) { + /* Set our stream on buffer pool config so that CUstream can be shared */ + gst_buffer_pool_config_set_cuda_stream (config, priv->stream); + } + + if (!gst_buffer_pool_set_config (pool, config)) { + GST_WARNING_OBJECT (self, "Failed to set pool config"); + gst_object_unref (pool); + return FALSE; + } + + config = gst_buffer_pool_get_config (pool); + gst_buffer_pool_config_get_params (config, nullptr, &size, nullptr, nullptr); + gst_structure_free (config); + + if (update_pool) + gst_query_set_nth_allocation_pool (query, 0, pool, size, min, max); + else + gst_query_add_allocation_pool (query, pool, size, min, max); + gst_object_unref (pool); + + return TRUE; +} + +static gboolean +gst_nv_comp_video_dec_alloc_task (GstNvCompVideoDec * self, + DecoderTask * task, gboolean batched, gsize size) +{ + if (batched) + return TRUE; + + task->uncompressed_alloc_size = size; + auto cuda_ret = + CuMemAlloc ((CUdeviceptr *) & task->device_uncompressed, size); + if (!gst_cuda_result (cuda_ret)) + return FALSE; + + cuda_ret = CuMemAllocHost ((void **) &task->host_uncompressed, size); + if (!gst_cuda_result (cuda_ret)) + return FALSE; + + task->compressed_alloc_size = size; + cuda_ret = CuMemAlloc ((CUdeviceptr *) & task->device_compressed, size); + if (!gst_cuda_result (cuda_ret)) + return FALSE; + + cuda_ret = CuMemAllocHost ((void **) &task->host_compressed, size); + if (!gst_cuda_result (cuda_ret)) + return FALSE; + + return TRUE; +} + +static gboolean +gst_nv_comp_video_dec_set_format (GstVideoDecoder * decoder, + GstVideoCodecState * state) +{ + auto self = GST_NV_COMP_VIDEO_DEC (decoder); + auto priv = self->priv; + + if (!priv->ctx) { + GST_ERROR_OBJECT (self, "CUDA context was not configured"); + return FALSE; + } + + GST_DEBUG_OBJECT (self, "Set format with caps %" GST_PTR_FORMAT, state->caps); + + g_clear_pointer (&priv->state, gst_video_codec_state_unref); + priv->state = gst_video_codec_state_ref (state); + + auto s = gst_caps_get_structure (state->caps, 0); + std::string mime_type = gst_structure_get_name (s); + + auto format_str = gst_structure_get_string (s, "format"); + if (!format_str) { + GST_ERROR_OBJECT (self, "Unknown video format"); + return FALSE; + } + + GstVideoFormat format = gst_video_format_from_string (format_str); + if (format == GST_VIDEO_FORMAT_UNKNOWN || format == GST_VIDEO_FORMAT_ENCODED) { + GST_ERROR_OBJECT (self, "Invalid format string %s", format_str); + return FALSE; + } + + s = gst_structure_copy (s); + gst_structure_set_name (s, "video/x-raw"); + + auto video_caps = gst_caps_new_empty (); + gst_caps_append_structure (video_caps, s); + + auto ret = gst_video_info_from_caps (&priv->info, video_caps); + gst_caps_unref (video_caps); + if (!ret) { + GST_ERROR_OBJECT (self, "Couldn't build output caps"); + return FALSE; + } + + if (!gst_cuda_context_push (priv->ctx)) { + GST_ERROR_OBJECT (self, "Couldn't push context"); + return FALSE; + } + + priv->manager = nullptr; + priv->batched_decomp = nullptr; + priv->task = nullptr; + + priv->batched = TRUE; + if (mime_type == "video/x-nvcomp") { + priv->batched = FALSE; + } else if (mime_type == "video/x-nvcomp-lz4") { + priv->batched_decomp = std::make_shared < BatchedDecomp < + nvcompBatchedLZ4DecompressGetTempSize, + nvcompBatchedLZ4DecompressAsync >> (); + } else if (mime_type == "video/x-nvcomp-snappy") { + priv->batched_decomp = std::make_shared < BatchedDecomp < + nvcompBatchedSnappyDecompressGetTempSize, + nvcompBatchedSnappyDecompressAsync >> (); + } else if (mime_type == "video/x-nvcomp-gdeflate") { + priv->batched_decomp = std::make_shared < BatchedDecomp < + nvcompBatchedGdeflateDecompressGetTempSize, + nvcompBatchedGdeflateDecompressAsync >> (); + } else if (mime_type == "video/x-nvcomp-deflate") { + priv->batched_decomp = std::make_shared < BatchedDecomp < + nvcompBatchedDeflateDecompressGetTempSize, + nvcompBatchedDeflateDecompressAsync >> (); + } else if (mime_type == "video/x-nvcomp-zstd") { + priv->batched_decomp = std::make_shared < BatchedDecomp < + nvcompBatchedZstdDecompressGetTempSize, + nvcompBatchedZstdDecompressAsync >> (); + } else if (mime_type == "video/x-nvcomp-cascaded") { + priv->batched_decomp = std::make_shared < BatchedDecomp < + nvcompBatchedCascadedDecompressGetTempSize, + nvcompBatchedCascadedDecompressAsync >> (); + } else if (mime_type == "video/x-nvcomp-bitcomp") { + priv->batched_decomp = std::make_shared < BatchedDecomp < + nvcompBatchedBitcompDecompressGetTempSize, + nvcompBatchedBitcompDecompressAsync >> (); + } else if (mime_type == "video/x-nvcomp-ans") { + priv->batched_decomp = std::make_shared < BatchedDecomp < + nvcompBatchedANSDecompressGetTempSize, + nvcompBatchedANSDecompressAsync >> (); + } else { + gst_cuda_context_pop (nullptr); + g_assert_not_reached (); + return FALSE; + } + + auto task = std::make_shared < DecoderTask > (); + task->ctx = (GstCudaContext *) gst_object_ref (priv->ctx); + + if (!gst_nv_comp_video_dec_alloc_task (self, task.get (), priv->batched, + priv->info.size)) { + task = nullptr; + gst_cuda_context_pop (nullptr); + return FALSE; + } + + priv->task = task; + gst_cuda_context_pop (nullptr); + + return gst_video_decoder_negotiate (decoder); +} + +static gboolean +is_supported_cuda_format (GstVideoFormat format) +{ + switch (format) { + case GST_VIDEO_FORMAT_I420: + case GST_VIDEO_FORMAT_YV12: + case GST_VIDEO_FORMAT_NV12: + case GST_VIDEO_FORMAT_NV21: + case GST_VIDEO_FORMAT_P010_10LE: + case GST_VIDEO_FORMAT_P012_LE: + case GST_VIDEO_FORMAT_P016_LE: + case GST_VIDEO_FORMAT_I420_10LE: + case GST_VIDEO_FORMAT_I420_12LE: + case GST_VIDEO_FORMAT_Y444: + case GST_VIDEO_FORMAT_Y444_10LE: + case GST_VIDEO_FORMAT_Y444_12LE: + case GST_VIDEO_FORMAT_Y444_16LE: + case GST_VIDEO_FORMAT_BGRA: + case GST_VIDEO_FORMAT_RGBA: + case GST_VIDEO_FORMAT_RGBx: + case GST_VIDEO_FORMAT_BGRx: + case GST_VIDEO_FORMAT_ARGB: + case GST_VIDEO_FORMAT_ABGR: + case GST_VIDEO_FORMAT_RGB: + case GST_VIDEO_FORMAT_BGR: + case GST_VIDEO_FORMAT_BGR10A2_LE: + case GST_VIDEO_FORMAT_RGB10A2_LE: + case GST_VIDEO_FORMAT_Y42B: + case GST_VIDEO_FORMAT_I422_10LE: + case GST_VIDEO_FORMAT_I422_12LE: + case GST_VIDEO_FORMAT_YUY2: + case GST_VIDEO_FORMAT_UYVY: + case GST_VIDEO_FORMAT_RGBP: + case GST_VIDEO_FORMAT_BGRP: + case GST_VIDEO_FORMAT_GBR: + case GST_VIDEO_FORMAT_GBR_10LE: + case GST_VIDEO_FORMAT_GBR_12LE: + case GST_VIDEO_FORMAT_GBR_16LE: + case GST_VIDEO_FORMAT_GBRA: + case GST_VIDEO_FORMAT_VUYA: + return TRUE; + default: + break; + } + + return FALSE; +} + +#ifdef HAVE_GST_GL +static gboolean +is_supported_gl_format (GstVideoFormat format) +{ + auto gl_caps = gst_caps_from_string ("video/x-raw, format = (string) " + GST_GL_COLOR_CONVERT_FORMATS); + auto our_caps = gst_caps_new_empty_simple ("video/x-raw"); + gst_caps_set_simple (our_caps, + "format", G_TYPE_STRING, gst_video_format_to_string (format), nullptr); + auto ret = gst_caps_is_subset (our_caps, gl_caps); + gst_caps_unref (gl_caps); + gst_caps_unref (our_caps); + + return ret; +} +#endif + +static gboolean +gst_nv_comp_video_dec_negotiate (GstVideoDecoder * decoder) +{ + auto self = GST_NV_COMP_VIDEO_DEC (decoder); + auto priv = self->priv; + gboolean is_cuda = FALSE; +#ifdef HAVE_GST_GL + gboolean is_gl = FALSE; +#endif + + auto peer_caps = gst_pad_get_allowed_caps (decoder->srcpad); + GST_DEBUG_OBJECT (self, "Allowed caps %" GST_PTR_FORMAT, peer_caps); + + if (!peer_caps || gst_caps_is_any (peer_caps)) { + GST_DEBUG_OBJECT (self, + "cannot determine output format, use system memory"); + } else { + GstCapsFeatures *features; + guint size = gst_caps_get_size (peer_caps); + guint i; + + for (i = 0; i < size; i++) { + features = gst_caps_get_features (peer_caps, i); + + if (!features) + continue; + + if (gst_caps_features_contains (features, + GST_CAPS_FEATURE_MEMORY_CUDA_MEMORY)) { + is_cuda = TRUE; + } +#ifdef HAVE_GST_GL + if (gst_caps_features_contains (features, + GST_CAPS_FEATURE_MEMORY_GL_MEMORY)) { + is_gl = TRUE; + } +#endif + } + } + gst_clear_caps (&peer_caps); + + auto state = gst_video_decoder_set_interlaced_output_state (decoder, + GST_VIDEO_INFO_FORMAT (&priv->info), + GST_VIDEO_INFO_INTERLACE_MODE (&priv->info), priv->info.width, + priv->info.height, priv->state); + + if (!state) { + GST_ERROR_OBJECT (self, "Couldn't set output state"); + return FALSE; + } + + priv->gl_interop = FALSE; + + state->caps = gst_video_info_to_caps (&state->info); + auto format = GST_VIDEO_INFO_FORMAT (&priv->info); + if (is_cuda && is_supported_cuda_format (format)) { + gst_caps_set_features_simple (state->caps, + gst_caps_features_new (GST_CAPS_FEATURE_MEMORY_CUDA_MEMORY, nullptr)); + } +#ifdef HAVE_GST_GL + else if (is_gl && is_supported_gl_format (format)) { + gst_caps_set_features_simple (state->caps, + gst_caps_features_new (GST_CAPS_FEATURE_MEMORY_GL_MEMORY, nullptr)); + priv->gl_interop = TRUE; + } +#endif + + return GST_VIDEO_DECODER_CLASS (parent_class)->negotiate (decoder); +} + +static gboolean +gst_nv_comp_video_dec_download (GstNvCompVideoDec * self, GstVideoFrame * frame, + CUstream stream, gboolean is_device_copy) +{ + auto priv = self->priv; + auto info = &priv->info; + auto finfo = info->finfo; + gint comp[GST_VIDEO_MAX_COMPONENTS]; + CUresult ret = CUDA_SUCCESS; + auto task = priv->task; + + for (guint i = 0; i < GST_VIDEO_FRAME_N_PLANES (frame); i++) { + guint8 *sp; + if (is_device_copy) + sp = task->device_uncompressed + info->offset[i]; + else + sp = task->host_uncompressed + info->offset[i]; + + guint8 *dp = (guint8 *) GST_VIDEO_FRAME_PLANE_DATA (frame, i); + guint ss, ds; + guint w, h; + + if (GST_VIDEO_FORMAT_INFO_HAS_PALETTE (finfo) && i == 1) { + if (is_device_copy) { + ret = CuMemcpyDtoDAsync ((CUdeviceptr) dp, (CUdeviceptr) sp, + 256 * 4, stream); + } else { + memcpy (dp, sp, 256 * 4); + } + + if (!gst_cuda_result (ret)) { + GST_ERROR_OBJECT (self, "CUDA memcpy failed"); + return FALSE; + } + + return TRUE; + } + + ds = GST_VIDEO_FRAME_PLANE_STRIDE (frame, i); + ss = GST_VIDEO_INFO_PLANE_STRIDE (info, i); + + gst_video_format_info_component (finfo, i, comp); + + w = GST_VIDEO_INFO_COMP_WIDTH (info, comp[0]) * + GST_VIDEO_INFO_COMP_PSTRIDE (info, comp[0]); + if (w == 0) + w = MIN (ss, ds); + + h = GST_VIDEO_INFO_COMP_HEIGHT (info, comp[0]); + + if (GST_VIDEO_FORMAT_INFO_IS_TILED (finfo)) { + gint tile_size; + gint sx_tiles, sy_tiles, dx_tiles, dy_tiles; + GstVideoTileMode mode; + + tile_size = GST_VIDEO_FORMAT_INFO_TILE_SIZE (info->finfo, i); + + mode = GST_VIDEO_FORMAT_INFO_TILE_MODE (info->finfo); + + sx_tiles = GST_VIDEO_TILE_X_TILES (ss); + sy_tiles = GST_VIDEO_TILE_Y_TILES (ss); + + dx_tiles = GST_VIDEO_TILE_X_TILES (ds); + dy_tiles = GST_VIDEO_TILE_Y_TILES (ds); + + w = MIN (sx_tiles, dx_tiles); + h = MIN (sy_tiles, dy_tiles); + + for (guint j = 0; j < h; j++) { + for (guint k = 0; k < w; k++) { + guint si, di; + guint8 *cur_dp; + guint8 *cur_sp; + + si = gst_video_tile_get_index (mode, k, j, sx_tiles, sy_tiles); + di = gst_video_tile_get_index (mode, k, j, dx_tiles, dy_tiles); + + cur_dp = dp + (di * tile_size); + cur_sp = sp + (si * tile_size); + + if (is_device_copy) { + ret = CuMemcpyDtoDAsync ((CUdeviceptr) cur_dp, (CUdeviceptr) cur_sp, + w, stream); + } else { + memcpy (cur_dp, cur_sp, w); + } + + if (!gst_cuda_result (ret)) { + GST_ERROR_OBJECT (self, "CUDA memcpy failed"); + return FALSE; + } + } + } + } else { + if (is_device_copy) { + CUDA_MEMCPY2D params = { }; + params.srcMemoryType = CU_MEMORYTYPE_DEVICE; + params.srcDevice = (CUdeviceptr) sp; + params.srcPitch = ss; + + params.dstMemoryType = CU_MEMORYTYPE_DEVICE; + params.dstDevice = (CUdeviceptr) dp; + params.dstPitch = ds; + + params.WidthInBytes = w; + params.Height = h; + + ret = CuMemcpy2DAsync (¶ms, stream); + + if (!gst_cuda_result (ret)) { + GST_ERROR_OBJECT (self, "CUDA memcpy failed"); + return FALSE; + } + } else { + for (guint j = 0; j < h; j++) { + memcpy (dp, sp, w); + dp += ds; + sp += ss; + } + } + } + } + + return TRUE; +} + +#ifdef HAVE_GST_GL +struct GLInteropData +{ + GstNvCompVideoDec *self = nullptr; + GstBuffer *buffer = nullptr; + gboolean ret = FALSE; +}; + +static GstCudaGraphicsResource * +ensure_gl_cuda_resource (GstNvCompVideoDec * self, GstMemory * mem) +{ + auto priv = self->priv; + GstCudaGraphicsResource *resource; + GQuark quark; + + if (!gst_is_gl_memory_pbo (mem)) { + GST_WARNING_OBJECT (self, "memory is not GL PBO memory, %s", + mem->allocator->mem_type); + return nullptr; + } + + quark = gst_cuda_quark_from_id (GST_CUDA_QUARK_GRAPHICS_RESOURCE); + resource = (GstCudaGraphicsResource *) + gst_mini_object_get_qdata (GST_MINI_OBJECT (mem), quark); + + if (!resource) { + GstMapInfo map_info; + GstGLMemoryPBO *pbo = (GstGLMemoryPBO *) mem; + GstGLBuffer *gl_buf = pbo->pbo; + gboolean ret; + + if (!gst_memory_map (mem, &map_info, + (GstMapFlags) (GST_MAP_READ | GST_MAP_GL))) { + GST_ERROR_OBJECT (self, "Couldn't map gl memory"); + return nullptr; + } + + resource = gst_cuda_graphics_resource_new (priv->ctx, + GST_OBJECT (GST_GL_BASE_MEMORY_CAST (mem)->context), + GST_CUDA_GRAPHICS_RESOURCE_GL_BUFFER); + + GST_LOG_OBJECT (self, "registering gl buffer %d to CUDA", gl_buf->id); + ret = gst_cuda_graphics_resource_register_gl_buffer (resource, gl_buf->id, + CU_GRAPHICS_REGISTER_FLAGS_NONE); + gst_memory_unmap (mem, &map_info); + + if (!ret) { + GST_ERROR_OBJECT (self, "Couldn't register gl buffer %d", gl_buf->id); + gst_cuda_graphics_resource_free (resource); + return nullptr; + } + + gst_mini_object_set_qdata (GST_MINI_OBJECT (mem), quark, resource, + (GDestroyNotify) gst_cuda_graphics_resource_free); + } + + return resource; +} + +static void +gst_nv_comp_video_dec_download_gl (GstGLContext * context, GLInteropData * data) +{ + auto self = data->self; + auto priv = self->priv; + auto info = &priv->info; + auto finfo = info->finfo; + GstCudaGraphicsResource *gst_res[GST_VIDEO_MAX_PLANES] = { nullptr, }; + CUgraphicsResource cuda_res[GST_VIDEO_MAX_PLANES] = { nullptr, }; + CUdeviceptr src_devptr[GST_VIDEO_MAX_PLANES] = { 0, }; + CUstream stream = gst_cuda_stream_get_handle (priv->stream); + CUresult ret; + gint comp[GST_VIDEO_MAX_COMPONENTS]; + auto task = priv->task; + + if (!gst_cuda_context_push (priv->ctx)) { + GST_ERROR_OBJECT (self, "Couldn't push context"); + return; + } + + for (guint i = 0; i < GST_VIDEO_INFO_N_PLANES (info); i++) { + GstMemory *mem = gst_buffer_peek_memory (data->buffer, i); + gsize src_size; + + if (!gst_is_gl_memory_pbo (mem)) { + GST_ERROR_OBJECT (self, "Not a GL PBO memory"); + goto out; + } + + gst_res[i] = ensure_gl_cuda_resource (self, mem); + if (!gst_res[i]) { + GST_ERROR_OBJECT (self, "Couldn't get resource %d", i); + goto out; + } + + cuda_res[i] = gst_cuda_graphics_resource_map (gst_res[i], stream, + CU_GRAPHICS_MAP_RESOURCE_FLAGS_WRITE_DISCARD); + if (!cuda_res[i]) { + GST_ERROR_OBJECT (self, "Couldn't map resource"); + goto out; + } + + ret = CuGraphicsResourceGetMappedPointer (&src_devptr[i], + &src_size, cuda_res[i]); + if (!gst_cuda_result (ret)) { + GST_ERROR_OBJECT (self, "Couldn't get mapped device pointer"); + goto out; + } + + /* Need PBO -> texture */ + GST_MINI_OBJECT_FLAG_SET (mem, GST_GL_BASE_MEMORY_TRANSFER_NEED_UPLOAD); + } + + for (guint i = 0; i < GST_VIDEO_INFO_N_PLANES (info); i++) { + guint8 *sp = task->device_uncompressed + info->offset[i]; + guint8 *dp = (guint8 *) src_devptr[i]; + guint ss, ds; + guint w, h; + + if (GST_VIDEO_FORMAT_INFO_HAS_PALETTE (finfo) && i == 1) { + ret = CuMemcpyDtoDAsync ((CUdeviceptr) dp, (CUdeviceptr) sp, + 256 * 4, stream); + + if (!gst_cuda_result (ret)) { + GST_ERROR_OBJECT (self, "CUDA memcpy failed"); + goto out; + } + + data->ret = TRUE; + goto out; + } + + auto meta = gst_buffer_get_video_meta (data->buffer); + if (meta) + ds = meta->stride[i]; + else + ds = GST_VIDEO_INFO_PLANE_STRIDE (info, i); + + ss = GST_VIDEO_INFO_PLANE_STRIDE (info, i); + + gst_video_format_info_component (finfo, i, comp); + + w = GST_VIDEO_INFO_COMP_WIDTH (info, comp[0]) * + GST_VIDEO_INFO_COMP_PSTRIDE (info, comp[0]); + if (w == 0) + w = MIN (ss, ds); + + h = GST_VIDEO_INFO_COMP_HEIGHT (info, comp[0]); + + if (GST_VIDEO_FORMAT_INFO_IS_TILED (finfo)) { + gint tile_size; + gint sx_tiles, sy_tiles, dx_tiles, dy_tiles; + GstVideoTileMode mode; + + tile_size = GST_VIDEO_FORMAT_INFO_TILE_SIZE (info->finfo, i); + + mode = GST_VIDEO_FORMAT_INFO_TILE_MODE (info->finfo); + + sx_tiles = GST_VIDEO_TILE_X_TILES (ss); + sy_tiles = GST_VIDEO_TILE_Y_TILES (ss); + + dx_tiles = GST_VIDEO_TILE_X_TILES (ds); + dy_tiles = GST_VIDEO_TILE_Y_TILES (ds); + + w = MIN (sx_tiles, dx_tiles); + h = MIN (sy_tiles, dy_tiles); + + for (guint j = 0; j < h; j++) { + for (guint k = 0; k < w; k++) { + guint si, di; + guint8 *cur_dp; + guint8 *cur_sp; + + si = gst_video_tile_get_index (mode, k, j, sx_tiles, sy_tiles); + di = gst_video_tile_get_index (mode, k, j, dx_tiles, dy_tiles); + + cur_dp = dp + (di * tile_size); + cur_sp = sp + (si * tile_size); + + ret = CuMemcpyDtoDAsync ((CUdeviceptr) cur_dp, (CUdeviceptr) cur_sp, + w, stream); + + if (!gst_cuda_result (ret)) { + GST_ERROR_OBJECT (self, "CUDA memcpy failed"); + goto out; + } + } + } + } else { + CUDA_MEMCPY2D params = { }; + params.srcMemoryType = CU_MEMORYTYPE_DEVICE; + params.srcDevice = (CUdeviceptr) sp; + params.srcPitch = ss; + + params.dstMemoryType = CU_MEMORYTYPE_DEVICE; + params.dstDevice = (CUdeviceptr) dp; + params.dstPitch = ds; + + params.WidthInBytes = w; + params.Height = h; + + ret = CuMemcpy2DAsync (¶ms, stream); + if (!gst_cuda_result (ret)) { + GST_ERROR_OBJECT (self, "CUDA memcpy failed"); + goto out; + } + } + } + + data->ret = TRUE; + +out: + for (guint i = 0; i < gst_buffer_n_memory (data->buffer); i++) { + if (!gst_res[i]) + break; + + gst_cuda_graphics_resource_unmap (gst_res[i], stream); + } + + CuStreamSynchronize (stream); + gst_cuda_context_pop (nullptr); +} +#endif + +struct ChunkData +{ + size_t uncomp_size = 0; + size_t comp_size = 0; + size_t offset = 0; +}; + +static gboolean +gst_nv_comp_video_dec_parse_header (GstNvCompVideoDec * self, + const guint8 * data, gsize size, + size_t &uncompressed_chunk_size, size_t &max_compressed_chunk_size, + size_t &batch_size, std::vector < ChunkData > &compressed_chunks) +{ + guint32 val; + const guint8 *ptr = data; + gsize remaining = size; + + if (size <= GST_NV_COMP_HEADER_MIN_SIZE) { + GST_ERROR_OBJECT (self, "Too small size"); + return FALSE; + } + + val = GST_READ_UINT32_LE (ptr); + if (val != GST_NV_COMP_HEADER_VERSION) { + GST_ERROR_OBJECT (self, "Invalid version"); + return FALSE; + } + ptr += sizeof (guint32); + remaining -= sizeof (guint32); + + uncompressed_chunk_size = GST_READ_UINT32_LE (ptr); + ptr += sizeof (guint32); + remaining -= sizeof (guint32); + + max_compressed_chunk_size = GST_READ_UINT32_LE (ptr); + ptr += sizeof (guint32); + remaining -= sizeof (guint32); + + batch_size = GST_READ_UINT32_LE (ptr); + ptr += sizeof (guint32); + remaining -= sizeof (guint32); + + compressed_chunks.resize (batch_size); + size_t total_compressed_size = 0; + for (size_t i = 0; i < batch_size; i++) { + if (remaining < sizeof (guint32)) + return FALSE; + + compressed_chunks[i].uncomp_size = GST_READ_UINT32_LE (ptr); + ptr += sizeof (guint32); + remaining -= sizeof (guint32); + + if (remaining < sizeof (guint32)) + return FALSE; + + compressed_chunks[i].comp_size = GST_READ_UINT32_LE (ptr); + total_compressed_size += compressed_chunks[i].comp_size; + + ptr += sizeof (guint32); + remaining -= sizeof (guint32); + } + + if (remaining != total_compressed_size) { + GST_ERROR_OBJECT (self, "Size mismatch, remaining: %" G_GSIZE_FORMAT + ", total compressed: %" G_GSIZE_FORMAT, remaining, + total_compressed_size); + return FALSE; + } + + for (size_t i = 0; i < batch_size; i++) { + compressed_chunks[i].offset = ptr - data; + ptr += compressed_chunks[i].comp_size; + } + + return TRUE; +} + +static GstFlowReturn +gst_nv_comp_video_dec_handle_frame (GstVideoDecoder * decoder, + GstVideoCodecFrame * frame) +{ + auto self = GST_NV_COMP_VIDEO_DEC (decoder); + auto priv = self->priv; + CUstream stream = nullptr; + GstVideoFrame vframe; + GstMapInfo map_info; + CUresult cuda_ret; + gboolean need_copy = TRUE; + GstMemory *mem; + nvcompStatus_t status; + auto task = priv->task; + GstFlowReturn ret; + + if (!priv->ctx || !priv->task) { + GST_ERROR_OBJECT (self, "Context was not configured"); + goto error; + } + + ret = gst_video_decoder_allocate_output_frame (decoder, frame); + if (ret != GST_FLOW_OK) { + gst_video_decoder_release_frame (decoder, frame); + return ret; + } + + if (!gst_cuda_context_push (priv->ctx)) { + GST_ERROR_OBJECT (self, "Couldn't push context"); + goto error; + } + + stream = gst_cuda_stream_get_handle (priv->stream); + + if (!gst_buffer_map (frame->input_buffer, &map_info, GST_MAP_READ)) { + GST_ERROR_OBJECT (self, "Couldn't map input buffer"); + gst_cuda_context_pop (nullptr); + goto error; + } + + if (priv->batched) { + g_assert (priv->batched_decomp); + + /* Parse custom header */ + size_t uncompressed_chunk_size; + size_t max_compressed_chunk_size; + size_t batch_size; + std::vector < ChunkData > compressed_chunks; + guint8 *mapped_data = map_info.data; + uint8_t *uncompressed; + if (!gst_nv_comp_video_dec_parse_header (self, mapped_data, + map_info.size, uncompressed_chunk_size, max_compressed_chunk_size, + batch_size, compressed_chunks)) { + gst_buffer_unmap (frame->input_buffer, &map_info); + gst_cuda_context_pop (nullptr); + goto error; + } + + GST_LOG_OBJECT (self, "batch size %" G_GSIZE_FORMAT + ", uncompressed-chunk-size %" G_GSIZE_FORMAT + ", compressed-chunk-size %" G_GSIZE_FORMAT, + batch_size, uncompressed_chunk_size, max_compressed_chunk_size); + + if (task->batch_size < batch_size || + task->max_uncompressed_chunk_size < uncompressed_chunk_size || + task->max_compressed_chunk_size < max_compressed_chunk_size) { + task->clear_resource (); + } + + if (task->batch_size == 0) { + size_t temp_size = 0; + + GST_DEBUG_OBJECT (self, "Allocating resource"); + + status = priv->batched_decomp->get_temp_size (batch_size, + uncompressed_chunk_size, &temp_size); + if (status != nvcompSuccess) { + GST_ERROR_OBJECT (self, "Couldn't get temp size"); + gst_buffer_unmap (frame->input_buffer, &map_info); + gst_cuda_context_pop (nullptr); + goto error; + } + + if (!task->allocate_batched (batch_size, + max_compressed_chunk_size, uncompressed_chunk_size, temp_size)) { + GST_ERROR_OBJECT (self, "Couldn't allocate resource"); + gst_buffer_unmap (frame->input_buffer, &map_info); + gst_cuda_context_pop (nullptr); + goto error; + } + } + + for (size_t i = 0; i < batch_size; i++) { + memcpy (task->host_compressed + (i * task->max_compressed_chunk_size), + mapped_data + compressed_chunks[i].offset, + compressed_chunks[i].comp_size); + task->host_compressed_bytes[i] = compressed_chunks[i].comp_size; + } + gst_buffer_unmap (frame->input_buffer, &map_info); + + for (size_t i = 0; i < batch_size; i++) { + GST_LOG_OBJECT (self, "Uploading chunk %" G_GSIZE_FORMAT + ", size %" G_GSIZE_FORMAT, i, compressed_chunks[i].comp_size); + auto offset = i * task->max_compressed_chunk_size; + + cuda_ret = CuMemcpyHtoDAsync ((CUdeviceptr) + (task->device_compressed + offset), + task->host_compressed + offset, + compressed_chunks[i].comp_size, stream); + if (!gst_cuda_result (cuda_ret)) { + gst_cuda_context_pop (nullptr); + goto error; + } + } + + cuda_ret = CuMemcpyHtoDAsync ((CUdeviceptr) task->device_compressed_bytes, + task->host_compressed_bytes, sizeof (size_t) * batch_size, stream); + if (!gst_cuda_result (cuda_ret)) { + gst_cuda_context_pop (nullptr); + goto error; + } + + status = priv->batched_decomp->decompress (task->device_compressed_ptrs, + task->device_compressed_bytes, task->device_uncompressed_bytes, + task->device_actual_uncompressed_bytes, batch_size, + task->temp_ptr, task->temp_size, task->device_uncompressed_ptrs, + task->device_statuses, (cudaStream_t) stream); + if (status != nvcompSuccess) { + GST_ERROR_OBJECT (self, "Couldn't decompress stream, status: %d", status); + gst_cuda_context_pop (nullptr); + goto error; + } + + uncompressed = task->device_uncompressed; + for (size_t i = 0; i < batch_size; i++) { + auto size = compressed_chunks[i].uncomp_size; + cuda_ret = CuMemcpyDtoDAsync ((CUdeviceptr) uncompressed, + (CUdeviceptr) task->host_uncompressed_ptrs[i], size, stream); + + if (!gst_cuda_result (cuda_ret)) { + gst_cuda_context_pop (nullptr); + goto error; + } + uncompressed += size; + } + } else { + if (task->compressed_alloc_size < map_info.size) { + if (task->device_compressed) + CuMemFree ((CUdeviceptr) task->device_compressed); + task->device_compressed = nullptr; + + if (task->host_compressed) + CuMemFreeHost (task->host_compressed); + task->host_compressed = nullptr; + + task->compressed_alloc_size = GST_ROUND_UP_128 (map_info.size); + auto cuda_ret = CuMemAlloc ((CUdeviceptr *) & task->device_compressed, + task->compressed_alloc_size); + if (!gst_cuda_result (cuda_ret)) { + gst_buffer_unmap (frame->input_buffer, &map_info); + gst_cuda_context_pop (nullptr); + goto error; + } + + cuda_ret = CuMemAllocHost ((void **) &task->host_compressed, + task->compressed_alloc_size); + if (!gst_cuda_result (cuda_ret)) { + gst_buffer_unmap (frame->input_buffer, &map_info); + gst_cuda_context_pop (nullptr); + goto error; + } + } + + memcpy (task->host_compressed, map_info.data, map_info.size); + + cuda_ret = CuMemcpyHtoDAsync ((CUdeviceptr) task->device_compressed, + task->host_compressed, map_info.size, stream); + gst_buffer_unmap (frame->input_buffer, &map_info); + + if (!gst_cuda_result (cuda_ret)) { + GST_ERROR_OBJECT (self, "Couldn't copy compressed memory"); + gst_cuda_context_pop (nullptr); + goto error; + } + + if (!priv->manager) { + priv->manager = create_manager (task->device_compressed, + (cudaStream_t) stream); + } + + { + auto config = + priv->manager->configure_decompression (task->device_compressed); + if (config.decomp_data_size != priv->info.size) { + GST_ERROR_OBJECT (self, "size mismatch, expected %" G_GSIZE_FORMAT + ", required %" G_GSIZE_FORMAT, priv->info.size, + config.decomp_data_size); + gst_cuda_context_pop (nullptr); + goto error; + } + + priv->manager->decompress (task->device_uncompressed, + task->device_compressed, config); + } + } + + mem = gst_buffer_peek_memory (frame->output_buffer, 0); +#ifdef HAVE_GST_GL + if (priv->gl_interop && gst_buffer_n_memory (frame->output_buffer) == + GST_VIDEO_INFO_N_PLANES (&priv->info)) { + GLInteropData interop_data; + interop_data.self = self; + interop_data.buffer = frame->output_buffer; + interop_data.ret = FALSE; + + auto gl_mem = (GstGLMemory *) mem; + gst_gl_context_thread_add (gl_mem->mem.context, + (GstGLContextThreadFunc) gst_nv_comp_video_dec_download_gl, + &interop_data); + if (interop_data.ret) { + need_copy = FALSE; + GST_TRACE_OBJECT (self, "CUDA -> GL copy done"); + } else { + priv->gl_interop = FALSE; + } + } +#endif + + if (need_copy) { + GstMapFlags map_flags = GST_MAP_WRITE; + gboolean device_copy = FALSE; + gboolean do_sync = TRUE; + if (gst_is_cuda_memory (mem)) { + auto cmem = GST_CUDA_MEMORY_CAST (mem); + if (cmem->context == priv->ctx) { + map_flags = (GstMapFlags) (GST_MAP_WRITE | GST_MAP_CUDA); + device_copy = TRUE; + auto mem_stream = gst_cuda_memory_get_stream (cmem); + if (mem_stream && mem_stream == priv->stream) + do_sync = FALSE; + } + } + + if (!device_copy) { + cuda_ret = CuMemcpyDtoHAsync (task->host_uncompressed, + (CUdeviceptr) task->device_uncompressed, priv->info.size, stream); + if (!gst_cuda_result (cuda_ret)) { + GST_ERROR_OBJECT (self, "Couldn't download image"); + gst_cuda_context_pop (nullptr); + goto error; + } + CuStreamSynchronize (stream); + do_sync = FALSE; + } + + gst_video_frame_map (&vframe, &priv->info, frame->output_buffer, map_flags); + gst_nv_comp_video_dec_download (self, &vframe, stream, device_copy); + if (do_sync) + CuStreamSynchronize (stream); + gst_video_frame_unmap (&vframe); + } + gst_cuda_context_pop (nullptr); + + return gst_video_decoder_finish_frame (decoder, frame); + +error: + gst_video_decoder_release_frame (decoder, frame); + return GST_FLOW_ERROR; +} diff --git a/subprojects/gst-plugins-bad/ext/nvcomp/gstnvcompvideodec.h b/subprojects/gst-plugins-bad/ext/nvcomp/gstnvcompvideodec.h new file mode 100644 index 0000000000..b53d539f7d --- /dev/null +++ b/subprojects/gst-plugins-bad/ext/nvcomp/gstnvcompvideodec.h @@ -0,0 +1,32 @@ +/* GStreamer + * Copyright (C) 2024 Seungha Yang + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Library General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Library General Public License for more details. + * + * You should have received a copy of the GNU Library General Public + * License along with this library; if not, write to the + * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, + * Boston, MA 02110-1301, USA. + */ + +#pragma once + +#include +#include +#include "gstnvcomp.h" + +G_BEGIN_DECLS + +#define GST_TYPE_NV_COMP_VIDEO_DEC (gst_nv_comp_video_dec_get_type()) +G_DECLARE_FINAL_TYPE (GstNvCompVideoDec, gst_nv_comp_video_dec, + GST, NV_COMP_VIDEO_DEC, GstVideoDecoder) + +G_END_DECLS \ No newline at end of file diff --git a/subprojects/gst-plugins-bad/ext/nvcomp/gstnvcompvideoenc.cpp b/subprojects/gst-plugins-bad/ext/nvcomp/gstnvcompvideoenc.cpp new file mode 100644 index 0000000000..4aa6af905c --- /dev/null +++ b/subprojects/gst-plugins-bad/ext/nvcomp/gstnvcompvideoenc.cpp @@ -0,0 +1,2014 @@ +/* GStreamer + * Copyright (C) 2024 Seungha Yang + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Library General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Library General Public License for more details. + * + * You should have received a copy of the GNU Library General Public + * License along with this library; if not, write to the + * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, + * Boston, MA 02110-1301, USA. + */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "gstnvcompvideoenc.h" +#ifdef HAVE_GST_GL +#include +#include +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +GST_DEBUG_CATEGORY_STATIC (gst_nv_comp_video_enc_debug); +#define GST_CAT_DEFAULT gst_nv_comp_video_enc_debug + +#ifdef HAVE_GST_GL +#define SINK_CAPS \ + GST_VIDEO_CAPS_MAKE_WITH_FEATURES (GST_CAPS_FEATURE_MEMORY_CUDA_MEMORY, \ + GST_VIDEO_FORMATS_ALL) ";" \ + GST_VIDEO_CAPS_MAKE_WITH_FEATURES (GST_CAPS_FEATURE_MEMORY_GL_MEMORY, \ + GST_VIDEO_FORMATS_ALL) ";" \ + GST_VIDEO_CAPS_MAKE (GST_VIDEO_FORMATS_ALL) +#else +#define SINK_CAPS \ + GST_VIDEO_CAPS_MAKE_WITH_FEATURES (GST_CAPS_FEATURE_MEMORY_CUDA_MEMORY, \ + GST_VIDEO_FORMATS_ALL) ";" \ + GST_VIDEO_CAPS_MAKE (GST_VIDEO_FORMATS_ALL) +#endif + +static GstStaticPadTemplate sink_template = +GST_STATIC_PAD_TEMPLATE ("sink", GST_PAD_SINK, GST_PAD_ALWAYS, + GST_STATIC_CAPS (SINK_CAPS)); + +static GstStaticPadTemplate src_template = + GST_STATIC_PAD_TEMPLATE ("src", GST_PAD_SRC, GST_PAD_ALWAYS, + GST_STATIC_CAPS ("video/x-nvcomp; video/x-nvcomp-lz4; " + "video/x-nvcomp-snappy; video/x-nvcomp-gdeflate; " + "video/x-nvcomp-deflate; video/x-nvcomp-zstd; video/x-nvcomp-cascaded; " + "video/x-nvcomp-bitcomp; video/x-nvcomp-ans")); + +enum GstNvCompDataType +{ + GST_NV_COMP_DATA_TYPE_DEFAULT = -1, + GST_NV_COMP_DATA_TYPE_CHAR = NVCOMP_TYPE_CHAR, + GST_NV_COMP_DATA_TYPE_UCHAR = NVCOMP_TYPE_UCHAR, + GST_NV_COMP_DATA_TYPE_SHORT = NVCOMP_TYPE_SHORT, + GST_NV_COMP_DATA_TYPE_USHORT = NVCOMP_TYPE_USHORT, + GST_NV_COMP_DATA_TYPE_INT = NVCOMP_TYPE_INT, + GST_NV_COMP_DATA_TYPE_UINT = NVCOMP_TYPE_UINT, + GST_NV_COMP_DATA_TYPE_LONGLONG = NVCOMP_TYPE_LONGLONG, + GST_NV_COMP_DATA_TYPE_ULONGLONG = NVCOMP_TYPE_ULONGLONG, + GST_NV_COMP_DATA_TYPE_BITS = NVCOMP_TYPE_BITS, +}; + +#define GST_TYPE_NV_COMP_DATA_TYPE (gst_nv_comp_data_type_type()) +static GType +gst_nv_comp_data_type_type (void) +{ + static GType data_type = 0; + static std::once_flag once; + static const GEnumValue types[] = { + {GST_NV_COMP_DATA_TYPE_DEFAULT, "Default", "default"}, + {GST_NV_COMP_DATA_TYPE_CHAR, "CHAR", "char"}, + {GST_NV_COMP_DATA_TYPE_UCHAR, "UCHAR", "uchar"}, + {GST_NV_COMP_DATA_TYPE_SHORT, "SHORT", "short"}, + {GST_NV_COMP_DATA_TYPE_USHORT, "USHORT", "ushort"}, + {GST_NV_COMP_DATA_TYPE_INT, "INT", "int"}, + {GST_NV_COMP_DATA_TYPE_UINT, "UINT", "uint"}, + {GST_NV_COMP_DATA_TYPE_LONGLONG, "LONGLONG", "longlong"}, + {GST_NV_COMP_DATA_TYPE_ULONGLONG, "ULONGLONG", "ulonglong"}, + {GST_NV_COMP_DATA_TYPE_BITS, "BITS", "bits"}, + {0, nullptr, nullptr}, + }; + + std::call_once (once,[&] { + data_type = g_enum_register_static ("GstNvCompDataType", types); + }); + + return data_type; +} + +enum GstNvCompDeflateAlgo +{ + GST_NV_COMP_DEFLATE_HIGH_THROUGHPUT, + GST_NV_COMP_DEFLATE_LOW_THROUGHPUT, + GST_NV_COMP_DEFLATE_HIGHEST_THROUGHPUT, +}; + +#define GST_TYPE_NV_COMP_DEFLATE_ALGO (gst_nv_comp_deflate_algo_get_type()) +static GType +gst_nv_comp_deflate_algo_get_type (void) +{ + static GType algo_type = 0; + static std::once_flag once; + static const GEnumValue algo[] = { + {GST_NV_COMP_DEFLATE_HIGH_THROUGHPUT, + "High throughput, low compression ratio", "high-throughput"}, + {GST_NV_COMP_DEFLATE_LOW_THROUGHPUT, + "Low throughput, high compression ratio", "low-throughput"}, + {GST_NV_COMP_DEFLATE_HIGHEST_THROUGHPUT, + "Highest throughput, entropy-only compression", "highest-throughput"}, + {0, nullptr, nullptr}, + }; + + std::call_once (once,[&] { + algo_type = g_enum_register_static ("GstNvCompDeflateAlgo", algo); + }); + + return algo_type; +} + +enum GstNvCompBitcompAlgo +{ + GST_NV_COMP_BITCOMP_DEFAULT, + GST_NV_COMP_BITCOMP_SPARSE, +}; + +#define GST_TYPE_NV_COMP_BITCOMP_ALGO (gst_nv_comp_bitcomp_algo_get_type()) +static GType +gst_nv_comp_bitcomp_algo_get_type (void) +{ + static GType algo_type = 0; + static std::once_flag once; + static const GEnumValue algo[] = { + {GST_NV_COMP_BITCOMP_DEFAULT, "Default", "default"}, + {GST_NV_COMP_BITCOMP_SPARSE, "Sparse", "sparse"}, + {0, nullptr, nullptr}, + }; + + std::call_once (once,[&] { + algo_type = g_enum_register_static ("GstNvCompBitcompAlgo", algo); + }); + + return algo_type; +} + +enum +{ + PROP_0, + PROP_METHOD, + PROP_DEFLATE_ALGO, + PROP_BITCOMP_ALGO, + PROP_DATA_TYPE, + PROP_CHUNK_SIZE, + PROP_ASYNC_DEPTH, + PROP_BATCHED, +}; + +#define DEFAULT_METHOD GST_NV_COMP_BITCOMP +#define DEFAULT_DEFLATE_ALGO GST_NV_COMP_DEFLATE_HIGH_THROUGHPUT +#define DEFAULT_BITCOMP_ALGO GST_NV_COMP_BITCOMP_SPARSE +#define DEFAULT_DATA_TYPE GST_NV_COMP_DATA_TYPE_DEFAULT +#define DEFAULT_CHUNK_SIZE 0 +#define DEFAULT_BATCHED TRUE +#define DEFAULT_ASYNC_DEPTH 2 + +/* *INDENT-OFF* */ +using namespace nvcomp; + +struct EncoderTask +{ + ~EncoderTask () + { + if (ctx) { + gst_cuda_context_push (ctx); + if (event) + CuEventDestroy (event); + if (device_uncompressed) + CuMemFree ((CUdeviceptr) device_uncompressed); + if (host_uncompressed) + CuMemFreeHost (host_uncompressed); + if (device_compressed) + CuMemFree ((CUdeviceptr) device_compressed); + if (host_compressed) + CuMemFreeHost (host_compressed); + if (device_uncompressed_bytes) + CuMemFree ((CUdeviceptr) device_uncompressed_bytes); + if (device_uncompressed_ptrs) + CuMemFree ((CUdeviceptr) device_uncompressed_ptrs); + if (device_compressed_bytes) + CuMemFree ((CUdeviceptr) device_compressed_bytes); + if (host_uncompressed_bytes) + CuMemFreeHost (host_uncompressed_bytes); + if (host_uncompressed_ptrs) + CuMemFreeHost (host_uncompressed_ptrs); + if (device_compressed_ptrs) + CuMemFree ((CUdeviceptr) device_compressed_ptrs); + if (host_compressed_bytes) + CuMemFreeHost (host_compressed_bytes); + if (host_compressed_ptrs) + CuMemFreeHost (host_compressed_ptrs); + if (temp_ptr) + CuMemFree ((CUdeviceptr) temp_ptr); + + gst_cuda_context_pop (nullptr); + gst_object_unref (ctx); + } + } + + GstCudaContext *ctx = nullptr; + CUevent event = nullptr; + uint8_t *device_uncompressed = nullptr; + uint8_t *host_uncompressed = nullptr; + + uint8_t *device_compressed = nullptr; + uint8_t *host_compressed = nullptr; + + size_t *device_uncompressed_bytes = nullptr; + void **device_uncompressed_ptrs = nullptr; + + size_t *host_uncompressed_bytes = nullptr; + void **host_uncompressed_ptrs = nullptr; + + size_t *device_compressed_bytes = nullptr; + void **device_compressed_ptrs = nullptr; + + size_t *host_compressed_bytes = nullptr; + void **host_compressed_ptrs = nullptr; + + void *temp_ptr = nullptr; + size_t temp_size = 0; + + size_t compressed_size = 0; + + gboolean batched; + size_t batch_size; + size_t chunk_size; + size_t max_output_chunk_size; + size_t compressed_alloc_size; +}; + +struct BatchedCompBase +{ + virtual nvcompStatus_t get_temp_size( + size_t batch_size, + size_t max_uncompressed_chunk_bytes, + size_t * temp_bytes) = 0; + + virtual nvcompStatus_t get_max_compressed_chunk_size( + size_t max_uncompressed_chunk_bytes, + size_t * max_compressed_bytes) = 0; + + virtual nvcompStatus_t compress( + void **device_uncompressed_ptrs, + size_t *device_uncompressed_bytes, + size_t max_uncompressed_chunk_bytes, + size_t batch_size, + void *device_temp_ptr, + size_t temp_bytes, + void **device_compressed_ptrs, + size_t *device_compressed_bytes, + cudaStream_t stream) = 0; +}; + +template +class BatchedComp : public BatchedCompBase +{ +public: + BatchedComp (const FormatOptT & opt) : opts_(opt) {} + + nvcompStatus_t get_temp_size( + size_t batch_size, + size_t max_uncompressed_chunk_bytes, + size_t * temp_bytes) + { + return T (batch_size, max_uncompressed_chunk_bytes, opts_, temp_bytes); + } + + nvcompStatus_t get_max_compressed_chunk_size( + size_t max_uncompressed_chunk_bytes, + size_t * max_compressed_bytes) + { + return O (max_uncompressed_chunk_bytes, opts_, max_compressed_bytes); + } + + nvcompStatus_t compress( + void **device_uncompressed_ptrs, + size_t *device_uncompressed_bytes, + size_t max_uncompressed_chunk_bytes, + size_t batch_size, + void *device_temp_ptr, + size_t temp_bytes, + void **device_compressed_ptrs, + size_t *device_compressed_bytes, + cudaStream_t stream) + { + return C (device_uncompressed_ptrs, device_uncompressed_bytes, + max_uncompressed_chunk_bytes, batch_size, device_temp_ptr, temp_bytes, + device_compressed_ptrs, device_compressed_bytes, opts_, stream); + } + +private: + FormatOptT opts_; +}; + +struct GstNvCompVideoEncPrivate +{ + GstCudaContext *ctx = nullptr; + GstCudaStream *stream = nullptr; + +#ifdef HAVE_GST_GL + GstGLDisplay *gl_display = nullptr; + GstGLContext *gl_context = nullptr; + GstGLContext *other_gl_context = nullptr; +#endif + + GstBufferPool *pool = nullptr; + + GstVideoCodecState *state = nullptr; + std::shared_ptr manager; + std::shared_ptr config; + std::shared_ptr batched_comp; + + gboolean gl_interop = FALSE; + + std::mutex lock; + std::mutex input_lock; + std::condition_variable input_cond; + std::mutex output_lock; + std::condition_variable output_cond; + + std::queue> input_task_queue; + std::queue> output_task_queue; + std::shared_ptr cur_task; + GThread *encode_thread = nullptr; + std::atomic last_flow = { GST_FLOW_OK }; + + GstNvCompMethod method = DEFAULT_METHOD; + GstNvCompDeflateAlgo deflate_algo = DEFAULT_DEFLATE_ALGO; + GstNvCompBitcompAlgo bitcomp_algo = DEFAULT_BITCOMP_ALGO; + GstNvCompDataType data_type = DEFAULT_DATA_TYPE; + guint chunk_size = DEFAULT_CHUNK_SIZE; + gboolean batched = DEFAULT_BATCHED; + guint async_depth = DEFAULT_ASYNC_DEPTH; +}; +/* *INDENT-ON* */ + +struct _GstNvCompVideoEnc +{ + GstVideoEncoder parent; + GstNvCompVideoEncPrivate *priv; +}; + +static void gst_nv_comp_video_enc_finalize (GObject * object); +static void gst_nv_comp_video_enc_set_property (GObject * object, guint prop_id, + const GValue * value, GParamSpec * pspec); +static void gst_nv_comp_video_enc_get_property (GObject * object, guint prop_id, + GValue * value, GParamSpec * pspec); + +static void gst_nv_comp_video_enc_set_context (GstElement * element, + GstContext * context); + +static gboolean gst_nv_comp_video_enc_open (GstVideoEncoder * encoder); +static gboolean gst_nv_comp_video_enc_close (GstVideoEncoder * encoder); +static gboolean gst_nv_comp_video_enc_stop (GstVideoEncoder * encoder); +static gboolean gst_nv_comp_video_enc_flush (GstVideoEncoder * encoder); +static GstFlowReturn gst_nv_comp_video_enc_finish (GstVideoEncoder * encoder); +static gboolean gst_nv_comp_video_enc_sink_query (GstVideoEncoder * encoder, + GstQuery * query); +static gboolean gst_nv_comp_video_enc_src_query (GstVideoEncoder * encoder, + GstQuery * query); +static gboolean +gst_nv_comp_video_enc_propose_allocation (GstVideoEncoder * encoder, + GstQuery * query); +static gboolean gst_nv_comp_video_enc_set_format (GstVideoEncoder * encoder, + GstVideoCodecState * state); +static GstFlowReturn +gst_nv_comp_video_enc_handle_frame (GstVideoEncoder * encoder, + GstVideoCodecFrame * frame); + +#define gst_nv_comp_video_enc_parent_class parent_class +G_DEFINE_TYPE (GstNvCompVideoEnc, + gst_nv_comp_video_enc, GST_TYPE_VIDEO_ENCODER); + +static void +gst_nv_comp_video_enc_class_init (GstNvCompVideoEncClass * klass) +{ + auto object_class = G_OBJECT_CLASS (klass); + auto element_class = GST_ELEMENT_CLASS (klass); + auto encoder_class = GST_VIDEO_ENCODER_CLASS (klass); + + object_class->finalize = gst_nv_comp_video_enc_finalize; + object_class->set_property = gst_nv_comp_video_enc_set_property; + object_class->get_property = gst_nv_comp_video_enc_get_property; + + g_object_class_install_property (object_class, PROP_METHOD, + g_param_spec_enum ("method", "Method", + "Compression method", + GST_TYPE_NV_COMP_METHOD, DEFAULT_METHOD, + (GParamFlags) (G_PARAM_READWRITE | G_PARAM_STATIC_STRINGS))); + g_object_class_install_property (object_class, PROP_DEFLATE_ALGO, + g_param_spec_enum ("deflate-algo", "Deflate Algo", + "Algorithm to use for deflate and gdeflate methods", + GST_TYPE_NV_COMP_DEFLATE_ALGO, DEFAULT_DEFLATE_ALGO, + (GParamFlags) (G_PARAM_READWRITE | G_PARAM_STATIC_STRINGS))); + g_object_class_install_property (object_class, PROP_BITCOMP_ALGO, + g_param_spec_enum ("bitcomp-algo", "Bitcomp Algo", + "Algorithm to use for bitcomp method", + GST_TYPE_NV_COMP_BITCOMP_ALGO, DEFAULT_BITCOMP_ALGO, + (GParamFlags) (G_PARAM_READWRITE | G_PARAM_STATIC_STRINGS))); + g_object_class_install_property (object_class, PROP_DATA_TYPE, + g_param_spec_enum ("data-type", "Data Type", + "Compression data type", + GST_TYPE_NV_COMP_DATA_TYPE, DEFAULT_DATA_TYPE, + (GParamFlags) (G_PARAM_READWRITE | G_PARAM_STATIC_STRINGS))); + g_object_class_install_property (object_class, PROP_CHUNK_SIZE, + g_param_spec_uint ("chunk-size", "Chunk Size", + "Uncompressed chunk size for batched compression (0 = default)", + 0, G_MAXINT32, DEFAULT_CHUNK_SIZE, + (GParamFlags) (G_PARAM_READWRITE | GST_PARAM_MUTABLE_READY | + G_PARAM_STATIC_STRINGS))); + g_object_class_install_property (object_class, PROP_BATCHED, + g_param_spec_boolean ("batched", "Batched", + "Use low-level C API for batched operation", DEFAULT_BATCHED, + (GParamFlags) (G_PARAM_READWRITE | GST_PARAM_MUTABLE_READY | + G_PARAM_STATIC_STRINGS))); + g_object_class_install_property (object_class, PROP_ASYNC_DEPTH, + g_param_spec_uint ("async-depth", "Async Depth", + "Internal resource pool size for threaded encoding", + 1, 4, DEFAULT_ASYNC_DEPTH, + (GParamFlags) (G_PARAM_READWRITE | GST_PARAM_MUTABLE_READY | + G_PARAM_STATIC_STRINGS))); + + gst_element_class_add_static_pad_template (element_class, &sink_template); + gst_element_class_add_static_pad_template (element_class, &src_template); + + gst_element_class_set_static_metadata (element_class, + "nvCOMP Video Encoder", "Encoder/Video/Hardware", + "Lossless video compression element based on nvCOMP library", + "Seungha Yang "); + + element_class->set_context = + GST_DEBUG_FUNCPTR (gst_nv_comp_video_enc_set_context); + + encoder_class->open = GST_DEBUG_FUNCPTR (gst_nv_comp_video_enc_open); + encoder_class->close = GST_DEBUG_FUNCPTR (gst_nv_comp_video_enc_close); + encoder_class->stop = GST_DEBUG_FUNCPTR (gst_nv_comp_video_enc_stop); + encoder_class->flush = GST_DEBUG_FUNCPTR (gst_nv_comp_video_enc_flush); + encoder_class->finish = GST_DEBUG_FUNCPTR (gst_nv_comp_video_enc_finish); + encoder_class->sink_query = + GST_DEBUG_FUNCPTR (gst_nv_comp_video_enc_sink_query); + encoder_class->src_query = + GST_DEBUG_FUNCPTR (gst_nv_comp_video_enc_src_query); + encoder_class->propose_allocation = + GST_DEBUG_FUNCPTR (gst_nv_comp_video_enc_propose_allocation); + encoder_class->set_format = + GST_DEBUG_FUNCPTR (gst_nv_comp_video_enc_set_format); + encoder_class->handle_frame = + GST_DEBUG_FUNCPTR (gst_nv_comp_video_enc_handle_frame); + + GST_DEBUG_CATEGORY_INIT (gst_nv_comp_video_enc_debug, + "nvcompvideoenc", 0, "nvcompvideoenc"); +} + +static void +gst_nv_comp_video_enc_init (GstNvCompVideoEnc * self) +{ + self->priv = new GstNvCompVideoEncPrivate (); +} + +static void +gst_nv_comp_video_enc_finalize (GObject * object) +{ + auto self = GST_NV_COMP_VIDEO_ENC (object); + + delete self->priv; + + G_OBJECT_CLASS (parent_class)->finalize (object); +} + +static void +gst_nv_comp_video_enc_set_property (GObject * object, guint prop_id, + const GValue * value, GParamSpec * pspec) +{ + auto self = GST_NV_COMP_VIDEO_ENC (object); + auto priv = self->priv; + + std::lock_guard < std::mutex > lk (priv->lock); + + switch (prop_id) { + case PROP_METHOD: + priv->method = (GstNvCompMethod) g_value_get_enum (value); + break; + case PROP_DEFLATE_ALGO: + priv->deflate_algo = (GstNvCompDeflateAlgo) g_value_get_enum (value); + break; + case PROP_BITCOMP_ALGO: + priv->bitcomp_algo = (GstNvCompBitcompAlgo) g_value_get_enum (value); + break; + case PROP_DATA_TYPE: + priv->data_type = (GstNvCompDataType) g_value_get_enum (value); + break; + case PROP_CHUNK_SIZE: + priv->chunk_size = g_value_get_uint (value); + break; + case PROP_BATCHED: + priv->batched = g_value_get_boolean (value); + break; + case PROP_ASYNC_DEPTH: + priv->async_depth = g_value_get_uint (value); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID (object, prop_id, pspec); + break; + } +} + +static void +gst_nv_comp_video_enc_get_property (GObject * object, guint prop_id, + GValue * value, GParamSpec * pspec) +{ + auto self = GST_NV_COMP_VIDEO_ENC (object); + auto priv = self->priv; + + std::lock_guard < std::mutex > lk (priv->lock); + + switch (prop_id) { + case PROP_METHOD: + g_value_set_enum (value, priv->method); + break; + case PROP_DEFLATE_ALGO: + g_value_set_enum (value, priv->deflate_algo); + break; + case PROP_BITCOMP_ALGO: + g_value_set_enum (value, priv->bitcomp_algo); + break; + case PROP_DATA_TYPE: + g_value_set_enum (value, priv->data_type); + break; + case PROP_CHUNK_SIZE: + g_value_set_uint (value, priv->chunk_size); + break; + case PROP_BATCHED: + g_value_set_boolean (value, priv->batched); + break; + case PROP_ASYNC_DEPTH: + g_value_set_uint (value, priv->async_depth); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID (object, prop_id, pspec); + break; + } +} + +static void +gst_nv_comp_video_enc_set_context (GstElement * element, GstContext * context) +{ + auto self = GST_NV_COMP_VIDEO_ENC (element); + auto priv = self->priv; + + gst_cuda_handle_set_context (element, context, -1, &priv->ctx); +#ifdef HAVE_GST_GL + if (gst_gl_handle_set_context (element, context, &priv->gl_display, + &priv->other_gl_context)) { + if (priv->gl_display) + gst_gl_display_filter_gl_api (priv->gl_display, GST_GL_API_OPENGL3); + } +#endif + + GST_ELEMENT_CLASS (parent_class)->set_context (element, context); +} + +static gboolean +gst_nv_comp_video_enc_open (GstVideoEncoder * encoder) +{ + auto self = GST_NV_COMP_VIDEO_ENC (encoder); + auto priv = self->priv; + + if (!gst_cuda_ensure_element_context (GST_ELEMENT_CAST (encoder), + -1, &priv->ctx)) { + GST_ERROR_OBJECT (self, "Couldn't get cuda context"); + return FALSE; + } + + priv->stream = gst_cuda_stream_new (priv->ctx); + + return TRUE; +} + +static gboolean +gst_nv_comp_video_enc_close (GstVideoEncoder * encoder) +{ + auto self = GST_NV_COMP_VIDEO_ENC (encoder); + auto priv = self->priv; + + gst_clear_cuda_stream (&priv->stream); + gst_clear_object (&priv->ctx); + +#ifdef HAVE_GST_GL + gst_clear_object (&priv->other_gl_context); + gst_clear_object (&priv->gl_context); + gst_clear_object (&priv->gl_context); +#endif + + return TRUE; +} + +static void +gst_nv_comp_video_enc_drain (GstNvCompVideoEnc * self, gboolean locked) +{ + auto priv = self->priv; + if (!priv->encode_thread) + return; + + if (locked) + GST_VIDEO_ENCODER_STREAM_UNLOCK (self); + + { + std::lock_guard < std::mutex > lk (priv->output_lock); + priv->output_task_queue.push (nullptr); + priv->output_cond.notify_all (); + } + + g_clear_pointer (&priv->encode_thread, g_thread_join); + + if (locked) + GST_VIDEO_ENCODER_STREAM_LOCK (self); + + priv->last_flow = GST_FLOW_OK; +} + +static gboolean +gst_nv_comp_video_enc_stop (GstVideoEncoder * encoder) +{ + auto self = GST_NV_COMP_VIDEO_ENC (encoder); + auto priv = self->priv; + + gst_nv_comp_video_enc_drain (self, FALSE); + + if (priv->ctx) { + gst_cuda_context_push (priv->ctx); + priv->manager = nullptr; + priv->cur_task = nullptr; + priv->input_task_queue = { }; + priv->output_task_queue = { }; + + gst_cuda_context_pop (nullptr); + } + + g_clear_pointer (&priv->state, gst_video_codec_state_unref); + + if (priv->pool) { + gst_buffer_pool_set_active (priv->pool, FALSE); + gst_clear_object (&priv->pool); + } + + return TRUE; +} + +static gboolean +gst_nv_comp_video_enc_flush (GstVideoEncoder * encoder) +{ + auto self = GST_NV_COMP_VIDEO_ENC (encoder); + + gst_nv_comp_video_enc_drain (self, TRUE); + + return TRUE; +} + +static GstFlowReturn +gst_nv_comp_video_enc_finish (GstVideoEncoder * encoder) +{ + auto self = GST_NV_COMP_VIDEO_ENC (encoder); + + gst_nv_comp_video_enc_drain (self, TRUE); + + return GST_FLOW_OK; +} + +static gboolean +gst_nv_comp_video_enc_handle_context_query (GstNvCompVideoEnc * self, + GstQuery * query) +{ + auto priv = self->priv; + +#ifdef HAVE_GST_GL + { + GstGLDisplay *display = nullptr; + GstGLContext *other = nullptr; + GstGLContext *local = nullptr; + + if (priv->gl_display) + display = (GstGLDisplay *) gst_object_ref (priv->gl_display); + if (priv->gl_context) + local = (GstGLContext *) gst_object_ref (priv->gl_context); + if (priv->other_gl_context) + other = (GstGLContext *) gst_object_ref (priv->other_gl_context); + + auto ret = gst_gl_handle_context_query (GST_ELEMENT (self), query, + display, local, other); + gst_clear_object (&display); + gst_clear_object (&other); + gst_clear_object (&local); + + if (ret) + return TRUE; + } +#endif + + if (gst_cuda_handle_context_query (GST_ELEMENT (self), query, priv->ctx)) + return TRUE; + + return FALSE; +} + +static gboolean +gst_nv_comp_video_enc_sink_query (GstVideoEncoder * encoder, GstQuery * query) +{ + auto self = GST_NV_COMP_VIDEO_ENC (encoder); + + switch (GST_QUERY_TYPE (query)) { + case GST_QUERY_CONTEXT: + if (gst_nv_comp_video_enc_handle_context_query (self, query)) + return TRUE; + break; + default: + break; + } + + return GST_VIDEO_ENCODER_CLASS (parent_class)->sink_query (encoder, query); +} + +static gboolean +gst_nv_comp_video_enc_src_query (GstVideoEncoder * encoder, GstQuery * query) +{ + auto self = GST_NV_COMP_VIDEO_ENC (encoder); + + switch (GST_QUERY_TYPE (query)) { + case GST_QUERY_CONTEXT: + if (gst_nv_comp_video_enc_handle_context_query (self, query)) + return TRUE; + break; + default: + break; + } + + return GST_VIDEO_ENCODER_CLASS (parent_class)->src_query (encoder, query); +} + +#ifdef HAVE_GST_GL +static void +check_cuda_device_from_gl_context (GstGLContext * context, gboolean * ret) +{ + guint device_count = 0; + CUdevice device_list[1] = { 0, }; + CUresult cuda_ret; + + *ret = FALSE; + cuda_ret = CuGLGetDevices (&device_count, + device_list, 1, CU_GL_DEVICE_LIST_ALL); + + if (!gst_cuda_result (cuda_ret) || device_count == 0) + return; + + *ret = TRUE; +} + +static gboolean +gst_nv_comp_video_enc_ensure_gl_context (GstNvCompVideoEnc * self) +{ + auto priv = self->priv; + gboolean ret = FALSE; + + if (!gst_gl_ensure_element_data (GST_ELEMENT (self), &priv->gl_display, + &priv->other_gl_context)) { + GST_DEBUG_OBJECT (self, "Couldn't get GL display"); + return FALSE; + } + + gst_gl_display_filter_gl_api (priv->gl_display, GST_GL_API_OPENGL3); + + if (!gst_gl_display_ensure_context (priv->gl_display, priv->other_gl_context, + &priv->gl_context, nullptr)) { + GST_DEBUG_OBJECT (self, "Couldn't get GL context"); + return FALSE; + } + + gst_gl_context_thread_add (priv->gl_context, + (GstGLContextThreadFunc) check_cuda_device_from_gl_context, &ret); + + return ret; +} +#endif + +static gboolean +gst_nv_comp_video_enc_propose_allocation (GstVideoEncoder * encoder, + GstQuery * query) +{ + auto self = GST_NV_COMP_VIDEO_ENC (encoder); + auto priv = self->priv; + GstBufferPool *pool = nullptr; + guint size; + + GstCaps *caps; + gst_query_parse_allocation (query, &caps, nullptr); + if (!caps) { + GST_WARNING_OBJECT (self, "null caps in query"); + return FALSE; + } + + GstVideoInfo info; + if (!gst_video_info_from_caps (&info, caps)) { + GST_WARNING_OBJECT (self, "Failed to convert caps into info"); + return FALSE; + } + + auto features = gst_caps_get_features (caps, 0); + gboolean use_cuda_pool = FALSE; + if (gst_caps_features_contains (features, + GST_CAPS_FEATURE_MEMORY_CUDA_MEMORY)) { + GST_DEBUG_OBJECT (self, "upstream support CUDA memory"); + pool = gst_cuda_buffer_pool_new (priv->ctx); + use_cuda_pool = TRUE; + } +#ifdef HAVE_GST_GL + else if (gst_caps_features_contains (features, + GST_CAPS_FEATURE_MEMORY_GL_MEMORY)) { + if (!gst_nv_comp_video_enc_ensure_gl_context (self)) { + priv->gl_interop = FALSE; + } else { + pool = gst_gl_buffer_pool_new (priv->gl_context); + } + } +#endif + + if (!pool) + pool = gst_video_buffer_pool_new (); + + auto config = gst_buffer_pool_get_config (pool); + gst_buffer_pool_config_add_option (config, GST_BUFFER_POOL_OPTION_VIDEO_META); + + size = GST_VIDEO_INFO_SIZE (&info); + gst_buffer_pool_config_set_params (config, caps, size, 0, 0); + if (use_cuda_pool && priv->stream) { + /* Set our stream on buffer pool config so that CUstream can be shared */ + gst_buffer_pool_config_set_cuda_stream (config, priv->stream); + } + + if (!gst_buffer_pool_set_config (pool, config)) { + GST_WARNING_OBJECT (self, "Failed to set pool config"); + gst_object_unref (pool); + return FALSE; + } + + config = gst_buffer_pool_get_config (pool); + gst_buffer_pool_config_get_params (config, nullptr, &size, nullptr, nullptr); + gst_structure_free (config); + + gst_query_add_allocation_pool (query, pool, size, 0, 0); + gst_query_add_allocation_meta (query, GST_VIDEO_META_API_TYPE, nullptr); + gst_object_unref (pool); + + return TRUE; +} + +static gboolean +gst_nv_comp_video_enc_alloc_task (GstNvCompVideoEnc * self, EncoderTask * task, + gboolean batched, size_t uncompressed_size, size_t compressed_size, + size_t batch_size, size_t chunk_size, size_t output_chunk_size, + size_t temp_size) +{ + size_t alloc_size = sizeof (size_t) * batch_size; + uint8_t *uncomp_data; + uint8_t *comp_data; + + auto ret = CuEventCreate (&task->event, + CU_EVENT_BLOCKING_SYNC | CU_EVENT_DISABLE_TIMING); + if (!gst_cuda_result (ret)) + return FALSE; + + auto aligned_uncompressed_size = uncompressed_size; + ret = CuMemAlloc ((CUdeviceptr *) & task->device_uncompressed, + aligned_uncompressed_size); + if (!gst_cuda_result (ret)) + return FALSE; + + ret = CuMemAllocHost ((void **) &task->host_uncompressed, + aligned_uncompressed_size); + if (!gst_cuda_result (ret)) + return FALSE; + + auto aligned_compressed_size = GST_ROUND_UP_8 (compressed_size); + ret = CuMemAlloc ((CUdeviceptr *) & task->device_compressed, + aligned_compressed_size); + if (!gst_cuda_result (ret)) + return FALSE; + + ret = CuMemAllocHost ((void **) &task->host_compressed, + aligned_compressed_size); + if (!gst_cuda_result (ret)) + return FALSE; + + if (!batched) + return TRUE; + + ret = CuMemAllocHost ((void **) &task->host_uncompressed_bytes, alloc_size); + if (!gst_cuda_result (ret)) + return FALSE; + + ret = CuMemAllocHost ((void **) &task->host_uncompressed_ptrs, alloc_size); + if (!gst_cuda_result (ret)) + return FALSE; + + for (size_t i = 0; i < batch_size; i++) { + if (i + 1 < batch_size) + task->host_uncompressed_bytes[i] = chunk_size; + else + task->host_uncompressed_bytes[i] = (uncompressed_size - (chunk_size * i)); + } + + ret = CuMemAlloc ((CUdeviceptr *) & task->device_uncompressed_bytes, + alloc_size); + if (!gst_cuda_result (ret)) + return FALSE; + + ret = CuMemAlloc ((CUdeviceptr *) & task->device_uncompressed_ptrs, + alloc_size); + if (!gst_cuda_result (ret)) + return FALSE; + + ret = CuMemAlloc ((CUdeviceptr *) & task->device_compressed_bytes, + alloc_size); + if (!gst_cuda_result (ret)) + return FALSE; + + ret = CuMemAlloc ((CUdeviceptr *) & task->device_compressed_ptrs, alloc_size); + if (!gst_cuda_result (ret)) + return FALSE; + + ret = CuMemAllocHost ((void **) &task->host_compressed_bytes, alloc_size); + if (!gst_cuda_result (ret)) + return FALSE; + + ret = CuMemAllocHost ((void **) &task->host_compressed_ptrs, alloc_size); + if (!gst_cuda_result (ret)) + return FALSE; + + if (temp_size > 0) { + ret = CuMemAlloc ((CUdeviceptr *) & task->temp_ptr, temp_size); + if (!gst_cuda_result (ret)) + return FALSE; + } + + task->temp_size = temp_size; + + uncomp_data = task->device_uncompressed; + comp_data = task->device_compressed; + for (size_t i = 0; i < batch_size; i++) { + task->host_uncompressed_ptrs[i] = uncomp_data; + uncomp_data += chunk_size; + + task->host_compressed_ptrs[i] = comp_data; + comp_data += output_chunk_size; + } + + ret = CuMemcpyHtoD ((CUdeviceptr) task->device_uncompressed_bytes, + task->host_uncompressed_bytes, alloc_size); + if (!gst_cuda_result (ret)) + return FALSE; + + ret = CuMemcpyHtoD ((CUdeviceptr) task->device_uncompressed_ptrs, + task->host_uncompressed_ptrs, alloc_size); + if (!gst_cuda_result (ret)) + return FALSE; + + ret = CuMemcpyHtoD ((CUdeviceptr) task->device_compressed_ptrs, + task->host_compressed_ptrs, alloc_size); + if (!gst_cuda_result (ret)) + return FALSE; + + task->batched = batched; + task->batch_size = batch_size; + task->chunk_size = chunk_size; + task->max_output_chunk_size = output_chunk_size; + task->compressed_alloc_size = aligned_compressed_size; + + return TRUE; +} + +static gboolean +gst_nv_comp_video_enc_set_format (GstVideoEncoder * encoder, + GstVideoCodecState * state) +{ + auto self = GST_NV_COMP_VIDEO_ENC (encoder); + auto priv = self->priv; + + gst_nv_comp_video_enc_drain (self, TRUE); + + std::lock_guard < std::mutex > lk (priv->lock); + + if (!priv->ctx) { + GST_ERROR_OBJECT (self, "CUDA context was not configured"); + return FALSE; + } + + if (priv->pool) { + gst_buffer_pool_set_active (priv->pool, FALSE); + gst_clear_object (&priv->pool); + } + + g_clear_pointer (&priv->state, gst_video_codec_state_unref); + priv->state = gst_video_codec_state_ref (state); + + std::string mime_type = "video/x-nvcomp"; + + if (!gst_cuda_context_push (priv->ctx)) { + GST_ERROR_OBJECT (self, "Couldn't push context"); + return FALSE; + } + + priv->gl_interop = FALSE; +#if HAVE_GST_GL + auto features = gst_caps_get_features (state->caps, 0); + if (gst_caps_features_contains (features, GST_CAPS_FEATURE_MEMORY_GL_MEMORY)) + priv->gl_interop = TRUE; +#endif + + priv->manager = nullptr; + priv->config = nullptr; + priv->batched_comp = nullptr; + priv->input_task_queue = { }; + priv->output_task_queue = { }; + auto stream = (cudaStream_t) gst_cuda_stream_get_handle (priv->stream); + guint device_id = 0; + g_object_get (priv->ctx, "cuda-device-id", &device_id, nullptr); + size_t chunk_size = priv->chunk_size; + size_t batch_size = 0; + + switch (priv->method) { + case GST_NV_COMP_LZ4: + { + nvcompBatchedLZ4Opts_t opts = nvcompBatchedLZ4DefaultOpts; + if (priv->data_type != GST_NV_COMP_DATA_TYPE_DEFAULT) + opts.data_type = (nvcompType_t) priv->data_type; + + if (chunk_size == 0) + chunk_size = 65536; + + chunk_size = MAX (32768, chunk_size); + chunk_size = GST_ROUND_UP_8 (chunk_size); + chunk_size = MIN (chunk_size, nvcompLZ4CompressionMaxAllowedChunkSize); + + if (priv->batched) { + batch_size = (state->info.size + chunk_size - 1) / chunk_size; + + priv->batched_comp = + std::make_shared < BatchedComp < nvcompBatchedLZ4Opts_t, + nvcompBatchedLZ4CompressGetTempSize, + nvcompBatchedLZ4CompressGetMaxOutputChunkSize, + nvcompBatchedLZ4CompressAsync >> (opts); + mime_type = "video/x-nvcomp-lz4"; + } else { + priv->manager = std::make_shared < LZ4Manager > (chunk_size, + opts, stream, device_id); + } + GST_DEBUG_OBJECT (self, "Using LZ4"); + break; + } + case GST_NV_COMP_SNAPPY: + { + if (chunk_size == 0) + chunk_size = 65536; + + chunk_size = MAX (32768, chunk_size); + chunk_size = GST_ROUND_UP_8 (chunk_size); + chunk_size = MIN (chunk_size, nvcompSnappyCompressionMaxAllowedChunkSize); + + if (priv->batched) { + batch_size = (state->info.size + chunk_size - 1) / chunk_size; + + priv->batched_comp = + std::make_shared < BatchedComp < nvcompBatchedSnappyOpts_t, + nvcompBatchedSnappyCompressGetTempSize, + nvcompBatchedSnappyCompressGetMaxOutputChunkSize, + nvcompBatchedSnappyCompressAsync >> + (nvcompBatchedSnappyDefaultOpts); + mime_type = "video/x-nvcomp-snappy"; + } else { + priv->manager = std::make_shared < SnappyManager > (chunk_size, + nvcompBatchedSnappyDefaultOpts, stream, device_id); + } + GST_DEBUG_OBJECT (self, "Using SNAPPY"); + break; + } + case GST_NV_COMP_GDEFLATE: + { + nvcompBatchedGdeflateOpts_t opts; + opts.algo = (int) priv->deflate_algo; + + if (chunk_size == 0) + chunk_size = 65536; + + chunk_size = MAX (32768, chunk_size); + chunk_size = GST_ROUND_UP_8 (chunk_size); + chunk_size = MIN (chunk_size, + nvcompGdeflateCompressionMaxAllowedChunkSize); + + + if (priv->batched) { + batch_size = (state->info.size + chunk_size - 1) / chunk_size; + + priv->batched_comp = + std::make_shared < BatchedComp < nvcompBatchedGdeflateOpts_t, + nvcompBatchedGdeflateCompressGetTempSize, + nvcompBatchedGdeflateCompressGetMaxOutputChunkSize, + nvcompBatchedGdeflateCompressAsync >> (opts); + mime_type = "video/x-nvcomp-gdeflate"; + } else { + priv->manager = std::make_shared < GdeflateManager > (chunk_size, + opts, stream, device_id); + } + GST_DEBUG_OBJECT (self, "Using GDEFLATE"); + break; + } + case GST_NV_COMP_DEFLATE: + { + if (chunk_size == 0) + chunk_size = 65536; + + chunk_size = MAX (32768, chunk_size); + chunk_size = GST_ROUND_UP_8 (chunk_size); + chunk_size = MIN (chunk_size, + nvcompDeflateCompressionMaxAllowedChunkSize); + + nvcompBatchedDeflateOpts_t opts; + opts.algo = (int) priv->deflate_algo; + if (priv->batched) { + batch_size = (state->info.size + chunk_size - 1) / chunk_size; + + priv->batched_comp = + std::make_shared < BatchedComp < nvcompBatchedDeflateOpts_t, + nvcompBatchedDeflateCompressGetTempSize, + nvcompBatchedDeflateCompressGetMaxOutputChunkSize, + nvcompBatchedDeflateCompressAsync >> (opts); + mime_type = "video/x-nvcomp-deflate"; + } else { + priv->manager = std::make_shared < DeflateManager > (chunk_size, + opts, stream, device_id); + } + GST_DEBUG_OBJECT (self, "Using DEFLATE"); + break; + } + case GST_NV_COMP_ZSTD: + { + if (chunk_size == 0) + chunk_size = 65536; + + chunk_size = MAX (32768, chunk_size); + chunk_size = GST_ROUND_UP_8 (chunk_size); + chunk_size = MIN (chunk_size, nvcompZstdCompressionMaxAllowedChunkSize); + + if (priv->batched) { + batch_size = (state->info.size + chunk_size - 1) / chunk_size; + + priv->batched_comp = + std::make_shared < BatchedComp < nvcompBatchedZstdOpts_t, + nvcompBatchedZstdCompressGetTempSize, + nvcompBatchedZstdCompressGetMaxOutputChunkSize, + nvcompBatchedZstdCompressAsync >> (nvcompBatchedZstdDefaultOpts); + mime_type = "video/x-nvcomp-zstd"; + } else { + priv->manager = std::make_shared < ZstdManager > (chunk_size, + nvcompBatchedZstdDefaultOpts, stream, device_id); + } + GST_DEBUG_OBJECT (self, "Using ZSTD"); + break; + } + case GST_NV_COMP_CASCADED: + { + if (chunk_size == 0) + chunk_size = 4096; + + chunk_size = MAX (512, chunk_size); + chunk_size = GST_ROUND_UP_8 (chunk_size); + chunk_size = MIN (chunk_size, 16384); + + nvcompBatchedCascadedOpts_t opts = nvcompBatchedCascadedDefaultOpts; + opts.chunk_size = chunk_size; + if (priv->data_type != GST_NV_COMP_DATA_TYPE_DEFAULT) + opts.type = (nvcompType_t) priv->data_type; + + if (priv->batched) { + batch_size = (state->info.size + chunk_size - 1) / chunk_size; + + priv->batched_comp = + std::make_shared < BatchedComp < nvcompBatchedCascadedOpts_t, + nvcompBatchedCascadedCompressGetTempSize, + nvcompBatchedCascadedCompressGetMaxOutputChunkSize, + nvcompBatchedCascadedCompressAsync >> (opts); + mime_type = "video/x-nvcomp-cascaded"; + } else { + priv->manager = std::make_shared < CascadedManager > (chunk_size, + opts, stream, device_id); + } + GST_DEBUG_OBJECT (self, "Using CASCADED"); + break; + } + case GST_NV_COMP_BITCOMP: + { + if (chunk_size == 0) + chunk_size = 65536; + + chunk_size = MAX (32768, chunk_size); + chunk_size = GST_ROUND_UP_8 (chunk_size); + chunk_size = MIN (chunk_size, + nvcompBitcompCompressionMaxAllowedChunkSize); + + nvcompBatchedBitcompFormatOpts opts = nvcompBatchedBitcompDefaultOpts; + opts.algorithm_type = (int) priv->bitcomp_algo; + if (priv->data_type != GST_NV_COMP_DATA_TYPE_DEFAULT) + opts.data_type = (nvcompType_t) priv->data_type; + + if (priv->batched) { + batch_size = (state->info.size + chunk_size - 1) / chunk_size; + + priv->batched_comp = + std::make_shared < BatchedComp < nvcompBatchedBitcompFormatOpts, + nvcompBatchedBitcompCompressGetTempSize, + nvcompBatchedBitcompCompressGetMaxOutputChunkSize, + nvcompBatchedBitcompCompressAsync >> (opts); + mime_type = "video/x-nvcomp-bitcomp"; + } else { + priv->manager = std::make_shared < BitcompManager > (chunk_size, + opts, stream, device_id); + } + GST_DEBUG_OBJECT (self, "Using BITCOMP"); + break; + } + case GST_NV_COMP_ANS: + { + if (chunk_size == 0) + chunk_size = 65536; + + chunk_size = MAX (32768, chunk_size); + chunk_size = GST_ROUND_UP_8 (chunk_size); + chunk_size = MIN (chunk_size, nvcompANSCompressionMaxAllowedChunkSize); + + if (priv->batched) { + batch_size = (state->info.size + chunk_size - 1) / chunk_size; + priv->batched_comp = + std::make_shared < BatchedComp < nvcompBatchedANSOpts_t, + nvcompBatchedANSCompressGetTempSize, + nvcompBatchedANSCompressGetMaxOutputChunkSize, + nvcompBatchedANSCompressAsync >> (nvcompBatchedANSDefaultOpts); + mime_type = "video/x-nvcomp-ans"; + } else { + priv->manager = std::make_shared < ANSManager > (chunk_size, + nvcompBatchedANSDefaultOpts, stream, device_id); + } + + GST_DEBUG_OBJECT (self, "Using ANS"); + break; + } + default: + g_assert_not_reached (); + return FALSE; + } + + size_t max_output_size = 0; + size_t max_output_chunk_size = 0; + size_t temp_size = 0; + if (priv->batched) { + auto status = priv->batched_comp->get_temp_size (batch_size, + chunk_size, &temp_size); + if (status != nvcompSuccess) { + GST_ERROR_OBJECT (self, "Couldn't get temp size"); + gst_cuda_context_pop (nullptr); + return FALSE; + } + + status = priv->batched_comp->get_max_compressed_chunk_size (chunk_size, + &max_output_chunk_size); + if (status != nvcompSuccess) { + GST_ERROR_OBJECT (self, "Couldn't get max output chunk size"); + gst_cuda_context_pop (nullptr); + return FALSE; + } + + max_output_chunk_size = GST_ROUND_UP_8 (max_output_chunk_size); + max_output_size = max_output_chunk_size * batch_size; + } else { + priv->config = std::make_shared < CompressionConfig > + (priv->manager->configure_compression (state->info.size)); + max_output_size = priv->config->max_compressed_buffer_size; + } + + GST_DEBUG_OBJECT (self, "Allocating resource, batched: %d" + ", uncompressed size: %" G_GSIZE_FORMAT + ", max-output-size: %" G_GSIZE_FORMAT + ", batch-size: %" G_GSIZE_FORMAT + ", chunk-size: %" G_GSIZE_FORMAT + ", max-output-chunk-size: %" G_GSIZE_FORMAT + ", temp-size: %" G_GSIZE_FORMAT, priv->batched, state->info.size, + max_output_size, batch_size, chunk_size, max_output_chunk_size, + temp_size); + + for (guint i = 0; i < priv->async_depth; i++) { + auto task = std::make_shared < EncoderTask > (); + task->ctx = (GstCudaContext *) gst_object_ref (priv->ctx); + + if (!gst_nv_comp_video_enc_alloc_task (self, task.get (), priv->batched, + state->info.size, max_output_size, batch_size, chunk_size, + max_output_chunk_size, temp_size)) { + priv->manager = nullptr; + priv->input_task_queue = { }; + task = nullptr; + gst_cuda_context_pop (nullptr); + return FALSE; + } + + priv->input_task_queue.push (task); + } + + /* In case of batched, custom header is added to signal chunk and batch size */ + if (priv->batched) { + /* version */ + max_output_size += sizeof (guint32); + + /* max uncompressed chunk size */ + max_output_size += sizeof (guint32); + + /* max compressed chunk size */ + max_output_size += sizeof (guint32); + + /* batch size */ + max_output_size += sizeof (guint32); + + /* each uncompressed/compressed chunk size */ + max_output_size += (sizeof (guint32) * batch_size * 2); + } + + priv->pool = gst_buffer_pool_new (); + auto config = gst_buffer_pool_get_config (priv->pool); + gst_buffer_pool_config_set_params (config, nullptr, max_output_size, 0, 0); + gst_buffer_pool_set_config (priv->pool, config); + gst_buffer_pool_set_active (priv->pool, TRUE); + + gst_cuda_context_pop (nullptr); + + auto caps = gst_caps_new_simple (mime_type.c_str (), "format", G_TYPE_STRING, + gst_video_format_to_string (GST_VIDEO_INFO_FORMAT (&state->info)), + nullptr); + auto out_state = + gst_video_encoder_set_output_state (GST_VIDEO_ENCODER (encoder), + caps, state); + gst_video_codec_state_unref (out_state); + + return TRUE; +} + +static gboolean +gst_nv_comp_video_enc_upload (GstNvCompVideoEnc * self, GstVideoFrame * frame, + CUstream stream, gboolean is_device_copy) +{ + auto priv = self->priv; + auto info = &priv->state->info; + auto finfo = info->finfo; + gint comp[GST_VIDEO_MAX_COMPONENTS]; + CUresult ret = CUDA_SUCCESS; + auto cur_task = priv->cur_task; + + for (guint i = 0; i < GST_VIDEO_FRAME_N_PLANES (frame); i++) { + guint8 *sp = (guint8 *) GST_VIDEO_FRAME_PLANE_DATA (frame, i); + guint8 *dp; + if (is_device_copy) + dp = cur_task->device_uncompressed + info->offset[i]; + else + dp = cur_task->host_uncompressed + info->offset[i]; + + guint ss, ds; + guint w, h; + + if (GST_VIDEO_FORMAT_INFO_HAS_PALETTE (finfo) && i == 1) { + if (is_device_copy) { + ret = CuMemcpyDtoDAsync ((CUdeviceptr) dp, (CUdeviceptr) sp, + 256 * 4, stream); + } else { + memcpy (dp, sp, 256 * 4); + } + + if (!gst_cuda_result (ret)) { + GST_ERROR_OBJECT (self, "CUDA memcpy failed"); + return FALSE; + } + + return TRUE; + } + + ss = GST_VIDEO_FRAME_PLANE_STRIDE (frame, i); + ds = GST_VIDEO_INFO_PLANE_STRIDE (info, i); + + gst_video_format_info_component (finfo, i, comp); + + w = GST_VIDEO_INFO_COMP_WIDTH (info, comp[0]) * + GST_VIDEO_INFO_COMP_PSTRIDE (info, comp[0]); + if (w == 0) + w = MIN (ss, ds); + + h = GST_VIDEO_INFO_COMP_HEIGHT (info, comp[0]); + + if (GST_VIDEO_FORMAT_INFO_IS_TILED (finfo)) { + gint tile_size; + gint sx_tiles, sy_tiles, dx_tiles, dy_tiles; + GstVideoTileMode mode; + + tile_size = GST_VIDEO_FORMAT_INFO_TILE_SIZE (info->finfo, i); + + mode = GST_VIDEO_FORMAT_INFO_TILE_MODE (info->finfo); + + sx_tiles = GST_VIDEO_TILE_X_TILES (ss); + sy_tiles = GST_VIDEO_TILE_Y_TILES (ss); + + dx_tiles = GST_VIDEO_TILE_X_TILES (ds); + dy_tiles = GST_VIDEO_TILE_Y_TILES (ds); + + w = MIN (sx_tiles, dx_tiles); + h = MIN (sy_tiles, dy_tiles); + + for (guint j = 0; j < h; j++) { + for (guint k = 0; k < w; k++) { + guint si, di; + guint8 *cur_dp; + guint8 *cur_sp; + + si = gst_video_tile_get_index (mode, k, j, sx_tiles, sy_tiles); + di = gst_video_tile_get_index (mode, k, j, dx_tiles, dy_tiles); + + cur_dp = dp + (di * tile_size); + cur_sp = sp + (si * tile_size); + + if (is_device_copy) { + ret = CuMemcpyDtoDAsync ((CUdeviceptr) cur_dp, (CUdeviceptr) cur_sp, + w, stream); + } else { + memcpy (cur_dp, cur_sp, w); + } + + if (!gst_cuda_result (ret)) { + GST_ERROR_OBJECT (self, "CUDA memcpy failed"); + return FALSE; + } + } + } + } else { + if (is_device_copy) { + CUDA_MEMCPY2D params = { }; + params.srcMemoryType = CU_MEMORYTYPE_DEVICE; + params.srcDevice = (CUdeviceptr) sp; + params.srcPitch = ss; + + params.dstMemoryType = CU_MEMORYTYPE_DEVICE; + params.dstDevice = (CUdeviceptr) dp; + params.dstPitch = ds; + + params.WidthInBytes = w; + params.Height = h; + + ret = CuMemcpy2DAsync (¶ms, stream); + + if (!gst_cuda_result (ret)) { + GST_ERROR_OBJECT (self, "CUDA memcpy failed"); + return FALSE; + } + } else { + for (guint j = 0; j < h; j++) { + memcpy (dp, sp, w); + dp += ds; + sp += ss; + } + } + } + } + + return TRUE; +} + +#ifdef HAVE_GST_GL +struct GLInteropData +{ + GstNvCompVideoEnc *self = nullptr; + GstBuffer *buffer = nullptr; + gboolean ret = FALSE; +}; + +static GstCudaGraphicsResource * +ensure_gl_cuda_resource (GstNvCompVideoEnc * self, GstMemory * mem) +{ + auto priv = self->priv; + GstCudaGraphicsResource *resource; + GQuark quark; + + if (!gst_is_gl_memory_pbo (mem)) { + GST_WARNING_OBJECT (self, "memory is not GL PBO memory, %s", + mem->allocator->mem_type); + return nullptr; + } + + quark = gst_cuda_quark_from_id (GST_CUDA_QUARK_GRAPHICS_RESOURCE); + resource = (GstCudaGraphicsResource *) + gst_mini_object_get_qdata (GST_MINI_OBJECT (mem), quark); + + if (!resource) { + GstMapInfo map_info; + GstGLMemoryPBO *pbo = (GstGLMemoryPBO *) mem; + GstGLBuffer *gl_buf = pbo->pbo; + gboolean ret; + + if (!gst_memory_map (mem, &map_info, + (GstMapFlags) (GST_MAP_READ | GST_MAP_GL))) { + GST_ERROR_OBJECT (self, "Couldn't map gl memory"); + return nullptr; + } + + resource = gst_cuda_graphics_resource_new (priv->ctx, + GST_OBJECT (GST_GL_BASE_MEMORY_CAST (mem)->context), + GST_CUDA_GRAPHICS_RESOURCE_GL_BUFFER); + + GST_LOG_OBJECT (self, "registering gl buffer %d to CUDA", gl_buf->id); + ret = gst_cuda_graphics_resource_register_gl_buffer (resource, gl_buf->id, + CU_GRAPHICS_REGISTER_FLAGS_NONE); + gst_memory_unmap (mem, &map_info); + + if (!ret) { + GST_ERROR_OBJECT (self, "Couldn't register gl buffer %d", gl_buf->id); + gst_cuda_graphics_resource_free (resource); + return nullptr; + } + + gst_mini_object_set_qdata (GST_MINI_OBJECT (mem), quark, resource, + (GDestroyNotify) gst_cuda_graphics_resource_free); + } + + return resource; +} + +static void +gst_nv_comp_video_enc_upload_gl (GstGLContext * context, GLInteropData * data) +{ + auto self = data->self; + auto priv = self->priv; + auto info = &priv->state->info; + auto finfo = info->finfo; + GstCudaGraphicsResource *gst_res[GST_VIDEO_MAX_PLANES] = { nullptr, }; + CUgraphicsResource cuda_res[GST_VIDEO_MAX_PLANES] = { nullptr, }; + CUdeviceptr src_devptr[GST_VIDEO_MAX_PLANES] = { 0, }; + CUstream stream = gst_cuda_stream_get_handle (priv->stream); + CUresult ret; + gint comp[GST_VIDEO_MAX_COMPONENTS]; + auto cur_task = priv->cur_task; + + if (!gst_cuda_context_push (priv->ctx)) { + GST_ERROR_OBJECT (self, "Couldn't push context"); + return; + } + + for (guint i = 0; i < GST_VIDEO_INFO_N_PLANES (info); i++) { + GstMemory *mem = gst_buffer_peek_memory (data->buffer, i); + GstGLMemoryPBO *pbo = (GstGLMemoryPBO *) mem; + gsize src_size; + + if (!gst_is_gl_memory_pbo (mem)) { + GST_ERROR_OBJECT (self, "Not a GL PBO memory"); + goto out; + } + + gst_res[i] = ensure_gl_cuda_resource (self, mem); + if (!gst_res[i]) { + GST_ERROR_OBJECT (self, "Couldn't get resource %d", i); + goto out; + } + + gst_gl_memory_pbo_upload_transfer (pbo); + gst_gl_memory_pbo_download_transfer (pbo); + + cuda_res[i] = gst_cuda_graphics_resource_map (gst_res[i], stream, + CU_GRAPHICS_MAP_RESOURCE_FLAGS_READ_ONLY); + if (!cuda_res[i]) { + GST_ERROR_OBJECT (self, "Couldn't map resource"); + goto out; + } + + ret = CuGraphicsResourceGetMappedPointer (&src_devptr[i], + &src_size, cuda_res[i]); + if (!gst_cuda_result (ret)) { + GST_ERROR_OBJECT (self, "Couldn't get mapped device pointer"); + goto out; + } + } + + + for (guint i = 0; i < GST_VIDEO_INFO_N_PLANES (info); i++) { + guint8 *sp = (guint8 *) src_devptr[i]; + guint8 *dp = cur_task->device_uncompressed + info->offset[i]; + guint ss, ds; + guint w, h; + + if (GST_VIDEO_FORMAT_INFO_HAS_PALETTE (finfo) && i == 1) { + ret = CuMemcpyDtoDAsync ((CUdeviceptr) dp, (CUdeviceptr) sp, + 256 * 4, stream); + + if (!gst_cuda_result (ret)) { + GST_ERROR_OBJECT (self, "CUDA memcpy failed"); + goto out; + } + + data->ret = TRUE; + goto out; + } + + auto meta = gst_buffer_get_video_meta (data->buffer); + if (meta) + ss = meta->stride[i]; + else + ss = GST_VIDEO_INFO_PLANE_STRIDE (info, i); + + ds = GST_VIDEO_INFO_PLANE_STRIDE (info, i); + + gst_video_format_info_component (finfo, i, comp); + + w = GST_VIDEO_INFO_COMP_WIDTH (info, comp[0]) * + GST_VIDEO_INFO_COMP_PSTRIDE (info, comp[0]); + if (w == 0) + w = MIN (ss, ds); + + h = GST_VIDEO_INFO_COMP_HEIGHT (info, comp[0]); + + if (GST_VIDEO_FORMAT_INFO_IS_TILED (finfo)) { + gint tile_size; + gint sx_tiles, sy_tiles, dx_tiles, dy_tiles; + GstVideoTileMode mode; + + tile_size = GST_VIDEO_FORMAT_INFO_TILE_SIZE (info->finfo, i); + + mode = GST_VIDEO_FORMAT_INFO_TILE_MODE (info->finfo); + + sx_tiles = GST_VIDEO_TILE_X_TILES (ss); + sy_tiles = GST_VIDEO_TILE_Y_TILES (ss); + + dx_tiles = GST_VIDEO_TILE_X_TILES (ds); + dy_tiles = GST_VIDEO_TILE_Y_TILES (ds); + + w = MIN (sx_tiles, dx_tiles); + h = MIN (sy_tiles, dy_tiles); + + for (guint j = 0; j < h; j++) { + for (guint k = 0; k < w; k++) { + guint si, di; + guint8 *cur_dp; + guint8 *cur_sp; + + si = gst_video_tile_get_index (mode, k, j, sx_tiles, sy_tiles); + di = gst_video_tile_get_index (mode, k, j, dx_tiles, dy_tiles); + + cur_dp = dp + (di * tile_size); + cur_sp = sp + (si * tile_size); + + ret = CuMemcpyDtoDAsync ((CUdeviceptr) cur_dp, (CUdeviceptr) cur_sp, + w, stream); + + if (!gst_cuda_result (ret)) { + GST_ERROR_OBJECT (self, "CUDA memcpy failed"); + goto out; + } + } + } + } else { + CUDA_MEMCPY2D params = { }; + params.srcMemoryType = CU_MEMORYTYPE_DEVICE; + params.srcDevice = (CUdeviceptr) sp; + params.srcPitch = ss; + + params.dstMemoryType = CU_MEMORYTYPE_DEVICE; + params.dstDevice = (CUdeviceptr) dp; + params.dstPitch = ds; + + params.WidthInBytes = w; + params.Height = h; + + ret = CuMemcpy2DAsync (¶ms, stream); + + if (!gst_cuda_result (ret)) { + GST_ERROR_OBJECT (self, "CUDA memcpy failed"); + goto out; + } + } + } + + data->ret = TRUE; + +out: + for (guint i = 0; i < gst_buffer_n_memory (data->buffer); i++) { + if (!gst_res[i]) + break; + + gst_cuda_graphics_resource_unmap (gst_res[i], stream); + } + + CuStreamSynchronize (stream); + gst_cuda_context_pop (nullptr); +} +#endif + +static gpointer +gst_nv_comp_video_enc_thread_func (GstNvCompVideoEnc * self) +{ + auto encoder = GST_VIDEO_ENCODER (self); + auto priv = self->priv; + + GST_DEBUG_OBJECT (self, "Entering loop"); + + while (1) { + std::shared_ptr < EncoderTask > task; + + { + std::unique_lock < std::mutex > lk (priv->output_lock); + while (priv->output_task_queue.empty ()) + priv->output_cond.wait (lk); + + task = priv->output_task_queue.front (); + priv->output_task_queue.pop (); + } + + if (!task) { + GST_DEBUG_OBJECT (self, "Got empty task, terminate"); + break; + } + + auto frame = gst_video_encoder_get_oldest_frame (encoder); + + gst_cuda_context_push (priv->ctx); + CuEventSynchronize (task->event); + gst_cuda_context_pop (nullptr); + + gst_buffer_pool_acquire_buffer (priv->pool, &frame->output_buffer, nullptr); + GstMapInfo map_info; + gst_buffer_map (frame->output_buffer, &map_info, GST_MAP_WRITE); + if (task->batched) { + task->compressed_size = 0; + auto dst = (uint8_t *) map_info.data; + + /* Write custom header */ + GST_WRITE_UINT32_LE (dst, GST_NV_COMP_HEADER_VERSION); + dst += sizeof (guint32); + task->compressed_size += sizeof (guint32); + + GST_WRITE_UINT32_LE (dst, task->chunk_size); + dst += sizeof (guint32); + task->compressed_size += sizeof (guint32); + + GST_WRITE_UINT32_LE (dst, task->max_output_chunk_size); + dst += sizeof (guint32); + task->compressed_size += sizeof (guint32); + + GST_WRITE_UINT32_LE (dst, task->batch_size); + dst += sizeof (guint32); + task->compressed_size += sizeof (guint32); + + for (size_t i = 0; i < task->batch_size; i++) { + GST_WRITE_UINT32_LE (dst, task->host_uncompressed_bytes[i]); + dst += sizeof (guint32); + task->compressed_size += sizeof (guint32); + + GST_WRITE_UINT32_LE (dst, task->host_compressed_bytes[i]); + dst += sizeof (guint32); + task->compressed_size += sizeof (guint32); + } + + /* Write compressed data */ + for (size_t i = 0; i < task->batch_size; i++) { + auto size = task->host_compressed_bytes[i]; + auto src = task->host_compressed + (i * task->max_output_chunk_size); + memcpy (dst, src, size); + dst += size; + task->compressed_size += size; + } + } else { + memcpy (map_info.data, task->host_compressed, task->compressed_size); + } + gst_buffer_unmap (frame->output_buffer, &map_info); + + if (task->compressed_size > 0) { + gst_buffer_set_size (frame->output_buffer, task->compressed_size); + frame->dts = frame->pts; + + auto ratio = (double) priv->state->info.size / task->compressed_size; + GST_LOG_OBJECT (self, "compressed buffer size %" G_GSIZE_FORMAT + ", ratio %.2f", task->compressed_size, ratio); + } else { + GST_ERROR_OBJECT (self, "Zero compressed size"); + gst_clear_buffer (&frame->output_buffer); + } + + { + std::lock_guard < std::mutex > lk (priv->input_lock); + priv->input_task_queue.push (task); + priv->input_cond.notify_all (); + } + + priv->last_flow = gst_video_encoder_finish_frame (encoder, frame); + }; + + GST_DEBUG_OBJECT (self, "Leaving loop"); + + return nullptr; +} + +static GstFlowReturn +gst_nv_comp_video_enc_handle_frame (GstVideoEncoder * encoder, + GstVideoCodecFrame * frame) +{ + auto self = GST_NV_COMP_VIDEO_ENC (encoder); + auto priv = self->priv; + GstMemory *mem; + CUstream stream = nullptr; + GstVideoFrame vframe; + auto info = &priv->state->info; + size_t compressed_size = 0; + gboolean need_copy = TRUE; + std::shared_ptr < EncoderTask > task; + + if (!priv->ctx || (!priv->manager && !priv->batched_comp)) { + GST_ERROR_OBJECT (self, "Context was not configured"); + goto error; + } + + if (priv->last_flow != GST_FLOW_OK) { + GST_INFO_OBJECT (self, "Last flow was %s", + gst_flow_get_name (priv->last_flow)); + gst_video_encoder_finish_frame (encoder, frame); + return priv->last_flow; + } + + if (!priv->encode_thread) { + priv->encode_thread = g_thread_new ("nvcompvideoenc", + (GThreadFunc) gst_nv_comp_video_enc_thread_func, self); + } + + GST_VIDEO_ENCODER_STREAM_UNLOCK (encoder); + { + std::unique_lock < std::mutex > lk (priv->input_lock); + while (priv->input_task_queue.empty ()) + priv->input_cond.wait (lk); + + priv->cur_task = priv->input_task_queue.front (); + priv->input_task_queue.pop (); + } + GST_VIDEO_ENCODER_STREAM_LOCK (encoder); + + mem = gst_buffer_peek_memory (frame->input_buffer, 0); +#ifdef HAVE_GST_GL + if (priv->gl_interop && gst_is_gl_memory (mem) && + gst_buffer_n_memory (frame->input_buffer) == + GST_VIDEO_INFO_N_PLANES (info)) { + GLInteropData interop_data; + interop_data.self = self; + interop_data.buffer = frame->input_buffer; + interop_data.ret = FALSE; + + auto gl_mem = (GstGLMemory *) mem; + gst_gl_context_thread_add (gl_mem->mem.context, + (GstGLContextThreadFunc) gst_nv_comp_video_enc_upload_gl, + &interop_data); + if (interop_data.ret) { + need_copy = FALSE; + GST_TRACE_OBJECT (self, "GL -> CUDA copy done"); + } else { + priv->gl_interop = FALSE; + } + } +#endif + + if (!gst_cuda_context_push (priv->ctx)) { + GST_ERROR_OBJECT (self, "Couldn't push context"); + std::lock_guard < std::mutex > lk (priv->input_lock); + priv->input_task_queue.push (std::move (priv->cur_task)); + goto error; + } + + stream = gst_cuda_stream_get_handle (priv->stream); + + if (need_copy) { + gboolean device_copy = FALSE; + if (gst_is_cuda_memory (mem)) { + GstCudaMemory *cmem = GST_CUDA_MEMORY_CAST (mem); + if (cmem->context == priv->ctx) { + device_copy = TRUE; + if (!gst_video_frame_map (&vframe, info, frame->input_buffer, + (GstMapFlags) (GST_MAP_READ | GST_MAP_CUDA))) { + GST_ERROR_OBJECT (self, "Couldn't map cuda memory"); + gst_cuda_context_pop (nullptr); + std::lock_guard < std::mutex > lk (priv->input_lock); + priv->input_task_queue.push (std::move (priv->cur_task)); + goto error; + } + + if (gst_cuda_memory_get_stream (cmem) != priv->stream) { + GST_DEBUG_OBJECT (self, "Different stream, need sync"); + gst_cuda_memory_sync (cmem); + } + } + } + + if (!device_copy && !gst_video_frame_map (&vframe, + info, frame->input_buffer, GST_MAP_READ)) { + GST_ERROR_OBJECT (self, "Couldn't map input frame"); + gst_cuda_context_pop (nullptr); + std::lock_guard < std::mutex > lk (priv->input_lock); + priv->input_task_queue.push (std::move (priv->cur_task)); + goto error; + } + + if (!gst_nv_comp_video_enc_upload (self, &vframe, stream, device_copy)) { + gst_video_frame_unmap (&vframe); + gst_cuda_context_pop (nullptr); + std::lock_guard < std::mutex > lk (priv->input_lock); + priv->input_task_queue.push (std::move (priv->cur_task)); + goto error; + } + + gst_video_frame_unmap (&vframe); + + if (!device_copy) { + CuMemcpyHtoDAsync ((CUdeviceptr) priv->cur_task->device_uncompressed, + priv->cur_task->host_uncompressed, info->size, stream); + } + } + + task = std::move (priv->cur_task); + if (task->batched) { + g_assert (priv->batched_comp); + + auto status = priv->batched_comp->compress (task->device_uncompressed_ptrs, + task->device_uncompressed_bytes, task->chunk_size, task->batch_size, + task->temp_ptr, task->temp_size, task->device_compressed_ptrs, + task->device_compressed_bytes, (cudaStream_t) stream); + if (status != nvcompSuccess) { + GST_ERROR_OBJECT (self, "Compression failed, ret %d", status); + gst_cuda_context_pop (nullptr); + std::lock_guard < std::mutex > lk (priv->input_lock); + priv->input_task_queue.push (std::move (task)); + goto error; + } + + CuMemcpyDtoHAsync (task->host_compressed_bytes, + (CUdeviceptr) task->device_compressed_bytes, + sizeof (size_t) * task->batch_size, stream); + CuMemcpyDtoHAsync (task->host_compressed, + (CUdeviceptr) task->device_compressed, + task->compressed_alloc_size, stream); + } else { + g_assert (priv->manager); + + priv->manager->compress (task->device_uncompressed, + task->device_compressed, *priv->config); + + compressed_size = + priv->manager->get_compressed_output_size (task->device_compressed); + + task->compressed_size = compressed_size; + CuMemcpyDtoHAsync (task->host_compressed, + (CUdeviceptr) task->device_compressed, compressed_size, stream); + } + + CuEventRecord (task->event, stream); + gst_cuda_context_pop (nullptr); + + { + std::lock_guard < std::mutex > lk (priv->output_lock); + priv->output_task_queue.push (std::move (task)); + priv->output_cond.notify_one (); + } + + gst_video_codec_frame_unref (frame); + + return priv->last_flow; + +error: + gst_video_encoder_finish_frame (encoder, frame); + + return GST_FLOW_ERROR; +} diff --git a/subprojects/gst-plugins-bad/ext/nvcomp/gstnvcompvideoenc.h b/subprojects/gst-plugins-bad/ext/nvcomp/gstnvcompvideoenc.h new file mode 100644 index 0000000000..d7ef55743e --- /dev/null +++ b/subprojects/gst-plugins-bad/ext/nvcomp/gstnvcompvideoenc.h @@ -0,0 +1,32 @@ +/* GStreamer + * Copyright (C) 2024 Seungha Yang + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Library General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Library General Public License for more details. + * + * You should have received a copy of the GNU Library General Public + * License along with this library; if not, write to the + * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, + * Boston, MA 02110-1301, USA. + */ + +#pragma once + +#include +#include +#include "gstnvcomp.h" + +G_BEGIN_DECLS + +#define GST_TYPE_NV_COMP_VIDEO_ENC (gst_nv_comp_video_enc_get_type()) +G_DECLARE_FINAL_TYPE (GstNvCompVideoEnc, gst_nv_comp_video_enc, + GST, NV_COMP_VIDEO_ENC, GstVideoEncoder) + +G_END_DECLS \ No newline at end of file diff --git a/subprojects/gst-plugins-bad/ext/nvcomp/meson.build b/subprojects/gst-plugins-bad/ext/nvcomp/meson.build new file mode 100644 index 0000000000..ffb40867b2 --- /dev/null +++ b/subprojects/gst-plugins-bad/ext/nvcomp/meson.build @@ -0,0 +1,70 @@ +nvcomp_sources = [ + 'gstnvcomp.cpp', + 'gstnvcompvideodec.cpp', + 'gstnvcompvideoenc.cpp', + 'plugin.cpp', +] +extra_args = ['-DGST_USE_UNSTABLE_API'] + +nvcomp_opt = get_option('nvcomp') +if nvcomp_opt.disabled() or host_system not in ['windows', 'linux'] + subdir_done() +endif + +nvcomp_sdk_path = get_option('nvcomp-sdk-path') +if nvcomp_sdk_path == '' + nvcomp_sdk_path = run_command(python3, '-c', 'import os; print(os.environ.get("NVCOMP_SDK_PATH"))', check: false).stdout().strip() +endif + +if nvcomp_sdk_path == '' or nvcomp_sdk_path == 'None' + if nvcomp_opt.enabled() + error('nvcomp-sdk-path option must be specified for nvCOMP plugin') + endif + subdir_done() +endif + +if not gstcuda_dep.found() + if nvcomp_opt.enabled() + error('nvCOMP plugin was enabled explicitly, but required gstcuda was not found') + endif + subdir_done() +endif + +nvcomp_inc_dirs = [include_directories(join_paths(nvcomp_sdk_path, 'include'), './stub'), + cuda_stubinc] + +nvcomp_lib_path = join_paths(nvcomp_sdk_path, 'lib') +nvcomp_lib = cc.find_library('nvcomp', + dirs: nvcomp_lib_path, required: nvcomp_opt) +if not nvcomp_lib.found() + subdir_done() +endif + +nvcomp_bitcomp_lib = cc.find_library('nvcomp_bitcomp', + dirs: nvcomp_lib_path, required: nvcomp_opt) +if not nvcomp_bitcomp_lib.found() + subdir_done() +endif + +nvcomp_gdeflate_lib = cc.find_library('nvcomp_gdeflate', + dirs: nvcomp_lib_path, required: nvcomp_opt) +if not nvcomp_gdeflate_lib.found() + subdir_done() +endif + +if gstgl_dep.found() + extra_args += ['-DHAVE_GST_GL'] +endif + +gstnvcomp = library('gstnvcomp', + nvcomp_sources, + c_args : gst_plugins_bad_args + extra_args, + cpp_args : gst_plugins_bad_args + extra_args, + include_directories : [configinc] + nvcomp_inc_dirs, + dependencies : [gstbase_dep, gstvideo_dep, gstcuda_dep, gstgl_dep, nvcomp_lib, + nvcomp_bitcomp_lib, nvcomp_gdeflate_lib], + override_options : ['cpp_std=c++17'], + install : true, + install_dir : plugins_install_dir, +) +plugins += [gstnvcomp] diff --git a/subprojects/gst-plugins-bad/ext/nvcomp/plugin.cpp b/subprojects/gst-plugins-bad/ext/nvcomp/plugin.cpp new file mode 100644 index 0000000000..375a587fa3 --- /dev/null +++ b/subprojects/gst-plugins-bad/ext/nvcomp/plugin.cpp @@ -0,0 +1,47 @@ +/* GStreamer + * Copyright (C) 2024 Seungha Yang + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Library General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Library General Public License for more details. + * + * You should have received a copy of the GNU Library General Public + * License along with this library; if not, write to the + * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, + * Boston, MA 02110-1301, USA. + */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include +#include "gstnvcomp.h" +#include "gstnvcompvideodec.h" +#include "gstnvcompvideoenc.h" + +static gboolean +plugin_init (GstPlugin * plugin) +{ + if (!gst_cuda_load_library ()) { + gst_plugin_add_status_warning (plugin, "CUDA library was not found."); + return TRUE; + } + + gst_element_register (plugin, + "nvcompvideodec", GST_RANK_NONE, GST_TYPE_NV_COMP_VIDEO_DEC); + gst_element_register (plugin, + "nvcompvideoenc", GST_RANK_NONE, GST_TYPE_NV_COMP_VIDEO_ENC); + + return TRUE; +} + +GST_PLUGIN_DEFINE (GST_VERSION_MAJOR, GST_VERSION_MINOR, nvcomp, + "GStreamer nvCOMP plugin", plugin_init, VERSION, "LGPL", + GST_PACKAGE_NAME, GST_PACKAGE_ORIGIN) diff --git a/subprojects/gst-plugins-bad/ext/nvcomp/stub/cuda_runtime.h b/subprojects/gst-plugins-bad/ext/nvcomp/stub/cuda_runtime.h new file mode 100644 index 0000000000..f6df2d0bf9 --- /dev/null +++ b/subprojects/gst-plugins-bad/ext/nvcomp/stub/cuda_runtime.h @@ -0,0 +1,9 @@ +#pragma once + +#include + +G_BEGIN_DECLS + +typedef struct CUstream_st* cudaStream_t; + +G_END_DECLS diff --git a/subprojects/gst-plugins-bad/meson_options.txt b/subprojects/gst-plugins-bad/meson_options.txt index 5752b83fee..fb3db29119 100644 --- a/subprojects/gst-plugins-bad/meson_options.txt +++ b/subprojects/gst-plugins-bad/meson_options.txt @@ -143,6 +143,7 @@ option('mplex', type : 'feature', value : 'auto', description : 'mplex audio/vid option('msdk', type : 'feature', value : 'auto', description : 'Intel Media SDK video encoder/decoder plugin') option('musepack', type : 'feature', value : 'auto', description : 'libmpcdec Musepack decoder plugin') option('neon', type : 'feature', value : 'auto', description : 'NEON HTTP source plugin') +option('nvcomp', type : 'feature', value : 'auto', description : 'NVIDIA nvCOMP compression/decompression plugin') option('nvcodec', type : 'feature', value : 'auto', description : 'NVIDIA GPU codec plugin') option('onnx', type : 'feature', value : 'auto', description : 'ONNX neural network plugin') option('openal', type : 'feature', value : 'auto', description : 'OpenAL plugin') @@ -241,6 +242,10 @@ option('sctp-internal-usrsctp', type: 'feature', value : 'enabled', option('mfx_api', type : 'combo', choices : ['MSDK', 'oneVPL', 'auto'], value : 'auto', description : 'Select MFX API to build against') +# nvCOMP plugin options +option('nvcomp-sdk-path', type: 'string', value : '', + description : 'nvCOMP SDK root directory') + # QSV plugin options option('mfx-modules-dir', type: 'string', value : '', description : 'libmfx runtime module dir, linux only')