gstreamer/subprojects/gst-plugins-bad/ext/nvcomp/gstnvcompvideodec.cpp
Seungha Yang 0ed9c39835 nvcomp: Add nvCOMP library based GPU lossless compression plugin
Adding NVIDIA nvCOMP library based plugin for lossless raw video
compression/decompression. To build this plugin, user should
install nvCOMP SDK first and specify the SDK path via
"nvcomp-sdk-path" build option or NVCOMP_SDK_PATH env.

Part-of: <https://gitlab.freedesktop.org/gstreamer/gstreamer/-/merge_requests/6912>
2024-06-13 18:19:08 +00:00

1739 lines
50 KiB
C++

/* GStreamer
* Copyright (C) 2024 Seungha Yang <seungha@centricular.com>
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Library General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Library General Public License for more details.
*
* You should have received a copy of the GNU Library General Public
* License along with this library; if not, write to the
* Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
* Boston, MA 02110-1301, USA.
*/
#ifdef HAVE_CONFIG_H
#include "config.h"
#endif
#include "gstnvcompvideodec.h"
#ifdef HAVE_GST_GL
#include <gst/gl/gl.h>
#include <gst/gl/gstglfuncs.h>
#endif
#include <nvcomp/nvcompManagerFactory.hpp>
#include <nvcomp/ans.h>
#include <nvcomp/bitcomp.h>
#include <nvcomp/cascaded.h>
#include <nvcomp/deflate.h>
#include <nvcomp/gdeflate.h>
#include <nvcomp/lz4.h>
#include <nvcomp/snappy.h>
#include <nvcomp/zstd.h>
#include <memory>
#include <string>
#include <string.h>
#include <vector>
GST_DEBUG_CATEGORY_STATIC (gst_nv_comp_video_dec_debug);
#define GST_CAT_DEFAULT gst_nv_comp_video_dec_debug
#ifdef HAVE_GST_GL
#define SRC_CAPS \
GST_VIDEO_CAPS_MAKE_WITH_FEATURES (GST_CAPS_FEATURE_MEMORY_CUDA_MEMORY, \
GST_VIDEO_FORMATS_ALL) ";" \
GST_VIDEO_CAPS_MAKE_WITH_FEATURES (GST_CAPS_FEATURE_MEMORY_GL_MEMORY, \
GST_VIDEO_FORMATS_ALL) ";" \
GST_VIDEO_CAPS_MAKE (GST_VIDEO_FORMATS_ALL)
#else
#define SRC_CAPS \
GST_VIDEO_CAPS_MAKE_WITH_FEATURES (GST_CAPS_FEATURE_MEMORY_CUDA_MEMORY, \
GST_VIDEO_FORMATS_ALL) ";" \
GST_VIDEO_CAPS_MAKE (GST_VIDEO_FORMATS_ALL)
#endif
static GstStaticPadTemplate sink_template =
GST_STATIC_PAD_TEMPLATE ("sink", GST_PAD_SINK, GST_PAD_ALWAYS,
GST_STATIC_CAPS ("video/x-nvcomp; video/x-nvcomp-lz4; "
"video/x-nvcomp-snappy; video/x-nvcomp-gdeflate; "
"video/x-nvcomp-deflate; video/x-nvcomp-zstd; video/x-nvcomp-cascaded; "
"video/x-nvcomp-bitcomp; video/x-nvcomp-ans"));
static GstStaticPadTemplate src_template =
GST_STATIC_PAD_TEMPLATE ("src", GST_PAD_SRC, GST_PAD_ALWAYS,
GST_STATIC_CAPS (SRC_CAPS));
/* *INDENT-OFF* */
using namespace nvcomp;
struct DecoderTask
{
~DecoderTask ()
{
if (ctx) {
gst_cuda_context_push (ctx);
clear_resource ();
gst_cuda_context_pop (nullptr);
gst_object_unref (ctx);
}
}
void clear_resource ()
{
if (!ctx)
return;
if (device_compressed)
CuMemFree ((CUdeviceptr) device_compressed);
device_compressed = nullptr;
if (host_compressed)
CuMemFreeHost (host_compressed);
host_compressed = nullptr;
if (device_compressed_bytes)
CuMemFree ((CUdeviceptr) device_compressed_bytes);
device_compressed_bytes = nullptr;
if (device_compressed_ptrs)
CuMemFree ((CUdeviceptr) device_compressed_ptrs);
device_compressed_ptrs = nullptr;
if (host_compressed_bytes)
CuMemFreeHost (host_compressed_bytes);
host_compressed_bytes = nullptr;
if (host_compressed_ptrs)
CuMemFreeHost (host_compressed_ptrs);
host_compressed_ptrs = nullptr;
if (device_uncompressed)
CuMemFree ((CUdeviceptr) device_uncompressed);
device_uncompressed = nullptr;
if (device_uncompressed_temp)
CuMemFree ((CUdeviceptr) device_uncompressed_temp);
device_uncompressed_temp = nullptr;
if (host_uncompressed)
CuMemFreeHost (host_uncompressed);
host_uncompressed = nullptr;
if (device_uncompressed_bytes)
CuMemFree ((CUdeviceptr) device_uncompressed_bytes);
device_uncompressed_bytes = nullptr;
if (device_uncompressed_ptrs)
CuMemFree ((CUdeviceptr) device_uncompressed_ptrs);
device_uncompressed_ptrs = nullptr;
if (host_uncompressed_bytes)
CuMemFreeHost (host_uncompressed_bytes);
host_uncompressed_bytes = nullptr;
if (host_uncompressed_ptrs)
CuMemFreeHost (host_uncompressed_ptrs);
host_uncompressed_ptrs = nullptr;
if (device_actual_uncompressed_bytes)
CuMemFree ((CUdeviceptr) device_actual_uncompressed_bytes);
device_actual_uncompressed_bytes = nullptr;
if (temp_ptr)
CuMemFree ((CUdeviceptr) temp_ptr);
temp_ptr = nullptr;
if (device_statuses)
CuMemFree ((CUdeviceptr) device_statuses);
device_statuses = nullptr;
batch_size = 0;
max_compressed_chunk_size = 0;
max_uncompressed_chunk_size = 0;
}
bool allocate_batched (size_t num_chunks,
size_t compressed_chunk_size,
size_t uncompressed_chunk_size, size_t temp_bytes)
{
size_t compressed_alloc;
size_t uncompressed_alloc;
size_t alloc_size = num_chunks * sizeof (size_t);
uint8_t *src;
compressed_chunk_size = GST_ROUND_UP_8 (compressed_chunk_size);
uncompressed_chunk_size = GST_ROUND_UP_8 (uncompressed_chunk_size);
compressed_alloc = num_chunks * compressed_chunk_size;
uncompressed_alloc = num_chunks * uncompressed_chunk_size;
auto ret = CuMemAlloc ((CUdeviceptr *) &device_compressed,
compressed_alloc);
if (!gst_cuda_result (ret))
return false;
ret = CuMemAllocHost ((void **) &host_compressed, compressed_alloc);
if (!gst_cuda_result (ret))
return false;
ret = CuMemAlloc ((CUdeviceptr *) &device_compressed_bytes,
alloc_size);
if (!gst_cuda_result (ret))
return false;
ret = CuMemAlloc ((CUdeviceptr *) &device_compressed_ptrs,
alloc_size);
if (!gst_cuda_result (ret))
return false;
ret = CuMemAllocHost ((void **) &host_compressed_bytes,
alloc_size);
if (!gst_cuda_result (ret))
return false;
ret = CuMemAllocHost ((void **) &host_compressed_ptrs,
alloc_size);
if (!gst_cuda_result (ret))
return false;
src = device_compressed;
for (size_t i = 0; i < num_chunks; i++) {
host_compressed_ptrs[i] = src;
src += compressed_chunk_size;
}
ret = CuMemcpyHtoD ((CUdeviceptr) device_compressed_ptrs,
host_compressed_ptrs, alloc_size);
if (!gst_cuda_result (ret))
return false;
ret = CuMemAlloc ((CUdeviceptr *) &device_uncompressed_temp,
uncompressed_alloc);
if (!gst_cuda_result (ret))
return false;
ret = CuMemAlloc ((CUdeviceptr *) &device_uncompressed,
uncompressed_alloc);
if (!gst_cuda_result (ret))
return false;
ret = CuMemAllocHost ((void **) &host_uncompressed, uncompressed_alloc);
if (!gst_cuda_result (ret))
return false;
ret = CuMemAlloc ((CUdeviceptr *) &device_uncompressed_bytes,
alloc_size);
if (!gst_cuda_result (ret))
return false;
ret = CuMemAlloc ((CUdeviceptr *) &device_uncompressed_ptrs,
alloc_size);
if (!gst_cuda_result (ret))
return false;
ret = CuMemAllocHost ((void **) &host_uncompressed_bytes,
alloc_size);
if (!gst_cuda_result (ret))
return false;
ret = CuMemAllocHost ((void **) &host_uncompressed_ptrs,
alloc_size);
if (!gst_cuda_result (ret))
return false;
src = device_uncompressed_temp;
for (size_t i = 0; i < num_chunks; i++) {
host_uncompressed_bytes[i] = uncompressed_chunk_size;
host_uncompressed_ptrs[i] = src;
src += uncompressed_chunk_size;
}
ret = CuMemcpyHtoD ((CUdeviceptr) device_uncompressed_bytes,
host_uncompressed_bytes, alloc_size);
if (!gst_cuda_result (ret))
return false;
ret = CuMemcpyHtoD ((CUdeviceptr) device_uncompressed_ptrs,
host_uncompressed_ptrs, alloc_size);
if (!gst_cuda_result (ret))
return false;
ret = CuMemAlloc ((CUdeviceptr *) &device_actual_uncompressed_bytes,
alloc_size);
if (!gst_cuda_result (ret))
return false;
if (temp_bytes > 0) {
ret = CuMemAlloc ((CUdeviceptr *) &temp_ptr, temp_bytes);
if (!gst_cuda_result (ret))
return false;
}
ret = CuMemAlloc ((CUdeviceptr *) &device_statuses,
sizeof (nvcompStatus_t) * num_chunks);
if (!gst_cuda_result (ret))
return false;
batched = TRUE;
batch_size = num_chunks;
temp_size = temp_bytes;
max_compressed_chunk_size = compressed_chunk_size;
max_uncompressed_chunk_size = uncompressed_chunk_size;
compressed_alloc_size = compressed_alloc;
uncompressed_alloc_size = uncompressed_alloc;
return true;
}
GstCudaContext *ctx = nullptr;
uint8_t *device_compressed = nullptr;
uint8_t *host_compressed = nullptr;
size_t *device_compressed_bytes = nullptr;
void **device_compressed_ptrs = nullptr;
size_t *host_compressed_bytes = nullptr;
void **host_compressed_ptrs = nullptr;
uint8_t *device_uncompressed = nullptr;
uint8_t *device_uncompressed_temp = nullptr;
uint8_t *host_uncompressed = nullptr;
size_t *device_uncompressed_bytes = nullptr;
void **device_uncompressed_ptrs = nullptr;
size_t *host_uncompressed_bytes = nullptr;
void **host_uncompressed_ptrs = nullptr;
size_t *device_actual_uncompressed_bytes = nullptr;
void *temp_ptr = nullptr;
size_t temp_size = 0;
nvcompStatus_t *device_statuses = nullptr;
gboolean batched = FALSE;
size_t batch_size = 0;
size_t max_uncompressed_chunk_size = 0;
size_t max_compressed_chunk_size = 0;
size_t uncompressed_alloc_size = 0;
size_t compressed_alloc_size = 0;
};
struct BatchedDecompBase
{
virtual nvcompStatus_t get_temp_size(
size_t num_chunks,
size_t max_uncompressed_chunk_bytes,
size_t * temp_bytes) = 0;
virtual nvcompStatus_t decompress(
void **device_compressed_ptrs,
size_t *device_compressed_bytes,
size_t *device_uncompressed_bytes,
size_t *device_actual_uncompressed_bytes,
size_t batch_size,
void *device_temp_ptr,
size_t temp_bytes,
void **device_uncompressed_ptrs,
nvcompStatus_t *device_statuses,
cudaStream_t stream) = 0;
};
template <auto T, auto D>
class BatchedDecomp : public BatchedDecompBase
{
public:
BatchedDecomp () {}
nvcompStatus_t get_temp_size(
size_t num_chunks,
size_t max_uncompressed_chunk_bytes,
size_t * temp_bytes)
{
return T (num_chunks, max_uncompressed_chunk_bytes, temp_bytes);
}
nvcompStatus_t decompress(
void **device_compressed_ptrs,
size_t *device_compressed_bytes,
size_t *device_uncompressed_bytes,
size_t *device_actual_uncompressed_bytes,
size_t batch_size,
void *device_temp_ptr,
size_t temp_bytes,
void **device_uncompressed_ptrs,
nvcompStatus_t *device_statuses,
cudaStream_t stream)
{
return D (device_compressed_ptrs, device_compressed_bytes,
device_uncompressed_bytes, device_actual_uncompressed_bytes,
batch_size, device_temp_ptr, temp_bytes, device_uncompressed_ptrs,
device_statuses, stream);
}
};
struct GstNvCompVideoDecPrivate
{
GstNvCompVideoDecPrivate ()
{
gst_video_info_init (&info);
}
GstCudaContext *ctx = nullptr;
GstCudaStream *stream = nullptr;
#ifdef HAVE_GST_GL
GstGLDisplay *gl_display = nullptr;
GstGLContext *gl_context = nullptr;
GstGLContext *other_gl_context = nullptr;
#endif
GstVideoCodecState *state = nullptr;
std::shared_ptr<nvcompManagerBase> manager;
std::shared_ptr<BatchedDecompBase> batched_decomp;
std::shared_ptr<DecoderTask> task;
gboolean gl_interop = FALSE;
GstVideoInfo info;
gboolean batched = FALSE;
GstNvCompMethod method;
};
/* *INDENT-ON* */
struct _GstNvCompVideoDec
{
GstVideoDecoder parent;
GstNvCompVideoDecPrivate *priv;
};
static void gst_nv_comp_video_dec_finalize (GObject * object);
static void gst_nv_comp_video_dec_set_context (GstElement * element,
GstContext * context);
static gboolean gst_nv_comp_video_dec_open (GstVideoDecoder * decoder);
static gboolean gst_nv_comp_video_dec_close (GstVideoDecoder * decoder);
static gboolean gst_nv_comp_video_dec_sink_query (GstVideoDecoder * decoder,
GstQuery * query);
static gboolean gst_nv_comp_video_dec_src_query (GstVideoDecoder * decoder,
GstQuery * query);
static gboolean
gst_nv_comp_video_dec_decide_allocation (GstVideoDecoder * decoder,
GstQuery * query);
static gboolean gst_nv_comp_video_dec_set_format (GstVideoDecoder * decoder,
GstVideoCodecState * state);
static gboolean gst_nv_comp_video_dec_negotiate (GstVideoDecoder * decoder);
static GstFlowReturn
gst_nv_comp_video_dec_handle_frame (GstVideoDecoder * decoder,
GstVideoCodecFrame * frame);
#define gst_nv_comp_video_dec_parent_class parent_class
G_DEFINE_TYPE (GstNvCompVideoDec,
gst_nv_comp_video_dec, GST_TYPE_VIDEO_DECODER);
static void
gst_nv_comp_video_dec_class_init (GstNvCompVideoDecClass * klass)
{
auto object_class = G_OBJECT_CLASS (klass);
auto element_class = GST_ELEMENT_CLASS (klass);
auto decoder_class = GST_VIDEO_DECODER_CLASS (klass);
object_class->finalize = gst_nv_comp_video_dec_finalize;
gst_element_class_add_static_pad_template (element_class, &sink_template);
gst_element_class_add_static_pad_template (element_class, &src_template);
gst_element_class_set_static_metadata (element_class,
"nvCOMP Video Decoder", "Decoder/Video/Hardware",
"Decompress a video stream using nvCOMP library",
"Seungha Yang <seungha@centricular.com>");
element_class->set_context =
GST_DEBUG_FUNCPTR (gst_nv_comp_video_dec_set_context);
decoder_class->open = GST_DEBUG_FUNCPTR (gst_nv_comp_video_dec_open);
decoder_class->close = GST_DEBUG_FUNCPTR (gst_nv_comp_video_dec_close);
decoder_class->sink_query =
GST_DEBUG_FUNCPTR (gst_nv_comp_video_dec_sink_query);
decoder_class->src_query =
GST_DEBUG_FUNCPTR (gst_nv_comp_video_dec_src_query);
decoder_class->decide_allocation =
GST_DEBUG_FUNCPTR (gst_nv_comp_video_dec_decide_allocation);
decoder_class->set_format =
GST_DEBUG_FUNCPTR (gst_nv_comp_video_dec_set_format);
decoder_class->negotiate =
GST_DEBUG_FUNCPTR (gst_nv_comp_video_dec_negotiate);
decoder_class->handle_frame =
GST_DEBUG_FUNCPTR (gst_nv_comp_video_dec_handle_frame);
GST_DEBUG_CATEGORY_INIT (gst_nv_comp_video_dec_debug,
"nvcompvideodec", 0, "nvcompvideodec");
}
static void
gst_nv_comp_video_dec_init (GstNvCompVideoDec * self)
{
self->priv = new GstNvCompVideoDecPrivate ();
}
static void
gst_nv_comp_video_dec_finalize (GObject * object)
{
auto self = GST_NV_COMP_VIDEO_DEC (object);
delete self->priv;
G_OBJECT_CLASS (parent_class)->finalize (object);
}
static void
gst_nv_comp_video_dec_set_context (GstElement * element, GstContext * context)
{
auto self = GST_NV_COMP_VIDEO_DEC (element);
auto priv = self->priv;
gst_cuda_handle_set_context (element, context, -1, &priv->ctx);
#ifdef HAVE_GST_GL
if (gst_gl_handle_set_context (element, context, &priv->gl_display,
&priv->other_gl_context)) {
if (priv->gl_display)
gst_gl_display_filter_gl_api (priv->gl_display, GST_GL_API_OPENGL3);
}
#endif
GST_ELEMENT_CLASS (parent_class)->set_context (element, context);
}
static gboolean
gst_nv_comp_video_dec_open (GstVideoDecoder * decoder)
{
auto self = GST_NV_COMP_VIDEO_DEC (decoder);
auto priv = self->priv;
if (!gst_cuda_ensure_element_context (GST_ELEMENT_CAST (decoder),
-1, &priv->ctx)) {
GST_ERROR_OBJECT (self, "Couldn't get cuda context");
return FALSE;
}
priv->stream = gst_cuda_stream_new (priv->ctx);
return TRUE;
}
static gboolean
gst_nv_comp_video_dec_close (GstVideoDecoder * decoder)
{
auto self = GST_NV_COMP_VIDEO_DEC (decoder);
auto priv = self->priv;
if (priv->ctx) {
gst_cuda_context_push (priv->ctx);
priv->manager = nullptr;
priv->task = nullptr;
gst_cuda_context_pop (nullptr);
}
gst_clear_cuda_stream (&priv->stream);
gst_clear_object (&priv->ctx);
#ifdef HAVE_GST_GL
gst_clear_object (&priv->other_gl_context);
gst_clear_object (&priv->gl_context);
gst_clear_object (&priv->gl_context);
#endif
return TRUE;
}
static gboolean
gst_nv_comp_video_dec_handle_context_query (GstNvCompVideoDec * self,
GstQuery * query)
{
auto priv = self->priv;
#ifdef HAVE_GST_GL
{
GstGLDisplay *display = nullptr;
GstGLContext *other = nullptr;
GstGLContext *local = nullptr;
if (priv->gl_display)
display = (GstGLDisplay *) gst_object_ref (priv->gl_display);
if (priv->gl_context)
local = (GstGLContext *) gst_object_ref (priv->gl_context);
if (priv->other_gl_context)
other = (GstGLContext *) gst_object_ref (priv->other_gl_context);
auto ret = gst_gl_handle_context_query (GST_ELEMENT (self), query,
display, local, other);
gst_clear_object (&display);
gst_clear_object (&other);
gst_clear_object (&local);
if (ret)
return TRUE;
}
#endif
if (gst_cuda_handle_context_query (GST_ELEMENT (self), query, priv->ctx))
return TRUE;
return FALSE;
}
static gboolean
gst_nv_comp_video_dec_sink_query (GstVideoDecoder * decoder, GstQuery * query)
{
auto self = GST_NV_COMP_VIDEO_DEC (decoder);
switch (GST_QUERY_TYPE (query)) {
case GST_QUERY_CONTEXT:
if (gst_nv_comp_video_dec_handle_context_query (self, query))
return TRUE;
break;
default:
break;
}
return GST_VIDEO_DECODER_CLASS (parent_class)->sink_query (decoder, query);
}
static gboolean
gst_nv_comp_video_dec_src_query (GstVideoDecoder * decoder, GstQuery * query)
{
auto self = GST_NV_COMP_VIDEO_DEC (decoder);
switch (GST_QUERY_TYPE (query)) {
case GST_QUERY_CONTEXT:
if (gst_nv_comp_video_dec_handle_context_query (self, query))
return TRUE;
break;
default:
break;
}
return GST_VIDEO_DECODER_CLASS (parent_class)->src_query (decoder, query);
}
#ifdef HAVE_GST_GL
static void
check_cuda_device_from_gl_context (GstGLContext * context, gboolean * ret)
{
guint device_count = 0;
CUdevice device_list[1] = { 0, };
CUresult cuda_ret;
*ret = FALSE;
cuda_ret = CuGLGetDevices (&device_count,
device_list, 1, CU_GL_DEVICE_LIST_ALL);
if (!gst_cuda_result (cuda_ret) || device_count == 0)
return;
*ret = TRUE;
}
static gboolean
gst_nv_comp_video_dec_ensure_gl_context (GstNvCompVideoDec * self)
{
auto priv = self->priv;
gboolean ret = FALSE;
if (!gst_gl_ensure_element_data (GST_ELEMENT (self), &priv->gl_display,
&priv->other_gl_context)) {
GST_DEBUG_OBJECT (self, "Couldn't get GL display");
return FALSE;
}
gst_gl_display_filter_gl_api (priv->gl_display, GST_GL_API_OPENGL3);
if (!gst_gl_display_ensure_context (priv->gl_display, priv->other_gl_context,
&priv->gl_context, nullptr)) {
GST_DEBUG_OBJECT (self, "Couldn't get GL context");
return FALSE;
}
gst_gl_context_thread_add (priv->gl_context,
(GstGLContextThreadFunc) check_cuda_device_from_gl_context, &ret);
return ret;
}
#endif
static gboolean
gst_nv_comp_video_dec_decide_allocation (GstVideoDecoder * decoder,
GstQuery * query)
{
auto self = GST_NV_COMP_VIDEO_DEC (decoder);
auto priv = self->priv;
GstBufferPool *pool = nullptr;
guint size;
guint min = 0;
guint max = 0;
GstCaps *caps;
gst_query_parse_allocation (query, &caps, nullptr);
if (!caps) {
GST_WARNING_OBJECT (self, "null caps in query");
return FALSE;
}
GstVideoInfo info;
if (!gst_video_info_from_caps (&info, caps)) {
GST_WARNING_OBJECT (self, "Failed to convert caps into info");
return FALSE;
}
gboolean update_pool = FALSE;
if (gst_query_get_n_allocation_pools (query) > 0) {
gst_query_parse_nth_allocation_pool (query, 0, &pool, &size, &min, &max);
update_pool = TRUE;
}
auto features = gst_caps_get_features (caps, 0);
gboolean use_cuda_pool = FALSE;
if (gst_caps_features_contains (features,
GST_CAPS_FEATURE_MEMORY_CUDA_MEMORY)) {
GST_DEBUG_OBJECT (self, "Downstream support CUDA memory");
if (pool) {
if (!GST_IS_CUDA_BUFFER_POOL (pool)) {
gst_clear_object (&pool);
} else {
auto cuda_pool = GST_CUDA_BUFFER_POOL (pool);
if (cuda_pool->context != priv->ctx)
gst_clear_object (&pool);
}
}
if (!pool)
pool = gst_cuda_buffer_pool_new (priv->ctx);
use_cuda_pool = TRUE;
}
#ifdef HAVE_GST_GL
else if (gst_caps_features_contains (features,
GST_CAPS_FEATURE_MEMORY_GL_MEMORY) && priv->gl_interop) {
GST_DEBUG_OBJECT (self, "Downstream support GL memory");
if (!gst_nv_comp_video_dec_ensure_gl_context (self)) {
priv->gl_interop = FALSE;
} else {
if (pool && !GST_IS_GL_BUFFER_POOL (pool))
gst_clear_object (&pool);
if (!pool)
pool = gst_gl_buffer_pool_new (priv->gl_context);
}
}
#endif
if (!pool)
pool = gst_video_buffer_pool_new ();
auto config = gst_buffer_pool_get_config (pool);
size = GST_VIDEO_INFO_SIZE (&info);
gst_buffer_pool_config_set_params (config, caps, size, 0, 0);
if (use_cuda_pool && priv->stream) {
/* Set our stream on buffer pool config so that CUstream can be shared */
gst_buffer_pool_config_set_cuda_stream (config, priv->stream);
}
if (!gst_buffer_pool_set_config (pool, config)) {
GST_WARNING_OBJECT (self, "Failed to set pool config");
gst_object_unref (pool);
return FALSE;
}
config = gst_buffer_pool_get_config (pool);
gst_buffer_pool_config_get_params (config, nullptr, &size, nullptr, nullptr);
gst_structure_free (config);
if (update_pool)
gst_query_set_nth_allocation_pool (query, 0, pool, size, min, max);
else
gst_query_add_allocation_pool (query, pool, size, min, max);
gst_object_unref (pool);
return TRUE;
}
static gboolean
gst_nv_comp_video_dec_alloc_task (GstNvCompVideoDec * self,
DecoderTask * task, gboolean batched, gsize size)
{
if (batched)
return TRUE;
task->uncompressed_alloc_size = size;
auto cuda_ret =
CuMemAlloc ((CUdeviceptr *) & task->device_uncompressed, size);
if (!gst_cuda_result (cuda_ret))
return FALSE;
cuda_ret = CuMemAllocHost ((void **) &task->host_uncompressed, size);
if (!gst_cuda_result (cuda_ret))
return FALSE;
task->compressed_alloc_size = size;
cuda_ret = CuMemAlloc ((CUdeviceptr *) & task->device_compressed, size);
if (!gst_cuda_result (cuda_ret))
return FALSE;
cuda_ret = CuMemAllocHost ((void **) &task->host_compressed, size);
if (!gst_cuda_result (cuda_ret))
return FALSE;
return TRUE;
}
static gboolean
gst_nv_comp_video_dec_set_format (GstVideoDecoder * decoder,
GstVideoCodecState * state)
{
auto self = GST_NV_COMP_VIDEO_DEC (decoder);
auto priv = self->priv;
if (!priv->ctx) {
GST_ERROR_OBJECT (self, "CUDA context was not configured");
return FALSE;
}
GST_DEBUG_OBJECT (self, "Set format with caps %" GST_PTR_FORMAT, state->caps);
g_clear_pointer (&priv->state, gst_video_codec_state_unref);
priv->state = gst_video_codec_state_ref (state);
auto s = gst_caps_get_structure (state->caps, 0);
std::string mime_type = gst_structure_get_name (s);
auto format_str = gst_structure_get_string (s, "format");
if (!format_str) {
GST_ERROR_OBJECT (self, "Unknown video format");
return FALSE;
}
GstVideoFormat format = gst_video_format_from_string (format_str);
if (format == GST_VIDEO_FORMAT_UNKNOWN || format == GST_VIDEO_FORMAT_ENCODED) {
GST_ERROR_OBJECT (self, "Invalid format string %s", format_str);
return FALSE;
}
s = gst_structure_copy (s);
gst_structure_set_name (s, "video/x-raw");
auto video_caps = gst_caps_new_empty ();
gst_caps_append_structure (video_caps, s);
auto ret = gst_video_info_from_caps (&priv->info, video_caps);
gst_caps_unref (video_caps);
if (!ret) {
GST_ERROR_OBJECT (self, "Couldn't build output caps");
return FALSE;
}
if (!gst_cuda_context_push (priv->ctx)) {
GST_ERROR_OBJECT (self, "Couldn't push context");
return FALSE;
}
priv->manager = nullptr;
priv->batched_decomp = nullptr;
priv->task = nullptr;
priv->batched = TRUE;
if (mime_type == "video/x-nvcomp") {
priv->batched = FALSE;
} else if (mime_type == "video/x-nvcomp-lz4") {
priv->batched_decomp = std::make_shared < BatchedDecomp <
nvcompBatchedLZ4DecompressGetTempSize,
nvcompBatchedLZ4DecompressAsync >> ();
} else if (mime_type == "video/x-nvcomp-snappy") {
priv->batched_decomp = std::make_shared < BatchedDecomp <
nvcompBatchedSnappyDecompressGetTempSize,
nvcompBatchedSnappyDecompressAsync >> ();
} else if (mime_type == "video/x-nvcomp-gdeflate") {
priv->batched_decomp = std::make_shared < BatchedDecomp <
nvcompBatchedGdeflateDecompressGetTempSize,
nvcompBatchedGdeflateDecompressAsync >> ();
} else if (mime_type == "video/x-nvcomp-deflate") {
priv->batched_decomp = std::make_shared < BatchedDecomp <
nvcompBatchedDeflateDecompressGetTempSize,
nvcompBatchedDeflateDecompressAsync >> ();
} else if (mime_type == "video/x-nvcomp-zstd") {
priv->batched_decomp = std::make_shared < BatchedDecomp <
nvcompBatchedZstdDecompressGetTempSize,
nvcompBatchedZstdDecompressAsync >> ();
} else if (mime_type == "video/x-nvcomp-cascaded") {
priv->batched_decomp = std::make_shared < BatchedDecomp <
nvcompBatchedCascadedDecompressGetTempSize,
nvcompBatchedCascadedDecompressAsync >> ();
} else if (mime_type == "video/x-nvcomp-bitcomp") {
priv->batched_decomp = std::make_shared < BatchedDecomp <
nvcompBatchedBitcompDecompressGetTempSize,
nvcompBatchedBitcompDecompressAsync >> ();
} else if (mime_type == "video/x-nvcomp-ans") {
priv->batched_decomp = std::make_shared < BatchedDecomp <
nvcompBatchedANSDecompressGetTempSize,
nvcompBatchedANSDecompressAsync >> ();
} else {
gst_cuda_context_pop (nullptr);
g_assert_not_reached ();
return FALSE;
}
auto task = std::make_shared < DecoderTask > ();
task->ctx = (GstCudaContext *) gst_object_ref (priv->ctx);
if (!gst_nv_comp_video_dec_alloc_task (self, task.get (), priv->batched,
priv->info.size)) {
task = nullptr;
gst_cuda_context_pop (nullptr);
return FALSE;
}
priv->task = task;
gst_cuda_context_pop (nullptr);
return gst_video_decoder_negotiate (decoder);
}
static gboolean
is_supported_cuda_format (GstVideoFormat format)
{
switch (format) {
case GST_VIDEO_FORMAT_I420:
case GST_VIDEO_FORMAT_YV12:
case GST_VIDEO_FORMAT_NV12:
case GST_VIDEO_FORMAT_NV21:
case GST_VIDEO_FORMAT_P010_10LE:
case GST_VIDEO_FORMAT_P012_LE:
case GST_VIDEO_FORMAT_P016_LE:
case GST_VIDEO_FORMAT_I420_10LE:
case GST_VIDEO_FORMAT_I420_12LE:
case GST_VIDEO_FORMAT_Y444:
case GST_VIDEO_FORMAT_Y444_10LE:
case GST_VIDEO_FORMAT_Y444_12LE:
case GST_VIDEO_FORMAT_Y444_16LE:
case GST_VIDEO_FORMAT_BGRA:
case GST_VIDEO_FORMAT_RGBA:
case GST_VIDEO_FORMAT_RGBx:
case GST_VIDEO_FORMAT_BGRx:
case GST_VIDEO_FORMAT_ARGB:
case GST_VIDEO_FORMAT_ABGR:
case GST_VIDEO_FORMAT_RGB:
case GST_VIDEO_FORMAT_BGR:
case GST_VIDEO_FORMAT_BGR10A2_LE:
case GST_VIDEO_FORMAT_RGB10A2_LE:
case GST_VIDEO_FORMAT_Y42B:
case GST_VIDEO_FORMAT_I422_10LE:
case GST_VIDEO_FORMAT_I422_12LE:
case GST_VIDEO_FORMAT_YUY2:
case GST_VIDEO_FORMAT_UYVY:
case GST_VIDEO_FORMAT_RGBP:
case GST_VIDEO_FORMAT_BGRP:
case GST_VIDEO_FORMAT_GBR:
case GST_VIDEO_FORMAT_GBR_10LE:
case GST_VIDEO_FORMAT_GBR_12LE:
case GST_VIDEO_FORMAT_GBR_16LE:
case GST_VIDEO_FORMAT_GBRA:
case GST_VIDEO_FORMAT_VUYA:
return TRUE;
default:
break;
}
return FALSE;
}
#ifdef HAVE_GST_GL
static gboolean
is_supported_gl_format (GstVideoFormat format)
{
auto gl_caps = gst_caps_from_string ("video/x-raw, format = (string) "
GST_GL_COLOR_CONVERT_FORMATS);
auto our_caps = gst_caps_new_empty_simple ("video/x-raw");
gst_caps_set_simple (our_caps,
"format", G_TYPE_STRING, gst_video_format_to_string (format), nullptr);
auto ret = gst_caps_is_subset (our_caps, gl_caps);
gst_caps_unref (gl_caps);
gst_caps_unref (our_caps);
return ret;
}
#endif
static gboolean
gst_nv_comp_video_dec_negotiate (GstVideoDecoder * decoder)
{
auto self = GST_NV_COMP_VIDEO_DEC (decoder);
auto priv = self->priv;
gboolean is_cuda = FALSE;
#ifdef HAVE_GST_GL
gboolean is_gl = FALSE;
#endif
auto peer_caps = gst_pad_get_allowed_caps (decoder->srcpad);
GST_DEBUG_OBJECT (self, "Allowed caps %" GST_PTR_FORMAT, peer_caps);
if (!peer_caps || gst_caps_is_any (peer_caps)) {
GST_DEBUG_OBJECT (self,
"cannot determine output format, use system memory");
} else {
GstCapsFeatures *features;
guint size = gst_caps_get_size (peer_caps);
guint i;
for (i = 0; i < size; i++) {
features = gst_caps_get_features (peer_caps, i);
if (!features)
continue;
if (gst_caps_features_contains (features,
GST_CAPS_FEATURE_MEMORY_CUDA_MEMORY)) {
is_cuda = TRUE;
}
#ifdef HAVE_GST_GL
if (gst_caps_features_contains (features,
GST_CAPS_FEATURE_MEMORY_GL_MEMORY)) {
is_gl = TRUE;
}
#endif
}
}
gst_clear_caps (&peer_caps);
auto state = gst_video_decoder_set_interlaced_output_state (decoder,
GST_VIDEO_INFO_FORMAT (&priv->info),
GST_VIDEO_INFO_INTERLACE_MODE (&priv->info), priv->info.width,
priv->info.height, priv->state);
if (!state) {
GST_ERROR_OBJECT (self, "Couldn't set output state");
return FALSE;
}
priv->gl_interop = FALSE;
state->caps = gst_video_info_to_caps (&state->info);
auto format = GST_VIDEO_INFO_FORMAT (&priv->info);
if (is_cuda && is_supported_cuda_format (format)) {
gst_caps_set_features_simple (state->caps,
gst_caps_features_new (GST_CAPS_FEATURE_MEMORY_CUDA_MEMORY, nullptr));
}
#ifdef HAVE_GST_GL
else if (is_gl && is_supported_gl_format (format)) {
gst_caps_set_features_simple (state->caps,
gst_caps_features_new (GST_CAPS_FEATURE_MEMORY_GL_MEMORY, nullptr));
priv->gl_interop = TRUE;
}
#endif
return GST_VIDEO_DECODER_CLASS (parent_class)->negotiate (decoder);
}
static gboolean
gst_nv_comp_video_dec_download (GstNvCompVideoDec * self, GstVideoFrame * frame,
CUstream stream, gboolean is_device_copy)
{
auto priv = self->priv;
auto info = &priv->info;
auto finfo = info->finfo;
gint comp[GST_VIDEO_MAX_COMPONENTS];
CUresult ret = CUDA_SUCCESS;
auto task = priv->task;
for (guint i = 0; i < GST_VIDEO_FRAME_N_PLANES (frame); i++) {
guint8 *sp;
if (is_device_copy)
sp = task->device_uncompressed + info->offset[i];
else
sp = task->host_uncompressed + info->offset[i];
guint8 *dp = (guint8 *) GST_VIDEO_FRAME_PLANE_DATA (frame, i);
guint ss, ds;
guint w, h;
if (GST_VIDEO_FORMAT_INFO_HAS_PALETTE (finfo) && i == 1) {
if (is_device_copy) {
ret = CuMemcpyDtoDAsync ((CUdeviceptr) dp, (CUdeviceptr) sp,
256 * 4, stream);
} else {
memcpy (dp, sp, 256 * 4);
}
if (!gst_cuda_result (ret)) {
GST_ERROR_OBJECT (self, "CUDA memcpy failed");
return FALSE;
}
return TRUE;
}
ds = GST_VIDEO_FRAME_PLANE_STRIDE (frame, i);
ss = GST_VIDEO_INFO_PLANE_STRIDE (info, i);
gst_video_format_info_component (finfo, i, comp);
w = GST_VIDEO_INFO_COMP_WIDTH (info, comp[0]) *
GST_VIDEO_INFO_COMP_PSTRIDE (info, comp[0]);
if (w == 0)
w = MIN (ss, ds);
h = GST_VIDEO_INFO_COMP_HEIGHT (info, comp[0]);
if (GST_VIDEO_FORMAT_INFO_IS_TILED (finfo)) {
gint tile_size;
gint sx_tiles, sy_tiles, dx_tiles, dy_tiles;
GstVideoTileMode mode;
tile_size = GST_VIDEO_FORMAT_INFO_TILE_SIZE (info->finfo, i);
mode = GST_VIDEO_FORMAT_INFO_TILE_MODE (info->finfo);
sx_tiles = GST_VIDEO_TILE_X_TILES (ss);
sy_tiles = GST_VIDEO_TILE_Y_TILES (ss);
dx_tiles = GST_VIDEO_TILE_X_TILES (ds);
dy_tiles = GST_VIDEO_TILE_Y_TILES (ds);
w = MIN (sx_tiles, dx_tiles);
h = MIN (sy_tiles, dy_tiles);
for (guint j = 0; j < h; j++) {
for (guint k = 0; k < w; k++) {
guint si, di;
guint8 *cur_dp;
guint8 *cur_sp;
si = gst_video_tile_get_index (mode, k, j, sx_tiles, sy_tiles);
di = gst_video_tile_get_index (mode, k, j, dx_tiles, dy_tiles);
cur_dp = dp + (di * tile_size);
cur_sp = sp + (si * tile_size);
if (is_device_copy) {
ret = CuMemcpyDtoDAsync ((CUdeviceptr) cur_dp, (CUdeviceptr) cur_sp,
w, stream);
} else {
memcpy (cur_dp, cur_sp, w);
}
if (!gst_cuda_result (ret)) {
GST_ERROR_OBJECT (self, "CUDA memcpy failed");
return FALSE;
}
}
}
} else {
if (is_device_copy) {
CUDA_MEMCPY2D params = { };
params.srcMemoryType = CU_MEMORYTYPE_DEVICE;
params.srcDevice = (CUdeviceptr) sp;
params.srcPitch = ss;
params.dstMemoryType = CU_MEMORYTYPE_DEVICE;
params.dstDevice = (CUdeviceptr) dp;
params.dstPitch = ds;
params.WidthInBytes = w;
params.Height = h;
ret = CuMemcpy2DAsync (&params, stream);
if (!gst_cuda_result (ret)) {
GST_ERROR_OBJECT (self, "CUDA memcpy failed");
return FALSE;
}
} else {
for (guint j = 0; j < h; j++) {
memcpy (dp, sp, w);
dp += ds;
sp += ss;
}
}
}
}
return TRUE;
}
#ifdef HAVE_GST_GL
struct GLInteropData
{
GstNvCompVideoDec *self = nullptr;
GstBuffer *buffer = nullptr;
gboolean ret = FALSE;
};
static GstCudaGraphicsResource *
ensure_gl_cuda_resource (GstNvCompVideoDec * self, GstMemory * mem)
{
auto priv = self->priv;
GstCudaGraphicsResource *resource;
GQuark quark;
if (!gst_is_gl_memory_pbo (mem)) {
GST_WARNING_OBJECT (self, "memory is not GL PBO memory, %s",
mem->allocator->mem_type);
return nullptr;
}
quark = gst_cuda_quark_from_id (GST_CUDA_QUARK_GRAPHICS_RESOURCE);
resource = (GstCudaGraphicsResource *)
gst_mini_object_get_qdata (GST_MINI_OBJECT (mem), quark);
if (!resource) {
GstMapInfo map_info;
GstGLMemoryPBO *pbo = (GstGLMemoryPBO *) mem;
GstGLBuffer *gl_buf = pbo->pbo;
gboolean ret;
if (!gst_memory_map (mem, &map_info,
(GstMapFlags) (GST_MAP_READ | GST_MAP_GL))) {
GST_ERROR_OBJECT (self, "Couldn't map gl memory");
return nullptr;
}
resource = gst_cuda_graphics_resource_new (priv->ctx,
GST_OBJECT (GST_GL_BASE_MEMORY_CAST (mem)->context),
GST_CUDA_GRAPHICS_RESOURCE_GL_BUFFER);
GST_LOG_OBJECT (self, "registering gl buffer %d to CUDA", gl_buf->id);
ret = gst_cuda_graphics_resource_register_gl_buffer (resource, gl_buf->id,
CU_GRAPHICS_REGISTER_FLAGS_NONE);
gst_memory_unmap (mem, &map_info);
if (!ret) {
GST_ERROR_OBJECT (self, "Couldn't register gl buffer %d", gl_buf->id);
gst_cuda_graphics_resource_free (resource);
return nullptr;
}
gst_mini_object_set_qdata (GST_MINI_OBJECT (mem), quark, resource,
(GDestroyNotify) gst_cuda_graphics_resource_free);
}
return resource;
}
static void
gst_nv_comp_video_dec_download_gl (GstGLContext * context, GLInteropData * data)
{
auto self = data->self;
auto priv = self->priv;
auto info = &priv->info;
auto finfo = info->finfo;
GstCudaGraphicsResource *gst_res[GST_VIDEO_MAX_PLANES] = { nullptr, };
CUgraphicsResource cuda_res[GST_VIDEO_MAX_PLANES] = { nullptr, };
CUdeviceptr src_devptr[GST_VIDEO_MAX_PLANES] = { 0, };
CUstream stream = gst_cuda_stream_get_handle (priv->stream);
CUresult ret;
gint comp[GST_VIDEO_MAX_COMPONENTS];
auto task = priv->task;
if (!gst_cuda_context_push (priv->ctx)) {
GST_ERROR_OBJECT (self, "Couldn't push context");
return;
}
for (guint i = 0; i < GST_VIDEO_INFO_N_PLANES (info); i++) {
GstMemory *mem = gst_buffer_peek_memory (data->buffer, i);
gsize src_size;
if (!gst_is_gl_memory_pbo (mem)) {
GST_ERROR_OBJECT (self, "Not a GL PBO memory");
goto out;
}
gst_res[i] = ensure_gl_cuda_resource (self, mem);
if (!gst_res[i]) {
GST_ERROR_OBJECT (self, "Couldn't get resource %d", i);
goto out;
}
cuda_res[i] = gst_cuda_graphics_resource_map (gst_res[i], stream,
CU_GRAPHICS_MAP_RESOURCE_FLAGS_WRITE_DISCARD);
if (!cuda_res[i]) {
GST_ERROR_OBJECT (self, "Couldn't map resource");
goto out;
}
ret = CuGraphicsResourceGetMappedPointer (&src_devptr[i],
&src_size, cuda_res[i]);
if (!gst_cuda_result (ret)) {
GST_ERROR_OBJECT (self, "Couldn't get mapped device pointer");
goto out;
}
/* Need PBO -> texture */
GST_MINI_OBJECT_FLAG_SET (mem, GST_GL_BASE_MEMORY_TRANSFER_NEED_UPLOAD);
}
for (guint i = 0; i < GST_VIDEO_INFO_N_PLANES (info); i++) {
guint8 *sp = task->device_uncompressed + info->offset[i];
guint8 *dp = (guint8 *) src_devptr[i];
guint ss, ds;
guint w, h;
if (GST_VIDEO_FORMAT_INFO_HAS_PALETTE (finfo) && i == 1) {
ret = CuMemcpyDtoDAsync ((CUdeviceptr) dp, (CUdeviceptr) sp,
256 * 4, stream);
if (!gst_cuda_result (ret)) {
GST_ERROR_OBJECT (self, "CUDA memcpy failed");
goto out;
}
data->ret = TRUE;
goto out;
}
auto meta = gst_buffer_get_video_meta (data->buffer);
if (meta)
ds = meta->stride[i];
else
ds = GST_VIDEO_INFO_PLANE_STRIDE (info, i);
ss = GST_VIDEO_INFO_PLANE_STRIDE (info, i);
gst_video_format_info_component (finfo, i, comp);
w = GST_VIDEO_INFO_COMP_WIDTH (info, comp[0]) *
GST_VIDEO_INFO_COMP_PSTRIDE (info, comp[0]);
if (w == 0)
w = MIN (ss, ds);
h = GST_VIDEO_INFO_COMP_HEIGHT (info, comp[0]);
if (GST_VIDEO_FORMAT_INFO_IS_TILED (finfo)) {
gint tile_size;
gint sx_tiles, sy_tiles, dx_tiles, dy_tiles;
GstVideoTileMode mode;
tile_size = GST_VIDEO_FORMAT_INFO_TILE_SIZE (info->finfo, i);
mode = GST_VIDEO_FORMAT_INFO_TILE_MODE (info->finfo);
sx_tiles = GST_VIDEO_TILE_X_TILES (ss);
sy_tiles = GST_VIDEO_TILE_Y_TILES (ss);
dx_tiles = GST_VIDEO_TILE_X_TILES (ds);
dy_tiles = GST_VIDEO_TILE_Y_TILES (ds);
w = MIN (sx_tiles, dx_tiles);
h = MIN (sy_tiles, dy_tiles);
for (guint j = 0; j < h; j++) {
for (guint k = 0; k < w; k++) {
guint si, di;
guint8 *cur_dp;
guint8 *cur_sp;
si = gst_video_tile_get_index (mode, k, j, sx_tiles, sy_tiles);
di = gst_video_tile_get_index (mode, k, j, dx_tiles, dy_tiles);
cur_dp = dp + (di * tile_size);
cur_sp = sp + (si * tile_size);
ret = CuMemcpyDtoDAsync ((CUdeviceptr) cur_dp, (CUdeviceptr) cur_sp,
w, stream);
if (!gst_cuda_result (ret)) {
GST_ERROR_OBJECT (self, "CUDA memcpy failed");
goto out;
}
}
}
} else {
CUDA_MEMCPY2D params = { };
params.srcMemoryType = CU_MEMORYTYPE_DEVICE;
params.srcDevice = (CUdeviceptr) sp;
params.srcPitch = ss;
params.dstMemoryType = CU_MEMORYTYPE_DEVICE;
params.dstDevice = (CUdeviceptr) dp;
params.dstPitch = ds;
params.WidthInBytes = w;
params.Height = h;
ret = CuMemcpy2DAsync (&params, stream);
if (!gst_cuda_result (ret)) {
GST_ERROR_OBJECT (self, "CUDA memcpy failed");
goto out;
}
}
}
data->ret = TRUE;
out:
for (guint i = 0; i < gst_buffer_n_memory (data->buffer); i++) {
if (!gst_res[i])
break;
gst_cuda_graphics_resource_unmap (gst_res[i], stream);
}
CuStreamSynchronize (stream);
gst_cuda_context_pop (nullptr);
}
#endif
struct ChunkData
{
size_t uncomp_size = 0;
size_t comp_size = 0;
size_t offset = 0;
};
static gboolean
gst_nv_comp_video_dec_parse_header (GstNvCompVideoDec * self,
const guint8 * data, gsize size,
size_t &uncompressed_chunk_size, size_t &max_compressed_chunk_size,
size_t &batch_size, std::vector < ChunkData > &compressed_chunks)
{
guint32 val;
const guint8 *ptr = data;
gsize remaining = size;
if (size <= GST_NV_COMP_HEADER_MIN_SIZE) {
GST_ERROR_OBJECT (self, "Too small size");
return FALSE;
}
val = GST_READ_UINT32_LE (ptr);
if (val != GST_NV_COMP_HEADER_VERSION) {
GST_ERROR_OBJECT (self, "Invalid version");
return FALSE;
}
ptr += sizeof (guint32);
remaining -= sizeof (guint32);
uncompressed_chunk_size = GST_READ_UINT32_LE (ptr);
ptr += sizeof (guint32);
remaining -= sizeof (guint32);
max_compressed_chunk_size = GST_READ_UINT32_LE (ptr);
ptr += sizeof (guint32);
remaining -= sizeof (guint32);
batch_size = GST_READ_UINT32_LE (ptr);
ptr += sizeof (guint32);
remaining -= sizeof (guint32);
compressed_chunks.resize (batch_size);
size_t total_compressed_size = 0;
for (size_t i = 0; i < batch_size; i++) {
if (remaining < sizeof (guint32))
return FALSE;
compressed_chunks[i].uncomp_size = GST_READ_UINT32_LE (ptr);
ptr += sizeof (guint32);
remaining -= sizeof (guint32);
if (remaining < sizeof (guint32))
return FALSE;
compressed_chunks[i].comp_size = GST_READ_UINT32_LE (ptr);
total_compressed_size += compressed_chunks[i].comp_size;
ptr += sizeof (guint32);
remaining -= sizeof (guint32);
}
if (remaining != total_compressed_size) {
GST_ERROR_OBJECT (self, "Size mismatch, remaining: %" G_GSIZE_FORMAT
", total compressed: %" G_GSIZE_FORMAT, remaining,
total_compressed_size);
return FALSE;
}
for (size_t i = 0; i < batch_size; i++) {
compressed_chunks[i].offset = ptr - data;
ptr += compressed_chunks[i].comp_size;
}
return TRUE;
}
static GstFlowReturn
gst_nv_comp_video_dec_handle_frame (GstVideoDecoder * decoder,
GstVideoCodecFrame * frame)
{
auto self = GST_NV_COMP_VIDEO_DEC (decoder);
auto priv = self->priv;
CUstream stream = nullptr;
GstVideoFrame vframe;
GstMapInfo map_info;
CUresult cuda_ret;
gboolean need_copy = TRUE;
GstMemory *mem;
nvcompStatus_t status;
auto task = priv->task;
GstFlowReturn ret;
if (!priv->ctx || !priv->task) {
GST_ERROR_OBJECT (self, "Context was not configured");
goto error;
}
ret = gst_video_decoder_allocate_output_frame (decoder, frame);
if (ret != GST_FLOW_OK) {
gst_video_decoder_release_frame (decoder, frame);
return ret;
}
if (!gst_cuda_context_push (priv->ctx)) {
GST_ERROR_OBJECT (self, "Couldn't push context");
goto error;
}
stream = gst_cuda_stream_get_handle (priv->stream);
if (!gst_buffer_map (frame->input_buffer, &map_info, GST_MAP_READ)) {
GST_ERROR_OBJECT (self, "Couldn't map input buffer");
gst_cuda_context_pop (nullptr);
goto error;
}
if (priv->batched) {
g_assert (priv->batched_decomp);
/* Parse custom header */
size_t uncompressed_chunk_size;
size_t max_compressed_chunk_size;
size_t batch_size;
std::vector < ChunkData > compressed_chunks;
guint8 *mapped_data = map_info.data;
uint8_t *uncompressed;
if (!gst_nv_comp_video_dec_parse_header (self, mapped_data,
map_info.size, uncompressed_chunk_size, max_compressed_chunk_size,
batch_size, compressed_chunks)) {
gst_buffer_unmap (frame->input_buffer, &map_info);
gst_cuda_context_pop (nullptr);
goto error;
}
GST_LOG_OBJECT (self, "batch size %" G_GSIZE_FORMAT
", uncompressed-chunk-size %" G_GSIZE_FORMAT
", compressed-chunk-size %" G_GSIZE_FORMAT,
batch_size, uncompressed_chunk_size, max_compressed_chunk_size);
if (task->batch_size < batch_size ||
task->max_uncompressed_chunk_size < uncompressed_chunk_size ||
task->max_compressed_chunk_size < max_compressed_chunk_size) {
task->clear_resource ();
}
if (task->batch_size == 0) {
size_t temp_size = 0;
GST_DEBUG_OBJECT (self, "Allocating resource");
status = priv->batched_decomp->get_temp_size (batch_size,
uncompressed_chunk_size, &temp_size);
if (status != nvcompSuccess) {
GST_ERROR_OBJECT (self, "Couldn't get temp size");
gst_buffer_unmap (frame->input_buffer, &map_info);
gst_cuda_context_pop (nullptr);
goto error;
}
if (!task->allocate_batched (batch_size,
max_compressed_chunk_size, uncompressed_chunk_size, temp_size)) {
GST_ERROR_OBJECT (self, "Couldn't allocate resource");
gst_buffer_unmap (frame->input_buffer, &map_info);
gst_cuda_context_pop (nullptr);
goto error;
}
}
for (size_t i = 0; i < batch_size; i++) {
memcpy (task->host_compressed + (i * task->max_compressed_chunk_size),
mapped_data + compressed_chunks[i].offset,
compressed_chunks[i].comp_size);
task->host_compressed_bytes[i] = compressed_chunks[i].comp_size;
}
gst_buffer_unmap (frame->input_buffer, &map_info);
for (size_t i = 0; i < batch_size; i++) {
GST_LOG_OBJECT (self, "Uploading chunk %" G_GSIZE_FORMAT
", size %" G_GSIZE_FORMAT, i, compressed_chunks[i].comp_size);
auto offset = i * task->max_compressed_chunk_size;
cuda_ret = CuMemcpyHtoDAsync ((CUdeviceptr)
(task->device_compressed + offset),
task->host_compressed + offset,
compressed_chunks[i].comp_size, stream);
if (!gst_cuda_result (cuda_ret)) {
gst_cuda_context_pop (nullptr);
goto error;
}
}
cuda_ret = CuMemcpyHtoDAsync ((CUdeviceptr) task->device_compressed_bytes,
task->host_compressed_bytes, sizeof (size_t) * batch_size, stream);
if (!gst_cuda_result (cuda_ret)) {
gst_cuda_context_pop (nullptr);
goto error;
}
status = priv->batched_decomp->decompress (task->device_compressed_ptrs,
task->device_compressed_bytes, task->device_uncompressed_bytes,
task->device_actual_uncompressed_bytes, batch_size,
task->temp_ptr, task->temp_size, task->device_uncompressed_ptrs,
task->device_statuses, (cudaStream_t) stream);
if (status != nvcompSuccess) {
GST_ERROR_OBJECT (self, "Couldn't decompress stream, status: %d", status);
gst_cuda_context_pop (nullptr);
goto error;
}
uncompressed = task->device_uncompressed;
for (size_t i = 0; i < batch_size; i++) {
auto size = compressed_chunks[i].uncomp_size;
cuda_ret = CuMemcpyDtoDAsync ((CUdeviceptr) uncompressed,
(CUdeviceptr) task->host_uncompressed_ptrs[i], size, stream);
if (!gst_cuda_result (cuda_ret)) {
gst_cuda_context_pop (nullptr);
goto error;
}
uncompressed += size;
}
} else {
if (task->compressed_alloc_size < map_info.size) {
if (task->device_compressed)
CuMemFree ((CUdeviceptr) task->device_compressed);
task->device_compressed = nullptr;
if (task->host_compressed)
CuMemFreeHost (task->host_compressed);
task->host_compressed = nullptr;
task->compressed_alloc_size = GST_ROUND_UP_128 (map_info.size);
auto cuda_ret = CuMemAlloc ((CUdeviceptr *) & task->device_compressed,
task->compressed_alloc_size);
if (!gst_cuda_result (cuda_ret)) {
gst_buffer_unmap (frame->input_buffer, &map_info);
gst_cuda_context_pop (nullptr);
goto error;
}
cuda_ret = CuMemAllocHost ((void **) &task->host_compressed,
task->compressed_alloc_size);
if (!gst_cuda_result (cuda_ret)) {
gst_buffer_unmap (frame->input_buffer, &map_info);
gst_cuda_context_pop (nullptr);
goto error;
}
}
memcpy (task->host_compressed, map_info.data, map_info.size);
cuda_ret = CuMemcpyHtoDAsync ((CUdeviceptr) task->device_compressed,
task->host_compressed, map_info.size, stream);
gst_buffer_unmap (frame->input_buffer, &map_info);
if (!gst_cuda_result (cuda_ret)) {
GST_ERROR_OBJECT (self, "Couldn't copy compressed memory");
gst_cuda_context_pop (nullptr);
goto error;
}
if (!priv->manager) {
priv->manager = create_manager (task->device_compressed,
(cudaStream_t) stream);
}
{
auto config =
priv->manager->configure_decompression (task->device_compressed);
if (config.decomp_data_size != priv->info.size) {
GST_ERROR_OBJECT (self, "size mismatch, expected %" G_GSIZE_FORMAT
", required %" G_GSIZE_FORMAT, priv->info.size,
config.decomp_data_size);
gst_cuda_context_pop (nullptr);
goto error;
}
priv->manager->decompress (task->device_uncompressed,
task->device_compressed, config);
}
}
mem = gst_buffer_peek_memory (frame->output_buffer, 0);
#ifdef HAVE_GST_GL
if (priv->gl_interop && gst_buffer_n_memory (frame->output_buffer) ==
GST_VIDEO_INFO_N_PLANES (&priv->info)) {
GLInteropData interop_data;
interop_data.self = self;
interop_data.buffer = frame->output_buffer;
interop_data.ret = FALSE;
auto gl_mem = (GstGLMemory *) mem;
gst_gl_context_thread_add (gl_mem->mem.context,
(GstGLContextThreadFunc) gst_nv_comp_video_dec_download_gl,
&interop_data);
if (interop_data.ret) {
need_copy = FALSE;
GST_TRACE_OBJECT (self, "CUDA -> GL copy done");
} else {
priv->gl_interop = FALSE;
}
}
#endif
if (need_copy) {
GstMapFlags map_flags = GST_MAP_WRITE;
gboolean device_copy = FALSE;
gboolean do_sync = TRUE;
if (gst_is_cuda_memory (mem)) {
auto cmem = GST_CUDA_MEMORY_CAST (mem);
if (cmem->context == priv->ctx) {
map_flags = (GstMapFlags) (GST_MAP_WRITE | GST_MAP_CUDA);
device_copy = TRUE;
auto mem_stream = gst_cuda_memory_get_stream (cmem);
if (mem_stream && mem_stream == priv->stream)
do_sync = FALSE;
}
}
if (!device_copy) {
cuda_ret = CuMemcpyDtoHAsync (task->host_uncompressed,
(CUdeviceptr) task->device_uncompressed, priv->info.size, stream);
if (!gst_cuda_result (cuda_ret)) {
GST_ERROR_OBJECT (self, "Couldn't download image");
gst_cuda_context_pop (nullptr);
goto error;
}
CuStreamSynchronize (stream);
do_sync = FALSE;
}
gst_video_frame_map (&vframe, &priv->info, frame->output_buffer, map_flags);
gst_nv_comp_video_dec_download (self, &vframe, stream, device_copy);
if (do_sync)
CuStreamSynchronize (stream);
gst_video_frame_unmap (&vframe);
}
gst_cuda_context_pop (nullptr);
return gst_video_decoder_finish_frame (decoder, frame);
error:
gst_video_decoder_release_frame (decoder, frame);
return GST_FLOW_ERROR;
}