gstreamer/subprojects/gst-plugins-bad/ext/nvcomp/gstnvcompvideoenc.cpp
Seungha Yang 0ed9c39835 nvcomp: Add nvCOMP library based GPU lossless compression plugin
Adding NVIDIA nvCOMP library based plugin for lossless raw video
compression/decompression. To build this plugin, user should
install nvCOMP SDK first and specify the SDK path via
"nvcomp-sdk-path" build option or NVCOMP_SDK_PATH env.

Part-of: <https://gitlab.freedesktop.org/gstreamer/gstreamer/-/merge_requests/6912>
2024-06-13 18:19:08 +00:00

2014 lines
61 KiB
C++

/* GStreamer
* Copyright (C) 2024 Seungha Yang <seungha@centricular.com>
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Library General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Library General Public License for more details.
*
* You should have received a copy of the GNU Library General Public
* License along with this library; if not, write to the
* Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
* Boston, MA 02110-1301, USA.
*/
#ifdef HAVE_CONFIG_H
#include "config.h"
#endif
#include "gstnvcompvideoenc.h"
#ifdef HAVE_GST_GL
#include <gst/gl/gl.h>
#include <gst/gl/gstglfuncs.h>
#endif
#include <nvcomp/ans.hpp>
#include <nvcomp/bitcomp.hpp>
#include <nvcomp/cascaded.hpp>
#include <nvcomp/deflate.hpp>
#include <nvcomp/gdeflate.hpp>
#include <nvcomp/lz4.hpp>
#include <nvcomp/snappy.hpp>
#include <nvcomp/zstd.hpp>
#include <memory>
#include <mutex>
#include <condition_variable>
#include <queue>
#include <atomic>
#include <string>
#include <string.h>
GST_DEBUG_CATEGORY_STATIC (gst_nv_comp_video_enc_debug);
#define GST_CAT_DEFAULT gst_nv_comp_video_enc_debug
#ifdef HAVE_GST_GL
#define SINK_CAPS \
GST_VIDEO_CAPS_MAKE_WITH_FEATURES (GST_CAPS_FEATURE_MEMORY_CUDA_MEMORY, \
GST_VIDEO_FORMATS_ALL) ";" \
GST_VIDEO_CAPS_MAKE_WITH_FEATURES (GST_CAPS_FEATURE_MEMORY_GL_MEMORY, \
GST_VIDEO_FORMATS_ALL) ";" \
GST_VIDEO_CAPS_MAKE (GST_VIDEO_FORMATS_ALL)
#else
#define SINK_CAPS \
GST_VIDEO_CAPS_MAKE_WITH_FEATURES (GST_CAPS_FEATURE_MEMORY_CUDA_MEMORY, \
GST_VIDEO_FORMATS_ALL) ";" \
GST_VIDEO_CAPS_MAKE (GST_VIDEO_FORMATS_ALL)
#endif
static GstStaticPadTemplate sink_template =
GST_STATIC_PAD_TEMPLATE ("sink", GST_PAD_SINK, GST_PAD_ALWAYS,
GST_STATIC_CAPS (SINK_CAPS));
static GstStaticPadTemplate src_template =
GST_STATIC_PAD_TEMPLATE ("src", GST_PAD_SRC, GST_PAD_ALWAYS,
GST_STATIC_CAPS ("video/x-nvcomp; video/x-nvcomp-lz4; "
"video/x-nvcomp-snappy; video/x-nvcomp-gdeflate; "
"video/x-nvcomp-deflate; video/x-nvcomp-zstd; video/x-nvcomp-cascaded; "
"video/x-nvcomp-bitcomp; video/x-nvcomp-ans"));
enum GstNvCompDataType
{
GST_NV_COMP_DATA_TYPE_DEFAULT = -1,
GST_NV_COMP_DATA_TYPE_CHAR = NVCOMP_TYPE_CHAR,
GST_NV_COMP_DATA_TYPE_UCHAR = NVCOMP_TYPE_UCHAR,
GST_NV_COMP_DATA_TYPE_SHORT = NVCOMP_TYPE_SHORT,
GST_NV_COMP_DATA_TYPE_USHORT = NVCOMP_TYPE_USHORT,
GST_NV_COMP_DATA_TYPE_INT = NVCOMP_TYPE_INT,
GST_NV_COMP_DATA_TYPE_UINT = NVCOMP_TYPE_UINT,
GST_NV_COMP_DATA_TYPE_LONGLONG = NVCOMP_TYPE_LONGLONG,
GST_NV_COMP_DATA_TYPE_ULONGLONG = NVCOMP_TYPE_ULONGLONG,
GST_NV_COMP_DATA_TYPE_BITS = NVCOMP_TYPE_BITS,
};
#define GST_TYPE_NV_COMP_DATA_TYPE (gst_nv_comp_data_type_type())
static GType
gst_nv_comp_data_type_type (void)
{
static GType data_type = 0;
static std::once_flag once;
static const GEnumValue types[] = {
{GST_NV_COMP_DATA_TYPE_DEFAULT, "Default", "default"},
{GST_NV_COMP_DATA_TYPE_CHAR, "CHAR", "char"},
{GST_NV_COMP_DATA_TYPE_UCHAR, "UCHAR", "uchar"},
{GST_NV_COMP_DATA_TYPE_SHORT, "SHORT", "short"},
{GST_NV_COMP_DATA_TYPE_USHORT, "USHORT", "ushort"},
{GST_NV_COMP_DATA_TYPE_INT, "INT", "int"},
{GST_NV_COMP_DATA_TYPE_UINT, "UINT", "uint"},
{GST_NV_COMP_DATA_TYPE_LONGLONG, "LONGLONG", "longlong"},
{GST_NV_COMP_DATA_TYPE_ULONGLONG, "ULONGLONG", "ulonglong"},
{GST_NV_COMP_DATA_TYPE_BITS, "BITS", "bits"},
{0, nullptr, nullptr},
};
std::call_once (once,[&] {
data_type = g_enum_register_static ("GstNvCompDataType", types);
});
return data_type;
}
enum GstNvCompDeflateAlgo
{
GST_NV_COMP_DEFLATE_HIGH_THROUGHPUT,
GST_NV_COMP_DEFLATE_LOW_THROUGHPUT,
GST_NV_COMP_DEFLATE_HIGHEST_THROUGHPUT,
};
#define GST_TYPE_NV_COMP_DEFLATE_ALGO (gst_nv_comp_deflate_algo_get_type())
static GType
gst_nv_comp_deflate_algo_get_type (void)
{
static GType algo_type = 0;
static std::once_flag once;
static const GEnumValue algo[] = {
{GST_NV_COMP_DEFLATE_HIGH_THROUGHPUT,
"High throughput, low compression ratio", "high-throughput"},
{GST_NV_COMP_DEFLATE_LOW_THROUGHPUT,
"Low throughput, high compression ratio", "low-throughput"},
{GST_NV_COMP_DEFLATE_HIGHEST_THROUGHPUT,
"Highest throughput, entropy-only compression", "highest-throughput"},
{0, nullptr, nullptr},
};
std::call_once (once,[&] {
algo_type = g_enum_register_static ("GstNvCompDeflateAlgo", algo);
});
return algo_type;
}
enum GstNvCompBitcompAlgo
{
GST_NV_COMP_BITCOMP_DEFAULT,
GST_NV_COMP_BITCOMP_SPARSE,
};
#define GST_TYPE_NV_COMP_BITCOMP_ALGO (gst_nv_comp_bitcomp_algo_get_type())
static GType
gst_nv_comp_bitcomp_algo_get_type (void)
{
static GType algo_type = 0;
static std::once_flag once;
static const GEnumValue algo[] = {
{GST_NV_COMP_BITCOMP_DEFAULT, "Default", "default"},
{GST_NV_COMP_BITCOMP_SPARSE, "Sparse", "sparse"},
{0, nullptr, nullptr},
};
std::call_once (once,[&] {
algo_type = g_enum_register_static ("GstNvCompBitcompAlgo", algo);
});
return algo_type;
}
enum
{
PROP_0,
PROP_METHOD,
PROP_DEFLATE_ALGO,
PROP_BITCOMP_ALGO,
PROP_DATA_TYPE,
PROP_CHUNK_SIZE,
PROP_ASYNC_DEPTH,
PROP_BATCHED,
};
#define DEFAULT_METHOD GST_NV_COMP_BITCOMP
#define DEFAULT_DEFLATE_ALGO GST_NV_COMP_DEFLATE_HIGH_THROUGHPUT
#define DEFAULT_BITCOMP_ALGO GST_NV_COMP_BITCOMP_SPARSE
#define DEFAULT_DATA_TYPE GST_NV_COMP_DATA_TYPE_DEFAULT
#define DEFAULT_CHUNK_SIZE 0
#define DEFAULT_BATCHED TRUE
#define DEFAULT_ASYNC_DEPTH 2
/* *INDENT-OFF* */
using namespace nvcomp;
struct EncoderTask
{
~EncoderTask ()
{
if (ctx) {
gst_cuda_context_push (ctx);
if (event)
CuEventDestroy (event);
if (device_uncompressed)
CuMemFree ((CUdeviceptr) device_uncompressed);
if (host_uncompressed)
CuMemFreeHost (host_uncompressed);
if (device_compressed)
CuMemFree ((CUdeviceptr) device_compressed);
if (host_compressed)
CuMemFreeHost (host_compressed);
if (device_uncompressed_bytes)
CuMemFree ((CUdeviceptr) device_uncompressed_bytes);
if (device_uncompressed_ptrs)
CuMemFree ((CUdeviceptr) device_uncompressed_ptrs);
if (device_compressed_bytes)
CuMemFree ((CUdeviceptr) device_compressed_bytes);
if (host_uncompressed_bytes)
CuMemFreeHost (host_uncompressed_bytes);
if (host_uncompressed_ptrs)
CuMemFreeHost (host_uncompressed_ptrs);
if (device_compressed_ptrs)
CuMemFree ((CUdeviceptr) device_compressed_ptrs);
if (host_compressed_bytes)
CuMemFreeHost (host_compressed_bytes);
if (host_compressed_ptrs)
CuMemFreeHost (host_compressed_ptrs);
if (temp_ptr)
CuMemFree ((CUdeviceptr) temp_ptr);
gst_cuda_context_pop (nullptr);
gst_object_unref (ctx);
}
}
GstCudaContext *ctx = nullptr;
CUevent event = nullptr;
uint8_t *device_uncompressed = nullptr;
uint8_t *host_uncompressed = nullptr;
uint8_t *device_compressed = nullptr;
uint8_t *host_compressed = nullptr;
size_t *device_uncompressed_bytes = nullptr;
void **device_uncompressed_ptrs = nullptr;
size_t *host_uncompressed_bytes = nullptr;
void **host_uncompressed_ptrs = nullptr;
size_t *device_compressed_bytes = nullptr;
void **device_compressed_ptrs = nullptr;
size_t *host_compressed_bytes = nullptr;
void **host_compressed_ptrs = nullptr;
void *temp_ptr = nullptr;
size_t temp_size = 0;
size_t compressed_size = 0;
gboolean batched;
size_t batch_size;
size_t chunk_size;
size_t max_output_chunk_size;
size_t compressed_alloc_size;
};
struct BatchedCompBase
{
virtual nvcompStatus_t get_temp_size(
size_t batch_size,
size_t max_uncompressed_chunk_bytes,
size_t * temp_bytes) = 0;
virtual nvcompStatus_t get_max_compressed_chunk_size(
size_t max_uncompressed_chunk_bytes,
size_t * max_compressed_bytes) = 0;
virtual nvcompStatus_t compress(
void **device_uncompressed_ptrs,
size_t *device_uncompressed_bytes,
size_t max_uncompressed_chunk_bytes,
size_t batch_size,
void *device_temp_ptr,
size_t temp_bytes,
void **device_compressed_ptrs,
size_t *device_compressed_bytes,
cudaStream_t stream) = 0;
};
template <typename FormatOptT, auto T, auto O, auto C>
class BatchedComp : public BatchedCompBase
{
public:
BatchedComp (const FormatOptT & opt) : opts_(opt) {}
nvcompStatus_t get_temp_size(
size_t batch_size,
size_t max_uncompressed_chunk_bytes,
size_t * temp_bytes)
{
return T (batch_size, max_uncompressed_chunk_bytes, opts_, temp_bytes);
}
nvcompStatus_t get_max_compressed_chunk_size(
size_t max_uncompressed_chunk_bytes,
size_t * max_compressed_bytes)
{
return O (max_uncompressed_chunk_bytes, opts_, max_compressed_bytes);
}
nvcompStatus_t compress(
void **device_uncompressed_ptrs,
size_t *device_uncompressed_bytes,
size_t max_uncompressed_chunk_bytes,
size_t batch_size,
void *device_temp_ptr,
size_t temp_bytes,
void **device_compressed_ptrs,
size_t *device_compressed_bytes,
cudaStream_t stream)
{
return C (device_uncompressed_ptrs, device_uncompressed_bytes,
max_uncompressed_chunk_bytes, batch_size, device_temp_ptr, temp_bytes,
device_compressed_ptrs, device_compressed_bytes, opts_, stream);
}
private:
FormatOptT opts_;
};
struct GstNvCompVideoEncPrivate
{
GstCudaContext *ctx = nullptr;
GstCudaStream *stream = nullptr;
#ifdef HAVE_GST_GL
GstGLDisplay *gl_display = nullptr;
GstGLContext *gl_context = nullptr;
GstGLContext *other_gl_context = nullptr;
#endif
GstBufferPool *pool = nullptr;
GstVideoCodecState *state = nullptr;
std::shared_ptr<nvcompManagerBase> manager;
std::shared_ptr<CompressionConfig> config;
std::shared_ptr<BatchedCompBase> batched_comp;
gboolean gl_interop = FALSE;
std::mutex lock;
std::mutex input_lock;
std::condition_variable input_cond;
std::mutex output_lock;
std::condition_variable output_cond;
std::queue<std::shared_ptr<EncoderTask>> input_task_queue;
std::queue<std::shared_ptr<EncoderTask>> output_task_queue;
std::shared_ptr<EncoderTask> cur_task;
GThread *encode_thread = nullptr;
std::atomic<GstFlowReturn> last_flow = { GST_FLOW_OK };
GstNvCompMethod method = DEFAULT_METHOD;
GstNvCompDeflateAlgo deflate_algo = DEFAULT_DEFLATE_ALGO;
GstNvCompBitcompAlgo bitcomp_algo = DEFAULT_BITCOMP_ALGO;
GstNvCompDataType data_type = DEFAULT_DATA_TYPE;
guint chunk_size = DEFAULT_CHUNK_SIZE;
gboolean batched = DEFAULT_BATCHED;
guint async_depth = DEFAULT_ASYNC_DEPTH;
};
/* *INDENT-ON* */
struct _GstNvCompVideoEnc
{
GstVideoEncoder parent;
GstNvCompVideoEncPrivate *priv;
};
static void gst_nv_comp_video_enc_finalize (GObject * object);
static void gst_nv_comp_video_enc_set_property (GObject * object, guint prop_id,
const GValue * value, GParamSpec * pspec);
static void gst_nv_comp_video_enc_get_property (GObject * object, guint prop_id,
GValue * value, GParamSpec * pspec);
static void gst_nv_comp_video_enc_set_context (GstElement * element,
GstContext * context);
static gboolean gst_nv_comp_video_enc_open (GstVideoEncoder * encoder);
static gboolean gst_nv_comp_video_enc_close (GstVideoEncoder * encoder);
static gboolean gst_nv_comp_video_enc_stop (GstVideoEncoder * encoder);
static gboolean gst_nv_comp_video_enc_flush (GstVideoEncoder * encoder);
static GstFlowReturn gst_nv_comp_video_enc_finish (GstVideoEncoder * encoder);
static gboolean gst_nv_comp_video_enc_sink_query (GstVideoEncoder * encoder,
GstQuery * query);
static gboolean gst_nv_comp_video_enc_src_query (GstVideoEncoder * encoder,
GstQuery * query);
static gboolean
gst_nv_comp_video_enc_propose_allocation (GstVideoEncoder * encoder,
GstQuery * query);
static gboolean gst_nv_comp_video_enc_set_format (GstVideoEncoder * encoder,
GstVideoCodecState * state);
static GstFlowReturn
gst_nv_comp_video_enc_handle_frame (GstVideoEncoder * encoder,
GstVideoCodecFrame * frame);
#define gst_nv_comp_video_enc_parent_class parent_class
G_DEFINE_TYPE (GstNvCompVideoEnc,
gst_nv_comp_video_enc, GST_TYPE_VIDEO_ENCODER);
static void
gst_nv_comp_video_enc_class_init (GstNvCompVideoEncClass * klass)
{
auto object_class = G_OBJECT_CLASS (klass);
auto element_class = GST_ELEMENT_CLASS (klass);
auto encoder_class = GST_VIDEO_ENCODER_CLASS (klass);
object_class->finalize = gst_nv_comp_video_enc_finalize;
object_class->set_property = gst_nv_comp_video_enc_set_property;
object_class->get_property = gst_nv_comp_video_enc_get_property;
g_object_class_install_property (object_class, PROP_METHOD,
g_param_spec_enum ("method", "Method",
"Compression method",
GST_TYPE_NV_COMP_METHOD, DEFAULT_METHOD,
(GParamFlags) (G_PARAM_READWRITE | G_PARAM_STATIC_STRINGS)));
g_object_class_install_property (object_class, PROP_DEFLATE_ALGO,
g_param_spec_enum ("deflate-algo", "Deflate Algo",
"Algorithm to use for deflate and gdeflate methods",
GST_TYPE_NV_COMP_DEFLATE_ALGO, DEFAULT_DEFLATE_ALGO,
(GParamFlags) (G_PARAM_READWRITE | G_PARAM_STATIC_STRINGS)));
g_object_class_install_property (object_class, PROP_BITCOMP_ALGO,
g_param_spec_enum ("bitcomp-algo", "Bitcomp Algo",
"Algorithm to use for bitcomp method",
GST_TYPE_NV_COMP_BITCOMP_ALGO, DEFAULT_BITCOMP_ALGO,
(GParamFlags) (G_PARAM_READWRITE | G_PARAM_STATIC_STRINGS)));
g_object_class_install_property (object_class, PROP_DATA_TYPE,
g_param_spec_enum ("data-type", "Data Type",
"Compression data type",
GST_TYPE_NV_COMP_DATA_TYPE, DEFAULT_DATA_TYPE,
(GParamFlags) (G_PARAM_READWRITE | G_PARAM_STATIC_STRINGS)));
g_object_class_install_property (object_class, PROP_CHUNK_SIZE,
g_param_spec_uint ("chunk-size", "Chunk Size",
"Uncompressed chunk size for batched compression (0 = default)",
0, G_MAXINT32, DEFAULT_CHUNK_SIZE,
(GParamFlags) (G_PARAM_READWRITE | GST_PARAM_MUTABLE_READY |
G_PARAM_STATIC_STRINGS)));
g_object_class_install_property (object_class, PROP_BATCHED,
g_param_spec_boolean ("batched", "Batched",
"Use low-level C API for batched operation", DEFAULT_BATCHED,
(GParamFlags) (G_PARAM_READWRITE | GST_PARAM_MUTABLE_READY |
G_PARAM_STATIC_STRINGS)));
g_object_class_install_property (object_class, PROP_ASYNC_DEPTH,
g_param_spec_uint ("async-depth", "Async Depth",
"Internal resource pool size for threaded encoding",
1, 4, DEFAULT_ASYNC_DEPTH,
(GParamFlags) (G_PARAM_READWRITE | GST_PARAM_MUTABLE_READY |
G_PARAM_STATIC_STRINGS)));
gst_element_class_add_static_pad_template (element_class, &sink_template);
gst_element_class_add_static_pad_template (element_class, &src_template);
gst_element_class_set_static_metadata (element_class,
"nvCOMP Video Encoder", "Encoder/Video/Hardware",
"Lossless video compression element based on nvCOMP library",
"Seungha Yang <seungha@centricular.com>");
element_class->set_context =
GST_DEBUG_FUNCPTR (gst_nv_comp_video_enc_set_context);
encoder_class->open = GST_DEBUG_FUNCPTR (gst_nv_comp_video_enc_open);
encoder_class->close = GST_DEBUG_FUNCPTR (gst_nv_comp_video_enc_close);
encoder_class->stop = GST_DEBUG_FUNCPTR (gst_nv_comp_video_enc_stop);
encoder_class->flush = GST_DEBUG_FUNCPTR (gst_nv_comp_video_enc_flush);
encoder_class->finish = GST_DEBUG_FUNCPTR (gst_nv_comp_video_enc_finish);
encoder_class->sink_query =
GST_DEBUG_FUNCPTR (gst_nv_comp_video_enc_sink_query);
encoder_class->src_query =
GST_DEBUG_FUNCPTR (gst_nv_comp_video_enc_src_query);
encoder_class->propose_allocation =
GST_DEBUG_FUNCPTR (gst_nv_comp_video_enc_propose_allocation);
encoder_class->set_format =
GST_DEBUG_FUNCPTR (gst_nv_comp_video_enc_set_format);
encoder_class->handle_frame =
GST_DEBUG_FUNCPTR (gst_nv_comp_video_enc_handle_frame);
GST_DEBUG_CATEGORY_INIT (gst_nv_comp_video_enc_debug,
"nvcompvideoenc", 0, "nvcompvideoenc");
}
static void
gst_nv_comp_video_enc_init (GstNvCompVideoEnc * self)
{
self->priv = new GstNvCompVideoEncPrivate ();
}
static void
gst_nv_comp_video_enc_finalize (GObject * object)
{
auto self = GST_NV_COMP_VIDEO_ENC (object);
delete self->priv;
G_OBJECT_CLASS (parent_class)->finalize (object);
}
static void
gst_nv_comp_video_enc_set_property (GObject * object, guint prop_id,
const GValue * value, GParamSpec * pspec)
{
auto self = GST_NV_COMP_VIDEO_ENC (object);
auto priv = self->priv;
std::lock_guard < std::mutex > lk (priv->lock);
switch (prop_id) {
case PROP_METHOD:
priv->method = (GstNvCompMethod) g_value_get_enum (value);
break;
case PROP_DEFLATE_ALGO:
priv->deflate_algo = (GstNvCompDeflateAlgo) g_value_get_enum (value);
break;
case PROP_BITCOMP_ALGO:
priv->bitcomp_algo = (GstNvCompBitcompAlgo) g_value_get_enum (value);
break;
case PROP_DATA_TYPE:
priv->data_type = (GstNvCompDataType) g_value_get_enum (value);
break;
case PROP_CHUNK_SIZE:
priv->chunk_size = g_value_get_uint (value);
break;
case PROP_BATCHED:
priv->batched = g_value_get_boolean (value);
break;
case PROP_ASYNC_DEPTH:
priv->async_depth = g_value_get_uint (value);
break;
default:
G_OBJECT_WARN_INVALID_PROPERTY_ID (object, prop_id, pspec);
break;
}
}
static void
gst_nv_comp_video_enc_get_property (GObject * object, guint prop_id,
GValue * value, GParamSpec * pspec)
{
auto self = GST_NV_COMP_VIDEO_ENC (object);
auto priv = self->priv;
std::lock_guard < std::mutex > lk (priv->lock);
switch (prop_id) {
case PROP_METHOD:
g_value_set_enum (value, priv->method);
break;
case PROP_DEFLATE_ALGO:
g_value_set_enum (value, priv->deflate_algo);
break;
case PROP_BITCOMP_ALGO:
g_value_set_enum (value, priv->bitcomp_algo);
break;
case PROP_DATA_TYPE:
g_value_set_enum (value, priv->data_type);
break;
case PROP_CHUNK_SIZE:
g_value_set_uint (value, priv->chunk_size);
break;
case PROP_BATCHED:
g_value_set_boolean (value, priv->batched);
break;
case PROP_ASYNC_DEPTH:
g_value_set_uint (value, priv->async_depth);
break;
default:
G_OBJECT_WARN_INVALID_PROPERTY_ID (object, prop_id, pspec);
break;
}
}
static void
gst_nv_comp_video_enc_set_context (GstElement * element, GstContext * context)
{
auto self = GST_NV_COMP_VIDEO_ENC (element);
auto priv = self->priv;
gst_cuda_handle_set_context (element, context, -1, &priv->ctx);
#ifdef HAVE_GST_GL
if (gst_gl_handle_set_context (element, context, &priv->gl_display,
&priv->other_gl_context)) {
if (priv->gl_display)
gst_gl_display_filter_gl_api (priv->gl_display, GST_GL_API_OPENGL3);
}
#endif
GST_ELEMENT_CLASS (parent_class)->set_context (element, context);
}
static gboolean
gst_nv_comp_video_enc_open (GstVideoEncoder * encoder)
{
auto self = GST_NV_COMP_VIDEO_ENC (encoder);
auto priv = self->priv;
if (!gst_cuda_ensure_element_context (GST_ELEMENT_CAST (encoder),
-1, &priv->ctx)) {
GST_ERROR_OBJECT (self, "Couldn't get cuda context");
return FALSE;
}
priv->stream = gst_cuda_stream_new (priv->ctx);
return TRUE;
}
static gboolean
gst_nv_comp_video_enc_close (GstVideoEncoder * encoder)
{
auto self = GST_NV_COMP_VIDEO_ENC (encoder);
auto priv = self->priv;
gst_clear_cuda_stream (&priv->stream);
gst_clear_object (&priv->ctx);
#ifdef HAVE_GST_GL
gst_clear_object (&priv->other_gl_context);
gst_clear_object (&priv->gl_context);
gst_clear_object (&priv->gl_context);
#endif
return TRUE;
}
static void
gst_nv_comp_video_enc_drain (GstNvCompVideoEnc * self, gboolean locked)
{
auto priv = self->priv;
if (!priv->encode_thread)
return;
if (locked)
GST_VIDEO_ENCODER_STREAM_UNLOCK (self);
{
std::lock_guard < std::mutex > lk (priv->output_lock);
priv->output_task_queue.push (nullptr);
priv->output_cond.notify_all ();
}
g_clear_pointer (&priv->encode_thread, g_thread_join);
if (locked)
GST_VIDEO_ENCODER_STREAM_LOCK (self);
priv->last_flow = GST_FLOW_OK;
}
static gboolean
gst_nv_comp_video_enc_stop (GstVideoEncoder * encoder)
{
auto self = GST_NV_COMP_VIDEO_ENC (encoder);
auto priv = self->priv;
gst_nv_comp_video_enc_drain (self, FALSE);
if (priv->ctx) {
gst_cuda_context_push (priv->ctx);
priv->manager = nullptr;
priv->cur_task = nullptr;
priv->input_task_queue = { };
priv->output_task_queue = { };
gst_cuda_context_pop (nullptr);
}
g_clear_pointer (&priv->state, gst_video_codec_state_unref);
if (priv->pool) {
gst_buffer_pool_set_active (priv->pool, FALSE);
gst_clear_object (&priv->pool);
}
return TRUE;
}
static gboolean
gst_nv_comp_video_enc_flush (GstVideoEncoder * encoder)
{
auto self = GST_NV_COMP_VIDEO_ENC (encoder);
gst_nv_comp_video_enc_drain (self, TRUE);
return TRUE;
}
static GstFlowReturn
gst_nv_comp_video_enc_finish (GstVideoEncoder * encoder)
{
auto self = GST_NV_COMP_VIDEO_ENC (encoder);
gst_nv_comp_video_enc_drain (self, TRUE);
return GST_FLOW_OK;
}
static gboolean
gst_nv_comp_video_enc_handle_context_query (GstNvCompVideoEnc * self,
GstQuery * query)
{
auto priv = self->priv;
#ifdef HAVE_GST_GL
{
GstGLDisplay *display = nullptr;
GstGLContext *other = nullptr;
GstGLContext *local = nullptr;
if (priv->gl_display)
display = (GstGLDisplay *) gst_object_ref (priv->gl_display);
if (priv->gl_context)
local = (GstGLContext *) gst_object_ref (priv->gl_context);
if (priv->other_gl_context)
other = (GstGLContext *) gst_object_ref (priv->other_gl_context);
auto ret = gst_gl_handle_context_query (GST_ELEMENT (self), query,
display, local, other);
gst_clear_object (&display);
gst_clear_object (&other);
gst_clear_object (&local);
if (ret)
return TRUE;
}
#endif
if (gst_cuda_handle_context_query (GST_ELEMENT (self), query, priv->ctx))
return TRUE;
return FALSE;
}
static gboolean
gst_nv_comp_video_enc_sink_query (GstVideoEncoder * encoder, GstQuery * query)
{
auto self = GST_NV_COMP_VIDEO_ENC (encoder);
switch (GST_QUERY_TYPE (query)) {
case GST_QUERY_CONTEXT:
if (gst_nv_comp_video_enc_handle_context_query (self, query))
return TRUE;
break;
default:
break;
}
return GST_VIDEO_ENCODER_CLASS (parent_class)->sink_query (encoder, query);
}
static gboolean
gst_nv_comp_video_enc_src_query (GstVideoEncoder * encoder, GstQuery * query)
{
auto self = GST_NV_COMP_VIDEO_ENC (encoder);
switch (GST_QUERY_TYPE (query)) {
case GST_QUERY_CONTEXT:
if (gst_nv_comp_video_enc_handle_context_query (self, query))
return TRUE;
break;
default:
break;
}
return GST_VIDEO_ENCODER_CLASS (parent_class)->src_query (encoder, query);
}
#ifdef HAVE_GST_GL
static void
check_cuda_device_from_gl_context (GstGLContext * context, gboolean * ret)
{
guint device_count = 0;
CUdevice device_list[1] = { 0, };
CUresult cuda_ret;
*ret = FALSE;
cuda_ret = CuGLGetDevices (&device_count,
device_list, 1, CU_GL_DEVICE_LIST_ALL);
if (!gst_cuda_result (cuda_ret) || device_count == 0)
return;
*ret = TRUE;
}
static gboolean
gst_nv_comp_video_enc_ensure_gl_context (GstNvCompVideoEnc * self)
{
auto priv = self->priv;
gboolean ret = FALSE;
if (!gst_gl_ensure_element_data (GST_ELEMENT (self), &priv->gl_display,
&priv->other_gl_context)) {
GST_DEBUG_OBJECT (self, "Couldn't get GL display");
return FALSE;
}
gst_gl_display_filter_gl_api (priv->gl_display, GST_GL_API_OPENGL3);
if (!gst_gl_display_ensure_context (priv->gl_display, priv->other_gl_context,
&priv->gl_context, nullptr)) {
GST_DEBUG_OBJECT (self, "Couldn't get GL context");
return FALSE;
}
gst_gl_context_thread_add (priv->gl_context,
(GstGLContextThreadFunc) check_cuda_device_from_gl_context, &ret);
return ret;
}
#endif
static gboolean
gst_nv_comp_video_enc_propose_allocation (GstVideoEncoder * encoder,
GstQuery * query)
{
auto self = GST_NV_COMP_VIDEO_ENC (encoder);
auto priv = self->priv;
GstBufferPool *pool = nullptr;
guint size;
GstCaps *caps;
gst_query_parse_allocation (query, &caps, nullptr);
if (!caps) {
GST_WARNING_OBJECT (self, "null caps in query");
return FALSE;
}
GstVideoInfo info;
if (!gst_video_info_from_caps (&info, caps)) {
GST_WARNING_OBJECT (self, "Failed to convert caps into info");
return FALSE;
}
auto features = gst_caps_get_features (caps, 0);
gboolean use_cuda_pool = FALSE;
if (gst_caps_features_contains (features,
GST_CAPS_FEATURE_MEMORY_CUDA_MEMORY)) {
GST_DEBUG_OBJECT (self, "upstream support CUDA memory");
pool = gst_cuda_buffer_pool_new (priv->ctx);
use_cuda_pool = TRUE;
}
#ifdef HAVE_GST_GL
else if (gst_caps_features_contains (features,
GST_CAPS_FEATURE_MEMORY_GL_MEMORY)) {
if (!gst_nv_comp_video_enc_ensure_gl_context (self)) {
priv->gl_interop = FALSE;
} else {
pool = gst_gl_buffer_pool_new (priv->gl_context);
}
}
#endif
if (!pool)
pool = gst_video_buffer_pool_new ();
auto config = gst_buffer_pool_get_config (pool);
gst_buffer_pool_config_add_option (config, GST_BUFFER_POOL_OPTION_VIDEO_META);
size = GST_VIDEO_INFO_SIZE (&info);
gst_buffer_pool_config_set_params (config, caps, size, 0, 0);
if (use_cuda_pool && priv->stream) {
/* Set our stream on buffer pool config so that CUstream can be shared */
gst_buffer_pool_config_set_cuda_stream (config, priv->stream);
}
if (!gst_buffer_pool_set_config (pool, config)) {
GST_WARNING_OBJECT (self, "Failed to set pool config");
gst_object_unref (pool);
return FALSE;
}
config = gst_buffer_pool_get_config (pool);
gst_buffer_pool_config_get_params (config, nullptr, &size, nullptr, nullptr);
gst_structure_free (config);
gst_query_add_allocation_pool (query, pool, size, 0, 0);
gst_query_add_allocation_meta (query, GST_VIDEO_META_API_TYPE, nullptr);
gst_object_unref (pool);
return TRUE;
}
static gboolean
gst_nv_comp_video_enc_alloc_task (GstNvCompVideoEnc * self, EncoderTask * task,
gboolean batched, size_t uncompressed_size, size_t compressed_size,
size_t batch_size, size_t chunk_size, size_t output_chunk_size,
size_t temp_size)
{
size_t alloc_size = sizeof (size_t) * batch_size;
uint8_t *uncomp_data;
uint8_t *comp_data;
auto ret = CuEventCreate (&task->event,
CU_EVENT_BLOCKING_SYNC | CU_EVENT_DISABLE_TIMING);
if (!gst_cuda_result (ret))
return FALSE;
auto aligned_uncompressed_size = uncompressed_size;
ret = CuMemAlloc ((CUdeviceptr *) & task->device_uncompressed,
aligned_uncompressed_size);
if (!gst_cuda_result (ret))
return FALSE;
ret = CuMemAllocHost ((void **) &task->host_uncompressed,
aligned_uncompressed_size);
if (!gst_cuda_result (ret))
return FALSE;
auto aligned_compressed_size = GST_ROUND_UP_8 (compressed_size);
ret = CuMemAlloc ((CUdeviceptr *) & task->device_compressed,
aligned_compressed_size);
if (!gst_cuda_result (ret))
return FALSE;
ret = CuMemAllocHost ((void **) &task->host_compressed,
aligned_compressed_size);
if (!gst_cuda_result (ret))
return FALSE;
if (!batched)
return TRUE;
ret = CuMemAllocHost ((void **) &task->host_uncompressed_bytes, alloc_size);
if (!gst_cuda_result (ret))
return FALSE;
ret = CuMemAllocHost ((void **) &task->host_uncompressed_ptrs, alloc_size);
if (!gst_cuda_result (ret))
return FALSE;
for (size_t i = 0; i < batch_size; i++) {
if (i + 1 < batch_size)
task->host_uncompressed_bytes[i] = chunk_size;
else
task->host_uncompressed_bytes[i] = (uncompressed_size - (chunk_size * i));
}
ret = CuMemAlloc ((CUdeviceptr *) & task->device_uncompressed_bytes,
alloc_size);
if (!gst_cuda_result (ret))
return FALSE;
ret = CuMemAlloc ((CUdeviceptr *) & task->device_uncompressed_ptrs,
alloc_size);
if (!gst_cuda_result (ret))
return FALSE;
ret = CuMemAlloc ((CUdeviceptr *) & task->device_compressed_bytes,
alloc_size);
if (!gst_cuda_result (ret))
return FALSE;
ret = CuMemAlloc ((CUdeviceptr *) & task->device_compressed_ptrs, alloc_size);
if (!gst_cuda_result (ret))
return FALSE;
ret = CuMemAllocHost ((void **) &task->host_compressed_bytes, alloc_size);
if (!gst_cuda_result (ret))
return FALSE;
ret = CuMemAllocHost ((void **) &task->host_compressed_ptrs, alloc_size);
if (!gst_cuda_result (ret))
return FALSE;
if (temp_size > 0) {
ret = CuMemAlloc ((CUdeviceptr *) & task->temp_ptr, temp_size);
if (!gst_cuda_result (ret))
return FALSE;
}
task->temp_size = temp_size;
uncomp_data = task->device_uncompressed;
comp_data = task->device_compressed;
for (size_t i = 0; i < batch_size; i++) {
task->host_uncompressed_ptrs[i] = uncomp_data;
uncomp_data += chunk_size;
task->host_compressed_ptrs[i] = comp_data;
comp_data += output_chunk_size;
}
ret = CuMemcpyHtoD ((CUdeviceptr) task->device_uncompressed_bytes,
task->host_uncompressed_bytes, alloc_size);
if (!gst_cuda_result (ret))
return FALSE;
ret = CuMemcpyHtoD ((CUdeviceptr) task->device_uncompressed_ptrs,
task->host_uncompressed_ptrs, alloc_size);
if (!gst_cuda_result (ret))
return FALSE;
ret = CuMemcpyHtoD ((CUdeviceptr) task->device_compressed_ptrs,
task->host_compressed_ptrs, alloc_size);
if (!gst_cuda_result (ret))
return FALSE;
task->batched = batched;
task->batch_size = batch_size;
task->chunk_size = chunk_size;
task->max_output_chunk_size = output_chunk_size;
task->compressed_alloc_size = aligned_compressed_size;
return TRUE;
}
static gboolean
gst_nv_comp_video_enc_set_format (GstVideoEncoder * encoder,
GstVideoCodecState * state)
{
auto self = GST_NV_COMP_VIDEO_ENC (encoder);
auto priv = self->priv;
gst_nv_comp_video_enc_drain (self, TRUE);
std::lock_guard < std::mutex > lk (priv->lock);
if (!priv->ctx) {
GST_ERROR_OBJECT (self, "CUDA context was not configured");
return FALSE;
}
if (priv->pool) {
gst_buffer_pool_set_active (priv->pool, FALSE);
gst_clear_object (&priv->pool);
}
g_clear_pointer (&priv->state, gst_video_codec_state_unref);
priv->state = gst_video_codec_state_ref (state);
std::string mime_type = "video/x-nvcomp";
if (!gst_cuda_context_push (priv->ctx)) {
GST_ERROR_OBJECT (self, "Couldn't push context");
return FALSE;
}
priv->gl_interop = FALSE;
#if HAVE_GST_GL
auto features = gst_caps_get_features (state->caps, 0);
if (gst_caps_features_contains (features, GST_CAPS_FEATURE_MEMORY_GL_MEMORY))
priv->gl_interop = TRUE;
#endif
priv->manager = nullptr;
priv->config = nullptr;
priv->batched_comp = nullptr;
priv->input_task_queue = { };
priv->output_task_queue = { };
auto stream = (cudaStream_t) gst_cuda_stream_get_handle (priv->stream);
guint device_id = 0;
g_object_get (priv->ctx, "cuda-device-id", &device_id, nullptr);
size_t chunk_size = priv->chunk_size;
size_t batch_size = 0;
switch (priv->method) {
case GST_NV_COMP_LZ4:
{
nvcompBatchedLZ4Opts_t opts = nvcompBatchedLZ4DefaultOpts;
if (priv->data_type != GST_NV_COMP_DATA_TYPE_DEFAULT)
opts.data_type = (nvcompType_t) priv->data_type;
if (chunk_size == 0)
chunk_size = 65536;
chunk_size = MAX (32768, chunk_size);
chunk_size = GST_ROUND_UP_8 (chunk_size);
chunk_size = MIN (chunk_size, nvcompLZ4CompressionMaxAllowedChunkSize);
if (priv->batched) {
batch_size = (state->info.size + chunk_size - 1) / chunk_size;
priv->batched_comp =
std::make_shared < BatchedComp < nvcompBatchedLZ4Opts_t,
nvcompBatchedLZ4CompressGetTempSize,
nvcompBatchedLZ4CompressGetMaxOutputChunkSize,
nvcompBatchedLZ4CompressAsync >> (opts);
mime_type = "video/x-nvcomp-lz4";
} else {
priv->manager = std::make_shared < LZ4Manager > (chunk_size,
opts, stream, device_id);
}
GST_DEBUG_OBJECT (self, "Using LZ4");
break;
}
case GST_NV_COMP_SNAPPY:
{
if (chunk_size == 0)
chunk_size = 65536;
chunk_size = MAX (32768, chunk_size);
chunk_size = GST_ROUND_UP_8 (chunk_size);
chunk_size = MIN (chunk_size, nvcompSnappyCompressionMaxAllowedChunkSize);
if (priv->batched) {
batch_size = (state->info.size + chunk_size - 1) / chunk_size;
priv->batched_comp =
std::make_shared < BatchedComp < nvcompBatchedSnappyOpts_t,
nvcompBatchedSnappyCompressGetTempSize,
nvcompBatchedSnappyCompressGetMaxOutputChunkSize,
nvcompBatchedSnappyCompressAsync >>
(nvcompBatchedSnappyDefaultOpts);
mime_type = "video/x-nvcomp-snappy";
} else {
priv->manager = std::make_shared < SnappyManager > (chunk_size,
nvcompBatchedSnappyDefaultOpts, stream, device_id);
}
GST_DEBUG_OBJECT (self, "Using SNAPPY");
break;
}
case GST_NV_COMP_GDEFLATE:
{
nvcompBatchedGdeflateOpts_t opts;
opts.algo = (int) priv->deflate_algo;
if (chunk_size == 0)
chunk_size = 65536;
chunk_size = MAX (32768, chunk_size);
chunk_size = GST_ROUND_UP_8 (chunk_size);
chunk_size = MIN (chunk_size,
nvcompGdeflateCompressionMaxAllowedChunkSize);
if (priv->batched) {
batch_size = (state->info.size + chunk_size - 1) / chunk_size;
priv->batched_comp =
std::make_shared < BatchedComp < nvcompBatchedGdeflateOpts_t,
nvcompBatchedGdeflateCompressGetTempSize,
nvcompBatchedGdeflateCompressGetMaxOutputChunkSize,
nvcompBatchedGdeflateCompressAsync >> (opts);
mime_type = "video/x-nvcomp-gdeflate";
} else {
priv->manager = std::make_shared < GdeflateManager > (chunk_size,
opts, stream, device_id);
}
GST_DEBUG_OBJECT (self, "Using GDEFLATE");
break;
}
case GST_NV_COMP_DEFLATE:
{
if (chunk_size == 0)
chunk_size = 65536;
chunk_size = MAX (32768, chunk_size);
chunk_size = GST_ROUND_UP_8 (chunk_size);
chunk_size = MIN (chunk_size,
nvcompDeflateCompressionMaxAllowedChunkSize);
nvcompBatchedDeflateOpts_t opts;
opts.algo = (int) priv->deflate_algo;
if (priv->batched) {
batch_size = (state->info.size + chunk_size - 1) / chunk_size;
priv->batched_comp =
std::make_shared < BatchedComp < nvcompBatchedDeflateOpts_t,
nvcompBatchedDeflateCompressGetTempSize,
nvcompBatchedDeflateCompressGetMaxOutputChunkSize,
nvcompBatchedDeflateCompressAsync >> (opts);
mime_type = "video/x-nvcomp-deflate";
} else {
priv->manager = std::make_shared < DeflateManager > (chunk_size,
opts, stream, device_id);
}
GST_DEBUG_OBJECT (self, "Using DEFLATE");
break;
}
case GST_NV_COMP_ZSTD:
{
if (chunk_size == 0)
chunk_size = 65536;
chunk_size = MAX (32768, chunk_size);
chunk_size = GST_ROUND_UP_8 (chunk_size);
chunk_size = MIN (chunk_size, nvcompZstdCompressionMaxAllowedChunkSize);
if (priv->batched) {
batch_size = (state->info.size + chunk_size - 1) / chunk_size;
priv->batched_comp =
std::make_shared < BatchedComp < nvcompBatchedZstdOpts_t,
nvcompBatchedZstdCompressGetTempSize,
nvcompBatchedZstdCompressGetMaxOutputChunkSize,
nvcompBatchedZstdCompressAsync >> (nvcompBatchedZstdDefaultOpts);
mime_type = "video/x-nvcomp-zstd";
} else {
priv->manager = std::make_shared < ZstdManager > (chunk_size,
nvcompBatchedZstdDefaultOpts, stream, device_id);
}
GST_DEBUG_OBJECT (self, "Using ZSTD");
break;
}
case GST_NV_COMP_CASCADED:
{
if (chunk_size == 0)
chunk_size = 4096;
chunk_size = MAX (512, chunk_size);
chunk_size = GST_ROUND_UP_8 (chunk_size);
chunk_size = MIN (chunk_size, 16384);
nvcompBatchedCascadedOpts_t opts = nvcompBatchedCascadedDefaultOpts;
opts.chunk_size = chunk_size;
if (priv->data_type != GST_NV_COMP_DATA_TYPE_DEFAULT)
opts.type = (nvcompType_t) priv->data_type;
if (priv->batched) {
batch_size = (state->info.size + chunk_size - 1) / chunk_size;
priv->batched_comp =
std::make_shared < BatchedComp < nvcompBatchedCascadedOpts_t,
nvcompBatchedCascadedCompressGetTempSize,
nvcompBatchedCascadedCompressGetMaxOutputChunkSize,
nvcompBatchedCascadedCompressAsync >> (opts);
mime_type = "video/x-nvcomp-cascaded";
} else {
priv->manager = std::make_shared < CascadedManager > (chunk_size,
opts, stream, device_id);
}
GST_DEBUG_OBJECT (self, "Using CASCADED");
break;
}
case GST_NV_COMP_BITCOMP:
{
if (chunk_size == 0)
chunk_size = 65536;
chunk_size = MAX (32768, chunk_size);
chunk_size = GST_ROUND_UP_8 (chunk_size);
chunk_size = MIN (chunk_size,
nvcompBitcompCompressionMaxAllowedChunkSize);
nvcompBatchedBitcompFormatOpts opts = nvcompBatchedBitcompDefaultOpts;
opts.algorithm_type = (int) priv->bitcomp_algo;
if (priv->data_type != GST_NV_COMP_DATA_TYPE_DEFAULT)
opts.data_type = (nvcompType_t) priv->data_type;
if (priv->batched) {
batch_size = (state->info.size + chunk_size - 1) / chunk_size;
priv->batched_comp =
std::make_shared < BatchedComp < nvcompBatchedBitcompFormatOpts,
nvcompBatchedBitcompCompressGetTempSize,
nvcompBatchedBitcompCompressGetMaxOutputChunkSize,
nvcompBatchedBitcompCompressAsync >> (opts);
mime_type = "video/x-nvcomp-bitcomp";
} else {
priv->manager = std::make_shared < BitcompManager > (chunk_size,
opts, stream, device_id);
}
GST_DEBUG_OBJECT (self, "Using BITCOMP");
break;
}
case GST_NV_COMP_ANS:
{
if (chunk_size == 0)
chunk_size = 65536;
chunk_size = MAX (32768, chunk_size);
chunk_size = GST_ROUND_UP_8 (chunk_size);
chunk_size = MIN (chunk_size, nvcompANSCompressionMaxAllowedChunkSize);
if (priv->batched) {
batch_size = (state->info.size + chunk_size - 1) / chunk_size;
priv->batched_comp =
std::make_shared < BatchedComp < nvcompBatchedANSOpts_t,
nvcompBatchedANSCompressGetTempSize,
nvcompBatchedANSCompressGetMaxOutputChunkSize,
nvcompBatchedANSCompressAsync >> (nvcompBatchedANSDefaultOpts);
mime_type = "video/x-nvcomp-ans";
} else {
priv->manager = std::make_shared < ANSManager > (chunk_size,
nvcompBatchedANSDefaultOpts, stream, device_id);
}
GST_DEBUG_OBJECT (self, "Using ANS");
break;
}
default:
g_assert_not_reached ();
return FALSE;
}
size_t max_output_size = 0;
size_t max_output_chunk_size = 0;
size_t temp_size = 0;
if (priv->batched) {
auto status = priv->batched_comp->get_temp_size (batch_size,
chunk_size, &temp_size);
if (status != nvcompSuccess) {
GST_ERROR_OBJECT (self, "Couldn't get temp size");
gst_cuda_context_pop (nullptr);
return FALSE;
}
status = priv->batched_comp->get_max_compressed_chunk_size (chunk_size,
&max_output_chunk_size);
if (status != nvcompSuccess) {
GST_ERROR_OBJECT (self, "Couldn't get max output chunk size");
gst_cuda_context_pop (nullptr);
return FALSE;
}
max_output_chunk_size = GST_ROUND_UP_8 (max_output_chunk_size);
max_output_size = max_output_chunk_size * batch_size;
} else {
priv->config = std::make_shared < CompressionConfig >
(priv->manager->configure_compression (state->info.size));
max_output_size = priv->config->max_compressed_buffer_size;
}
GST_DEBUG_OBJECT (self, "Allocating resource, batched: %d"
", uncompressed size: %" G_GSIZE_FORMAT
", max-output-size: %" G_GSIZE_FORMAT
", batch-size: %" G_GSIZE_FORMAT
", chunk-size: %" G_GSIZE_FORMAT
", max-output-chunk-size: %" G_GSIZE_FORMAT
", temp-size: %" G_GSIZE_FORMAT, priv->batched, state->info.size,
max_output_size, batch_size, chunk_size, max_output_chunk_size,
temp_size);
for (guint i = 0; i < priv->async_depth; i++) {
auto task = std::make_shared < EncoderTask > ();
task->ctx = (GstCudaContext *) gst_object_ref (priv->ctx);
if (!gst_nv_comp_video_enc_alloc_task (self, task.get (), priv->batched,
state->info.size, max_output_size, batch_size, chunk_size,
max_output_chunk_size, temp_size)) {
priv->manager = nullptr;
priv->input_task_queue = { };
task = nullptr;
gst_cuda_context_pop (nullptr);
return FALSE;
}
priv->input_task_queue.push (task);
}
/* In case of batched, custom header is added to signal chunk and batch size */
if (priv->batched) {
/* version */
max_output_size += sizeof (guint32);
/* max uncompressed chunk size */
max_output_size += sizeof (guint32);
/* max compressed chunk size */
max_output_size += sizeof (guint32);
/* batch size */
max_output_size += sizeof (guint32);
/* each uncompressed/compressed chunk size */
max_output_size += (sizeof (guint32) * batch_size * 2);
}
priv->pool = gst_buffer_pool_new ();
auto config = gst_buffer_pool_get_config (priv->pool);
gst_buffer_pool_config_set_params (config, nullptr, max_output_size, 0, 0);
gst_buffer_pool_set_config (priv->pool, config);
gst_buffer_pool_set_active (priv->pool, TRUE);
gst_cuda_context_pop (nullptr);
auto caps = gst_caps_new_simple (mime_type.c_str (), "format", G_TYPE_STRING,
gst_video_format_to_string (GST_VIDEO_INFO_FORMAT (&state->info)),
nullptr);
auto out_state =
gst_video_encoder_set_output_state (GST_VIDEO_ENCODER (encoder),
caps, state);
gst_video_codec_state_unref (out_state);
return TRUE;
}
static gboolean
gst_nv_comp_video_enc_upload (GstNvCompVideoEnc * self, GstVideoFrame * frame,
CUstream stream, gboolean is_device_copy)
{
auto priv = self->priv;
auto info = &priv->state->info;
auto finfo = info->finfo;
gint comp[GST_VIDEO_MAX_COMPONENTS];
CUresult ret = CUDA_SUCCESS;
auto cur_task = priv->cur_task;
for (guint i = 0; i < GST_VIDEO_FRAME_N_PLANES (frame); i++) {
guint8 *sp = (guint8 *) GST_VIDEO_FRAME_PLANE_DATA (frame, i);
guint8 *dp;
if (is_device_copy)
dp = cur_task->device_uncompressed + info->offset[i];
else
dp = cur_task->host_uncompressed + info->offset[i];
guint ss, ds;
guint w, h;
if (GST_VIDEO_FORMAT_INFO_HAS_PALETTE (finfo) && i == 1) {
if (is_device_copy) {
ret = CuMemcpyDtoDAsync ((CUdeviceptr) dp, (CUdeviceptr) sp,
256 * 4, stream);
} else {
memcpy (dp, sp, 256 * 4);
}
if (!gst_cuda_result (ret)) {
GST_ERROR_OBJECT (self, "CUDA memcpy failed");
return FALSE;
}
return TRUE;
}
ss = GST_VIDEO_FRAME_PLANE_STRIDE (frame, i);
ds = GST_VIDEO_INFO_PLANE_STRIDE (info, i);
gst_video_format_info_component (finfo, i, comp);
w = GST_VIDEO_INFO_COMP_WIDTH (info, comp[0]) *
GST_VIDEO_INFO_COMP_PSTRIDE (info, comp[0]);
if (w == 0)
w = MIN (ss, ds);
h = GST_VIDEO_INFO_COMP_HEIGHT (info, comp[0]);
if (GST_VIDEO_FORMAT_INFO_IS_TILED (finfo)) {
gint tile_size;
gint sx_tiles, sy_tiles, dx_tiles, dy_tiles;
GstVideoTileMode mode;
tile_size = GST_VIDEO_FORMAT_INFO_TILE_SIZE (info->finfo, i);
mode = GST_VIDEO_FORMAT_INFO_TILE_MODE (info->finfo);
sx_tiles = GST_VIDEO_TILE_X_TILES (ss);
sy_tiles = GST_VIDEO_TILE_Y_TILES (ss);
dx_tiles = GST_VIDEO_TILE_X_TILES (ds);
dy_tiles = GST_VIDEO_TILE_Y_TILES (ds);
w = MIN (sx_tiles, dx_tiles);
h = MIN (sy_tiles, dy_tiles);
for (guint j = 0; j < h; j++) {
for (guint k = 0; k < w; k++) {
guint si, di;
guint8 *cur_dp;
guint8 *cur_sp;
si = gst_video_tile_get_index (mode, k, j, sx_tiles, sy_tiles);
di = gst_video_tile_get_index (mode, k, j, dx_tiles, dy_tiles);
cur_dp = dp + (di * tile_size);
cur_sp = sp + (si * tile_size);
if (is_device_copy) {
ret = CuMemcpyDtoDAsync ((CUdeviceptr) cur_dp, (CUdeviceptr) cur_sp,
w, stream);
} else {
memcpy (cur_dp, cur_sp, w);
}
if (!gst_cuda_result (ret)) {
GST_ERROR_OBJECT (self, "CUDA memcpy failed");
return FALSE;
}
}
}
} else {
if (is_device_copy) {
CUDA_MEMCPY2D params = { };
params.srcMemoryType = CU_MEMORYTYPE_DEVICE;
params.srcDevice = (CUdeviceptr) sp;
params.srcPitch = ss;
params.dstMemoryType = CU_MEMORYTYPE_DEVICE;
params.dstDevice = (CUdeviceptr) dp;
params.dstPitch = ds;
params.WidthInBytes = w;
params.Height = h;
ret = CuMemcpy2DAsync (&params, stream);
if (!gst_cuda_result (ret)) {
GST_ERROR_OBJECT (self, "CUDA memcpy failed");
return FALSE;
}
} else {
for (guint j = 0; j < h; j++) {
memcpy (dp, sp, w);
dp += ds;
sp += ss;
}
}
}
}
return TRUE;
}
#ifdef HAVE_GST_GL
struct GLInteropData
{
GstNvCompVideoEnc *self = nullptr;
GstBuffer *buffer = nullptr;
gboolean ret = FALSE;
};
static GstCudaGraphicsResource *
ensure_gl_cuda_resource (GstNvCompVideoEnc * self, GstMemory * mem)
{
auto priv = self->priv;
GstCudaGraphicsResource *resource;
GQuark quark;
if (!gst_is_gl_memory_pbo (mem)) {
GST_WARNING_OBJECT (self, "memory is not GL PBO memory, %s",
mem->allocator->mem_type);
return nullptr;
}
quark = gst_cuda_quark_from_id (GST_CUDA_QUARK_GRAPHICS_RESOURCE);
resource = (GstCudaGraphicsResource *)
gst_mini_object_get_qdata (GST_MINI_OBJECT (mem), quark);
if (!resource) {
GstMapInfo map_info;
GstGLMemoryPBO *pbo = (GstGLMemoryPBO *) mem;
GstGLBuffer *gl_buf = pbo->pbo;
gboolean ret;
if (!gst_memory_map (mem, &map_info,
(GstMapFlags) (GST_MAP_READ | GST_MAP_GL))) {
GST_ERROR_OBJECT (self, "Couldn't map gl memory");
return nullptr;
}
resource = gst_cuda_graphics_resource_new (priv->ctx,
GST_OBJECT (GST_GL_BASE_MEMORY_CAST (mem)->context),
GST_CUDA_GRAPHICS_RESOURCE_GL_BUFFER);
GST_LOG_OBJECT (self, "registering gl buffer %d to CUDA", gl_buf->id);
ret = gst_cuda_graphics_resource_register_gl_buffer (resource, gl_buf->id,
CU_GRAPHICS_REGISTER_FLAGS_NONE);
gst_memory_unmap (mem, &map_info);
if (!ret) {
GST_ERROR_OBJECT (self, "Couldn't register gl buffer %d", gl_buf->id);
gst_cuda_graphics_resource_free (resource);
return nullptr;
}
gst_mini_object_set_qdata (GST_MINI_OBJECT (mem), quark, resource,
(GDestroyNotify) gst_cuda_graphics_resource_free);
}
return resource;
}
static void
gst_nv_comp_video_enc_upload_gl (GstGLContext * context, GLInteropData * data)
{
auto self = data->self;
auto priv = self->priv;
auto info = &priv->state->info;
auto finfo = info->finfo;
GstCudaGraphicsResource *gst_res[GST_VIDEO_MAX_PLANES] = { nullptr, };
CUgraphicsResource cuda_res[GST_VIDEO_MAX_PLANES] = { nullptr, };
CUdeviceptr src_devptr[GST_VIDEO_MAX_PLANES] = { 0, };
CUstream stream = gst_cuda_stream_get_handle (priv->stream);
CUresult ret;
gint comp[GST_VIDEO_MAX_COMPONENTS];
auto cur_task = priv->cur_task;
if (!gst_cuda_context_push (priv->ctx)) {
GST_ERROR_OBJECT (self, "Couldn't push context");
return;
}
for (guint i = 0; i < GST_VIDEO_INFO_N_PLANES (info); i++) {
GstMemory *mem = gst_buffer_peek_memory (data->buffer, i);
GstGLMemoryPBO *pbo = (GstGLMemoryPBO *) mem;
gsize src_size;
if (!gst_is_gl_memory_pbo (mem)) {
GST_ERROR_OBJECT (self, "Not a GL PBO memory");
goto out;
}
gst_res[i] = ensure_gl_cuda_resource (self, mem);
if (!gst_res[i]) {
GST_ERROR_OBJECT (self, "Couldn't get resource %d", i);
goto out;
}
gst_gl_memory_pbo_upload_transfer (pbo);
gst_gl_memory_pbo_download_transfer (pbo);
cuda_res[i] = gst_cuda_graphics_resource_map (gst_res[i], stream,
CU_GRAPHICS_MAP_RESOURCE_FLAGS_READ_ONLY);
if (!cuda_res[i]) {
GST_ERROR_OBJECT (self, "Couldn't map resource");
goto out;
}
ret = CuGraphicsResourceGetMappedPointer (&src_devptr[i],
&src_size, cuda_res[i]);
if (!gst_cuda_result (ret)) {
GST_ERROR_OBJECT (self, "Couldn't get mapped device pointer");
goto out;
}
}
for (guint i = 0; i < GST_VIDEO_INFO_N_PLANES (info); i++) {
guint8 *sp = (guint8 *) src_devptr[i];
guint8 *dp = cur_task->device_uncompressed + info->offset[i];
guint ss, ds;
guint w, h;
if (GST_VIDEO_FORMAT_INFO_HAS_PALETTE (finfo) && i == 1) {
ret = CuMemcpyDtoDAsync ((CUdeviceptr) dp, (CUdeviceptr) sp,
256 * 4, stream);
if (!gst_cuda_result (ret)) {
GST_ERROR_OBJECT (self, "CUDA memcpy failed");
goto out;
}
data->ret = TRUE;
goto out;
}
auto meta = gst_buffer_get_video_meta (data->buffer);
if (meta)
ss = meta->stride[i];
else
ss = GST_VIDEO_INFO_PLANE_STRIDE (info, i);
ds = GST_VIDEO_INFO_PLANE_STRIDE (info, i);
gst_video_format_info_component (finfo, i, comp);
w = GST_VIDEO_INFO_COMP_WIDTH (info, comp[0]) *
GST_VIDEO_INFO_COMP_PSTRIDE (info, comp[0]);
if (w == 0)
w = MIN (ss, ds);
h = GST_VIDEO_INFO_COMP_HEIGHT (info, comp[0]);
if (GST_VIDEO_FORMAT_INFO_IS_TILED (finfo)) {
gint tile_size;
gint sx_tiles, sy_tiles, dx_tiles, dy_tiles;
GstVideoTileMode mode;
tile_size = GST_VIDEO_FORMAT_INFO_TILE_SIZE (info->finfo, i);
mode = GST_VIDEO_FORMAT_INFO_TILE_MODE (info->finfo);
sx_tiles = GST_VIDEO_TILE_X_TILES (ss);
sy_tiles = GST_VIDEO_TILE_Y_TILES (ss);
dx_tiles = GST_VIDEO_TILE_X_TILES (ds);
dy_tiles = GST_VIDEO_TILE_Y_TILES (ds);
w = MIN (sx_tiles, dx_tiles);
h = MIN (sy_tiles, dy_tiles);
for (guint j = 0; j < h; j++) {
for (guint k = 0; k < w; k++) {
guint si, di;
guint8 *cur_dp;
guint8 *cur_sp;
si = gst_video_tile_get_index (mode, k, j, sx_tiles, sy_tiles);
di = gst_video_tile_get_index (mode, k, j, dx_tiles, dy_tiles);
cur_dp = dp + (di * tile_size);
cur_sp = sp + (si * tile_size);
ret = CuMemcpyDtoDAsync ((CUdeviceptr) cur_dp, (CUdeviceptr) cur_sp,
w, stream);
if (!gst_cuda_result (ret)) {
GST_ERROR_OBJECT (self, "CUDA memcpy failed");
goto out;
}
}
}
} else {
CUDA_MEMCPY2D params = { };
params.srcMemoryType = CU_MEMORYTYPE_DEVICE;
params.srcDevice = (CUdeviceptr) sp;
params.srcPitch = ss;
params.dstMemoryType = CU_MEMORYTYPE_DEVICE;
params.dstDevice = (CUdeviceptr) dp;
params.dstPitch = ds;
params.WidthInBytes = w;
params.Height = h;
ret = CuMemcpy2DAsync (&params, stream);
if (!gst_cuda_result (ret)) {
GST_ERROR_OBJECT (self, "CUDA memcpy failed");
goto out;
}
}
}
data->ret = TRUE;
out:
for (guint i = 0; i < gst_buffer_n_memory (data->buffer); i++) {
if (!gst_res[i])
break;
gst_cuda_graphics_resource_unmap (gst_res[i], stream);
}
CuStreamSynchronize (stream);
gst_cuda_context_pop (nullptr);
}
#endif
static gpointer
gst_nv_comp_video_enc_thread_func (GstNvCompVideoEnc * self)
{
auto encoder = GST_VIDEO_ENCODER (self);
auto priv = self->priv;
GST_DEBUG_OBJECT (self, "Entering loop");
while (1) {
std::shared_ptr < EncoderTask > task;
{
std::unique_lock < std::mutex > lk (priv->output_lock);
while (priv->output_task_queue.empty ())
priv->output_cond.wait (lk);
task = priv->output_task_queue.front ();
priv->output_task_queue.pop ();
}
if (!task) {
GST_DEBUG_OBJECT (self, "Got empty task, terminate");
break;
}
auto frame = gst_video_encoder_get_oldest_frame (encoder);
gst_cuda_context_push (priv->ctx);
CuEventSynchronize (task->event);
gst_cuda_context_pop (nullptr);
gst_buffer_pool_acquire_buffer (priv->pool, &frame->output_buffer, nullptr);
GstMapInfo map_info;
gst_buffer_map (frame->output_buffer, &map_info, GST_MAP_WRITE);
if (task->batched) {
task->compressed_size = 0;
auto dst = (uint8_t *) map_info.data;
/* Write custom header */
GST_WRITE_UINT32_LE (dst, GST_NV_COMP_HEADER_VERSION);
dst += sizeof (guint32);
task->compressed_size += sizeof (guint32);
GST_WRITE_UINT32_LE (dst, task->chunk_size);
dst += sizeof (guint32);
task->compressed_size += sizeof (guint32);
GST_WRITE_UINT32_LE (dst, task->max_output_chunk_size);
dst += sizeof (guint32);
task->compressed_size += sizeof (guint32);
GST_WRITE_UINT32_LE (dst, task->batch_size);
dst += sizeof (guint32);
task->compressed_size += sizeof (guint32);
for (size_t i = 0; i < task->batch_size; i++) {
GST_WRITE_UINT32_LE (dst, task->host_uncompressed_bytes[i]);
dst += sizeof (guint32);
task->compressed_size += sizeof (guint32);
GST_WRITE_UINT32_LE (dst, task->host_compressed_bytes[i]);
dst += sizeof (guint32);
task->compressed_size += sizeof (guint32);
}
/* Write compressed data */
for (size_t i = 0; i < task->batch_size; i++) {
auto size = task->host_compressed_bytes[i];
auto src = task->host_compressed + (i * task->max_output_chunk_size);
memcpy (dst, src, size);
dst += size;
task->compressed_size += size;
}
} else {
memcpy (map_info.data, task->host_compressed, task->compressed_size);
}
gst_buffer_unmap (frame->output_buffer, &map_info);
if (task->compressed_size > 0) {
gst_buffer_set_size (frame->output_buffer, task->compressed_size);
frame->dts = frame->pts;
auto ratio = (double) priv->state->info.size / task->compressed_size;
GST_LOG_OBJECT (self, "compressed buffer size %" G_GSIZE_FORMAT
", ratio %.2f", task->compressed_size, ratio);
} else {
GST_ERROR_OBJECT (self, "Zero compressed size");
gst_clear_buffer (&frame->output_buffer);
}
{
std::lock_guard < std::mutex > lk (priv->input_lock);
priv->input_task_queue.push (task);
priv->input_cond.notify_all ();
}
priv->last_flow = gst_video_encoder_finish_frame (encoder, frame);
};
GST_DEBUG_OBJECT (self, "Leaving loop");
return nullptr;
}
static GstFlowReturn
gst_nv_comp_video_enc_handle_frame (GstVideoEncoder * encoder,
GstVideoCodecFrame * frame)
{
auto self = GST_NV_COMP_VIDEO_ENC (encoder);
auto priv = self->priv;
GstMemory *mem;
CUstream stream = nullptr;
GstVideoFrame vframe;
auto info = &priv->state->info;
size_t compressed_size = 0;
gboolean need_copy = TRUE;
std::shared_ptr < EncoderTask > task;
if (!priv->ctx || (!priv->manager && !priv->batched_comp)) {
GST_ERROR_OBJECT (self, "Context was not configured");
goto error;
}
if (priv->last_flow != GST_FLOW_OK) {
GST_INFO_OBJECT (self, "Last flow was %s",
gst_flow_get_name (priv->last_flow));
gst_video_encoder_finish_frame (encoder, frame);
return priv->last_flow;
}
if (!priv->encode_thread) {
priv->encode_thread = g_thread_new ("nvcompvideoenc",
(GThreadFunc) gst_nv_comp_video_enc_thread_func, self);
}
GST_VIDEO_ENCODER_STREAM_UNLOCK (encoder);
{
std::unique_lock < std::mutex > lk (priv->input_lock);
while (priv->input_task_queue.empty ())
priv->input_cond.wait (lk);
priv->cur_task = priv->input_task_queue.front ();
priv->input_task_queue.pop ();
}
GST_VIDEO_ENCODER_STREAM_LOCK (encoder);
mem = gst_buffer_peek_memory (frame->input_buffer, 0);
#ifdef HAVE_GST_GL
if (priv->gl_interop && gst_is_gl_memory (mem) &&
gst_buffer_n_memory (frame->input_buffer) ==
GST_VIDEO_INFO_N_PLANES (info)) {
GLInteropData interop_data;
interop_data.self = self;
interop_data.buffer = frame->input_buffer;
interop_data.ret = FALSE;
auto gl_mem = (GstGLMemory *) mem;
gst_gl_context_thread_add (gl_mem->mem.context,
(GstGLContextThreadFunc) gst_nv_comp_video_enc_upload_gl,
&interop_data);
if (interop_data.ret) {
need_copy = FALSE;
GST_TRACE_OBJECT (self, "GL -> CUDA copy done");
} else {
priv->gl_interop = FALSE;
}
}
#endif
if (!gst_cuda_context_push (priv->ctx)) {
GST_ERROR_OBJECT (self, "Couldn't push context");
std::lock_guard < std::mutex > lk (priv->input_lock);
priv->input_task_queue.push (std::move (priv->cur_task));
goto error;
}
stream = gst_cuda_stream_get_handle (priv->stream);
if (need_copy) {
gboolean device_copy = FALSE;
if (gst_is_cuda_memory (mem)) {
GstCudaMemory *cmem = GST_CUDA_MEMORY_CAST (mem);
if (cmem->context == priv->ctx) {
device_copy = TRUE;
if (!gst_video_frame_map (&vframe, info, frame->input_buffer,
(GstMapFlags) (GST_MAP_READ | GST_MAP_CUDA))) {
GST_ERROR_OBJECT (self, "Couldn't map cuda memory");
gst_cuda_context_pop (nullptr);
std::lock_guard < std::mutex > lk (priv->input_lock);
priv->input_task_queue.push (std::move (priv->cur_task));
goto error;
}
if (gst_cuda_memory_get_stream (cmem) != priv->stream) {
GST_DEBUG_OBJECT (self, "Different stream, need sync");
gst_cuda_memory_sync (cmem);
}
}
}
if (!device_copy && !gst_video_frame_map (&vframe,
info, frame->input_buffer, GST_MAP_READ)) {
GST_ERROR_OBJECT (self, "Couldn't map input frame");
gst_cuda_context_pop (nullptr);
std::lock_guard < std::mutex > lk (priv->input_lock);
priv->input_task_queue.push (std::move (priv->cur_task));
goto error;
}
if (!gst_nv_comp_video_enc_upload (self, &vframe, stream, device_copy)) {
gst_video_frame_unmap (&vframe);
gst_cuda_context_pop (nullptr);
std::lock_guard < std::mutex > lk (priv->input_lock);
priv->input_task_queue.push (std::move (priv->cur_task));
goto error;
}
gst_video_frame_unmap (&vframe);
if (!device_copy) {
CuMemcpyHtoDAsync ((CUdeviceptr) priv->cur_task->device_uncompressed,
priv->cur_task->host_uncompressed, info->size, stream);
}
}
task = std::move (priv->cur_task);
if (task->batched) {
g_assert (priv->batched_comp);
auto status = priv->batched_comp->compress (task->device_uncompressed_ptrs,
task->device_uncompressed_bytes, task->chunk_size, task->batch_size,
task->temp_ptr, task->temp_size, task->device_compressed_ptrs,
task->device_compressed_bytes, (cudaStream_t) stream);
if (status != nvcompSuccess) {
GST_ERROR_OBJECT (self, "Compression failed, ret %d", status);
gst_cuda_context_pop (nullptr);
std::lock_guard < std::mutex > lk (priv->input_lock);
priv->input_task_queue.push (std::move (task));
goto error;
}
CuMemcpyDtoHAsync (task->host_compressed_bytes,
(CUdeviceptr) task->device_compressed_bytes,
sizeof (size_t) * task->batch_size, stream);
CuMemcpyDtoHAsync (task->host_compressed,
(CUdeviceptr) task->device_compressed,
task->compressed_alloc_size, stream);
} else {
g_assert (priv->manager);
priv->manager->compress (task->device_uncompressed,
task->device_compressed, *priv->config);
compressed_size =
priv->manager->get_compressed_output_size (task->device_compressed);
task->compressed_size = compressed_size;
CuMemcpyDtoHAsync (task->host_compressed,
(CUdeviceptr) task->device_compressed, compressed_size, stream);
}
CuEventRecord (task->event, stream);
gst_cuda_context_pop (nullptr);
{
std::lock_guard < std::mutex > lk (priv->output_lock);
priv->output_task_queue.push (std::move (task));
priv->output_cond.notify_one ();
}
gst_video_codec_frame_unref (frame);
return priv->last_flow;
error:
gst_video_encoder_finish_frame (encoder, frame);
return GST_FLOW_ERROR;
}