/* GStreamer * Copyright (C) 2024 Seungha Yang * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Library General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Library General Public License for more details. * * You should have received a copy of the GNU Library General Public * License along with this library; if not, write to the * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, * Boston, MA 02110-1301, USA. */ #ifdef HAVE_CONFIG_H #include "config.h" #endif #include "gstnvcompvideoenc.h" #ifdef HAVE_GST_GL #include #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include GST_DEBUG_CATEGORY_STATIC (gst_nv_comp_video_enc_debug); #define GST_CAT_DEFAULT gst_nv_comp_video_enc_debug #ifdef HAVE_GST_GL #define SINK_CAPS \ GST_VIDEO_CAPS_MAKE_WITH_FEATURES (GST_CAPS_FEATURE_MEMORY_CUDA_MEMORY, \ GST_VIDEO_FORMATS_ALL) ";" \ GST_VIDEO_CAPS_MAKE_WITH_FEATURES (GST_CAPS_FEATURE_MEMORY_GL_MEMORY, \ GST_VIDEO_FORMATS_ALL) ";" \ GST_VIDEO_CAPS_MAKE (GST_VIDEO_FORMATS_ALL) #else #define SINK_CAPS \ GST_VIDEO_CAPS_MAKE_WITH_FEATURES (GST_CAPS_FEATURE_MEMORY_CUDA_MEMORY, \ GST_VIDEO_FORMATS_ALL) ";" \ GST_VIDEO_CAPS_MAKE (GST_VIDEO_FORMATS_ALL) #endif static GstStaticPadTemplate sink_template = GST_STATIC_PAD_TEMPLATE ("sink", GST_PAD_SINK, GST_PAD_ALWAYS, GST_STATIC_CAPS (SINK_CAPS)); static GstStaticPadTemplate src_template = GST_STATIC_PAD_TEMPLATE ("src", GST_PAD_SRC, GST_PAD_ALWAYS, GST_STATIC_CAPS ("video/x-nvcomp; video/x-nvcomp-lz4; " "video/x-nvcomp-snappy; video/x-nvcomp-gdeflate; " "video/x-nvcomp-deflate; video/x-nvcomp-zstd; video/x-nvcomp-cascaded; " "video/x-nvcomp-bitcomp; video/x-nvcomp-ans")); enum GstNvCompDataType { GST_NV_COMP_DATA_TYPE_DEFAULT = -1, GST_NV_COMP_DATA_TYPE_CHAR = NVCOMP_TYPE_CHAR, GST_NV_COMP_DATA_TYPE_UCHAR = NVCOMP_TYPE_UCHAR, GST_NV_COMP_DATA_TYPE_SHORT = NVCOMP_TYPE_SHORT, GST_NV_COMP_DATA_TYPE_USHORT = NVCOMP_TYPE_USHORT, GST_NV_COMP_DATA_TYPE_INT = NVCOMP_TYPE_INT, GST_NV_COMP_DATA_TYPE_UINT = NVCOMP_TYPE_UINT, GST_NV_COMP_DATA_TYPE_LONGLONG = NVCOMP_TYPE_LONGLONG, GST_NV_COMP_DATA_TYPE_ULONGLONG = NVCOMP_TYPE_ULONGLONG, GST_NV_COMP_DATA_TYPE_BITS = NVCOMP_TYPE_BITS, }; #define GST_TYPE_NV_COMP_DATA_TYPE (gst_nv_comp_data_type_type()) static GType gst_nv_comp_data_type_type (void) { static GType data_type = 0; static std::once_flag once; static const GEnumValue types[] = { {GST_NV_COMP_DATA_TYPE_DEFAULT, "Default", "default"}, {GST_NV_COMP_DATA_TYPE_CHAR, "CHAR", "char"}, {GST_NV_COMP_DATA_TYPE_UCHAR, "UCHAR", "uchar"}, {GST_NV_COMP_DATA_TYPE_SHORT, "SHORT", "short"}, {GST_NV_COMP_DATA_TYPE_USHORT, "USHORT", "ushort"}, {GST_NV_COMP_DATA_TYPE_INT, "INT", "int"}, {GST_NV_COMP_DATA_TYPE_UINT, "UINT", "uint"}, {GST_NV_COMP_DATA_TYPE_LONGLONG, "LONGLONG", "longlong"}, {GST_NV_COMP_DATA_TYPE_ULONGLONG, "ULONGLONG", "ulonglong"}, {GST_NV_COMP_DATA_TYPE_BITS, "BITS", "bits"}, {0, nullptr, nullptr}, }; std::call_once (once,[&] { data_type = g_enum_register_static ("GstNvCompDataType", types); }); return data_type; } enum GstNvCompDeflateAlgo { GST_NV_COMP_DEFLATE_HIGH_THROUGHPUT, GST_NV_COMP_DEFLATE_LOW_THROUGHPUT, GST_NV_COMP_DEFLATE_HIGHEST_THROUGHPUT, }; #define GST_TYPE_NV_COMP_DEFLATE_ALGO (gst_nv_comp_deflate_algo_get_type()) static GType gst_nv_comp_deflate_algo_get_type (void) { static GType algo_type = 0; static std::once_flag once; static const GEnumValue algo[] = { {GST_NV_COMP_DEFLATE_HIGH_THROUGHPUT, "High throughput, low compression ratio", "high-throughput"}, {GST_NV_COMP_DEFLATE_LOW_THROUGHPUT, "Low throughput, high compression ratio", "low-throughput"}, {GST_NV_COMP_DEFLATE_HIGHEST_THROUGHPUT, "Highest throughput, entropy-only compression", "highest-throughput"}, {0, nullptr, nullptr}, }; std::call_once (once,[&] { algo_type = g_enum_register_static ("GstNvCompDeflateAlgo", algo); }); return algo_type; } enum GstNvCompBitcompAlgo { GST_NV_COMP_BITCOMP_DEFAULT, GST_NV_COMP_BITCOMP_SPARSE, }; #define GST_TYPE_NV_COMP_BITCOMP_ALGO (gst_nv_comp_bitcomp_algo_get_type()) static GType gst_nv_comp_bitcomp_algo_get_type (void) { static GType algo_type = 0; static std::once_flag once; static const GEnumValue algo[] = { {GST_NV_COMP_BITCOMP_DEFAULT, "Default", "default"}, {GST_NV_COMP_BITCOMP_SPARSE, "Sparse", "sparse"}, {0, nullptr, nullptr}, }; std::call_once (once,[&] { algo_type = g_enum_register_static ("GstNvCompBitcompAlgo", algo); }); return algo_type; } enum { PROP_0, PROP_METHOD, PROP_DEFLATE_ALGO, PROP_BITCOMP_ALGO, PROP_DATA_TYPE, PROP_CHUNK_SIZE, PROP_ASYNC_DEPTH, PROP_BATCHED, }; #define DEFAULT_METHOD GST_NV_COMP_BITCOMP #define DEFAULT_DEFLATE_ALGO GST_NV_COMP_DEFLATE_HIGH_THROUGHPUT #define DEFAULT_BITCOMP_ALGO GST_NV_COMP_BITCOMP_SPARSE #define DEFAULT_DATA_TYPE GST_NV_COMP_DATA_TYPE_DEFAULT #define DEFAULT_CHUNK_SIZE 0 #define DEFAULT_BATCHED TRUE #define DEFAULT_ASYNC_DEPTH 2 /* *INDENT-OFF* */ using namespace nvcomp; struct EncoderTask { ~EncoderTask () { if (ctx) { gst_cuda_context_push (ctx); if (event) CuEventDestroy (event); if (device_uncompressed) CuMemFree ((CUdeviceptr) device_uncompressed); if (host_uncompressed) CuMemFreeHost (host_uncompressed); if (device_compressed) CuMemFree ((CUdeviceptr) device_compressed); if (host_compressed) CuMemFreeHost (host_compressed); if (device_uncompressed_bytes) CuMemFree ((CUdeviceptr) device_uncompressed_bytes); if (device_uncompressed_ptrs) CuMemFree ((CUdeviceptr) device_uncompressed_ptrs); if (device_compressed_bytes) CuMemFree ((CUdeviceptr) device_compressed_bytes); if (host_uncompressed_bytes) CuMemFreeHost (host_uncompressed_bytes); if (host_uncompressed_ptrs) CuMemFreeHost (host_uncompressed_ptrs); if (device_compressed_ptrs) CuMemFree ((CUdeviceptr) device_compressed_ptrs); if (host_compressed_bytes) CuMemFreeHost (host_compressed_bytes); if (host_compressed_ptrs) CuMemFreeHost (host_compressed_ptrs); if (temp_ptr) CuMemFree ((CUdeviceptr) temp_ptr); gst_cuda_context_pop (nullptr); gst_object_unref (ctx); } } GstCudaContext *ctx = nullptr; CUevent event = nullptr; uint8_t *device_uncompressed = nullptr; uint8_t *host_uncompressed = nullptr; uint8_t *device_compressed = nullptr; uint8_t *host_compressed = nullptr; size_t *device_uncompressed_bytes = nullptr; void **device_uncompressed_ptrs = nullptr; size_t *host_uncompressed_bytes = nullptr; void **host_uncompressed_ptrs = nullptr; size_t *device_compressed_bytes = nullptr; void **device_compressed_ptrs = nullptr; size_t *host_compressed_bytes = nullptr; void **host_compressed_ptrs = nullptr; void *temp_ptr = nullptr; size_t temp_size = 0; size_t compressed_size = 0; gboolean batched; size_t batch_size; size_t chunk_size; size_t max_output_chunk_size; size_t compressed_alloc_size; }; struct BatchedCompBase { virtual nvcompStatus_t get_temp_size( size_t batch_size, size_t max_uncompressed_chunk_bytes, size_t * temp_bytes) = 0; virtual nvcompStatus_t get_max_compressed_chunk_size( size_t max_uncompressed_chunk_bytes, size_t * max_compressed_bytes) = 0; virtual nvcompStatus_t compress( void **device_uncompressed_ptrs, size_t *device_uncompressed_bytes, size_t max_uncompressed_chunk_bytes, size_t batch_size, void *device_temp_ptr, size_t temp_bytes, void **device_compressed_ptrs, size_t *device_compressed_bytes, cudaStream_t stream) = 0; }; template class BatchedComp : public BatchedCompBase { public: BatchedComp (const FormatOptT & opt) : opts_(opt) {} nvcompStatus_t get_temp_size( size_t batch_size, size_t max_uncompressed_chunk_bytes, size_t * temp_bytes) { return T (batch_size, max_uncompressed_chunk_bytes, opts_, temp_bytes); } nvcompStatus_t get_max_compressed_chunk_size( size_t max_uncompressed_chunk_bytes, size_t * max_compressed_bytes) { return O (max_uncompressed_chunk_bytes, opts_, max_compressed_bytes); } nvcompStatus_t compress( void **device_uncompressed_ptrs, size_t *device_uncompressed_bytes, size_t max_uncompressed_chunk_bytes, size_t batch_size, void *device_temp_ptr, size_t temp_bytes, void **device_compressed_ptrs, size_t *device_compressed_bytes, cudaStream_t stream) { return C (device_uncompressed_ptrs, device_uncompressed_bytes, max_uncompressed_chunk_bytes, batch_size, device_temp_ptr, temp_bytes, device_compressed_ptrs, device_compressed_bytes, opts_, stream); } private: FormatOptT opts_; }; struct GstNvCompVideoEncPrivate { GstCudaContext *ctx = nullptr; GstCudaStream *stream = nullptr; #ifdef HAVE_GST_GL GstGLDisplay *gl_display = nullptr; GstGLContext *gl_context = nullptr; GstGLContext *other_gl_context = nullptr; #endif GstBufferPool *pool = nullptr; GstVideoCodecState *state = nullptr; std::shared_ptr manager; std::shared_ptr config; std::shared_ptr batched_comp; gboolean gl_interop = FALSE; std::mutex lock; std::mutex input_lock; std::condition_variable input_cond; std::mutex output_lock; std::condition_variable output_cond; std::queue> input_task_queue; std::queue> output_task_queue; std::shared_ptr cur_task; GThread *encode_thread = nullptr; std::atomic last_flow = { GST_FLOW_OK }; GstNvCompMethod method = DEFAULT_METHOD; GstNvCompDeflateAlgo deflate_algo = DEFAULT_DEFLATE_ALGO; GstNvCompBitcompAlgo bitcomp_algo = DEFAULT_BITCOMP_ALGO; GstNvCompDataType data_type = DEFAULT_DATA_TYPE; guint chunk_size = DEFAULT_CHUNK_SIZE; gboolean batched = DEFAULT_BATCHED; guint async_depth = DEFAULT_ASYNC_DEPTH; }; /* *INDENT-ON* */ struct _GstNvCompVideoEnc { GstVideoEncoder parent; GstNvCompVideoEncPrivate *priv; }; static void gst_nv_comp_video_enc_finalize (GObject * object); static void gst_nv_comp_video_enc_set_property (GObject * object, guint prop_id, const GValue * value, GParamSpec * pspec); static void gst_nv_comp_video_enc_get_property (GObject * object, guint prop_id, GValue * value, GParamSpec * pspec); static void gst_nv_comp_video_enc_set_context (GstElement * element, GstContext * context); static gboolean gst_nv_comp_video_enc_open (GstVideoEncoder * encoder); static gboolean gst_nv_comp_video_enc_close (GstVideoEncoder * encoder); static gboolean gst_nv_comp_video_enc_stop (GstVideoEncoder * encoder); static gboolean gst_nv_comp_video_enc_flush (GstVideoEncoder * encoder); static GstFlowReturn gst_nv_comp_video_enc_finish (GstVideoEncoder * encoder); static gboolean gst_nv_comp_video_enc_sink_query (GstVideoEncoder * encoder, GstQuery * query); static gboolean gst_nv_comp_video_enc_src_query (GstVideoEncoder * encoder, GstQuery * query); static gboolean gst_nv_comp_video_enc_propose_allocation (GstVideoEncoder * encoder, GstQuery * query); static gboolean gst_nv_comp_video_enc_set_format (GstVideoEncoder * encoder, GstVideoCodecState * state); static GstFlowReturn gst_nv_comp_video_enc_handle_frame (GstVideoEncoder * encoder, GstVideoCodecFrame * frame); #define gst_nv_comp_video_enc_parent_class parent_class G_DEFINE_TYPE (GstNvCompVideoEnc, gst_nv_comp_video_enc, GST_TYPE_VIDEO_ENCODER); static void gst_nv_comp_video_enc_class_init (GstNvCompVideoEncClass * klass) { auto object_class = G_OBJECT_CLASS (klass); auto element_class = GST_ELEMENT_CLASS (klass); auto encoder_class = GST_VIDEO_ENCODER_CLASS (klass); object_class->finalize = gst_nv_comp_video_enc_finalize; object_class->set_property = gst_nv_comp_video_enc_set_property; object_class->get_property = gst_nv_comp_video_enc_get_property; g_object_class_install_property (object_class, PROP_METHOD, g_param_spec_enum ("method", "Method", "Compression method", GST_TYPE_NV_COMP_METHOD, DEFAULT_METHOD, (GParamFlags) (G_PARAM_READWRITE | G_PARAM_STATIC_STRINGS))); g_object_class_install_property (object_class, PROP_DEFLATE_ALGO, g_param_spec_enum ("deflate-algo", "Deflate Algo", "Algorithm to use for deflate and gdeflate methods", GST_TYPE_NV_COMP_DEFLATE_ALGO, DEFAULT_DEFLATE_ALGO, (GParamFlags) (G_PARAM_READWRITE | G_PARAM_STATIC_STRINGS))); g_object_class_install_property (object_class, PROP_BITCOMP_ALGO, g_param_spec_enum ("bitcomp-algo", "Bitcomp Algo", "Algorithm to use for bitcomp method", GST_TYPE_NV_COMP_BITCOMP_ALGO, DEFAULT_BITCOMP_ALGO, (GParamFlags) (G_PARAM_READWRITE | G_PARAM_STATIC_STRINGS))); g_object_class_install_property (object_class, PROP_DATA_TYPE, g_param_spec_enum ("data-type", "Data Type", "Compression data type", GST_TYPE_NV_COMP_DATA_TYPE, DEFAULT_DATA_TYPE, (GParamFlags) (G_PARAM_READWRITE | G_PARAM_STATIC_STRINGS))); g_object_class_install_property (object_class, PROP_CHUNK_SIZE, g_param_spec_uint ("chunk-size", "Chunk Size", "Uncompressed chunk size for batched compression (0 = default)", 0, G_MAXINT32, DEFAULT_CHUNK_SIZE, (GParamFlags) (G_PARAM_READWRITE | GST_PARAM_MUTABLE_READY | G_PARAM_STATIC_STRINGS))); g_object_class_install_property (object_class, PROP_BATCHED, g_param_spec_boolean ("batched", "Batched", "Use low-level C API for batched operation", DEFAULT_BATCHED, (GParamFlags) (G_PARAM_READWRITE | GST_PARAM_MUTABLE_READY | G_PARAM_STATIC_STRINGS))); g_object_class_install_property (object_class, PROP_ASYNC_DEPTH, g_param_spec_uint ("async-depth", "Async Depth", "Internal resource pool size for threaded encoding", 1, 4, DEFAULT_ASYNC_DEPTH, (GParamFlags) (G_PARAM_READWRITE | GST_PARAM_MUTABLE_READY | G_PARAM_STATIC_STRINGS))); gst_element_class_add_static_pad_template (element_class, &sink_template); gst_element_class_add_static_pad_template (element_class, &src_template); gst_element_class_set_static_metadata (element_class, "nvCOMP Video Encoder", "Encoder/Video/Hardware", "Lossless video compression element based on nvCOMP library", "Seungha Yang "); element_class->set_context = GST_DEBUG_FUNCPTR (gst_nv_comp_video_enc_set_context); encoder_class->open = GST_DEBUG_FUNCPTR (gst_nv_comp_video_enc_open); encoder_class->close = GST_DEBUG_FUNCPTR (gst_nv_comp_video_enc_close); encoder_class->stop = GST_DEBUG_FUNCPTR (gst_nv_comp_video_enc_stop); encoder_class->flush = GST_DEBUG_FUNCPTR (gst_nv_comp_video_enc_flush); encoder_class->finish = GST_DEBUG_FUNCPTR (gst_nv_comp_video_enc_finish); encoder_class->sink_query = GST_DEBUG_FUNCPTR (gst_nv_comp_video_enc_sink_query); encoder_class->src_query = GST_DEBUG_FUNCPTR (gst_nv_comp_video_enc_src_query); encoder_class->propose_allocation = GST_DEBUG_FUNCPTR (gst_nv_comp_video_enc_propose_allocation); encoder_class->set_format = GST_DEBUG_FUNCPTR (gst_nv_comp_video_enc_set_format); encoder_class->handle_frame = GST_DEBUG_FUNCPTR (gst_nv_comp_video_enc_handle_frame); GST_DEBUG_CATEGORY_INIT (gst_nv_comp_video_enc_debug, "nvcompvideoenc", 0, "nvcompvideoenc"); } static void gst_nv_comp_video_enc_init (GstNvCompVideoEnc * self) { self->priv = new GstNvCompVideoEncPrivate (); } static void gst_nv_comp_video_enc_finalize (GObject * object) { auto self = GST_NV_COMP_VIDEO_ENC (object); delete self->priv; G_OBJECT_CLASS (parent_class)->finalize (object); } static void gst_nv_comp_video_enc_set_property (GObject * object, guint prop_id, const GValue * value, GParamSpec * pspec) { auto self = GST_NV_COMP_VIDEO_ENC (object); auto priv = self->priv; std::lock_guard < std::mutex > lk (priv->lock); switch (prop_id) { case PROP_METHOD: priv->method = (GstNvCompMethod) g_value_get_enum (value); break; case PROP_DEFLATE_ALGO: priv->deflate_algo = (GstNvCompDeflateAlgo) g_value_get_enum (value); break; case PROP_BITCOMP_ALGO: priv->bitcomp_algo = (GstNvCompBitcompAlgo) g_value_get_enum (value); break; case PROP_DATA_TYPE: priv->data_type = (GstNvCompDataType) g_value_get_enum (value); break; case PROP_CHUNK_SIZE: priv->chunk_size = g_value_get_uint (value); break; case PROP_BATCHED: priv->batched = g_value_get_boolean (value); break; case PROP_ASYNC_DEPTH: priv->async_depth = g_value_get_uint (value); break; default: G_OBJECT_WARN_INVALID_PROPERTY_ID (object, prop_id, pspec); break; } } static void gst_nv_comp_video_enc_get_property (GObject * object, guint prop_id, GValue * value, GParamSpec * pspec) { auto self = GST_NV_COMP_VIDEO_ENC (object); auto priv = self->priv; std::lock_guard < std::mutex > lk (priv->lock); switch (prop_id) { case PROP_METHOD: g_value_set_enum (value, priv->method); break; case PROP_DEFLATE_ALGO: g_value_set_enum (value, priv->deflate_algo); break; case PROP_BITCOMP_ALGO: g_value_set_enum (value, priv->bitcomp_algo); break; case PROP_DATA_TYPE: g_value_set_enum (value, priv->data_type); break; case PROP_CHUNK_SIZE: g_value_set_uint (value, priv->chunk_size); break; case PROP_BATCHED: g_value_set_boolean (value, priv->batched); break; case PROP_ASYNC_DEPTH: g_value_set_uint (value, priv->async_depth); break; default: G_OBJECT_WARN_INVALID_PROPERTY_ID (object, prop_id, pspec); break; } } static void gst_nv_comp_video_enc_set_context (GstElement * element, GstContext * context) { auto self = GST_NV_COMP_VIDEO_ENC (element); auto priv = self->priv; gst_cuda_handle_set_context (element, context, -1, &priv->ctx); #ifdef HAVE_GST_GL if (gst_gl_handle_set_context (element, context, &priv->gl_display, &priv->other_gl_context)) { if (priv->gl_display) gst_gl_display_filter_gl_api (priv->gl_display, GST_GL_API_OPENGL3); } #endif GST_ELEMENT_CLASS (parent_class)->set_context (element, context); } static gboolean gst_nv_comp_video_enc_open (GstVideoEncoder * encoder) { auto self = GST_NV_COMP_VIDEO_ENC (encoder); auto priv = self->priv; if (!gst_cuda_ensure_element_context (GST_ELEMENT_CAST (encoder), -1, &priv->ctx)) { GST_ERROR_OBJECT (self, "Couldn't get cuda context"); return FALSE; } priv->stream = gst_cuda_stream_new (priv->ctx); return TRUE; } static gboolean gst_nv_comp_video_enc_close (GstVideoEncoder * encoder) { auto self = GST_NV_COMP_VIDEO_ENC (encoder); auto priv = self->priv; gst_clear_cuda_stream (&priv->stream); gst_clear_object (&priv->ctx); #ifdef HAVE_GST_GL gst_clear_object (&priv->other_gl_context); gst_clear_object (&priv->gl_context); gst_clear_object (&priv->gl_context); #endif return TRUE; } static void gst_nv_comp_video_enc_drain (GstNvCompVideoEnc * self, gboolean locked) { auto priv = self->priv; if (!priv->encode_thread) return; if (locked) GST_VIDEO_ENCODER_STREAM_UNLOCK (self); { std::lock_guard < std::mutex > lk (priv->output_lock); priv->output_task_queue.push (nullptr); priv->output_cond.notify_all (); } g_clear_pointer (&priv->encode_thread, g_thread_join); if (locked) GST_VIDEO_ENCODER_STREAM_LOCK (self); priv->last_flow = GST_FLOW_OK; } static gboolean gst_nv_comp_video_enc_stop (GstVideoEncoder * encoder) { auto self = GST_NV_COMP_VIDEO_ENC (encoder); auto priv = self->priv; gst_nv_comp_video_enc_drain (self, FALSE); if (priv->ctx) { gst_cuda_context_push (priv->ctx); priv->manager = nullptr; priv->cur_task = nullptr; priv->input_task_queue = { }; priv->output_task_queue = { }; gst_cuda_context_pop (nullptr); } g_clear_pointer (&priv->state, gst_video_codec_state_unref); if (priv->pool) { gst_buffer_pool_set_active (priv->pool, FALSE); gst_clear_object (&priv->pool); } return TRUE; } static gboolean gst_nv_comp_video_enc_flush (GstVideoEncoder * encoder) { auto self = GST_NV_COMP_VIDEO_ENC (encoder); gst_nv_comp_video_enc_drain (self, TRUE); return TRUE; } static GstFlowReturn gst_nv_comp_video_enc_finish (GstVideoEncoder * encoder) { auto self = GST_NV_COMP_VIDEO_ENC (encoder); gst_nv_comp_video_enc_drain (self, TRUE); return GST_FLOW_OK; } static gboolean gst_nv_comp_video_enc_handle_context_query (GstNvCompVideoEnc * self, GstQuery * query) { auto priv = self->priv; #ifdef HAVE_GST_GL { GstGLDisplay *display = nullptr; GstGLContext *other = nullptr; GstGLContext *local = nullptr; if (priv->gl_display) display = (GstGLDisplay *) gst_object_ref (priv->gl_display); if (priv->gl_context) local = (GstGLContext *) gst_object_ref (priv->gl_context); if (priv->other_gl_context) other = (GstGLContext *) gst_object_ref (priv->other_gl_context); auto ret = gst_gl_handle_context_query (GST_ELEMENT (self), query, display, local, other); gst_clear_object (&display); gst_clear_object (&other); gst_clear_object (&local); if (ret) return TRUE; } #endif if (gst_cuda_handle_context_query (GST_ELEMENT (self), query, priv->ctx)) return TRUE; return FALSE; } static gboolean gst_nv_comp_video_enc_sink_query (GstVideoEncoder * encoder, GstQuery * query) { auto self = GST_NV_COMP_VIDEO_ENC (encoder); switch (GST_QUERY_TYPE (query)) { case GST_QUERY_CONTEXT: if (gst_nv_comp_video_enc_handle_context_query (self, query)) return TRUE; break; default: break; } return GST_VIDEO_ENCODER_CLASS (parent_class)->sink_query (encoder, query); } static gboolean gst_nv_comp_video_enc_src_query (GstVideoEncoder * encoder, GstQuery * query) { auto self = GST_NV_COMP_VIDEO_ENC (encoder); switch (GST_QUERY_TYPE (query)) { case GST_QUERY_CONTEXT: if (gst_nv_comp_video_enc_handle_context_query (self, query)) return TRUE; break; default: break; } return GST_VIDEO_ENCODER_CLASS (parent_class)->src_query (encoder, query); } #ifdef HAVE_GST_GL static void check_cuda_device_from_gl_context (GstGLContext * context, gboolean * ret) { guint device_count = 0; CUdevice device_list[1] = { 0, }; CUresult cuda_ret; *ret = FALSE; cuda_ret = CuGLGetDevices (&device_count, device_list, 1, CU_GL_DEVICE_LIST_ALL); if (!gst_cuda_result (cuda_ret) || device_count == 0) return; *ret = TRUE; } static gboolean gst_nv_comp_video_enc_ensure_gl_context (GstNvCompVideoEnc * self) { auto priv = self->priv; gboolean ret = FALSE; if (!gst_gl_ensure_element_data (GST_ELEMENT (self), &priv->gl_display, &priv->other_gl_context)) { GST_DEBUG_OBJECT (self, "Couldn't get GL display"); return FALSE; } gst_gl_display_filter_gl_api (priv->gl_display, GST_GL_API_OPENGL3); if (!gst_gl_display_ensure_context (priv->gl_display, priv->other_gl_context, &priv->gl_context, nullptr)) { GST_DEBUG_OBJECT (self, "Couldn't get GL context"); return FALSE; } gst_gl_context_thread_add (priv->gl_context, (GstGLContextThreadFunc) check_cuda_device_from_gl_context, &ret); return ret; } #endif static gboolean gst_nv_comp_video_enc_propose_allocation (GstVideoEncoder * encoder, GstQuery * query) { auto self = GST_NV_COMP_VIDEO_ENC (encoder); auto priv = self->priv; GstBufferPool *pool = nullptr; guint size; GstCaps *caps; gst_query_parse_allocation (query, &caps, nullptr); if (!caps) { GST_WARNING_OBJECT (self, "null caps in query"); return FALSE; } GstVideoInfo info; if (!gst_video_info_from_caps (&info, caps)) { GST_WARNING_OBJECT (self, "Failed to convert caps into info"); return FALSE; } auto features = gst_caps_get_features (caps, 0); gboolean use_cuda_pool = FALSE; if (gst_caps_features_contains (features, GST_CAPS_FEATURE_MEMORY_CUDA_MEMORY)) { GST_DEBUG_OBJECT (self, "upstream support CUDA memory"); pool = gst_cuda_buffer_pool_new (priv->ctx); use_cuda_pool = TRUE; } #ifdef HAVE_GST_GL else if (gst_caps_features_contains (features, GST_CAPS_FEATURE_MEMORY_GL_MEMORY)) { if (!gst_nv_comp_video_enc_ensure_gl_context (self)) { priv->gl_interop = FALSE; } else { pool = gst_gl_buffer_pool_new (priv->gl_context); } } #endif if (!pool) pool = gst_video_buffer_pool_new (); auto config = gst_buffer_pool_get_config (pool); gst_buffer_pool_config_add_option (config, GST_BUFFER_POOL_OPTION_VIDEO_META); size = GST_VIDEO_INFO_SIZE (&info); gst_buffer_pool_config_set_params (config, caps, size, 0, 0); if (use_cuda_pool && priv->stream) { /* Set our stream on buffer pool config so that CUstream can be shared */ gst_buffer_pool_config_set_cuda_stream (config, priv->stream); } if (!gst_buffer_pool_set_config (pool, config)) { GST_WARNING_OBJECT (self, "Failed to set pool config"); gst_object_unref (pool); return FALSE; } config = gst_buffer_pool_get_config (pool); gst_buffer_pool_config_get_params (config, nullptr, &size, nullptr, nullptr); gst_structure_free (config); gst_query_add_allocation_pool (query, pool, size, 0, 0); gst_query_add_allocation_meta (query, GST_VIDEO_META_API_TYPE, nullptr); gst_object_unref (pool); return TRUE; } static gboolean gst_nv_comp_video_enc_alloc_task (GstNvCompVideoEnc * self, EncoderTask * task, gboolean batched, size_t uncompressed_size, size_t compressed_size, size_t batch_size, size_t chunk_size, size_t output_chunk_size, size_t temp_size) { size_t alloc_size = sizeof (size_t) * batch_size; uint8_t *uncomp_data; uint8_t *comp_data; auto ret = CuEventCreate (&task->event, CU_EVENT_BLOCKING_SYNC | CU_EVENT_DISABLE_TIMING); if (!gst_cuda_result (ret)) return FALSE; auto aligned_uncompressed_size = uncompressed_size; ret = CuMemAlloc ((CUdeviceptr *) & task->device_uncompressed, aligned_uncompressed_size); if (!gst_cuda_result (ret)) return FALSE; ret = CuMemAllocHost ((void **) &task->host_uncompressed, aligned_uncompressed_size); if (!gst_cuda_result (ret)) return FALSE; auto aligned_compressed_size = GST_ROUND_UP_8 (compressed_size); ret = CuMemAlloc ((CUdeviceptr *) & task->device_compressed, aligned_compressed_size); if (!gst_cuda_result (ret)) return FALSE; ret = CuMemAllocHost ((void **) &task->host_compressed, aligned_compressed_size); if (!gst_cuda_result (ret)) return FALSE; if (!batched) return TRUE; ret = CuMemAllocHost ((void **) &task->host_uncompressed_bytes, alloc_size); if (!gst_cuda_result (ret)) return FALSE; ret = CuMemAllocHost ((void **) &task->host_uncompressed_ptrs, alloc_size); if (!gst_cuda_result (ret)) return FALSE; for (size_t i = 0; i < batch_size; i++) { if (i + 1 < batch_size) task->host_uncompressed_bytes[i] = chunk_size; else task->host_uncompressed_bytes[i] = (uncompressed_size - (chunk_size * i)); } ret = CuMemAlloc ((CUdeviceptr *) & task->device_uncompressed_bytes, alloc_size); if (!gst_cuda_result (ret)) return FALSE; ret = CuMemAlloc ((CUdeviceptr *) & task->device_uncompressed_ptrs, alloc_size); if (!gst_cuda_result (ret)) return FALSE; ret = CuMemAlloc ((CUdeviceptr *) & task->device_compressed_bytes, alloc_size); if (!gst_cuda_result (ret)) return FALSE; ret = CuMemAlloc ((CUdeviceptr *) & task->device_compressed_ptrs, alloc_size); if (!gst_cuda_result (ret)) return FALSE; ret = CuMemAllocHost ((void **) &task->host_compressed_bytes, alloc_size); if (!gst_cuda_result (ret)) return FALSE; ret = CuMemAllocHost ((void **) &task->host_compressed_ptrs, alloc_size); if (!gst_cuda_result (ret)) return FALSE; if (temp_size > 0) { ret = CuMemAlloc ((CUdeviceptr *) & task->temp_ptr, temp_size); if (!gst_cuda_result (ret)) return FALSE; } task->temp_size = temp_size; uncomp_data = task->device_uncompressed; comp_data = task->device_compressed; for (size_t i = 0; i < batch_size; i++) { task->host_uncompressed_ptrs[i] = uncomp_data; uncomp_data += chunk_size; task->host_compressed_ptrs[i] = comp_data; comp_data += output_chunk_size; } ret = CuMemcpyHtoD ((CUdeviceptr) task->device_uncompressed_bytes, task->host_uncompressed_bytes, alloc_size); if (!gst_cuda_result (ret)) return FALSE; ret = CuMemcpyHtoD ((CUdeviceptr) task->device_uncompressed_ptrs, task->host_uncompressed_ptrs, alloc_size); if (!gst_cuda_result (ret)) return FALSE; ret = CuMemcpyHtoD ((CUdeviceptr) task->device_compressed_ptrs, task->host_compressed_ptrs, alloc_size); if (!gst_cuda_result (ret)) return FALSE; task->batched = batched; task->batch_size = batch_size; task->chunk_size = chunk_size; task->max_output_chunk_size = output_chunk_size; task->compressed_alloc_size = aligned_compressed_size; return TRUE; } static gboolean gst_nv_comp_video_enc_set_format (GstVideoEncoder * encoder, GstVideoCodecState * state) { auto self = GST_NV_COMP_VIDEO_ENC (encoder); auto priv = self->priv; gst_nv_comp_video_enc_drain (self, TRUE); std::lock_guard < std::mutex > lk (priv->lock); if (!priv->ctx) { GST_ERROR_OBJECT (self, "CUDA context was not configured"); return FALSE; } if (priv->pool) { gst_buffer_pool_set_active (priv->pool, FALSE); gst_clear_object (&priv->pool); } g_clear_pointer (&priv->state, gst_video_codec_state_unref); priv->state = gst_video_codec_state_ref (state); std::string mime_type = "video/x-nvcomp"; if (!gst_cuda_context_push (priv->ctx)) { GST_ERROR_OBJECT (self, "Couldn't push context"); return FALSE; } priv->gl_interop = FALSE; #if HAVE_GST_GL auto features = gst_caps_get_features (state->caps, 0); if (gst_caps_features_contains (features, GST_CAPS_FEATURE_MEMORY_GL_MEMORY)) priv->gl_interop = TRUE; #endif priv->manager = nullptr; priv->config = nullptr; priv->batched_comp = nullptr; priv->input_task_queue = { }; priv->output_task_queue = { }; auto stream = (cudaStream_t) gst_cuda_stream_get_handle (priv->stream); guint device_id = 0; g_object_get (priv->ctx, "cuda-device-id", &device_id, nullptr); size_t chunk_size = priv->chunk_size; size_t batch_size = 0; switch (priv->method) { case GST_NV_COMP_LZ4: { nvcompBatchedLZ4Opts_t opts = nvcompBatchedLZ4DefaultOpts; if (priv->data_type != GST_NV_COMP_DATA_TYPE_DEFAULT) opts.data_type = (nvcompType_t) priv->data_type; if (chunk_size == 0) chunk_size = 65536; chunk_size = MAX (32768, chunk_size); chunk_size = GST_ROUND_UP_8 (chunk_size); chunk_size = MIN (chunk_size, nvcompLZ4CompressionMaxAllowedChunkSize); if (priv->batched) { batch_size = (state->info.size + chunk_size - 1) / chunk_size; priv->batched_comp = std::make_shared < BatchedComp < nvcompBatchedLZ4Opts_t, nvcompBatchedLZ4CompressGetTempSize, nvcompBatchedLZ4CompressGetMaxOutputChunkSize, nvcompBatchedLZ4CompressAsync >> (opts); mime_type = "video/x-nvcomp-lz4"; } else { priv->manager = std::make_shared < LZ4Manager > (chunk_size, opts, stream, device_id); } GST_DEBUG_OBJECT (self, "Using LZ4"); break; } case GST_NV_COMP_SNAPPY: { if (chunk_size == 0) chunk_size = 65536; chunk_size = MAX (32768, chunk_size); chunk_size = GST_ROUND_UP_8 (chunk_size); chunk_size = MIN (chunk_size, nvcompSnappyCompressionMaxAllowedChunkSize); if (priv->batched) { batch_size = (state->info.size + chunk_size - 1) / chunk_size; priv->batched_comp = std::make_shared < BatchedComp < nvcompBatchedSnappyOpts_t, nvcompBatchedSnappyCompressGetTempSize, nvcompBatchedSnappyCompressGetMaxOutputChunkSize, nvcompBatchedSnappyCompressAsync >> (nvcompBatchedSnappyDefaultOpts); mime_type = "video/x-nvcomp-snappy"; } else { priv->manager = std::make_shared < SnappyManager > (chunk_size, nvcompBatchedSnappyDefaultOpts, stream, device_id); } GST_DEBUG_OBJECT (self, "Using SNAPPY"); break; } case GST_NV_COMP_GDEFLATE: { nvcompBatchedGdeflateOpts_t opts; opts.algo = (int) priv->deflate_algo; if (chunk_size == 0) chunk_size = 65536; chunk_size = MAX (32768, chunk_size); chunk_size = GST_ROUND_UP_8 (chunk_size); chunk_size = MIN (chunk_size, nvcompGdeflateCompressionMaxAllowedChunkSize); if (priv->batched) { batch_size = (state->info.size + chunk_size - 1) / chunk_size; priv->batched_comp = std::make_shared < BatchedComp < nvcompBatchedGdeflateOpts_t, nvcompBatchedGdeflateCompressGetTempSize, nvcompBatchedGdeflateCompressGetMaxOutputChunkSize, nvcompBatchedGdeflateCompressAsync >> (opts); mime_type = "video/x-nvcomp-gdeflate"; } else { priv->manager = std::make_shared < GdeflateManager > (chunk_size, opts, stream, device_id); } GST_DEBUG_OBJECT (self, "Using GDEFLATE"); break; } case GST_NV_COMP_DEFLATE: { if (chunk_size == 0) chunk_size = 65536; chunk_size = MAX (32768, chunk_size); chunk_size = GST_ROUND_UP_8 (chunk_size); chunk_size = MIN (chunk_size, nvcompDeflateCompressionMaxAllowedChunkSize); nvcompBatchedDeflateOpts_t opts; opts.algo = (int) priv->deflate_algo; if (priv->batched) { batch_size = (state->info.size + chunk_size - 1) / chunk_size; priv->batched_comp = std::make_shared < BatchedComp < nvcompBatchedDeflateOpts_t, nvcompBatchedDeflateCompressGetTempSize, nvcompBatchedDeflateCompressGetMaxOutputChunkSize, nvcompBatchedDeflateCompressAsync >> (opts); mime_type = "video/x-nvcomp-deflate"; } else { priv->manager = std::make_shared < DeflateManager > (chunk_size, opts, stream, device_id); } GST_DEBUG_OBJECT (self, "Using DEFLATE"); break; } case GST_NV_COMP_ZSTD: { if (chunk_size == 0) chunk_size = 65536; chunk_size = MAX (32768, chunk_size); chunk_size = GST_ROUND_UP_8 (chunk_size); chunk_size = MIN (chunk_size, nvcompZstdCompressionMaxAllowedChunkSize); if (priv->batched) { batch_size = (state->info.size + chunk_size - 1) / chunk_size; priv->batched_comp = std::make_shared < BatchedComp < nvcompBatchedZstdOpts_t, nvcompBatchedZstdCompressGetTempSize, nvcompBatchedZstdCompressGetMaxOutputChunkSize, nvcompBatchedZstdCompressAsync >> (nvcompBatchedZstdDefaultOpts); mime_type = "video/x-nvcomp-zstd"; } else { priv->manager = std::make_shared < ZstdManager > (chunk_size, nvcompBatchedZstdDefaultOpts, stream, device_id); } GST_DEBUG_OBJECT (self, "Using ZSTD"); break; } case GST_NV_COMP_CASCADED: { if (chunk_size == 0) chunk_size = 4096; chunk_size = MAX (512, chunk_size); chunk_size = GST_ROUND_UP_8 (chunk_size); chunk_size = MIN (chunk_size, 16384); nvcompBatchedCascadedOpts_t opts = nvcompBatchedCascadedDefaultOpts; opts.chunk_size = chunk_size; if (priv->data_type != GST_NV_COMP_DATA_TYPE_DEFAULT) opts.type = (nvcompType_t) priv->data_type; if (priv->batched) { batch_size = (state->info.size + chunk_size - 1) / chunk_size; priv->batched_comp = std::make_shared < BatchedComp < nvcompBatchedCascadedOpts_t, nvcompBatchedCascadedCompressGetTempSize, nvcompBatchedCascadedCompressGetMaxOutputChunkSize, nvcompBatchedCascadedCompressAsync >> (opts); mime_type = "video/x-nvcomp-cascaded"; } else { priv->manager = std::make_shared < CascadedManager > (chunk_size, opts, stream, device_id); } GST_DEBUG_OBJECT (self, "Using CASCADED"); break; } case GST_NV_COMP_BITCOMP: { if (chunk_size == 0) chunk_size = 65536; chunk_size = MAX (32768, chunk_size); chunk_size = GST_ROUND_UP_8 (chunk_size); chunk_size = MIN (chunk_size, nvcompBitcompCompressionMaxAllowedChunkSize); nvcompBatchedBitcompFormatOpts opts = nvcompBatchedBitcompDefaultOpts; opts.algorithm_type = (int) priv->bitcomp_algo; if (priv->data_type != GST_NV_COMP_DATA_TYPE_DEFAULT) opts.data_type = (nvcompType_t) priv->data_type; if (priv->batched) { batch_size = (state->info.size + chunk_size - 1) / chunk_size; priv->batched_comp = std::make_shared < BatchedComp < nvcompBatchedBitcompFormatOpts, nvcompBatchedBitcompCompressGetTempSize, nvcompBatchedBitcompCompressGetMaxOutputChunkSize, nvcompBatchedBitcompCompressAsync >> (opts); mime_type = "video/x-nvcomp-bitcomp"; } else { priv->manager = std::make_shared < BitcompManager > (chunk_size, opts, stream, device_id); } GST_DEBUG_OBJECT (self, "Using BITCOMP"); break; } case GST_NV_COMP_ANS: { if (chunk_size == 0) chunk_size = 65536; chunk_size = MAX (32768, chunk_size); chunk_size = GST_ROUND_UP_8 (chunk_size); chunk_size = MIN (chunk_size, nvcompANSCompressionMaxAllowedChunkSize); if (priv->batched) { batch_size = (state->info.size + chunk_size - 1) / chunk_size; priv->batched_comp = std::make_shared < BatchedComp < nvcompBatchedANSOpts_t, nvcompBatchedANSCompressGetTempSize, nvcompBatchedANSCompressGetMaxOutputChunkSize, nvcompBatchedANSCompressAsync >> (nvcompBatchedANSDefaultOpts); mime_type = "video/x-nvcomp-ans"; } else { priv->manager = std::make_shared < ANSManager > (chunk_size, nvcompBatchedANSDefaultOpts, stream, device_id); } GST_DEBUG_OBJECT (self, "Using ANS"); break; } default: g_assert_not_reached (); return FALSE; } size_t max_output_size = 0; size_t max_output_chunk_size = 0; size_t temp_size = 0; if (priv->batched) { auto status = priv->batched_comp->get_temp_size (batch_size, chunk_size, &temp_size); if (status != nvcompSuccess) { GST_ERROR_OBJECT (self, "Couldn't get temp size"); gst_cuda_context_pop (nullptr); return FALSE; } status = priv->batched_comp->get_max_compressed_chunk_size (chunk_size, &max_output_chunk_size); if (status != nvcompSuccess) { GST_ERROR_OBJECT (self, "Couldn't get max output chunk size"); gst_cuda_context_pop (nullptr); return FALSE; } max_output_chunk_size = GST_ROUND_UP_8 (max_output_chunk_size); max_output_size = max_output_chunk_size * batch_size; } else { priv->config = std::make_shared < CompressionConfig > (priv->manager->configure_compression (state->info.size)); max_output_size = priv->config->max_compressed_buffer_size; } GST_DEBUG_OBJECT (self, "Allocating resource, batched: %d" ", uncompressed size: %" G_GSIZE_FORMAT ", max-output-size: %" G_GSIZE_FORMAT ", batch-size: %" G_GSIZE_FORMAT ", chunk-size: %" G_GSIZE_FORMAT ", max-output-chunk-size: %" G_GSIZE_FORMAT ", temp-size: %" G_GSIZE_FORMAT, priv->batched, state->info.size, max_output_size, batch_size, chunk_size, max_output_chunk_size, temp_size); for (guint i = 0; i < priv->async_depth; i++) { auto task = std::make_shared < EncoderTask > (); task->ctx = (GstCudaContext *) gst_object_ref (priv->ctx); if (!gst_nv_comp_video_enc_alloc_task (self, task.get (), priv->batched, state->info.size, max_output_size, batch_size, chunk_size, max_output_chunk_size, temp_size)) { priv->manager = nullptr; priv->input_task_queue = { }; task = nullptr; gst_cuda_context_pop (nullptr); return FALSE; } priv->input_task_queue.push (task); } /* In case of batched, custom header is added to signal chunk and batch size */ if (priv->batched) { /* version */ max_output_size += sizeof (guint32); /* max uncompressed chunk size */ max_output_size += sizeof (guint32); /* max compressed chunk size */ max_output_size += sizeof (guint32); /* batch size */ max_output_size += sizeof (guint32); /* each uncompressed/compressed chunk size */ max_output_size += (sizeof (guint32) * batch_size * 2); } priv->pool = gst_buffer_pool_new (); auto config = gst_buffer_pool_get_config (priv->pool); gst_buffer_pool_config_set_params (config, nullptr, max_output_size, 0, 0); gst_buffer_pool_set_config (priv->pool, config); gst_buffer_pool_set_active (priv->pool, TRUE); gst_cuda_context_pop (nullptr); auto caps = gst_caps_new_simple (mime_type.c_str (), "format", G_TYPE_STRING, gst_video_format_to_string (GST_VIDEO_INFO_FORMAT (&state->info)), nullptr); auto out_state = gst_video_encoder_set_output_state (GST_VIDEO_ENCODER (encoder), caps, state); gst_video_codec_state_unref (out_state); return TRUE; } static gboolean gst_nv_comp_video_enc_upload (GstNvCompVideoEnc * self, GstVideoFrame * frame, CUstream stream, gboolean is_device_copy) { auto priv = self->priv; auto info = &priv->state->info; auto finfo = info->finfo; gint comp[GST_VIDEO_MAX_COMPONENTS]; CUresult ret = CUDA_SUCCESS; auto cur_task = priv->cur_task; for (guint i = 0; i < GST_VIDEO_FRAME_N_PLANES (frame); i++) { guint8 *sp = (guint8 *) GST_VIDEO_FRAME_PLANE_DATA (frame, i); guint8 *dp; if (is_device_copy) dp = cur_task->device_uncompressed + info->offset[i]; else dp = cur_task->host_uncompressed + info->offset[i]; guint ss, ds; guint w, h; if (GST_VIDEO_FORMAT_INFO_HAS_PALETTE (finfo) && i == 1) { if (is_device_copy) { ret = CuMemcpyDtoDAsync ((CUdeviceptr) dp, (CUdeviceptr) sp, 256 * 4, stream); } else { memcpy (dp, sp, 256 * 4); } if (!gst_cuda_result (ret)) { GST_ERROR_OBJECT (self, "CUDA memcpy failed"); return FALSE; } return TRUE; } ss = GST_VIDEO_FRAME_PLANE_STRIDE (frame, i); ds = GST_VIDEO_INFO_PLANE_STRIDE (info, i); gst_video_format_info_component (finfo, i, comp); w = GST_VIDEO_INFO_COMP_WIDTH (info, comp[0]) * GST_VIDEO_INFO_COMP_PSTRIDE (info, comp[0]); if (w == 0) w = MIN (ss, ds); h = GST_VIDEO_INFO_COMP_HEIGHT (info, comp[0]); if (GST_VIDEO_FORMAT_INFO_IS_TILED (finfo)) { gint tile_size; gint sx_tiles, sy_tiles, dx_tiles, dy_tiles; GstVideoTileMode mode; tile_size = GST_VIDEO_FORMAT_INFO_TILE_SIZE (info->finfo, i); mode = GST_VIDEO_FORMAT_INFO_TILE_MODE (info->finfo); sx_tiles = GST_VIDEO_TILE_X_TILES (ss); sy_tiles = GST_VIDEO_TILE_Y_TILES (ss); dx_tiles = GST_VIDEO_TILE_X_TILES (ds); dy_tiles = GST_VIDEO_TILE_Y_TILES (ds); w = MIN (sx_tiles, dx_tiles); h = MIN (sy_tiles, dy_tiles); for (guint j = 0; j < h; j++) { for (guint k = 0; k < w; k++) { guint si, di; guint8 *cur_dp; guint8 *cur_sp; si = gst_video_tile_get_index (mode, k, j, sx_tiles, sy_tiles); di = gst_video_tile_get_index (mode, k, j, dx_tiles, dy_tiles); cur_dp = dp + (di * tile_size); cur_sp = sp + (si * tile_size); if (is_device_copy) { ret = CuMemcpyDtoDAsync ((CUdeviceptr) cur_dp, (CUdeviceptr) cur_sp, w, stream); } else { memcpy (cur_dp, cur_sp, w); } if (!gst_cuda_result (ret)) { GST_ERROR_OBJECT (self, "CUDA memcpy failed"); return FALSE; } } } } else { if (is_device_copy) { CUDA_MEMCPY2D params = { }; params.srcMemoryType = CU_MEMORYTYPE_DEVICE; params.srcDevice = (CUdeviceptr) sp; params.srcPitch = ss; params.dstMemoryType = CU_MEMORYTYPE_DEVICE; params.dstDevice = (CUdeviceptr) dp; params.dstPitch = ds; params.WidthInBytes = w; params.Height = h; ret = CuMemcpy2DAsync (¶ms, stream); if (!gst_cuda_result (ret)) { GST_ERROR_OBJECT (self, "CUDA memcpy failed"); return FALSE; } } else { for (guint j = 0; j < h; j++) { memcpy (dp, sp, w); dp += ds; sp += ss; } } } } return TRUE; } #ifdef HAVE_GST_GL struct GLInteropData { GstNvCompVideoEnc *self = nullptr; GstBuffer *buffer = nullptr; gboolean ret = FALSE; }; static GstCudaGraphicsResource * ensure_gl_cuda_resource (GstNvCompVideoEnc * self, GstMemory * mem) { auto priv = self->priv; GstCudaGraphicsResource *resource; GQuark quark; if (!gst_is_gl_memory_pbo (mem)) { GST_WARNING_OBJECT (self, "memory is not GL PBO memory, %s", mem->allocator->mem_type); return nullptr; } quark = gst_cuda_quark_from_id (GST_CUDA_QUARK_GRAPHICS_RESOURCE); resource = (GstCudaGraphicsResource *) gst_mini_object_get_qdata (GST_MINI_OBJECT (mem), quark); if (!resource) { GstMapInfo map_info; GstGLMemoryPBO *pbo = (GstGLMemoryPBO *) mem; GstGLBuffer *gl_buf = pbo->pbo; gboolean ret; if (!gst_memory_map (mem, &map_info, (GstMapFlags) (GST_MAP_READ | GST_MAP_GL))) { GST_ERROR_OBJECT (self, "Couldn't map gl memory"); return nullptr; } resource = gst_cuda_graphics_resource_new (priv->ctx, GST_OBJECT (GST_GL_BASE_MEMORY_CAST (mem)->context), GST_CUDA_GRAPHICS_RESOURCE_GL_BUFFER); GST_LOG_OBJECT (self, "registering gl buffer %d to CUDA", gl_buf->id); ret = gst_cuda_graphics_resource_register_gl_buffer (resource, gl_buf->id, CU_GRAPHICS_REGISTER_FLAGS_NONE); gst_memory_unmap (mem, &map_info); if (!ret) { GST_ERROR_OBJECT (self, "Couldn't register gl buffer %d", gl_buf->id); gst_cuda_graphics_resource_free (resource); return nullptr; } gst_mini_object_set_qdata (GST_MINI_OBJECT (mem), quark, resource, (GDestroyNotify) gst_cuda_graphics_resource_free); } return resource; } static void gst_nv_comp_video_enc_upload_gl (GstGLContext * context, GLInteropData * data) { auto self = data->self; auto priv = self->priv; auto info = &priv->state->info; auto finfo = info->finfo; GstCudaGraphicsResource *gst_res[GST_VIDEO_MAX_PLANES] = { nullptr, }; CUgraphicsResource cuda_res[GST_VIDEO_MAX_PLANES] = { nullptr, }; CUdeviceptr src_devptr[GST_VIDEO_MAX_PLANES] = { 0, }; CUstream stream = gst_cuda_stream_get_handle (priv->stream); CUresult ret; gint comp[GST_VIDEO_MAX_COMPONENTS]; auto cur_task = priv->cur_task; if (!gst_cuda_context_push (priv->ctx)) { GST_ERROR_OBJECT (self, "Couldn't push context"); return; } for (guint i = 0; i < GST_VIDEO_INFO_N_PLANES (info); i++) { GstMemory *mem = gst_buffer_peek_memory (data->buffer, i); GstGLMemoryPBO *pbo = (GstGLMemoryPBO *) mem; gsize src_size; if (!gst_is_gl_memory_pbo (mem)) { GST_ERROR_OBJECT (self, "Not a GL PBO memory"); goto out; } gst_res[i] = ensure_gl_cuda_resource (self, mem); if (!gst_res[i]) { GST_ERROR_OBJECT (self, "Couldn't get resource %d", i); goto out; } gst_gl_memory_pbo_upload_transfer (pbo); gst_gl_memory_pbo_download_transfer (pbo); cuda_res[i] = gst_cuda_graphics_resource_map (gst_res[i], stream, CU_GRAPHICS_MAP_RESOURCE_FLAGS_READ_ONLY); if (!cuda_res[i]) { GST_ERROR_OBJECT (self, "Couldn't map resource"); goto out; } ret = CuGraphicsResourceGetMappedPointer (&src_devptr[i], &src_size, cuda_res[i]); if (!gst_cuda_result (ret)) { GST_ERROR_OBJECT (self, "Couldn't get mapped device pointer"); goto out; } } for (guint i = 0; i < GST_VIDEO_INFO_N_PLANES (info); i++) { guint8 *sp = (guint8 *) src_devptr[i]; guint8 *dp = cur_task->device_uncompressed + info->offset[i]; guint ss, ds; guint w, h; if (GST_VIDEO_FORMAT_INFO_HAS_PALETTE (finfo) && i == 1) { ret = CuMemcpyDtoDAsync ((CUdeviceptr) dp, (CUdeviceptr) sp, 256 * 4, stream); if (!gst_cuda_result (ret)) { GST_ERROR_OBJECT (self, "CUDA memcpy failed"); goto out; } data->ret = TRUE; goto out; } auto meta = gst_buffer_get_video_meta (data->buffer); if (meta) ss = meta->stride[i]; else ss = GST_VIDEO_INFO_PLANE_STRIDE (info, i); ds = GST_VIDEO_INFO_PLANE_STRIDE (info, i); gst_video_format_info_component (finfo, i, comp); w = GST_VIDEO_INFO_COMP_WIDTH (info, comp[0]) * GST_VIDEO_INFO_COMP_PSTRIDE (info, comp[0]); if (w == 0) w = MIN (ss, ds); h = GST_VIDEO_INFO_COMP_HEIGHT (info, comp[0]); if (GST_VIDEO_FORMAT_INFO_IS_TILED (finfo)) { gint tile_size; gint sx_tiles, sy_tiles, dx_tiles, dy_tiles; GstVideoTileMode mode; tile_size = GST_VIDEO_FORMAT_INFO_TILE_SIZE (info->finfo, i); mode = GST_VIDEO_FORMAT_INFO_TILE_MODE (info->finfo); sx_tiles = GST_VIDEO_TILE_X_TILES (ss); sy_tiles = GST_VIDEO_TILE_Y_TILES (ss); dx_tiles = GST_VIDEO_TILE_X_TILES (ds); dy_tiles = GST_VIDEO_TILE_Y_TILES (ds); w = MIN (sx_tiles, dx_tiles); h = MIN (sy_tiles, dy_tiles); for (guint j = 0; j < h; j++) { for (guint k = 0; k < w; k++) { guint si, di; guint8 *cur_dp; guint8 *cur_sp; si = gst_video_tile_get_index (mode, k, j, sx_tiles, sy_tiles); di = gst_video_tile_get_index (mode, k, j, dx_tiles, dy_tiles); cur_dp = dp + (di * tile_size); cur_sp = sp + (si * tile_size); ret = CuMemcpyDtoDAsync ((CUdeviceptr) cur_dp, (CUdeviceptr) cur_sp, w, stream); if (!gst_cuda_result (ret)) { GST_ERROR_OBJECT (self, "CUDA memcpy failed"); goto out; } } } } else { CUDA_MEMCPY2D params = { }; params.srcMemoryType = CU_MEMORYTYPE_DEVICE; params.srcDevice = (CUdeviceptr) sp; params.srcPitch = ss; params.dstMemoryType = CU_MEMORYTYPE_DEVICE; params.dstDevice = (CUdeviceptr) dp; params.dstPitch = ds; params.WidthInBytes = w; params.Height = h; ret = CuMemcpy2DAsync (¶ms, stream); if (!gst_cuda_result (ret)) { GST_ERROR_OBJECT (self, "CUDA memcpy failed"); goto out; } } } data->ret = TRUE; out: for (guint i = 0; i < gst_buffer_n_memory (data->buffer); i++) { if (!gst_res[i]) break; gst_cuda_graphics_resource_unmap (gst_res[i], stream); } CuStreamSynchronize (stream); gst_cuda_context_pop (nullptr); } #endif static gpointer gst_nv_comp_video_enc_thread_func (GstNvCompVideoEnc * self) { auto encoder = GST_VIDEO_ENCODER (self); auto priv = self->priv; GST_DEBUG_OBJECT (self, "Entering loop"); while (1) { std::shared_ptr < EncoderTask > task; { std::unique_lock < std::mutex > lk (priv->output_lock); while (priv->output_task_queue.empty ()) priv->output_cond.wait (lk); task = priv->output_task_queue.front (); priv->output_task_queue.pop (); } if (!task) { GST_DEBUG_OBJECT (self, "Got empty task, terminate"); break; } auto frame = gst_video_encoder_get_oldest_frame (encoder); gst_cuda_context_push (priv->ctx); CuEventSynchronize (task->event); gst_cuda_context_pop (nullptr); gst_buffer_pool_acquire_buffer (priv->pool, &frame->output_buffer, nullptr); GstMapInfo map_info; gst_buffer_map (frame->output_buffer, &map_info, GST_MAP_WRITE); if (task->batched) { task->compressed_size = 0; auto dst = (uint8_t *) map_info.data; /* Write custom header */ GST_WRITE_UINT32_LE (dst, GST_NV_COMP_HEADER_VERSION); dst += sizeof (guint32); task->compressed_size += sizeof (guint32); GST_WRITE_UINT32_LE (dst, task->chunk_size); dst += sizeof (guint32); task->compressed_size += sizeof (guint32); GST_WRITE_UINT32_LE (dst, task->max_output_chunk_size); dst += sizeof (guint32); task->compressed_size += sizeof (guint32); GST_WRITE_UINT32_LE (dst, task->batch_size); dst += sizeof (guint32); task->compressed_size += sizeof (guint32); for (size_t i = 0; i < task->batch_size; i++) { GST_WRITE_UINT32_LE (dst, task->host_uncompressed_bytes[i]); dst += sizeof (guint32); task->compressed_size += sizeof (guint32); GST_WRITE_UINT32_LE (dst, task->host_compressed_bytes[i]); dst += sizeof (guint32); task->compressed_size += sizeof (guint32); } /* Write compressed data */ for (size_t i = 0; i < task->batch_size; i++) { auto size = task->host_compressed_bytes[i]; auto src = task->host_compressed + (i * task->max_output_chunk_size); memcpy (dst, src, size); dst += size; task->compressed_size += size; } } else { memcpy (map_info.data, task->host_compressed, task->compressed_size); } gst_buffer_unmap (frame->output_buffer, &map_info); if (task->compressed_size > 0) { gst_buffer_set_size (frame->output_buffer, task->compressed_size); frame->dts = frame->pts; auto ratio = (double) priv->state->info.size / task->compressed_size; GST_LOG_OBJECT (self, "compressed buffer size %" G_GSIZE_FORMAT ", ratio %.2f", task->compressed_size, ratio); } else { GST_ERROR_OBJECT (self, "Zero compressed size"); gst_clear_buffer (&frame->output_buffer); } { std::lock_guard < std::mutex > lk (priv->input_lock); priv->input_task_queue.push (task); priv->input_cond.notify_all (); } priv->last_flow = gst_video_encoder_finish_frame (encoder, frame); }; GST_DEBUG_OBJECT (self, "Leaving loop"); return nullptr; } static GstFlowReturn gst_nv_comp_video_enc_handle_frame (GstVideoEncoder * encoder, GstVideoCodecFrame * frame) { auto self = GST_NV_COMP_VIDEO_ENC (encoder); auto priv = self->priv; GstMemory *mem; CUstream stream = nullptr; GstVideoFrame vframe; auto info = &priv->state->info; size_t compressed_size = 0; gboolean need_copy = TRUE; std::shared_ptr < EncoderTask > task; if (!priv->ctx || (!priv->manager && !priv->batched_comp)) { GST_ERROR_OBJECT (self, "Context was not configured"); goto error; } if (priv->last_flow != GST_FLOW_OK) { GST_INFO_OBJECT (self, "Last flow was %s", gst_flow_get_name (priv->last_flow)); gst_video_encoder_finish_frame (encoder, frame); return priv->last_flow; } if (!priv->encode_thread) { priv->encode_thread = g_thread_new ("nvcompvideoenc", (GThreadFunc) gst_nv_comp_video_enc_thread_func, self); } GST_VIDEO_ENCODER_STREAM_UNLOCK (encoder); { std::unique_lock < std::mutex > lk (priv->input_lock); while (priv->input_task_queue.empty ()) priv->input_cond.wait (lk); priv->cur_task = priv->input_task_queue.front (); priv->input_task_queue.pop (); } GST_VIDEO_ENCODER_STREAM_LOCK (encoder); mem = gst_buffer_peek_memory (frame->input_buffer, 0); #ifdef HAVE_GST_GL if (priv->gl_interop && gst_is_gl_memory (mem) && gst_buffer_n_memory (frame->input_buffer) == GST_VIDEO_INFO_N_PLANES (info)) { GLInteropData interop_data; interop_data.self = self; interop_data.buffer = frame->input_buffer; interop_data.ret = FALSE; auto gl_mem = (GstGLMemory *) mem; gst_gl_context_thread_add (gl_mem->mem.context, (GstGLContextThreadFunc) gst_nv_comp_video_enc_upload_gl, &interop_data); if (interop_data.ret) { need_copy = FALSE; GST_TRACE_OBJECT (self, "GL -> CUDA copy done"); } else { priv->gl_interop = FALSE; } } #endif if (!gst_cuda_context_push (priv->ctx)) { GST_ERROR_OBJECT (self, "Couldn't push context"); std::lock_guard < std::mutex > lk (priv->input_lock); priv->input_task_queue.push (std::move (priv->cur_task)); goto error; } stream = gst_cuda_stream_get_handle (priv->stream); if (need_copy) { gboolean device_copy = FALSE; if (gst_is_cuda_memory (mem)) { GstCudaMemory *cmem = GST_CUDA_MEMORY_CAST (mem); if (cmem->context == priv->ctx) { device_copy = TRUE; if (!gst_video_frame_map (&vframe, info, frame->input_buffer, (GstMapFlags) (GST_MAP_READ | GST_MAP_CUDA))) { GST_ERROR_OBJECT (self, "Couldn't map cuda memory"); gst_cuda_context_pop (nullptr); std::lock_guard < std::mutex > lk (priv->input_lock); priv->input_task_queue.push (std::move (priv->cur_task)); goto error; } if (gst_cuda_memory_get_stream (cmem) != priv->stream) { GST_DEBUG_OBJECT (self, "Different stream, need sync"); gst_cuda_memory_sync (cmem); } } } if (!device_copy && !gst_video_frame_map (&vframe, info, frame->input_buffer, GST_MAP_READ)) { GST_ERROR_OBJECT (self, "Couldn't map input frame"); gst_cuda_context_pop (nullptr); std::lock_guard < std::mutex > lk (priv->input_lock); priv->input_task_queue.push (std::move (priv->cur_task)); goto error; } if (!gst_nv_comp_video_enc_upload (self, &vframe, stream, device_copy)) { gst_video_frame_unmap (&vframe); gst_cuda_context_pop (nullptr); std::lock_guard < std::mutex > lk (priv->input_lock); priv->input_task_queue.push (std::move (priv->cur_task)); goto error; } gst_video_frame_unmap (&vframe); if (!device_copy) { CuMemcpyHtoDAsync ((CUdeviceptr) priv->cur_task->device_uncompressed, priv->cur_task->host_uncompressed, info->size, stream); } } task = std::move (priv->cur_task); if (task->batched) { g_assert (priv->batched_comp); auto status = priv->batched_comp->compress (task->device_uncompressed_ptrs, task->device_uncompressed_bytes, task->chunk_size, task->batch_size, task->temp_ptr, task->temp_size, task->device_compressed_ptrs, task->device_compressed_bytes, (cudaStream_t) stream); if (status != nvcompSuccess) { GST_ERROR_OBJECT (self, "Compression failed, ret %d", status); gst_cuda_context_pop (nullptr); std::lock_guard < std::mutex > lk (priv->input_lock); priv->input_task_queue.push (std::move (task)); goto error; } CuMemcpyDtoHAsync (task->host_compressed_bytes, (CUdeviceptr) task->device_compressed_bytes, sizeof (size_t) * task->batch_size, stream); CuMemcpyDtoHAsync (task->host_compressed, (CUdeviceptr) task->device_compressed, task->compressed_alloc_size, stream); } else { g_assert (priv->manager); priv->manager->compress (task->device_uncompressed, task->device_compressed, *priv->config); compressed_size = priv->manager->get_compressed_output_size (task->device_compressed); task->compressed_size = compressed_size; CuMemcpyDtoHAsync (task->host_compressed, (CUdeviceptr) task->device_compressed, compressed_size, stream); } CuEventRecord (task->event, stream); gst_cuda_context_pop (nullptr); { std::lock_guard < std::mutex > lk (priv->output_lock); priv->output_task_queue.push (std::move (task)); priv->output_cond.notify_one (); } gst_video_codec_frame_unref (frame); return priv->last_flow; error: gst_video_encoder_finish_frame (encoder, frame); return GST_FLOW_ERROR; }