mirror of
https://gitlab.freedesktop.org/gstreamer/gstreamer.git
synced 2024-11-27 04:01:08 +00:00
0ed9c39835
Adding NVIDIA nvCOMP library based plugin for lossless raw video compression/decompression. To build this plugin, user should install nvCOMP SDK first and specify the SDK path via "nvcomp-sdk-path" build option or NVCOMP_SDK_PATH env. Part-of: <https://gitlab.freedesktop.org/gstreamer/gstreamer/-/merge_requests/6912>
2014 lines
61 KiB
C++
2014 lines
61 KiB
C++
/* GStreamer
|
|
* Copyright (C) 2024 Seungha Yang <seungha@centricular.com>
|
|
*
|
|
* This library is free software; you can redistribute it and/or
|
|
* modify it under the terms of the GNU Library General Public
|
|
* License as published by the Free Software Foundation; either
|
|
* version 2 of the License, or (at your option) any later version.
|
|
*
|
|
* This library is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
* Library General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU Library General Public
|
|
* License along with this library; if not, write to the
|
|
* Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
|
|
* Boston, MA 02110-1301, USA.
|
|
*/
|
|
|
|
#ifdef HAVE_CONFIG_H
|
|
#include "config.h"
|
|
#endif
|
|
|
|
#include "gstnvcompvideoenc.h"
|
|
#ifdef HAVE_GST_GL
|
|
#include <gst/gl/gl.h>
|
|
#include <gst/gl/gstglfuncs.h>
|
|
#endif
|
|
|
|
#include <nvcomp/ans.hpp>
|
|
#include <nvcomp/bitcomp.hpp>
|
|
#include <nvcomp/cascaded.hpp>
|
|
#include <nvcomp/deflate.hpp>
|
|
#include <nvcomp/gdeflate.hpp>
|
|
#include <nvcomp/lz4.hpp>
|
|
#include <nvcomp/snappy.hpp>
|
|
#include <nvcomp/zstd.hpp>
|
|
#include <memory>
|
|
#include <mutex>
|
|
#include <condition_variable>
|
|
#include <queue>
|
|
#include <atomic>
|
|
#include <string>
|
|
#include <string.h>
|
|
|
|
GST_DEBUG_CATEGORY_STATIC (gst_nv_comp_video_enc_debug);
|
|
#define GST_CAT_DEFAULT gst_nv_comp_video_enc_debug
|
|
|
|
#ifdef HAVE_GST_GL
|
|
#define SINK_CAPS \
|
|
GST_VIDEO_CAPS_MAKE_WITH_FEATURES (GST_CAPS_FEATURE_MEMORY_CUDA_MEMORY, \
|
|
GST_VIDEO_FORMATS_ALL) ";" \
|
|
GST_VIDEO_CAPS_MAKE_WITH_FEATURES (GST_CAPS_FEATURE_MEMORY_GL_MEMORY, \
|
|
GST_VIDEO_FORMATS_ALL) ";" \
|
|
GST_VIDEO_CAPS_MAKE (GST_VIDEO_FORMATS_ALL)
|
|
#else
|
|
#define SINK_CAPS \
|
|
GST_VIDEO_CAPS_MAKE_WITH_FEATURES (GST_CAPS_FEATURE_MEMORY_CUDA_MEMORY, \
|
|
GST_VIDEO_FORMATS_ALL) ";" \
|
|
GST_VIDEO_CAPS_MAKE (GST_VIDEO_FORMATS_ALL)
|
|
#endif
|
|
|
|
static GstStaticPadTemplate sink_template =
|
|
GST_STATIC_PAD_TEMPLATE ("sink", GST_PAD_SINK, GST_PAD_ALWAYS,
|
|
GST_STATIC_CAPS (SINK_CAPS));
|
|
|
|
static GstStaticPadTemplate src_template =
|
|
GST_STATIC_PAD_TEMPLATE ("src", GST_PAD_SRC, GST_PAD_ALWAYS,
|
|
GST_STATIC_CAPS ("video/x-nvcomp; video/x-nvcomp-lz4; "
|
|
"video/x-nvcomp-snappy; video/x-nvcomp-gdeflate; "
|
|
"video/x-nvcomp-deflate; video/x-nvcomp-zstd; video/x-nvcomp-cascaded; "
|
|
"video/x-nvcomp-bitcomp; video/x-nvcomp-ans"));
|
|
|
|
enum GstNvCompDataType
|
|
{
|
|
GST_NV_COMP_DATA_TYPE_DEFAULT = -1,
|
|
GST_NV_COMP_DATA_TYPE_CHAR = NVCOMP_TYPE_CHAR,
|
|
GST_NV_COMP_DATA_TYPE_UCHAR = NVCOMP_TYPE_UCHAR,
|
|
GST_NV_COMP_DATA_TYPE_SHORT = NVCOMP_TYPE_SHORT,
|
|
GST_NV_COMP_DATA_TYPE_USHORT = NVCOMP_TYPE_USHORT,
|
|
GST_NV_COMP_DATA_TYPE_INT = NVCOMP_TYPE_INT,
|
|
GST_NV_COMP_DATA_TYPE_UINT = NVCOMP_TYPE_UINT,
|
|
GST_NV_COMP_DATA_TYPE_LONGLONG = NVCOMP_TYPE_LONGLONG,
|
|
GST_NV_COMP_DATA_TYPE_ULONGLONG = NVCOMP_TYPE_ULONGLONG,
|
|
GST_NV_COMP_DATA_TYPE_BITS = NVCOMP_TYPE_BITS,
|
|
};
|
|
|
|
#define GST_TYPE_NV_COMP_DATA_TYPE (gst_nv_comp_data_type_type())
|
|
static GType
|
|
gst_nv_comp_data_type_type (void)
|
|
{
|
|
static GType data_type = 0;
|
|
static std::once_flag once;
|
|
static const GEnumValue types[] = {
|
|
{GST_NV_COMP_DATA_TYPE_DEFAULT, "Default", "default"},
|
|
{GST_NV_COMP_DATA_TYPE_CHAR, "CHAR", "char"},
|
|
{GST_NV_COMP_DATA_TYPE_UCHAR, "UCHAR", "uchar"},
|
|
{GST_NV_COMP_DATA_TYPE_SHORT, "SHORT", "short"},
|
|
{GST_NV_COMP_DATA_TYPE_USHORT, "USHORT", "ushort"},
|
|
{GST_NV_COMP_DATA_TYPE_INT, "INT", "int"},
|
|
{GST_NV_COMP_DATA_TYPE_UINT, "UINT", "uint"},
|
|
{GST_NV_COMP_DATA_TYPE_LONGLONG, "LONGLONG", "longlong"},
|
|
{GST_NV_COMP_DATA_TYPE_ULONGLONG, "ULONGLONG", "ulonglong"},
|
|
{GST_NV_COMP_DATA_TYPE_BITS, "BITS", "bits"},
|
|
{0, nullptr, nullptr},
|
|
};
|
|
|
|
std::call_once (once,[&] {
|
|
data_type = g_enum_register_static ("GstNvCompDataType", types);
|
|
});
|
|
|
|
return data_type;
|
|
}
|
|
|
|
enum GstNvCompDeflateAlgo
|
|
{
|
|
GST_NV_COMP_DEFLATE_HIGH_THROUGHPUT,
|
|
GST_NV_COMP_DEFLATE_LOW_THROUGHPUT,
|
|
GST_NV_COMP_DEFLATE_HIGHEST_THROUGHPUT,
|
|
};
|
|
|
|
#define GST_TYPE_NV_COMP_DEFLATE_ALGO (gst_nv_comp_deflate_algo_get_type())
|
|
static GType
|
|
gst_nv_comp_deflate_algo_get_type (void)
|
|
{
|
|
static GType algo_type = 0;
|
|
static std::once_flag once;
|
|
static const GEnumValue algo[] = {
|
|
{GST_NV_COMP_DEFLATE_HIGH_THROUGHPUT,
|
|
"High throughput, low compression ratio", "high-throughput"},
|
|
{GST_NV_COMP_DEFLATE_LOW_THROUGHPUT,
|
|
"Low throughput, high compression ratio", "low-throughput"},
|
|
{GST_NV_COMP_DEFLATE_HIGHEST_THROUGHPUT,
|
|
"Highest throughput, entropy-only compression", "highest-throughput"},
|
|
{0, nullptr, nullptr},
|
|
};
|
|
|
|
std::call_once (once,[&] {
|
|
algo_type = g_enum_register_static ("GstNvCompDeflateAlgo", algo);
|
|
});
|
|
|
|
return algo_type;
|
|
}
|
|
|
|
enum GstNvCompBitcompAlgo
|
|
{
|
|
GST_NV_COMP_BITCOMP_DEFAULT,
|
|
GST_NV_COMP_BITCOMP_SPARSE,
|
|
};
|
|
|
|
#define GST_TYPE_NV_COMP_BITCOMP_ALGO (gst_nv_comp_bitcomp_algo_get_type())
|
|
static GType
|
|
gst_nv_comp_bitcomp_algo_get_type (void)
|
|
{
|
|
static GType algo_type = 0;
|
|
static std::once_flag once;
|
|
static const GEnumValue algo[] = {
|
|
{GST_NV_COMP_BITCOMP_DEFAULT, "Default", "default"},
|
|
{GST_NV_COMP_BITCOMP_SPARSE, "Sparse", "sparse"},
|
|
{0, nullptr, nullptr},
|
|
};
|
|
|
|
std::call_once (once,[&] {
|
|
algo_type = g_enum_register_static ("GstNvCompBitcompAlgo", algo);
|
|
});
|
|
|
|
return algo_type;
|
|
}
|
|
|
|
enum
|
|
{
|
|
PROP_0,
|
|
PROP_METHOD,
|
|
PROP_DEFLATE_ALGO,
|
|
PROP_BITCOMP_ALGO,
|
|
PROP_DATA_TYPE,
|
|
PROP_CHUNK_SIZE,
|
|
PROP_ASYNC_DEPTH,
|
|
PROP_BATCHED,
|
|
};
|
|
|
|
#define DEFAULT_METHOD GST_NV_COMP_BITCOMP
|
|
#define DEFAULT_DEFLATE_ALGO GST_NV_COMP_DEFLATE_HIGH_THROUGHPUT
|
|
#define DEFAULT_BITCOMP_ALGO GST_NV_COMP_BITCOMP_SPARSE
|
|
#define DEFAULT_DATA_TYPE GST_NV_COMP_DATA_TYPE_DEFAULT
|
|
#define DEFAULT_CHUNK_SIZE 0
|
|
#define DEFAULT_BATCHED TRUE
|
|
#define DEFAULT_ASYNC_DEPTH 2
|
|
|
|
/* *INDENT-OFF* */
|
|
using namespace nvcomp;
|
|
|
|
struct EncoderTask
|
|
{
|
|
~EncoderTask ()
|
|
{
|
|
if (ctx) {
|
|
gst_cuda_context_push (ctx);
|
|
if (event)
|
|
CuEventDestroy (event);
|
|
if (device_uncompressed)
|
|
CuMemFree ((CUdeviceptr) device_uncompressed);
|
|
if (host_uncompressed)
|
|
CuMemFreeHost (host_uncompressed);
|
|
if (device_compressed)
|
|
CuMemFree ((CUdeviceptr) device_compressed);
|
|
if (host_compressed)
|
|
CuMemFreeHost (host_compressed);
|
|
if (device_uncompressed_bytes)
|
|
CuMemFree ((CUdeviceptr) device_uncompressed_bytes);
|
|
if (device_uncompressed_ptrs)
|
|
CuMemFree ((CUdeviceptr) device_uncompressed_ptrs);
|
|
if (device_compressed_bytes)
|
|
CuMemFree ((CUdeviceptr) device_compressed_bytes);
|
|
if (host_uncompressed_bytes)
|
|
CuMemFreeHost (host_uncompressed_bytes);
|
|
if (host_uncompressed_ptrs)
|
|
CuMemFreeHost (host_uncompressed_ptrs);
|
|
if (device_compressed_ptrs)
|
|
CuMemFree ((CUdeviceptr) device_compressed_ptrs);
|
|
if (host_compressed_bytes)
|
|
CuMemFreeHost (host_compressed_bytes);
|
|
if (host_compressed_ptrs)
|
|
CuMemFreeHost (host_compressed_ptrs);
|
|
if (temp_ptr)
|
|
CuMemFree ((CUdeviceptr) temp_ptr);
|
|
|
|
gst_cuda_context_pop (nullptr);
|
|
gst_object_unref (ctx);
|
|
}
|
|
}
|
|
|
|
GstCudaContext *ctx = nullptr;
|
|
CUevent event = nullptr;
|
|
uint8_t *device_uncompressed = nullptr;
|
|
uint8_t *host_uncompressed = nullptr;
|
|
|
|
uint8_t *device_compressed = nullptr;
|
|
uint8_t *host_compressed = nullptr;
|
|
|
|
size_t *device_uncompressed_bytes = nullptr;
|
|
void **device_uncompressed_ptrs = nullptr;
|
|
|
|
size_t *host_uncompressed_bytes = nullptr;
|
|
void **host_uncompressed_ptrs = nullptr;
|
|
|
|
size_t *device_compressed_bytes = nullptr;
|
|
void **device_compressed_ptrs = nullptr;
|
|
|
|
size_t *host_compressed_bytes = nullptr;
|
|
void **host_compressed_ptrs = nullptr;
|
|
|
|
void *temp_ptr = nullptr;
|
|
size_t temp_size = 0;
|
|
|
|
size_t compressed_size = 0;
|
|
|
|
gboolean batched;
|
|
size_t batch_size;
|
|
size_t chunk_size;
|
|
size_t max_output_chunk_size;
|
|
size_t compressed_alloc_size;
|
|
};
|
|
|
|
struct BatchedCompBase
|
|
{
|
|
virtual nvcompStatus_t get_temp_size(
|
|
size_t batch_size,
|
|
size_t max_uncompressed_chunk_bytes,
|
|
size_t * temp_bytes) = 0;
|
|
|
|
virtual nvcompStatus_t get_max_compressed_chunk_size(
|
|
size_t max_uncompressed_chunk_bytes,
|
|
size_t * max_compressed_bytes) = 0;
|
|
|
|
virtual nvcompStatus_t compress(
|
|
void **device_uncompressed_ptrs,
|
|
size_t *device_uncompressed_bytes,
|
|
size_t max_uncompressed_chunk_bytes,
|
|
size_t batch_size,
|
|
void *device_temp_ptr,
|
|
size_t temp_bytes,
|
|
void **device_compressed_ptrs,
|
|
size_t *device_compressed_bytes,
|
|
cudaStream_t stream) = 0;
|
|
};
|
|
|
|
template <typename FormatOptT, auto T, auto O, auto C>
|
|
class BatchedComp : public BatchedCompBase
|
|
{
|
|
public:
|
|
BatchedComp (const FormatOptT & opt) : opts_(opt) {}
|
|
|
|
nvcompStatus_t get_temp_size(
|
|
size_t batch_size,
|
|
size_t max_uncompressed_chunk_bytes,
|
|
size_t * temp_bytes)
|
|
{
|
|
return T (batch_size, max_uncompressed_chunk_bytes, opts_, temp_bytes);
|
|
}
|
|
|
|
nvcompStatus_t get_max_compressed_chunk_size(
|
|
size_t max_uncompressed_chunk_bytes,
|
|
size_t * max_compressed_bytes)
|
|
{
|
|
return O (max_uncompressed_chunk_bytes, opts_, max_compressed_bytes);
|
|
}
|
|
|
|
nvcompStatus_t compress(
|
|
void **device_uncompressed_ptrs,
|
|
size_t *device_uncompressed_bytes,
|
|
size_t max_uncompressed_chunk_bytes,
|
|
size_t batch_size,
|
|
void *device_temp_ptr,
|
|
size_t temp_bytes,
|
|
void **device_compressed_ptrs,
|
|
size_t *device_compressed_bytes,
|
|
cudaStream_t stream)
|
|
{
|
|
return C (device_uncompressed_ptrs, device_uncompressed_bytes,
|
|
max_uncompressed_chunk_bytes, batch_size, device_temp_ptr, temp_bytes,
|
|
device_compressed_ptrs, device_compressed_bytes, opts_, stream);
|
|
}
|
|
|
|
private:
|
|
FormatOptT opts_;
|
|
};
|
|
|
|
struct GstNvCompVideoEncPrivate
|
|
{
|
|
GstCudaContext *ctx = nullptr;
|
|
GstCudaStream *stream = nullptr;
|
|
|
|
#ifdef HAVE_GST_GL
|
|
GstGLDisplay *gl_display = nullptr;
|
|
GstGLContext *gl_context = nullptr;
|
|
GstGLContext *other_gl_context = nullptr;
|
|
#endif
|
|
|
|
GstBufferPool *pool = nullptr;
|
|
|
|
GstVideoCodecState *state = nullptr;
|
|
std::shared_ptr<nvcompManagerBase> manager;
|
|
std::shared_ptr<CompressionConfig> config;
|
|
std::shared_ptr<BatchedCompBase> batched_comp;
|
|
|
|
gboolean gl_interop = FALSE;
|
|
|
|
std::mutex lock;
|
|
std::mutex input_lock;
|
|
std::condition_variable input_cond;
|
|
std::mutex output_lock;
|
|
std::condition_variable output_cond;
|
|
|
|
std::queue<std::shared_ptr<EncoderTask>> input_task_queue;
|
|
std::queue<std::shared_ptr<EncoderTask>> output_task_queue;
|
|
std::shared_ptr<EncoderTask> cur_task;
|
|
GThread *encode_thread = nullptr;
|
|
std::atomic<GstFlowReturn> last_flow = { GST_FLOW_OK };
|
|
|
|
GstNvCompMethod method = DEFAULT_METHOD;
|
|
GstNvCompDeflateAlgo deflate_algo = DEFAULT_DEFLATE_ALGO;
|
|
GstNvCompBitcompAlgo bitcomp_algo = DEFAULT_BITCOMP_ALGO;
|
|
GstNvCompDataType data_type = DEFAULT_DATA_TYPE;
|
|
guint chunk_size = DEFAULT_CHUNK_SIZE;
|
|
gboolean batched = DEFAULT_BATCHED;
|
|
guint async_depth = DEFAULT_ASYNC_DEPTH;
|
|
};
|
|
/* *INDENT-ON* */
|
|
|
|
struct _GstNvCompVideoEnc
|
|
{
|
|
GstVideoEncoder parent;
|
|
GstNvCompVideoEncPrivate *priv;
|
|
};
|
|
|
|
static void gst_nv_comp_video_enc_finalize (GObject * object);
|
|
static void gst_nv_comp_video_enc_set_property (GObject * object, guint prop_id,
|
|
const GValue * value, GParamSpec * pspec);
|
|
static void gst_nv_comp_video_enc_get_property (GObject * object, guint prop_id,
|
|
GValue * value, GParamSpec * pspec);
|
|
|
|
static void gst_nv_comp_video_enc_set_context (GstElement * element,
|
|
GstContext * context);
|
|
|
|
static gboolean gst_nv_comp_video_enc_open (GstVideoEncoder * encoder);
|
|
static gboolean gst_nv_comp_video_enc_close (GstVideoEncoder * encoder);
|
|
static gboolean gst_nv_comp_video_enc_stop (GstVideoEncoder * encoder);
|
|
static gboolean gst_nv_comp_video_enc_flush (GstVideoEncoder * encoder);
|
|
static GstFlowReturn gst_nv_comp_video_enc_finish (GstVideoEncoder * encoder);
|
|
static gboolean gst_nv_comp_video_enc_sink_query (GstVideoEncoder * encoder,
|
|
GstQuery * query);
|
|
static gboolean gst_nv_comp_video_enc_src_query (GstVideoEncoder * encoder,
|
|
GstQuery * query);
|
|
static gboolean
|
|
gst_nv_comp_video_enc_propose_allocation (GstVideoEncoder * encoder,
|
|
GstQuery * query);
|
|
static gboolean gst_nv_comp_video_enc_set_format (GstVideoEncoder * encoder,
|
|
GstVideoCodecState * state);
|
|
static GstFlowReturn
|
|
gst_nv_comp_video_enc_handle_frame (GstVideoEncoder * encoder,
|
|
GstVideoCodecFrame * frame);
|
|
|
|
#define gst_nv_comp_video_enc_parent_class parent_class
|
|
G_DEFINE_TYPE (GstNvCompVideoEnc,
|
|
gst_nv_comp_video_enc, GST_TYPE_VIDEO_ENCODER);
|
|
|
|
static void
|
|
gst_nv_comp_video_enc_class_init (GstNvCompVideoEncClass * klass)
|
|
{
|
|
auto object_class = G_OBJECT_CLASS (klass);
|
|
auto element_class = GST_ELEMENT_CLASS (klass);
|
|
auto encoder_class = GST_VIDEO_ENCODER_CLASS (klass);
|
|
|
|
object_class->finalize = gst_nv_comp_video_enc_finalize;
|
|
object_class->set_property = gst_nv_comp_video_enc_set_property;
|
|
object_class->get_property = gst_nv_comp_video_enc_get_property;
|
|
|
|
g_object_class_install_property (object_class, PROP_METHOD,
|
|
g_param_spec_enum ("method", "Method",
|
|
"Compression method",
|
|
GST_TYPE_NV_COMP_METHOD, DEFAULT_METHOD,
|
|
(GParamFlags) (G_PARAM_READWRITE | G_PARAM_STATIC_STRINGS)));
|
|
g_object_class_install_property (object_class, PROP_DEFLATE_ALGO,
|
|
g_param_spec_enum ("deflate-algo", "Deflate Algo",
|
|
"Algorithm to use for deflate and gdeflate methods",
|
|
GST_TYPE_NV_COMP_DEFLATE_ALGO, DEFAULT_DEFLATE_ALGO,
|
|
(GParamFlags) (G_PARAM_READWRITE | G_PARAM_STATIC_STRINGS)));
|
|
g_object_class_install_property (object_class, PROP_BITCOMP_ALGO,
|
|
g_param_spec_enum ("bitcomp-algo", "Bitcomp Algo",
|
|
"Algorithm to use for bitcomp method",
|
|
GST_TYPE_NV_COMP_BITCOMP_ALGO, DEFAULT_BITCOMP_ALGO,
|
|
(GParamFlags) (G_PARAM_READWRITE | G_PARAM_STATIC_STRINGS)));
|
|
g_object_class_install_property (object_class, PROP_DATA_TYPE,
|
|
g_param_spec_enum ("data-type", "Data Type",
|
|
"Compression data type",
|
|
GST_TYPE_NV_COMP_DATA_TYPE, DEFAULT_DATA_TYPE,
|
|
(GParamFlags) (G_PARAM_READWRITE | G_PARAM_STATIC_STRINGS)));
|
|
g_object_class_install_property (object_class, PROP_CHUNK_SIZE,
|
|
g_param_spec_uint ("chunk-size", "Chunk Size",
|
|
"Uncompressed chunk size for batched compression (0 = default)",
|
|
0, G_MAXINT32, DEFAULT_CHUNK_SIZE,
|
|
(GParamFlags) (G_PARAM_READWRITE | GST_PARAM_MUTABLE_READY |
|
|
G_PARAM_STATIC_STRINGS)));
|
|
g_object_class_install_property (object_class, PROP_BATCHED,
|
|
g_param_spec_boolean ("batched", "Batched",
|
|
"Use low-level C API for batched operation", DEFAULT_BATCHED,
|
|
(GParamFlags) (G_PARAM_READWRITE | GST_PARAM_MUTABLE_READY |
|
|
G_PARAM_STATIC_STRINGS)));
|
|
g_object_class_install_property (object_class, PROP_ASYNC_DEPTH,
|
|
g_param_spec_uint ("async-depth", "Async Depth",
|
|
"Internal resource pool size for threaded encoding",
|
|
1, 4, DEFAULT_ASYNC_DEPTH,
|
|
(GParamFlags) (G_PARAM_READWRITE | GST_PARAM_MUTABLE_READY |
|
|
G_PARAM_STATIC_STRINGS)));
|
|
|
|
gst_element_class_add_static_pad_template (element_class, &sink_template);
|
|
gst_element_class_add_static_pad_template (element_class, &src_template);
|
|
|
|
gst_element_class_set_static_metadata (element_class,
|
|
"nvCOMP Video Encoder", "Encoder/Video/Hardware",
|
|
"Lossless video compression element based on nvCOMP library",
|
|
"Seungha Yang <seungha@centricular.com>");
|
|
|
|
element_class->set_context =
|
|
GST_DEBUG_FUNCPTR (gst_nv_comp_video_enc_set_context);
|
|
|
|
encoder_class->open = GST_DEBUG_FUNCPTR (gst_nv_comp_video_enc_open);
|
|
encoder_class->close = GST_DEBUG_FUNCPTR (gst_nv_comp_video_enc_close);
|
|
encoder_class->stop = GST_DEBUG_FUNCPTR (gst_nv_comp_video_enc_stop);
|
|
encoder_class->flush = GST_DEBUG_FUNCPTR (gst_nv_comp_video_enc_flush);
|
|
encoder_class->finish = GST_DEBUG_FUNCPTR (gst_nv_comp_video_enc_finish);
|
|
encoder_class->sink_query =
|
|
GST_DEBUG_FUNCPTR (gst_nv_comp_video_enc_sink_query);
|
|
encoder_class->src_query =
|
|
GST_DEBUG_FUNCPTR (gst_nv_comp_video_enc_src_query);
|
|
encoder_class->propose_allocation =
|
|
GST_DEBUG_FUNCPTR (gst_nv_comp_video_enc_propose_allocation);
|
|
encoder_class->set_format =
|
|
GST_DEBUG_FUNCPTR (gst_nv_comp_video_enc_set_format);
|
|
encoder_class->handle_frame =
|
|
GST_DEBUG_FUNCPTR (gst_nv_comp_video_enc_handle_frame);
|
|
|
|
GST_DEBUG_CATEGORY_INIT (gst_nv_comp_video_enc_debug,
|
|
"nvcompvideoenc", 0, "nvcompvideoenc");
|
|
}
|
|
|
|
static void
|
|
gst_nv_comp_video_enc_init (GstNvCompVideoEnc * self)
|
|
{
|
|
self->priv = new GstNvCompVideoEncPrivate ();
|
|
}
|
|
|
|
static void
|
|
gst_nv_comp_video_enc_finalize (GObject * object)
|
|
{
|
|
auto self = GST_NV_COMP_VIDEO_ENC (object);
|
|
|
|
delete self->priv;
|
|
|
|
G_OBJECT_CLASS (parent_class)->finalize (object);
|
|
}
|
|
|
|
static void
|
|
gst_nv_comp_video_enc_set_property (GObject * object, guint prop_id,
|
|
const GValue * value, GParamSpec * pspec)
|
|
{
|
|
auto self = GST_NV_COMP_VIDEO_ENC (object);
|
|
auto priv = self->priv;
|
|
|
|
std::lock_guard < std::mutex > lk (priv->lock);
|
|
|
|
switch (prop_id) {
|
|
case PROP_METHOD:
|
|
priv->method = (GstNvCompMethod) g_value_get_enum (value);
|
|
break;
|
|
case PROP_DEFLATE_ALGO:
|
|
priv->deflate_algo = (GstNvCompDeflateAlgo) g_value_get_enum (value);
|
|
break;
|
|
case PROP_BITCOMP_ALGO:
|
|
priv->bitcomp_algo = (GstNvCompBitcompAlgo) g_value_get_enum (value);
|
|
break;
|
|
case PROP_DATA_TYPE:
|
|
priv->data_type = (GstNvCompDataType) g_value_get_enum (value);
|
|
break;
|
|
case PROP_CHUNK_SIZE:
|
|
priv->chunk_size = g_value_get_uint (value);
|
|
break;
|
|
case PROP_BATCHED:
|
|
priv->batched = g_value_get_boolean (value);
|
|
break;
|
|
case PROP_ASYNC_DEPTH:
|
|
priv->async_depth = g_value_get_uint (value);
|
|
break;
|
|
default:
|
|
G_OBJECT_WARN_INVALID_PROPERTY_ID (object, prop_id, pspec);
|
|
break;
|
|
}
|
|
}
|
|
|
|
static void
|
|
gst_nv_comp_video_enc_get_property (GObject * object, guint prop_id,
|
|
GValue * value, GParamSpec * pspec)
|
|
{
|
|
auto self = GST_NV_COMP_VIDEO_ENC (object);
|
|
auto priv = self->priv;
|
|
|
|
std::lock_guard < std::mutex > lk (priv->lock);
|
|
|
|
switch (prop_id) {
|
|
case PROP_METHOD:
|
|
g_value_set_enum (value, priv->method);
|
|
break;
|
|
case PROP_DEFLATE_ALGO:
|
|
g_value_set_enum (value, priv->deflate_algo);
|
|
break;
|
|
case PROP_BITCOMP_ALGO:
|
|
g_value_set_enum (value, priv->bitcomp_algo);
|
|
break;
|
|
case PROP_DATA_TYPE:
|
|
g_value_set_enum (value, priv->data_type);
|
|
break;
|
|
case PROP_CHUNK_SIZE:
|
|
g_value_set_uint (value, priv->chunk_size);
|
|
break;
|
|
case PROP_BATCHED:
|
|
g_value_set_boolean (value, priv->batched);
|
|
break;
|
|
case PROP_ASYNC_DEPTH:
|
|
g_value_set_uint (value, priv->async_depth);
|
|
break;
|
|
default:
|
|
G_OBJECT_WARN_INVALID_PROPERTY_ID (object, prop_id, pspec);
|
|
break;
|
|
}
|
|
}
|
|
|
|
static void
|
|
gst_nv_comp_video_enc_set_context (GstElement * element, GstContext * context)
|
|
{
|
|
auto self = GST_NV_COMP_VIDEO_ENC (element);
|
|
auto priv = self->priv;
|
|
|
|
gst_cuda_handle_set_context (element, context, -1, &priv->ctx);
|
|
#ifdef HAVE_GST_GL
|
|
if (gst_gl_handle_set_context (element, context, &priv->gl_display,
|
|
&priv->other_gl_context)) {
|
|
if (priv->gl_display)
|
|
gst_gl_display_filter_gl_api (priv->gl_display, GST_GL_API_OPENGL3);
|
|
}
|
|
#endif
|
|
|
|
GST_ELEMENT_CLASS (parent_class)->set_context (element, context);
|
|
}
|
|
|
|
static gboolean
|
|
gst_nv_comp_video_enc_open (GstVideoEncoder * encoder)
|
|
{
|
|
auto self = GST_NV_COMP_VIDEO_ENC (encoder);
|
|
auto priv = self->priv;
|
|
|
|
if (!gst_cuda_ensure_element_context (GST_ELEMENT_CAST (encoder),
|
|
-1, &priv->ctx)) {
|
|
GST_ERROR_OBJECT (self, "Couldn't get cuda context");
|
|
return FALSE;
|
|
}
|
|
|
|
priv->stream = gst_cuda_stream_new (priv->ctx);
|
|
|
|
return TRUE;
|
|
}
|
|
|
|
static gboolean
|
|
gst_nv_comp_video_enc_close (GstVideoEncoder * encoder)
|
|
{
|
|
auto self = GST_NV_COMP_VIDEO_ENC (encoder);
|
|
auto priv = self->priv;
|
|
|
|
gst_clear_cuda_stream (&priv->stream);
|
|
gst_clear_object (&priv->ctx);
|
|
|
|
#ifdef HAVE_GST_GL
|
|
gst_clear_object (&priv->other_gl_context);
|
|
gst_clear_object (&priv->gl_context);
|
|
gst_clear_object (&priv->gl_context);
|
|
#endif
|
|
|
|
return TRUE;
|
|
}
|
|
|
|
static void
|
|
gst_nv_comp_video_enc_drain (GstNvCompVideoEnc * self, gboolean locked)
|
|
{
|
|
auto priv = self->priv;
|
|
if (!priv->encode_thread)
|
|
return;
|
|
|
|
if (locked)
|
|
GST_VIDEO_ENCODER_STREAM_UNLOCK (self);
|
|
|
|
{
|
|
std::lock_guard < std::mutex > lk (priv->output_lock);
|
|
priv->output_task_queue.push (nullptr);
|
|
priv->output_cond.notify_all ();
|
|
}
|
|
|
|
g_clear_pointer (&priv->encode_thread, g_thread_join);
|
|
|
|
if (locked)
|
|
GST_VIDEO_ENCODER_STREAM_LOCK (self);
|
|
|
|
priv->last_flow = GST_FLOW_OK;
|
|
}
|
|
|
|
static gboolean
|
|
gst_nv_comp_video_enc_stop (GstVideoEncoder * encoder)
|
|
{
|
|
auto self = GST_NV_COMP_VIDEO_ENC (encoder);
|
|
auto priv = self->priv;
|
|
|
|
gst_nv_comp_video_enc_drain (self, FALSE);
|
|
|
|
if (priv->ctx) {
|
|
gst_cuda_context_push (priv->ctx);
|
|
priv->manager = nullptr;
|
|
priv->cur_task = nullptr;
|
|
priv->input_task_queue = { };
|
|
priv->output_task_queue = { };
|
|
|
|
gst_cuda_context_pop (nullptr);
|
|
}
|
|
|
|
g_clear_pointer (&priv->state, gst_video_codec_state_unref);
|
|
|
|
if (priv->pool) {
|
|
gst_buffer_pool_set_active (priv->pool, FALSE);
|
|
gst_clear_object (&priv->pool);
|
|
}
|
|
|
|
return TRUE;
|
|
}
|
|
|
|
static gboolean
|
|
gst_nv_comp_video_enc_flush (GstVideoEncoder * encoder)
|
|
{
|
|
auto self = GST_NV_COMP_VIDEO_ENC (encoder);
|
|
|
|
gst_nv_comp_video_enc_drain (self, TRUE);
|
|
|
|
return TRUE;
|
|
}
|
|
|
|
static GstFlowReturn
|
|
gst_nv_comp_video_enc_finish (GstVideoEncoder * encoder)
|
|
{
|
|
auto self = GST_NV_COMP_VIDEO_ENC (encoder);
|
|
|
|
gst_nv_comp_video_enc_drain (self, TRUE);
|
|
|
|
return GST_FLOW_OK;
|
|
}
|
|
|
|
static gboolean
|
|
gst_nv_comp_video_enc_handle_context_query (GstNvCompVideoEnc * self,
|
|
GstQuery * query)
|
|
{
|
|
auto priv = self->priv;
|
|
|
|
#ifdef HAVE_GST_GL
|
|
{
|
|
GstGLDisplay *display = nullptr;
|
|
GstGLContext *other = nullptr;
|
|
GstGLContext *local = nullptr;
|
|
|
|
if (priv->gl_display)
|
|
display = (GstGLDisplay *) gst_object_ref (priv->gl_display);
|
|
if (priv->gl_context)
|
|
local = (GstGLContext *) gst_object_ref (priv->gl_context);
|
|
if (priv->other_gl_context)
|
|
other = (GstGLContext *) gst_object_ref (priv->other_gl_context);
|
|
|
|
auto ret = gst_gl_handle_context_query (GST_ELEMENT (self), query,
|
|
display, local, other);
|
|
gst_clear_object (&display);
|
|
gst_clear_object (&other);
|
|
gst_clear_object (&local);
|
|
|
|
if (ret)
|
|
return TRUE;
|
|
}
|
|
#endif
|
|
|
|
if (gst_cuda_handle_context_query (GST_ELEMENT (self), query, priv->ctx))
|
|
return TRUE;
|
|
|
|
return FALSE;
|
|
}
|
|
|
|
static gboolean
|
|
gst_nv_comp_video_enc_sink_query (GstVideoEncoder * encoder, GstQuery * query)
|
|
{
|
|
auto self = GST_NV_COMP_VIDEO_ENC (encoder);
|
|
|
|
switch (GST_QUERY_TYPE (query)) {
|
|
case GST_QUERY_CONTEXT:
|
|
if (gst_nv_comp_video_enc_handle_context_query (self, query))
|
|
return TRUE;
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
|
|
return GST_VIDEO_ENCODER_CLASS (parent_class)->sink_query (encoder, query);
|
|
}
|
|
|
|
static gboolean
|
|
gst_nv_comp_video_enc_src_query (GstVideoEncoder * encoder, GstQuery * query)
|
|
{
|
|
auto self = GST_NV_COMP_VIDEO_ENC (encoder);
|
|
|
|
switch (GST_QUERY_TYPE (query)) {
|
|
case GST_QUERY_CONTEXT:
|
|
if (gst_nv_comp_video_enc_handle_context_query (self, query))
|
|
return TRUE;
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
|
|
return GST_VIDEO_ENCODER_CLASS (parent_class)->src_query (encoder, query);
|
|
}
|
|
|
|
#ifdef HAVE_GST_GL
|
|
static void
|
|
check_cuda_device_from_gl_context (GstGLContext * context, gboolean * ret)
|
|
{
|
|
guint device_count = 0;
|
|
CUdevice device_list[1] = { 0, };
|
|
CUresult cuda_ret;
|
|
|
|
*ret = FALSE;
|
|
cuda_ret = CuGLGetDevices (&device_count,
|
|
device_list, 1, CU_GL_DEVICE_LIST_ALL);
|
|
|
|
if (!gst_cuda_result (cuda_ret) || device_count == 0)
|
|
return;
|
|
|
|
*ret = TRUE;
|
|
}
|
|
|
|
static gboolean
|
|
gst_nv_comp_video_enc_ensure_gl_context (GstNvCompVideoEnc * self)
|
|
{
|
|
auto priv = self->priv;
|
|
gboolean ret = FALSE;
|
|
|
|
if (!gst_gl_ensure_element_data (GST_ELEMENT (self), &priv->gl_display,
|
|
&priv->other_gl_context)) {
|
|
GST_DEBUG_OBJECT (self, "Couldn't get GL display");
|
|
return FALSE;
|
|
}
|
|
|
|
gst_gl_display_filter_gl_api (priv->gl_display, GST_GL_API_OPENGL3);
|
|
|
|
if (!gst_gl_display_ensure_context (priv->gl_display, priv->other_gl_context,
|
|
&priv->gl_context, nullptr)) {
|
|
GST_DEBUG_OBJECT (self, "Couldn't get GL context");
|
|
return FALSE;
|
|
}
|
|
|
|
gst_gl_context_thread_add (priv->gl_context,
|
|
(GstGLContextThreadFunc) check_cuda_device_from_gl_context, &ret);
|
|
|
|
return ret;
|
|
}
|
|
#endif
|
|
|
|
static gboolean
|
|
gst_nv_comp_video_enc_propose_allocation (GstVideoEncoder * encoder,
|
|
GstQuery * query)
|
|
{
|
|
auto self = GST_NV_COMP_VIDEO_ENC (encoder);
|
|
auto priv = self->priv;
|
|
GstBufferPool *pool = nullptr;
|
|
guint size;
|
|
|
|
GstCaps *caps;
|
|
gst_query_parse_allocation (query, &caps, nullptr);
|
|
if (!caps) {
|
|
GST_WARNING_OBJECT (self, "null caps in query");
|
|
return FALSE;
|
|
}
|
|
|
|
GstVideoInfo info;
|
|
if (!gst_video_info_from_caps (&info, caps)) {
|
|
GST_WARNING_OBJECT (self, "Failed to convert caps into info");
|
|
return FALSE;
|
|
}
|
|
|
|
auto features = gst_caps_get_features (caps, 0);
|
|
gboolean use_cuda_pool = FALSE;
|
|
if (gst_caps_features_contains (features,
|
|
GST_CAPS_FEATURE_MEMORY_CUDA_MEMORY)) {
|
|
GST_DEBUG_OBJECT (self, "upstream support CUDA memory");
|
|
pool = gst_cuda_buffer_pool_new (priv->ctx);
|
|
use_cuda_pool = TRUE;
|
|
}
|
|
#ifdef HAVE_GST_GL
|
|
else if (gst_caps_features_contains (features,
|
|
GST_CAPS_FEATURE_MEMORY_GL_MEMORY)) {
|
|
if (!gst_nv_comp_video_enc_ensure_gl_context (self)) {
|
|
priv->gl_interop = FALSE;
|
|
} else {
|
|
pool = gst_gl_buffer_pool_new (priv->gl_context);
|
|
}
|
|
}
|
|
#endif
|
|
|
|
if (!pool)
|
|
pool = gst_video_buffer_pool_new ();
|
|
|
|
auto config = gst_buffer_pool_get_config (pool);
|
|
gst_buffer_pool_config_add_option (config, GST_BUFFER_POOL_OPTION_VIDEO_META);
|
|
|
|
size = GST_VIDEO_INFO_SIZE (&info);
|
|
gst_buffer_pool_config_set_params (config, caps, size, 0, 0);
|
|
if (use_cuda_pool && priv->stream) {
|
|
/* Set our stream on buffer pool config so that CUstream can be shared */
|
|
gst_buffer_pool_config_set_cuda_stream (config, priv->stream);
|
|
}
|
|
|
|
if (!gst_buffer_pool_set_config (pool, config)) {
|
|
GST_WARNING_OBJECT (self, "Failed to set pool config");
|
|
gst_object_unref (pool);
|
|
return FALSE;
|
|
}
|
|
|
|
config = gst_buffer_pool_get_config (pool);
|
|
gst_buffer_pool_config_get_params (config, nullptr, &size, nullptr, nullptr);
|
|
gst_structure_free (config);
|
|
|
|
gst_query_add_allocation_pool (query, pool, size, 0, 0);
|
|
gst_query_add_allocation_meta (query, GST_VIDEO_META_API_TYPE, nullptr);
|
|
gst_object_unref (pool);
|
|
|
|
return TRUE;
|
|
}
|
|
|
|
static gboolean
|
|
gst_nv_comp_video_enc_alloc_task (GstNvCompVideoEnc * self, EncoderTask * task,
|
|
gboolean batched, size_t uncompressed_size, size_t compressed_size,
|
|
size_t batch_size, size_t chunk_size, size_t output_chunk_size,
|
|
size_t temp_size)
|
|
{
|
|
size_t alloc_size = sizeof (size_t) * batch_size;
|
|
uint8_t *uncomp_data;
|
|
uint8_t *comp_data;
|
|
|
|
auto ret = CuEventCreate (&task->event,
|
|
CU_EVENT_BLOCKING_SYNC | CU_EVENT_DISABLE_TIMING);
|
|
if (!gst_cuda_result (ret))
|
|
return FALSE;
|
|
|
|
auto aligned_uncompressed_size = uncompressed_size;
|
|
ret = CuMemAlloc ((CUdeviceptr *) & task->device_uncompressed,
|
|
aligned_uncompressed_size);
|
|
if (!gst_cuda_result (ret))
|
|
return FALSE;
|
|
|
|
ret = CuMemAllocHost ((void **) &task->host_uncompressed,
|
|
aligned_uncompressed_size);
|
|
if (!gst_cuda_result (ret))
|
|
return FALSE;
|
|
|
|
auto aligned_compressed_size = GST_ROUND_UP_8 (compressed_size);
|
|
ret = CuMemAlloc ((CUdeviceptr *) & task->device_compressed,
|
|
aligned_compressed_size);
|
|
if (!gst_cuda_result (ret))
|
|
return FALSE;
|
|
|
|
ret = CuMemAllocHost ((void **) &task->host_compressed,
|
|
aligned_compressed_size);
|
|
if (!gst_cuda_result (ret))
|
|
return FALSE;
|
|
|
|
if (!batched)
|
|
return TRUE;
|
|
|
|
ret = CuMemAllocHost ((void **) &task->host_uncompressed_bytes, alloc_size);
|
|
if (!gst_cuda_result (ret))
|
|
return FALSE;
|
|
|
|
ret = CuMemAllocHost ((void **) &task->host_uncompressed_ptrs, alloc_size);
|
|
if (!gst_cuda_result (ret))
|
|
return FALSE;
|
|
|
|
for (size_t i = 0; i < batch_size; i++) {
|
|
if (i + 1 < batch_size)
|
|
task->host_uncompressed_bytes[i] = chunk_size;
|
|
else
|
|
task->host_uncompressed_bytes[i] = (uncompressed_size - (chunk_size * i));
|
|
}
|
|
|
|
ret = CuMemAlloc ((CUdeviceptr *) & task->device_uncompressed_bytes,
|
|
alloc_size);
|
|
if (!gst_cuda_result (ret))
|
|
return FALSE;
|
|
|
|
ret = CuMemAlloc ((CUdeviceptr *) & task->device_uncompressed_ptrs,
|
|
alloc_size);
|
|
if (!gst_cuda_result (ret))
|
|
return FALSE;
|
|
|
|
ret = CuMemAlloc ((CUdeviceptr *) & task->device_compressed_bytes,
|
|
alloc_size);
|
|
if (!gst_cuda_result (ret))
|
|
return FALSE;
|
|
|
|
ret = CuMemAlloc ((CUdeviceptr *) & task->device_compressed_ptrs, alloc_size);
|
|
if (!gst_cuda_result (ret))
|
|
return FALSE;
|
|
|
|
ret = CuMemAllocHost ((void **) &task->host_compressed_bytes, alloc_size);
|
|
if (!gst_cuda_result (ret))
|
|
return FALSE;
|
|
|
|
ret = CuMemAllocHost ((void **) &task->host_compressed_ptrs, alloc_size);
|
|
if (!gst_cuda_result (ret))
|
|
return FALSE;
|
|
|
|
if (temp_size > 0) {
|
|
ret = CuMemAlloc ((CUdeviceptr *) & task->temp_ptr, temp_size);
|
|
if (!gst_cuda_result (ret))
|
|
return FALSE;
|
|
}
|
|
|
|
task->temp_size = temp_size;
|
|
|
|
uncomp_data = task->device_uncompressed;
|
|
comp_data = task->device_compressed;
|
|
for (size_t i = 0; i < batch_size; i++) {
|
|
task->host_uncompressed_ptrs[i] = uncomp_data;
|
|
uncomp_data += chunk_size;
|
|
|
|
task->host_compressed_ptrs[i] = comp_data;
|
|
comp_data += output_chunk_size;
|
|
}
|
|
|
|
ret = CuMemcpyHtoD ((CUdeviceptr) task->device_uncompressed_bytes,
|
|
task->host_uncompressed_bytes, alloc_size);
|
|
if (!gst_cuda_result (ret))
|
|
return FALSE;
|
|
|
|
ret = CuMemcpyHtoD ((CUdeviceptr) task->device_uncompressed_ptrs,
|
|
task->host_uncompressed_ptrs, alloc_size);
|
|
if (!gst_cuda_result (ret))
|
|
return FALSE;
|
|
|
|
ret = CuMemcpyHtoD ((CUdeviceptr) task->device_compressed_ptrs,
|
|
task->host_compressed_ptrs, alloc_size);
|
|
if (!gst_cuda_result (ret))
|
|
return FALSE;
|
|
|
|
task->batched = batched;
|
|
task->batch_size = batch_size;
|
|
task->chunk_size = chunk_size;
|
|
task->max_output_chunk_size = output_chunk_size;
|
|
task->compressed_alloc_size = aligned_compressed_size;
|
|
|
|
return TRUE;
|
|
}
|
|
|
|
static gboolean
|
|
gst_nv_comp_video_enc_set_format (GstVideoEncoder * encoder,
|
|
GstVideoCodecState * state)
|
|
{
|
|
auto self = GST_NV_COMP_VIDEO_ENC (encoder);
|
|
auto priv = self->priv;
|
|
|
|
gst_nv_comp_video_enc_drain (self, TRUE);
|
|
|
|
std::lock_guard < std::mutex > lk (priv->lock);
|
|
|
|
if (!priv->ctx) {
|
|
GST_ERROR_OBJECT (self, "CUDA context was not configured");
|
|
return FALSE;
|
|
}
|
|
|
|
if (priv->pool) {
|
|
gst_buffer_pool_set_active (priv->pool, FALSE);
|
|
gst_clear_object (&priv->pool);
|
|
}
|
|
|
|
g_clear_pointer (&priv->state, gst_video_codec_state_unref);
|
|
priv->state = gst_video_codec_state_ref (state);
|
|
|
|
std::string mime_type = "video/x-nvcomp";
|
|
|
|
if (!gst_cuda_context_push (priv->ctx)) {
|
|
GST_ERROR_OBJECT (self, "Couldn't push context");
|
|
return FALSE;
|
|
}
|
|
|
|
priv->gl_interop = FALSE;
|
|
#if HAVE_GST_GL
|
|
auto features = gst_caps_get_features (state->caps, 0);
|
|
if (gst_caps_features_contains (features, GST_CAPS_FEATURE_MEMORY_GL_MEMORY))
|
|
priv->gl_interop = TRUE;
|
|
#endif
|
|
|
|
priv->manager = nullptr;
|
|
priv->config = nullptr;
|
|
priv->batched_comp = nullptr;
|
|
priv->input_task_queue = { };
|
|
priv->output_task_queue = { };
|
|
auto stream = (cudaStream_t) gst_cuda_stream_get_handle (priv->stream);
|
|
guint device_id = 0;
|
|
g_object_get (priv->ctx, "cuda-device-id", &device_id, nullptr);
|
|
size_t chunk_size = priv->chunk_size;
|
|
size_t batch_size = 0;
|
|
|
|
switch (priv->method) {
|
|
case GST_NV_COMP_LZ4:
|
|
{
|
|
nvcompBatchedLZ4Opts_t opts = nvcompBatchedLZ4DefaultOpts;
|
|
if (priv->data_type != GST_NV_COMP_DATA_TYPE_DEFAULT)
|
|
opts.data_type = (nvcompType_t) priv->data_type;
|
|
|
|
if (chunk_size == 0)
|
|
chunk_size = 65536;
|
|
|
|
chunk_size = MAX (32768, chunk_size);
|
|
chunk_size = GST_ROUND_UP_8 (chunk_size);
|
|
chunk_size = MIN (chunk_size, nvcompLZ4CompressionMaxAllowedChunkSize);
|
|
|
|
if (priv->batched) {
|
|
batch_size = (state->info.size + chunk_size - 1) / chunk_size;
|
|
|
|
priv->batched_comp =
|
|
std::make_shared < BatchedComp < nvcompBatchedLZ4Opts_t,
|
|
nvcompBatchedLZ4CompressGetTempSize,
|
|
nvcompBatchedLZ4CompressGetMaxOutputChunkSize,
|
|
nvcompBatchedLZ4CompressAsync >> (opts);
|
|
mime_type = "video/x-nvcomp-lz4";
|
|
} else {
|
|
priv->manager = std::make_shared < LZ4Manager > (chunk_size,
|
|
opts, stream, device_id);
|
|
}
|
|
GST_DEBUG_OBJECT (self, "Using LZ4");
|
|
break;
|
|
}
|
|
case GST_NV_COMP_SNAPPY:
|
|
{
|
|
if (chunk_size == 0)
|
|
chunk_size = 65536;
|
|
|
|
chunk_size = MAX (32768, chunk_size);
|
|
chunk_size = GST_ROUND_UP_8 (chunk_size);
|
|
chunk_size = MIN (chunk_size, nvcompSnappyCompressionMaxAllowedChunkSize);
|
|
|
|
if (priv->batched) {
|
|
batch_size = (state->info.size + chunk_size - 1) / chunk_size;
|
|
|
|
priv->batched_comp =
|
|
std::make_shared < BatchedComp < nvcompBatchedSnappyOpts_t,
|
|
nvcompBatchedSnappyCompressGetTempSize,
|
|
nvcompBatchedSnappyCompressGetMaxOutputChunkSize,
|
|
nvcompBatchedSnappyCompressAsync >>
|
|
(nvcompBatchedSnappyDefaultOpts);
|
|
mime_type = "video/x-nvcomp-snappy";
|
|
} else {
|
|
priv->manager = std::make_shared < SnappyManager > (chunk_size,
|
|
nvcompBatchedSnappyDefaultOpts, stream, device_id);
|
|
}
|
|
GST_DEBUG_OBJECT (self, "Using SNAPPY");
|
|
break;
|
|
}
|
|
case GST_NV_COMP_GDEFLATE:
|
|
{
|
|
nvcompBatchedGdeflateOpts_t opts;
|
|
opts.algo = (int) priv->deflate_algo;
|
|
|
|
if (chunk_size == 0)
|
|
chunk_size = 65536;
|
|
|
|
chunk_size = MAX (32768, chunk_size);
|
|
chunk_size = GST_ROUND_UP_8 (chunk_size);
|
|
chunk_size = MIN (chunk_size,
|
|
nvcompGdeflateCompressionMaxAllowedChunkSize);
|
|
|
|
|
|
if (priv->batched) {
|
|
batch_size = (state->info.size + chunk_size - 1) / chunk_size;
|
|
|
|
priv->batched_comp =
|
|
std::make_shared < BatchedComp < nvcompBatchedGdeflateOpts_t,
|
|
nvcompBatchedGdeflateCompressGetTempSize,
|
|
nvcompBatchedGdeflateCompressGetMaxOutputChunkSize,
|
|
nvcompBatchedGdeflateCompressAsync >> (opts);
|
|
mime_type = "video/x-nvcomp-gdeflate";
|
|
} else {
|
|
priv->manager = std::make_shared < GdeflateManager > (chunk_size,
|
|
opts, stream, device_id);
|
|
}
|
|
GST_DEBUG_OBJECT (self, "Using GDEFLATE");
|
|
break;
|
|
}
|
|
case GST_NV_COMP_DEFLATE:
|
|
{
|
|
if (chunk_size == 0)
|
|
chunk_size = 65536;
|
|
|
|
chunk_size = MAX (32768, chunk_size);
|
|
chunk_size = GST_ROUND_UP_8 (chunk_size);
|
|
chunk_size = MIN (chunk_size,
|
|
nvcompDeflateCompressionMaxAllowedChunkSize);
|
|
|
|
nvcompBatchedDeflateOpts_t opts;
|
|
opts.algo = (int) priv->deflate_algo;
|
|
if (priv->batched) {
|
|
batch_size = (state->info.size + chunk_size - 1) / chunk_size;
|
|
|
|
priv->batched_comp =
|
|
std::make_shared < BatchedComp < nvcompBatchedDeflateOpts_t,
|
|
nvcompBatchedDeflateCompressGetTempSize,
|
|
nvcompBatchedDeflateCompressGetMaxOutputChunkSize,
|
|
nvcompBatchedDeflateCompressAsync >> (opts);
|
|
mime_type = "video/x-nvcomp-deflate";
|
|
} else {
|
|
priv->manager = std::make_shared < DeflateManager > (chunk_size,
|
|
opts, stream, device_id);
|
|
}
|
|
GST_DEBUG_OBJECT (self, "Using DEFLATE");
|
|
break;
|
|
}
|
|
case GST_NV_COMP_ZSTD:
|
|
{
|
|
if (chunk_size == 0)
|
|
chunk_size = 65536;
|
|
|
|
chunk_size = MAX (32768, chunk_size);
|
|
chunk_size = GST_ROUND_UP_8 (chunk_size);
|
|
chunk_size = MIN (chunk_size, nvcompZstdCompressionMaxAllowedChunkSize);
|
|
|
|
if (priv->batched) {
|
|
batch_size = (state->info.size + chunk_size - 1) / chunk_size;
|
|
|
|
priv->batched_comp =
|
|
std::make_shared < BatchedComp < nvcompBatchedZstdOpts_t,
|
|
nvcompBatchedZstdCompressGetTempSize,
|
|
nvcompBatchedZstdCompressGetMaxOutputChunkSize,
|
|
nvcompBatchedZstdCompressAsync >> (nvcompBatchedZstdDefaultOpts);
|
|
mime_type = "video/x-nvcomp-zstd";
|
|
} else {
|
|
priv->manager = std::make_shared < ZstdManager > (chunk_size,
|
|
nvcompBatchedZstdDefaultOpts, stream, device_id);
|
|
}
|
|
GST_DEBUG_OBJECT (self, "Using ZSTD");
|
|
break;
|
|
}
|
|
case GST_NV_COMP_CASCADED:
|
|
{
|
|
if (chunk_size == 0)
|
|
chunk_size = 4096;
|
|
|
|
chunk_size = MAX (512, chunk_size);
|
|
chunk_size = GST_ROUND_UP_8 (chunk_size);
|
|
chunk_size = MIN (chunk_size, 16384);
|
|
|
|
nvcompBatchedCascadedOpts_t opts = nvcompBatchedCascadedDefaultOpts;
|
|
opts.chunk_size = chunk_size;
|
|
if (priv->data_type != GST_NV_COMP_DATA_TYPE_DEFAULT)
|
|
opts.type = (nvcompType_t) priv->data_type;
|
|
|
|
if (priv->batched) {
|
|
batch_size = (state->info.size + chunk_size - 1) / chunk_size;
|
|
|
|
priv->batched_comp =
|
|
std::make_shared < BatchedComp < nvcompBatchedCascadedOpts_t,
|
|
nvcompBatchedCascadedCompressGetTempSize,
|
|
nvcompBatchedCascadedCompressGetMaxOutputChunkSize,
|
|
nvcompBatchedCascadedCompressAsync >> (opts);
|
|
mime_type = "video/x-nvcomp-cascaded";
|
|
} else {
|
|
priv->manager = std::make_shared < CascadedManager > (chunk_size,
|
|
opts, stream, device_id);
|
|
}
|
|
GST_DEBUG_OBJECT (self, "Using CASCADED");
|
|
break;
|
|
}
|
|
case GST_NV_COMP_BITCOMP:
|
|
{
|
|
if (chunk_size == 0)
|
|
chunk_size = 65536;
|
|
|
|
chunk_size = MAX (32768, chunk_size);
|
|
chunk_size = GST_ROUND_UP_8 (chunk_size);
|
|
chunk_size = MIN (chunk_size,
|
|
nvcompBitcompCompressionMaxAllowedChunkSize);
|
|
|
|
nvcompBatchedBitcompFormatOpts opts = nvcompBatchedBitcompDefaultOpts;
|
|
opts.algorithm_type = (int) priv->bitcomp_algo;
|
|
if (priv->data_type != GST_NV_COMP_DATA_TYPE_DEFAULT)
|
|
opts.data_type = (nvcompType_t) priv->data_type;
|
|
|
|
if (priv->batched) {
|
|
batch_size = (state->info.size + chunk_size - 1) / chunk_size;
|
|
|
|
priv->batched_comp =
|
|
std::make_shared < BatchedComp < nvcompBatchedBitcompFormatOpts,
|
|
nvcompBatchedBitcompCompressGetTempSize,
|
|
nvcompBatchedBitcompCompressGetMaxOutputChunkSize,
|
|
nvcompBatchedBitcompCompressAsync >> (opts);
|
|
mime_type = "video/x-nvcomp-bitcomp";
|
|
} else {
|
|
priv->manager = std::make_shared < BitcompManager > (chunk_size,
|
|
opts, stream, device_id);
|
|
}
|
|
GST_DEBUG_OBJECT (self, "Using BITCOMP");
|
|
break;
|
|
}
|
|
case GST_NV_COMP_ANS:
|
|
{
|
|
if (chunk_size == 0)
|
|
chunk_size = 65536;
|
|
|
|
chunk_size = MAX (32768, chunk_size);
|
|
chunk_size = GST_ROUND_UP_8 (chunk_size);
|
|
chunk_size = MIN (chunk_size, nvcompANSCompressionMaxAllowedChunkSize);
|
|
|
|
if (priv->batched) {
|
|
batch_size = (state->info.size + chunk_size - 1) / chunk_size;
|
|
priv->batched_comp =
|
|
std::make_shared < BatchedComp < nvcompBatchedANSOpts_t,
|
|
nvcompBatchedANSCompressGetTempSize,
|
|
nvcompBatchedANSCompressGetMaxOutputChunkSize,
|
|
nvcompBatchedANSCompressAsync >> (nvcompBatchedANSDefaultOpts);
|
|
mime_type = "video/x-nvcomp-ans";
|
|
} else {
|
|
priv->manager = std::make_shared < ANSManager > (chunk_size,
|
|
nvcompBatchedANSDefaultOpts, stream, device_id);
|
|
}
|
|
|
|
GST_DEBUG_OBJECT (self, "Using ANS");
|
|
break;
|
|
}
|
|
default:
|
|
g_assert_not_reached ();
|
|
return FALSE;
|
|
}
|
|
|
|
size_t max_output_size = 0;
|
|
size_t max_output_chunk_size = 0;
|
|
size_t temp_size = 0;
|
|
if (priv->batched) {
|
|
auto status = priv->batched_comp->get_temp_size (batch_size,
|
|
chunk_size, &temp_size);
|
|
if (status != nvcompSuccess) {
|
|
GST_ERROR_OBJECT (self, "Couldn't get temp size");
|
|
gst_cuda_context_pop (nullptr);
|
|
return FALSE;
|
|
}
|
|
|
|
status = priv->batched_comp->get_max_compressed_chunk_size (chunk_size,
|
|
&max_output_chunk_size);
|
|
if (status != nvcompSuccess) {
|
|
GST_ERROR_OBJECT (self, "Couldn't get max output chunk size");
|
|
gst_cuda_context_pop (nullptr);
|
|
return FALSE;
|
|
}
|
|
|
|
max_output_chunk_size = GST_ROUND_UP_8 (max_output_chunk_size);
|
|
max_output_size = max_output_chunk_size * batch_size;
|
|
} else {
|
|
priv->config = std::make_shared < CompressionConfig >
|
|
(priv->manager->configure_compression (state->info.size));
|
|
max_output_size = priv->config->max_compressed_buffer_size;
|
|
}
|
|
|
|
GST_DEBUG_OBJECT (self, "Allocating resource, batched: %d"
|
|
", uncompressed size: %" G_GSIZE_FORMAT
|
|
", max-output-size: %" G_GSIZE_FORMAT
|
|
", batch-size: %" G_GSIZE_FORMAT
|
|
", chunk-size: %" G_GSIZE_FORMAT
|
|
", max-output-chunk-size: %" G_GSIZE_FORMAT
|
|
", temp-size: %" G_GSIZE_FORMAT, priv->batched, state->info.size,
|
|
max_output_size, batch_size, chunk_size, max_output_chunk_size,
|
|
temp_size);
|
|
|
|
for (guint i = 0; i < priv->async_depth; i++) {
|
|
auto task = std::make_shared < EncoderTask > ();
|
|
task->ctx = (GstCudaContext *) gst_object_ref (priv->ctx);
|
|
|
|
if (!gst_nv_comp_video_enc_alloc_task (self, task.get (), priv->batched,
|
|
state->info.size, max_output_size, batch_size, chunk_size,
|
|
max_output_chunk_size, temp_size)) {
|
|
priv->manager = nullptr;
|
|
priv->input_task_queue = { };
|
|
task = nullptr;
|
|
gst_cuda_context_pop (nullptr);
|
|
return FALSE;
|
|
}
|
|
|
|
priv->input_task_queue.push (task);
|
|
}
|
|
|
|
/* In case of batched, custom header is added to signal chunk and batch size */
|
|
if (priv->batched) {
|
|
/* version */
|
|
max_output_size += sizeof (guint32);
|
|
|
|
/* max uncompressed chunk size */
|
|
max_output_size += sizeof (guint32);
|
|
|
|
/* max compressed chunk size */
|
|
max_output_size += sizeof (guint32);
|
|
|
|
/* batch size */
|
|
max_output_size += sizeof (guint32);
|
|
|
|
/* each uncompressed/compressed chunk size */
|
|
max_output_size += (sizeof (guint32) * batch_size * 2);
|
|
}
|
|
|
|
priv->pool = gst_buffer_pool_new ();
|
|
auto config = gst_buffer_pool_get_config (priv->pool);
|
|
gst_buffer_pool_config_set_params (config, nullptr, max_output_size, 0, 0);
|
|
gst_buffer_pool_set_config (priv->pool, config);
|
|
gst_buffer_pool_set_active (priv->pool, TRUE);
|
|
|
|
gst_cuda_context_pop (nullptr);
|
|
|
|
auto caps = gst_caps_new_simple (mime_type.c_str (), "format", G_TYPE_STRING,
|
|
gst_video_format_to_string (GST_VIDEO_INFO_FORMAT (&state->info)),
|
|
nullptr);
|
|
auto out_state =
|
|
gst_video_encoder_set_output_state (GST_VIDEO_ENCODER (encoder),
|
|
caps, state);
|
|
gst_video_codec_state_unref (out_state);
|
|
|
|
return TRUE;
|
|
}
|
|
|
|
static gboolean
|
|
gst_nv_comp_video_enc_upload (GstNvCompVideoEnc * self, GstVideoFrame * frame,
|
|
CUstream stream, gboolean is_device_copy)
|
|
{
|
|
auto priv = self->priv;
|
|
auto info = &priv->state->info;
|
|
auto finfo = info->finfo;
|
|
gint comp[GST_VIDEO_MAX_COMPONENTS];
|
|
CUresult ret = CUDA_SUCCESS;
|
|
auto cur_task = priv->cur_task;
|
|
|
|
for (guint i = 0; i < GST_VIDEO_FRAME_N_PLANES (frame); i++) {
|
|
guint8 *sp = (guint8 *) GST_VIDEO_FRAME_PLANE_DATA (frame, i);
|
|
guint8 *dp;
|
|
if (is_device_copy)
|
|
dp = cur_task->device_uncompressed + info->offset[i];
|
|
else
|
|
dp = cur_task->host_uncompressed + info->offset[i];
|
|
|
|
guint ss, ds;
|
|
guint w, h;
|
|
|
|
if (GST_VIDEO_FORMAT_INFO_HAS_PALETTE (finfo) && i == 1) {
|
|
if (is_device_copy) {
|
|
ret = CuMemcpyDtoDAsync ((CUdeviceptr) dp, (CUdeviceptr) sp,
|
|
256 * 4, stream);
|
|
} else {
|
|
memcpy (dp, sp, 256 * 4);
|
|
}
|
|
|
|
if (!gst_cuda_result (ret)) {
|
|
GST_ERROR_OBJECT (self, "CUDA memcpy failed");
|
|
return FALSE;
|
|
}
|
|
|
|
return TRUE;
|
|
}
|
|
|
|
ss = GST_VIDEO_FRAME_PLANE_STRIDE (frame, i);
|
|
ds = GST_VIDEO_INFO_PLANE_STRIDE (info, i);
|
|
|
|
gst_video_format_info_component (finfo, i, comp);
|
|
|
|
w = GST_VIDEO_INFO_COMP_WIDTH (info, comp[0]) *
|
|
GST_VIDEO_INFO_COMP_PSTRIDE (info, comp[0]);
|
|
if (w == 0)
|
|
w = MIN (ss, ds);
|
|
|
|
h = GST_VIDEO_INFO_COMP_HEIGHT (info, comp[0]);
|
|
|
|
if (GST_VIDEO_FORMAT_INFO_IS_TILED (finfo)) {
|
|
gint tile_size;
|
|
gint sx_tiles, sy_tiles, dx_tiles, dy_tiles;
|
|
GstVideoTileMode mode;
|
|
|
|
tile_size = GST_VIDEO_FORMAT_INFO_TILE_SIZE (info->finfo, i);
|
|
|
|
mode = GST_VIDEO_FORMAT_INFO_TILE_MODE (info->finfo);
|
|
|
|
sx_tiles = GST_VIDEO_TILE_X_TILES (ss);
|
|
sy_tiles = GST_VIDEO_TILE_Y_TILES (ss);
|
|
|
|
dx_tiles = GST_VIDEO_TILE_X_TILES (ds);
|
|
dy_tiles = GST_VIDEO_TILE_Y_TILES (ds);
|
|
|
|
w = MIN (sx_tiles, dx_tiles);
|
|
h = MIN (sy_tiles, dy_tiles);
|
|
|
|
for (guint j = 0; j < h; j++) {
|
|
for (guint k = 0; k < w; k++) {
|
|
guint si, di;
|
|
guint8 *cur_dp;
|
|
guint8 *cur_sp;
|
|
|
|
si = gst_video_tile_get_index (mode, k, j, sx_tiles, sy_tiles);
|
|
di = gst_video_tile_get_index (mode, k, j, dx_tiles, dy_tiles);
|
|
|
|
cur_dp = dp + (di * tile_size);
|
|
cur_sp = sp + (si * tile_size);
|
|
|
|
if (is_device_copy) {
|
|
ret = CuMemcpyDtoDAsync ((CUdeviceptr) cur_dp, (CUdeviceptr) cur_sp,
|
|
w, stream);
|
|
} else {
|
|
memcpy (cur_dp, cur_sp, w);
|
|
}
|
|
|
|
if (!gst_cuda_result (ret)) {
|
|
GST_ERROR_OBJECT (self, "CUDA memcpy failed");
|
|
return FALSE;
|
|
}
|
|
}
|
|
}
|
|
} else {
|
|
if (is_device_copy) {
|
|
CUDA_MEMCPY2D params = { };
|
|
params.srcMemoryType = CU_MEMORYTYPE_DEVICE;
|
|
params.srcDevice = (CUdeviceptr) sp;
|
|
params.srcPitch = ss;
|
|
|
|
params.dstMemoryType = CU_MEMORYTYPE_DEVICE;
|
|
params.dstDevice = (CUdeviceptr) dp;
|
|
params.dstPitch = ds;
|
|
|
|
params.WidthInBytes = w;
|
|
params.Height = h;
|
|
|
|
ret = CuMemcpy2DAsync (¶ms, stream);
|
|
|
|
if (!gst_cuda_result (ret)) {
|
|
GST_ERROR_OBJECT (self, "CUDA memcpy failed");
|
|
return FALSE;
|
|
}
|
|
} else {
|
|
for (guint j = 0; j < h; j++) {
|
|
memcpy (dp, sp, w);
|
|
dp += ds;
|
|
sp += ss;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
return TRUE;
|
|
}
|
|
|
|
#ifdef HAVE_GST_GL
|
|
struct GLInteropData
|
|
{
|
|
GstNvCompVideoEnc *self = nullptr;
|
|
GstBuffer *buffer = nullptr;
|
|
gboolean ret = FALSE;
|
|
};
|
|
|
|
static GstCudaGraphicsResource *
|
|
ensure_gl_cuda_resource (GstNvCompVideoEnc * self, GstMemory * mem)
|
|
{
|
|
auto priv = self->priv;
|
|
GstCudaGraphicsResource *resource;
|
|
GQuark quark;
|
|
|
|
if (!gst_is_gl_memory_pbo (mem)) {
|
|
GST_WARNING_OBJECT (self, "memory is not GL PBO memory, %s",
|
|
mem->allocator->mem_type);
|
|
return nullptr;
|
|
}
|
|
|
|
quark = gst_cuda_quark_from_id (GST_CUDA_QUARK_GRAPHICS_RESOURCE);
|
|
resource = (GstCudaGraphicsResource *)
|
|
gst_mini_object_get_qdata (GST_MINI_OBJECT (mem), quark);
|
|
|
|
if (!resource) {
|
|
GstMapInfo map_info;
|
|
GstGLMemoryPBO *pbo = (GstGLMemoryPBO *) mem;
|
|
GstGLBuffer *gl_buf = pbo->pbo;
|
|
gboolean ret;
|
|
|
|
if (!gst_memory_map (mem, &map_info,
|
|
(GstMapFlags) (GST_MAP_READ | GST_MAP_GL))) {
|
|
GST_ERROR_OBJECT (self, "Couldn't map gl memory");
|
|
return nullptr;
|
|
}
|
|
|
|
resource = gst_cuda_graphics_resource_new (priv->ctx,
|
|
GST_OBJECT (GST_GL_BASE_MEMORY_CAST (mem)->context),
|
|
GST_CUDA_GRAPHICS_RESOURCE_GL_BUFFER);
|
|
|
|
GST_LOG_OBJECT (self, "registering gl buffer %d to CUDA", gl_buf->id);
|
|
ret = gst_cuda_graphics_resource_register_gl_buffer (resource, gl_buf->id,
|
|
CU_GRAPHICS_REGISTER_FLAGS_NONE);
|
|
gst_memory_unmap (mem, &map_info);
|
|
|
|
if (!ret) {
|
|
GST_ERROR_OBJECT (self, "Couldn't register gl buffer %d", gl_buf->id);
|
|
gst_cuda_graphics_resource_free (resource);
|
|
return nullptr;
|
|
}
|
|
|
|
gst_mini_object_set_qdata (GST_MINI_OBJECT (mem), quark, resource,
|
|
(GDestroyNotify) gst_cuda_graphics_resource_free);
|
|
}
|
|
|
|
return resource;
|
|
}
|
|
|
|
static void
|
|
gst_nv_comp_video_enc_upload_gl (GstGLContext * context, GLInteropData * data)
|
|
{
|
|
auto self = data->self;
|
|
auto priv = self->priv;
|
|
auto info = &priv->state->info;
|
|
auto finfo = info->finfo;
|
|
GstCudaGraphicsResource *gst_res[GST_VIDEO_MAX_PLANES] = { nullptr, };
|
|
CUgraphicsResource cuda_res[GST_VIDEO_MAX_PLANES] = { nullptr, };
|
|
CUdeviceptr src_devptr[GST_VIDEO_MAX_PLANES] = { 0, };
|
|
CUstream stream = gst_cuda_stream_get_handle (priv->stream);
|
|
CUresult ret;
|
|
gint comp[GST_VIDEO_MAX_COMPONENTS];
|
|
auto cur_task = priv->cur_task;
|
|
|
|
if (!gst_cuda_context_push (priv->ctx)) {
|
|
GST_ERROR_OBJECT (self, "Couldn't push context");
|
|
return;
|
|
}
|
|
|
|
for (guint i = 0; i < GST_VIDEO_INFO_N_PLANES (info); i++) {
|
|
GstMemory *mem = gst_buffer_peek_memory (data->buffer, i);
|
|
GstGLMemoryPBO *pbo = (GstGLMemoryPBO *) mem;
|
|
gsize src_size;
|
|
|
|
if (!gst_is_gl_memory_pbo (mem)) {
|
|
GST_ERROR_OBJECT (self, "Not a GL PBO memory");
|
|
goto out;
|
|
}
|
|
|
|
gst_res[i] = ensure_gl_cuda_resource (self, mem);
|
|
if (!gst_res[i]) {
|
|
GST_ERROR_OBJECT (self, "Couldn't get resource %d", i);
|
|
goto out;
|
|
}
|
|
|
|
gst_gl_memory_pbo_upload_transfer (pbo);
|
|
gst_gl_memory_pbo_download_transfer (pbo);
|
|
|
|
cuda_res[i] = gst_cuda_graphics_resource_map (gst_res[i], stream,
|
|
CU_GRAPHICS_MAP_RESOURCE_FLAGS_READ_ONLY);
|
|
if (!cuda_res[i]) {
|
|
GST_ERROR_OBJECT (self, "Couldn't map resource");
|
|
goto out;
|
|
}
|
|
|
|
ret = CuGraphicsResourceGetMappedPointer (&src_devptr[i],
|
|
&src_size, cuda_res[i]);
|
|
if (!gst_cuda_result (ret)) {
|
|
GST_ERROR_OBJECT (self, "Couldn't get mapped device pointer");
|
|
goto out;
|
|
}
|
|
}
|
|
|
|
|
|
for (guint i = 0; i < GST_VIDEO_INFO_N_PLANES (info); i++) {
|
|
guint8 *sp = (guint8 *) src_devptr[i];
|
|
guint8 *dp = cur_task->device_uncompressed + info->offset[i];
|
|
guint ss, ds;
|
|
guint w, h;
|
|
|
|
if (GST_VIDEO_FORMAT_INFO_HAS_PALETTE (finfo) && i == 1) {
|
|
ret = CuMemcpyDtoDAsync ((CUdeviceptr) dp, (CUdeviceptr) sp,
|
|
256 * 4, stream);
|
|
|
|
if (!gst_cuda_result (ret)) {
|
|
GST_ERROR_OBJECT (self, "CUDA memcpy failed");
|
|
goto out;
|
|
}
|
|
|
|
data->ret = TRUE;
|
|
goto out;
|
|
}
|
|
|
|
auto meta = gst_buffer_get_video_meta (data->buffer);
|
|
if (meta)
|
|
ss = meta->stride[i];
|
|
else
|
|
ss = GST_VIDEO_INFO_PLANE_STRIDE (info, i);
|
|
|
|
ds = GST_VIDEO_INFO_PLANE_STRIDE (info, i);
|
|
|
|
gst_video_format_info_component (finfo, i, comp);
|
|
|
|
w = GST_VIDEO_INFO_COMP_WIDTH (info, comp[0]) *
|
|
GST_VIDEO_INFO_COMP_PSTRIDE (info, comp[0]);
|
|
if (w == 0)
|
|
w = MIN (ss, ds);
|
|
|
|
h = GST_VIDEO_INFO_COMP_HEIGHT (info, comp[0]);
|
|
|
|
if (GST_VIDEO_FORMAT_INFO_IS_TILED (finfo)) {
|
|
gint tile_size;
|
|
gint sx_tiles, sy_tiles, dx_tiles, dy_tiles;
|
|
GstVideoTileMode mode;
|
|
|
|
tile_size = GST_VIDEO_FORMAT_INFO_TILE_SIZE (info->finfo, i);
|
|
|
|
mode = GST_VIDEO_FORMAT_INFO_TILE_MODE (info->finfo);
|
|
|
|
sx_tiles = GST_VIDEO_TILE_X_TILES (ss);
|
|
sy_tiles = GST_VIDEO_TILE_Y_TILES (ss);
|
|
|
|
dx_tiles = GST_VIDEO_TILE_X_TILES (ds);
|
|
dy_tiles = GST_VIDEO_TILE_Y_TILES (ds);
|
|
|
|
w = MIN (sx_tiles, dx_tiles);
|
|
h = MIN (sy_tiles, dy_tiles);
|
|
|
|
for (guint j = 0; j < h; j++) {
|
|
for (guint k = 0; k < w; k++) {
|
|
guint si, di;
|
|
guint8 *cur_dp;
|
|
guint8 *cur_sp;
|
|
|
|
si = gst_video_tile_get_index (mode, k, j, sx_tiles, sy_tiles);
|
|
di = gst_video_tile_get_index (mode, k, j, dx_tiles, dy_tiles);
|
|
|
|
cur_dp = dp + (di * tile_size);
|
|
cur_sp = sp + (si * tile_size);
|
|
|
|
ret = CuMemcpyDtoDAsync ((CUdeviceptr) cur_dp, (CUdeviceptr) cur_sp,
|
|
w, stream);
|
|
|
|
if (!gst_cuda_result (ret)) {
|
|
GST_ERROR_OBJECT (self, "CUDA memcpy failed");
|
|
goto out;
|
|
}
|
|
}
|
|
}
|
|
} else {
|
|
CUDA_MEMCPY2D params = { };
|
|
params.srcMemoryType = CU_MEMORYTYPE_DEVICE;
|
|
params.srcDevice = (CUdeviceptr) sp;
|
|
params.srcPitch = ss;
|
|
|
|
params.dstMemoryType = CU_MEMORYTYPE_DEVICE;
|
|
params.dstDevice = (CUdeviceptr) dp;
|
|
params.dstPitch = ds;
|
|
|
|
params.WidthInBytes = w;
|
|
params.Height = h;
|
|
|
|
ret = CuMemcpy2DAsync (¶ms, stream);
|
|
|
|
if (!gst_cuda_result (ret)) {
|
|
GST_ERROR_OBJECT (self, "CUDA memcpy failed");
|
|
goto out;
|
|
}
|
|
}
|
|
}
|
|
|
|
data->ret = TRUE;
|
|
|
|
out:
|
|
for (guint i = 0; i < gst_buffer_n_memory (data->buffer); i++) {
|
|
if (!gst_res[i])
|
|
break;
|
|
|
|
gst_cuda_graphics_resource_unmap (gst_res[i], stream);
|
|
}
|
|
|
|
CuStreamSynchronize (stream);
|
|
gst_cuda_context_pop (nullptr);
|
|
}
|
|
#endif
|
|
|
|
static gpointer
|
|
gst_nv_comp_video_enc_thread_func (GstNvCompVideoEnc * self)
|
|
{
|
|
auto encoder = GST_VIDEO_ENCODER (self);
|
|
auto priv = self->priv;
|
|
|
|
GST_DEBUG_OBJECT (self, "Entering loop");
|
|
|
|
while (1) {
|
|
std::shared_ptr < EncoderTask > task;
|
|
|
|
{
|
|
std::unique_lock < std::mutex > lk (priv->output_lock);
|
|
while (priv->output_task_queue.empty ())
|
|
priv->output_cond.wait (lk);
|
|
|
|
task = priv->output_task_queue.front ();
|
|
priv->output_task_queue.pop ();
|
|
}
|
|
|
|
if (!task) {
|
|
GST_DEBUG_OBJECT (self, "Got empty task, terminate");
|
|
break;
|
|
}
|
|
|
|
auto frame = gst_video_encoder_get_oldest_frame (encoder);
|
|
|
|
gst_cuda_context_push (priv->ctx);
|
|
CuEventSynchronize (task->event);
|
|
gst_cuda_context_pop (nullptr);
|
|
|
|
gst_buffer_pool_acquire_buffer (priv->pool, &frame->output_buffer, nullptr);
|
|
GstMapInfo map_info;
|
|
gst_buffer_map (frame->output_buffer, &map_info, GST_MAP_WRITE);
|
|
if (task->batched) {
|
|
task->compressed_size = 0;
|
|
auto dst = (uint8_t *) map_info.data;
|
|
|
|
/* Write custom header */
|
|
GST_WRITE_UINT32_LE (dst, GST_NV_COMP_HEADER_VERSION);
|
|
dst += sizeof (guint32);
|
|
task->compressed_size += sizeof (guint32);
|
|
|
|
GST_WRITE_UINT32_LE (dst, task->chunk_size);
|
|
dst += sizeof (guint32);
|
|
task->compressed_size += sizeof (guint32);
|
|
|
|
GST_WRITE_UINT32_LE (dst, task->max_output_chunk_size);
|
|
dst += sizeof (guint32);
|
|
task->compressed_size += sizeof (guint32);
|
|
|
|
GST_WRITE_UINT32_LE (dst, task->batch_size);
|
|
dst += sizeof (guint32);
|
|
task->compressed_size += sizeof (guint32);
|
|
|
|
for (size_t i = 0; i < task->batch_size; i++) {
|
|
GST_WRITE_UINT32_LE (dst, task->host_uncompressed_bytes[i]);
|
|
dst += sizeof (guint32);
|
|
task->compressed_size += sizeof (guint32);
|
|
|
|
GST_WRITE_UINT32_LE (dst, task->host_compressed_bytes[i]);
|
|
dst += sizeof (guint32);
|
|
task->compressed_size += sizeof (guint32);
|
|
}
|
|
|
|
/* Write compressed data */
|
|
for (size_t i = 0; i < task->batch_size; i++) {
|
|
auto size = task->host_compressed_bytes[i];
|
|
auto src = task->host_compressed + (i * task->max_output_chunk_size);
|
|
memcpy (dst, src, size);
|
|
dst += size;
|
|
task->compressed_size += size;
|
|
}
|
|
} else {
|
|
memcpy (map_info.data, task->host_compressed, task->compressed_size);
|
|
}
|
|
gst_buffer_unmap (frame->output_buffer, &map_info);
|
|
|
|
if (task->compressed_size > 0) {
|
|
gst_buffer_set_size (frame->output_buffer, task->compressed_size);
|
|
frame->dts = frame->pts;
|
|
|
|
auto ratio = (double) priv->state->info.size / task->compressed_size;
|
|
GST_LOG_OBJECT (self, "compressed buffer size %" G_GSIZE_FORMAT
|
|
", ratio %.2f", task->compressed_size, ratio);
|
|
} else {
|
|
GST_ERROR_OBJECT (self, "Zero compressed size");
|
|
gst_clear_buffer (&frame->output_buffer);
|
|
}
|
|
|
|
{
|
|
std::lock_guard < std::mutex > lk (priv->input_lock);
|
|
priv->input_task_queue.push (task);
|
|
priv->input_cond.notify_all ();
|
|
}
|
|
|
|
priv->last_flow = gst_video_encoder_finish_frame (encoder, frame);
|
|
};
|
|
|
|
GST_DEBUG_OBJECT (self, "Leaving loop");
|
|
|
|
return nullptr;
|
|
}
|
|
|
|
static GstFlowReturn
|
|
gst_nv_comp_video_enc_handle_frame (GstVideoEncoder * encoder,
|
|
GstVideoCodecFrame * frame)
|
|
{
|
|
auto self = GST_NV_COMP_VIDEO_ENC (encoder);
|
|
auto priv = self->priv;
|
|
GstMemory *mem;
|
|
CUstream stream = nullptr;
|
|
GstVideoFrame vframe;
|
|
auto info = &priv->state->info;
|
|
size_t compressed_size = 0;
|
|
gboolean need_copy = TRUE;
|
|
std::shared_ptr < EncoderTask > task;
|
|
|
|
if (!priv->ctx || (!priv->manager && !priv->batched_comp)) {
|
|
GST_ERROR_OBJECT (self, "Context was not configured");
|
|
goto error;
|
|
}
|
|
|
|
if (priv->last_flow != GST_FLOW_OK) {
|
|
GST_INFO_OBJECT (self, "Last flow was %s",
|
|
gst_flow_get_name (priv->last_flow));
|
|
gst_video_encoder_finish_frame (encoder, frame);
|
|
return priv->last_flow;
|
|
}
|
|
|
|
if (!priv->encode_thread) {
|
|
priv->encode_thread = g_thread_new ("nvcompvideoenc",
|
|
(GThreadFunc) gst_nv_comp_video_enc_thread_func, self);
|
|
}
|
|
|
|
GST_VIDEO_ENCODER_STREAM_UNLOCK (encoder);
|
|
{
|
|
std::unique_lock < std::mutex > lk (priv->input_lock);
|
|
while (priv->input_task_queue.empty ())
|
|
priv->input_cond.wait (lk);
|
|
|
|
priv->cur_task = priv->input_task_queue.front ();
|
|
priv->input_task_queue.pop ();
|
|
}
|
|
GST_VIDEO_ENCODER_STREAM_LOCK (encoder);
|
|
|
|
mem = gst_buffer_peek_memory (frame->input_buffer, 0);
|
|
#ifdef HAVE_GST_GL
|
|
if (priv->gl_interop && gst_is_gl_memory (mem) &&
|
|
gst_buffer_n_memory (frame->input_buffer) ==
|
|
GST_VIDEO_INFO_N_PLANES (info)) {
|
|
GLInteropData interop_data;
|
|
interop_data.self = self;
|
|
interop_data.buffer = frame->input_buffer;
|
|
interop_data.ret = FALSE;
|
|
|
|
auto gl_mem = (GstGLMemory *) mem;
|
|
gst_gl_context_thread_add (gl_mem->mem.context,
|
|
(GstGLContextThreadFunc) gst_nv_comp_video_enc_upload_gl,
|
|
&interop_data);
|
|
if (interop_data.ret) {
|
|
need_copy = FALSE;
|
|
GST_TRACE_OBJECT (self, "GL -> CUDA copy done");
|
|
} else {
|
|
priv->gl_interop = FALSE;
|
|
}
|
|
}
|
|
#endif
|
|
|
|
if (!gst_cuda_context_push (priv->ctx)) {
|
|
GST_ERROR_OBJECT (self, "Couldn't push context");
|
|
std::lock_guard < std::mutex > lk (priv->input_lock);
|
|
priv->input_task_queue.push (std::move (priv->cur_task));
|
|
goto error;
|
|
}
|
|
|
|
stream = gst_cuda_stream_get_handle (priv->stream);
|
|
|
|
if (need_copy) {
|
|
gboolean device_copy = FALSE;
|
|
if (gst_is_cuda_memory (mem)) {
|
|
GstCudaMemory *cmem = GST_CUDA_MEMORY_CAST (mem);
|
|
if (cmem->context == priv->ctx) {
|
|
device_copy = TRUE;
|
|
if (!gst_video_frame_map (&vframe, info, frame->input_buffer,
|
|
(GstMapFlags) (GST_MAP_READ | GST_MAP_CUDA))) {
|
|
GST_ERROR_OBJECT (self, "Couldn't map cuda memory");
|
|
gst_cuda_context_pop (nullptr);
|
|
std::lock_guard < std::mutex > lk (priv->input_lock);
|
|
priv->input_task_queue.push (std::move (priv->cur_task));
|
|
goto error;
|
|
}
|
|
|
|
if (gst_cuda_memory_get_stream (cmem) != priv->stream) {
|
|
GST_DEBUG_OBJECT (self, "Different stream, need sync");
|
|
gst_cuda_memory_sync (cmem);
|
|
}
|
|
}
|
|
}
|
|
|
|
if (!device_copy && !gst_video_frame_map (&vframe,
|
|
info, frame->input_buffer, GST_MAP_READ)) {
|
|
GST_ERROR_OBJECT (self, "Couldn't map input frame");
|
|
gst_cuda_context_pop (nullptr);
|
|
std::lock_guard < std::mutex > lk (priv->input_lock);
|
|
priv->input_task_queue.push (std::move (priv->cur_task));
|
|
goto error;
|
|
}
|
|
|
|
if (!gst_nv_comp_video_enc_upload (self, &vframe, stream, device_copy)) {
|
|
gst_video_frame_unmap (&vframe);
|
|
gst_cuda_context_pop (nullptr);
|
|
std::lock_guard < std::mutex > lk (priv->input_lock);
|
|
priv->input_task_queue.push (std::move (priv->cur_task));
|
|
goto error;
|
|
}
|
|
|
|
gst_video_frame_unmap (&vframe);
|
|
|
|
if (!device_copy) {
|
|
CuMemcpyHtoDAsync ((CUdeviceptr) priv->cur_task->device_uncompressed,
|
|
priv->cur_task->host_uncompressed, info->size, stream);
|
|
}
|
|
}
|
|
|
|
task = std::move (priv->cur_task);
|
|
if (task->batched) {
|
|
g_assert (priv->batched_comp);
|
|
|
|
auto status = priv->batched_comp->compress (task->device_uncompressed_ptrs,
|
|
task->device_uncompressed_bytes, task->chunk_size, task->batch_size,
|
|
task->temp_ptr, task->temp_size, task->device_compressed_ptrs,
|
|
task->device_compressed_bytes, (cudaStream_t) stream);
|
|
if (status != nvcompSuccess) {
|
|
GST_ERROR_OBJECT (self, "Compression failed, ret %d", status);
|
|
gst_cuda_context_pop (nullptr);
|
|
std::lock_guard < std::mutex > lk (priv->input_lock);
|
|
priv->input_task_queue.push (std::move (task));
|
|
goto error;
|
|
}
|
|
|
|
CuMemcpyDtoHAsync (task->host_compressed_bytes,
|
|
(CUdeviceptr) task->device_compressed_bytes,
|
|
sizeof (size_t) * task->batch_size, stream);
|
|
CuMemcpyDtoHAsync (task->host_compressed,
|
|
(CUdeviceptr) task->device_compressed,
|
|
task->compressed_alloc_size, stream);
|
|
} else {
|
|
g_assert (priv->manager);
|
|
|
|
priv->manager->compress (task->device_uncompressed,
|
|
task->device_compressed, *priv->config);
|
|
|
|
compressed_size =
|
|
priv->manager->get_compressed_output_size (task->device_compressed);
|
|
|
|
task->compressed_size = compressed_size;
|
|
CuMemcpyDtoHAsync (task->host_compressed,
|
|
(CUdeviceptr) task->device_compressed, compressed_size, stream);
|
|
}
|
|
|
|
CuEventRecord (task->event, stream);
|
|
gst_cuda_context_pop (nullptr);
|
|
|
|
{
|
|
std::lock_guard < std::mutex > lk (priv->output_lock);
|
|
priv->output_task_queue.push (std::move (task));
|
|
priv->output_cond.notify_one ();
|
|
}
|
|
|
|
gst_video_codec_frame_unref (frame);
|
|
|
|
return priv->last_flow;
|
|
|
|
error:
|
|
gst_video_encoder_finish_frame (encoder, frame);
|
|
|
|
return GST_FLOW_ERROR;
|
|
}
|