gst-plugins-rs/net/rtp/src/vp9/depay/imp.rs

//
// Copyright (C) 2023 Sebastian Dröge <sebastian@centricular.com>
//
// This Source Code Form is subject to the terms of the Mozilla Public License, v2.0.
// If a copy of the MPL was not distributed with this file, You can obtain one at
// <https://mozilla.org/MPL/2.0/>.
//
// SPDX-License-Identifier: MPL-2.0

/**
 * SECTION:element-rtpvp9depay2
 * @see_also: rtpvp9pay2, vp9enc, vp9dec
 *
 * Depayload a VP9 video stream from RTP packets as per [draft-ietf-payload-vp9][draft-ietf-payload-vp9].
 *
 * [draft-ietf-payload-vp9]:https://datatracker.ietf.org/doc/html/draft-ietf-payload-vp9-16#section-4
 *
 * ## Example pipeline
 *
 * ```shell
 * gst-launch-1.0 udpsrc address=127.0.0.1 port=5555 caps='application/x-rtp,media=video,clock-rate=90000,encoding-name=VP9' ! rtpjitterbuffer latency=100 ! rtpvp9depay2 ! decodebin3 ! videoconvertscale ! autovideosink
 * ```
 *
 * This will depayload and decode an incoming RTP VP9 video stream. You can use the #rtpvp9pay2
 * and #vp9enc elements to create such an RTP stream.
 *
 * Since: plugins-rs-0.13.0
 */
use std::{io::Cursor, mem, sync::Mutex};

use atomic_refcell::AtomicRefCell;
use bitstream_io::{BigEndian, BitRead as _, BitReader, ByteRead as _, ByteReader};

use gst::{glib, prelude::*, subclass::prelude::*};

use once_cell::sync::Lazy;

use crate::basedepay::{PacketToBufferRelation, RtpBaseDepay2Ext};
use crate::vp9::frame_header::FrameHeader;
use crate::vp9::payload_descriptor::{PayloadDescriptor, PictureId};

#[derive(Clone, Default)]
struct Settings {
    request_keyframe: bool,
    wait_for_keyframe: bool,
}

struct State {
    /// Last extended RTP timestamp.
    last_timestamp: Option<u64>,

    /// Last picture ID, if any.
    ///
    /// This is the picture ID from the last picture and is reset
    /// to `None` also if a picture doesn't have any ID.
    last_picture_id: Option<PictureId>,

    /// Payload descriptor of the first packet of the last key picture.
    ///
    /// If this is not set then we did not see a keyframe yet.
    last_key_picture_payload_descriptor: Option<PayloadDescriptor>,

    /// Frame header of the last keyframe.
    ///
    /// For scalable streams this is set to the last frame of the picture.
    last_keyframe_frame_header: Option<FrameHeader>,

    /// Frame header of the current keyframe, if any.
    ///
    /// For scalable streams this is set to the last frame of the picture.
    ///
    /// This is only set if the current picture is a key picture and is reset whenever a picture is
    /// pushed downstream.
    current_keyframe_frame_header: Option<FrameHeader>,

    /// Payload descriptor of the first packet of the current picture.
    ///
    /// This is reset whenever the current picture is pushed downstream.
    current_picture_payload_descriptor: Option<PayloadDescriptor>,

    /// Currently queued data for the current picture.
    pending_picture_ext_seqnum: u64,
    pending_picture: Vec<u8>,

    /// Set to `true` if the next outgoing buffer should have the `DISCONT` flag set.
    needs_discont: bool,
}

impl Default for State {
    fn default() -> Self {
        State {
            last_timestamp: None,
            last_picture_id: None,
            last_key_picture_payload_descriptor: None,
            last_keyframe_frame_header: None,
            current_keyframe_frame_header: None,
            current_picture_payload_descriptor: None,
            pending_picture_ext_seqnum: 0,
            pending_picture: Vec::default(),
            needs_discont: true,
        }
    }
}

#[derive(Default)]
pub struct RtpVp9Depay {
    state: AtomicRefCell<State>,
    settings: Mutex<Settings>,
}

static CAT: Lazy<gst::DebugCategory> = Lazy::new(|| {
    gst::DebugCategory::new(
        "rtpvp9depay2",
        gst::DebugColorFlags::empty(),
        Some("RTP VP9 Depayloader"),
    )
});

impl RtpVp9Depay {
    fn reset(&self, state: &mut State) {
        gst::debug!(CAT, imp: self, "resetting state");

        *state = State::default()
    }
}

#[glib::object_subclass]
impl ObjectSubclass for RtpVp9Depay {
    const NAME: &'static str = "GstRtpVp9Depay2";
    type Type = super::RtpVp9Depay;
    type ParentType = crate::basedepay::RtpBaseDepay2;
}

impl ObjectImpl for RtpVp9Depay {
    fn properties() -> &'static [glib::ParamSpec] {
        static PROPERTIES: Lazy<Vec<glib::ParamSpec>> = Lazy::new(|| {
            vec![
                glib::ParamSpecBoolean::builder("request-keyframe")
                    .nick("Request Keyframe")
                    .blurb("Request new keyframe when packet loss is detected")
                    .default_value(Settings::default().request_keyframe)
                    .mutable_ready()
                    .build(),
                glib::ParamSpecBoolean::builder("wait-for-keyframe")
                    .nick("Wait For Keyframe")
                    .blurb("Wait for the next keyframe after packet loss")
                    .default_value(Settings::default().wait_for_keyframe)
                    .mutable_ready()
                    .build(),
            ]
        });

        PROPERTIES.as_ref()
    }

    fn set_property(&self, _id: usize, value: &glib::Value, pspec: &glib::ParamSpec) {
        match pspec.name() {
            "request-keyframe" => {
                self.settings.lock().unwrap().request_keyframe = value.get().unwrap();
            }
            "wait-for-keyframe" => {
                self.settings.lock().unwrap().wait_for_keyframe = value.get().unwrap();
            }
            _ => unimplemented!(),
        };
    }

    fn property(&self, _id: usize, pspec: &glib::ParamSpec) -> glib::Value {
        match pspec.name() {
            "request-keyframe" => self.settings.lock().unwrap().request_keyframe.to_value(),
            "wait-for-keyframe" => self.settings.lock().unwrap().wait_for_keyframe.to_value(),
            _ => unimplemented!(),
        }
    }
}

impl GstObjectImpl for RtpVp9Depay {}

impl ElementImpl for RtpVp9Depay {
    fn metadata() -> Option<&'static gst::subclass::ElementMetadata> {
        static ELEMENT_METADATA: Lazy<gst::subclass::ElementMetadata> = Lazy::new(|| {
            gst::subclass::ElementMetadata::new(
                "RTP VP9 Depayloader",
                "Codec/Depayloader/Network/RTP",
                "Depayload VP9 from RTP packets",
                "Sebastian Dröge <sebastian@centricular.com>",
            )
        });

        Some(&*ELEMENT_METADATA)
    }

    fn pad_templates() -> &'static [gst::PadTemplate] {
        static PAD_TEMPLATES: Lazy<Vec<gst::PadTemplate>> = Lazy::new(|| {
            let sink_pad_template = gst::PadTemplate::new(
                "sink",
                gst::PadDirection::Sink,
                gst::PadPresence::Always,
                &gst::Caps::builder("application/x-rtp")
                    .field("media", "video")
                    .field("clock-rate", 90_000i32)
                    .field(
                        "encoding-name",
                        gst::List::new(["VP9", "VP9-DRAFT-IETF-01"]),
                    )
                    .build(),
            )
            .unwrap();

            let src_pad_template = gst::PadTemplate::new(
                "src",
                gst::PadDirection::Src,
                gst::PadPresence::Always,
                &gst::Caps::builder("video/x-vp9").build(),
            )
            .unwrap();

            vec![src_pad_template, sink_pad_template]
        });

        PAD_TEMPLATES.as_ref()
    }
}

impl crate::basedepay::RtpBaseDepay2Impl for RtpVp9Depay {
    const ALLOWED_META_TAGS: &'static [&'static str] = &["video"];

    fn start(&self) -> Result<(), gst::ErrorMessage> {
        let mut state = self.state.borrow_mut();
        self.reset(&mut state);

        Ok(())
    }

    fn stop(&self) -> Result<(), gst::ErrorMessage> {
        let mut state = self.state.borrow_mut();
        self.reset(&mut state);

        Ok(())
    }

    fn drain(&self) -> Result<gst::FlowSuccess, gst::FlowError> {
        // TODO: Could forward all complete layers here if any are queued up
        Ok(gst::FlowSuccess::Ok)
    }

    fn flush(&self) {
        let mut state = self.state.borrow_mut();
        self.reset(&mut state);
    }

    // TODO: Might want to send lost events (and possibly ignore the ones from upstream) if there
    // are discontinuities (either in the seqnum or otherwise detected). This is especially useful
    // in case of ULPFEC as that breaks seqnum-based discontinuity detecetion.
    //
    // rtpvp9depay does this but it feels like the whole approach needs some redesign.
    fn handle_packet(
        &self,
        packet: &crate::basedepay::Packet,
    ) -> Result<gst::FlowSuccess, gst::FlowError> {
        let settings = self.settings.lock().unwrap().clone();

        gst::trace!(CAT, imp: self, "Handling RTP packet {packet:?}");
        let mut state = self.state.borrow_mut();

        let payload = packet.payload();
        let mut cursor = Cursor::new(payload);

        let mut r = ByteReader::endian(&mut cursor, BigEndian);
        let payload_descriptor = match r.parse::<PayloadDescriptor>() {
            Ok(payload_descriptor) => payload_descriptor,
            Err(err) => {
                gst::warning!(CAT, imp: self, "Invalid VP9 RTP packet: {err}");
                // TODO: Could potentially drain here?
                self.reset(&mut state);
                self.obj().drop_packet(packet);
                return Ok(gst::FlowSuccess::Ok);
            }
        };

        let payload_start_index = cursor.position() as usize;

        gst::trace!(CAT, imp: self, "VP9 RTP payload descriptor size: {}", payload_start_index);
        gst::trace!(CAT, imp: self, "Received VP9 RTP payload descriptor: {payload_descriptor:?}");

        // This is the start of a picture if this is the start of the frame and either there is no
        // layer information or this is the first spatial layer.
        let is_start_of_picture = payload_descriptor.start_of_frame
            && payload_descriptor
                .layer_index
                .as_ref()
                .map_or(true, |layer_index| layer_index.spatial_layer_id == 0);

        // Additionally, this is a key picture if it is not an inter predicted picture.
        let is_key_picture =
            !payload_descriptor.inter_picture_predicted_frame && is_start_of_picture;

        // If the timestamp or picture ID is changing we assume that a new picture is starting.
        // Any previously queued picture data needs to be drained now.
        if is_start_of_picture
            || state.last_timestamp != Some(packet.ext_timestamp())
            || state.last_picture_id.map_or(false, |picture_id| {
                Some(picture_id) != payload_descriptor.picture_id
            })
        {
            // Missed the marker packet for the last picture
            if state.current_picture_payload_descriptor.is_some() {
                gst::warning!(CAT, imp: self, "Packet is part of a new picture but didn't receive last packet of previous picture");
                // TODO: Could potentially drain here?
                self.reset(&mut state);
            }
            // Else cleanly starting a new picture here
        }

        // Validate payload descriptor
        if let Some(ref last_keyframe_payloader_descriptor) =
            state.last_key_picture_payload_descriptor
        {
            // Section 4.2, I flag
            //
            // > If the V bit was set in the stream's most recent start of a keyframe (i.e. the SS
            // > field was present) and the F bit is set to 0 (i.e. non-flexible scalability mode is
            // > in use), then this bit MUST be set on every packet.
            //
            // This check is extended here to not just check for presence of the SS field but check
            // that there are multiple spatial layers. If there is only one then we treat it as if
            // the field wasn't set.
            if last_keyframe_payloader_descriptor
                .scalability_structure
                .as_ref()
                .map_or(false, |scalability_structure| {
                    scalability_structure.num_spatial_layers > 1
                })
                && !payload_descriptor.flexible_mode
                && payload_descriptor.picture_id.is_none()
            {
                gst::warning!(
                    CAT,
                    imp: self,
                    "Scalability structure present and non-flexible scalability mode used but no picture ID present",
                );
                // TODO: Could potentially drain here?
                self.reset(&mut state);
                self.obj().drop_packet(packet);
                return Ok(gst::FlowSuccess::Ok);
            }

            // In other words, picture IDs are only optional if non-flexible scalability mode is
            // used and there was no scalability structure in the keyframe.

            // Section 4.2, F flag
            //
            // > The value of this F bit MUST only change on the first packet of a key picture.  A
            // > key picture is a picture whose base spatial layer frame is a key frame, and which
            // > thus completely resets the encoder state.  This packet will have its P bit equal to
            // > zero, SID or L bit (described below) equal to zero, and B bit (described below)
            // > equal to 1.
            if !is_key_picture
                && last_keyframe_payloader_descriptor.flexible_mode
                    != payload_descriptor.flexible_mode
            {
                gst::warning!(CAT, imp: self, "Flexible scalability mode can only change on key pictures");

                // TODO: Could potentially drain here?
                self.reset(&mut state);
                self.obj().drop_packet(packet);
                return Ok(gst::FlowSuccess::Ok);
            }
        }

        // Section 4.2, P flag
        //
        // > When P is set to zero, the TID field (described below) MUST also be set to 0 (if
        // > present).
        if !payload_descriptor.inter_picture_predicted_frame
            && payload_descriptor
                .layer_index
                .as_ref()
                .map_or(false, |layer_index| layer_index.temporal_layer_id != 0)
        {
            gst::warning!(CAT, imp: self, "Temporal layer ID of non-inter-predicted frame must be 0");
            // TODO: Could potentially drain here?
            self.reset(&mut state);
            self.obj().drop_packet(packet);
            return Ok(gst::FlowSuccess::Ok);
        }

        // Section 4.2, F flag
        //
        // > This MUST only be set to 1 if the I bit is also set to one; if the I bit is set to
        // > zero, then this MUST also be set to zero and ignored by receivers.
        if payload_descriptor.flexible_mode && payload_descriptor.picture_id.is_none() {
            gst::warning!(CAT, imp: self, "Flexible scalability mode but no picture ID present");
            // TODO: Could potentially drain here?
            self.reset(&mut state);
            self.obj().drop_packet(packet);
            return Ok(gst::FlowSuccess::Ok);
        }

        // If this is not the start of a picture then we have to wait for one
        if state.current_picture_payload_descriptor.is_none() && !is_start_of_picture {
            if state.last_timestamp.is_some() {
                gst::warning!(CAT, imp: self, "Waiting for start of picture");
            } else {
                gst::trace!(CAT, imp: self, "Waiting for start of picture");
            }
            // TODO: Could potentially drain here?
            self.obj().drop_packet(packet);
            self.reset(&mut state);
            return Ok(gst::FlowSuccess::Ok);
        }

        // If necessary wait for a key picture if we never saw one so far and/or request one
        // from upstream.
        if is_start_of_picture
            && !is_key_picture
            && state.last_key_picture_payload_descriptor.is_none()
        {
            if settings.request_keyframe {
                gst::debug!(CAT, imp: self, "Requesting keyframe from upstream");
                let event = gst_video::UpstreamForceKeyUnitEvent::builder()
                    .all_headers(true)
                    .build();
                let _ = self.obj().sink_pad().push_event(event);
            }

            if settings.wait_for_keyframe {
                gst::trace!(CAT, imp: self, "Waiting for keyframe");
                // TODO: Could potentially drain here?
                self.reset(&mut state);
                self.obj().drop_packet(packet);
                return Ok(gst::FlowSuccess::Ok);
            }
        }

        // Update state tracking
        if is_start_of_picture {
            assert!(state.pending_picture.is_empty());
            state.pending_picture_ext_seqnum = packet.ext_seqnum();
            state.current_picture_payload_descriptor = Some(payload_descriptor.clone());
            state.last_timestamp = Some(packet.ext_timestamp());
            if let Some(picture_id) = payload_descriptor.picture_id {
                state.last_picture_id = Some(picture_id);
            } else {
                state.last_picture_id = None;
            }

            if is_key_picture {
                state.last_key_picture_payload_descriptor = Some(payload_descriptor.clone());
            }
        }

        // If this is the start of a frame in a key picture then parse the frame header. We always
        // keep the last one around as that should theoretically be the one with the highest
        // resolution and profile.
        if payload_descriptor.start_of_frame
            && state.current_picture_payload_descriptor.as_ref().map_or(
                false,
                |current_picture_payload_descriptor| {
                    !current_picture_payload_descriptor.inter_picture_predicted_frame
                },
            )
        {
            let mut r = BitReader::endian(&mut cursor, BigEndian);
            // We assume that the beginning of the frame header fits into the first packet
            match r.parse::<FrameHeader>() {
                Ok(frame_header) => {
                    gst::trace!(CAT, imp: self, "Parsed frame header: {frame_header:?}");
                    state.current_keyframe_frame_header = Some(frame_header);
                }
                Err(err) => {
                    // Don't consider this a fatal error
                    gst::warning!(CAT, imp: self, "Failed to read frame header: {err}");
                }
            };
        }

        state
            .pending_picture
            .extend_from_slice(&payload[payload_start_index..]);

        // The marker bit is set for the last packet of a picture.
        if !packet.marker_bit() {
            return Ok(gst::FlowSuccess::Ok);
        }

        let current_picture_payload_descriptor =
            state.current_picture_payload_descriptor.take().unwrap();

        if let Some(current_keyframe_frame_header) = state.current_keyframe_frame_header.take() {
            // TODO: Could also add more information to the caps
            if current_keyframe_frame_header.keyframe_info.is_some()
                && state.last_keyframe_frame_header.as_ref().map_or(
                    true,
                    |last_keyframe_frame_header| {
                        last_keyframe_frame_header.profile != current_keyframe_frame_header.profile
                            || last_keyframe_frame_header
                                .keyframe_info
                                .as_ref()
                                .map(|keyframe_info| keyframe_info.render_size())
                                != current_keyframe_frame_header
                                    .keyframe_info
                                    .as_ref()
                                    .map(|keyframe_info| keyframe_info.render_size())
                    },
                )
            {
                let render_size = current_keyframe_frame_header
                    .keyframe_info
                    .as_ref()
                    .map(|keyframe_info| keyframe_info.render_size())
                    .unwrap();

                let caps = gst::Caps::builder("video/x-vp9")
                    .field(
                        "profile",
                        format!("{}", current_keyframe_frame_header.profile),
                    )
                    .field("width", render_size.0 as i32)
                    .field("height", render_size.1 as i32)
                    .build();

                self.obj().set_src_caps(&caps);
            }
            state.last_keyframe_frame_header = Some(current_keyframe_frame_header);
        }

        let mut buffer = gst::Buffer::from_mut_slice(mem::take(&mut state.pending_picture));
        {
            let buffer = buffer.get_mut().unwrap();

            if current_picture_payload_descriptor.inter_picture_predicted_frame {
                buffer.set_flags(gst::BufferFlags::DELTA_UNIT);
                gst::trace!(CAT, imp: self, "Finishing delta-frame");
            } else {
                gst::trace!(CAT, imp: self, "Finishing keyframe");
            }

            if state.needs_discont {
                gst::trace!(CAT, imp: self, "Setting DISCONT");
                buffer.set_flags(gst::BufferFlags::DISCONT);
                state.needs_discont = false;
            }

            // Set MARKER flag on the output so that the parser knows that this buffer ends a full
            // picture and potentially can operate a bit faster.
            buffer.set_flags(gst::BufferFlags::MARKER);
        }

        state.current_picture_payload_descriptor = None;
        state.current_keyframe_frame_header = None;

        // Set fallback caps if the first complete frame we have is not a keyframe. For keyframes,
        // caps with profile and resolution would've been set above already.
        //
        // If a keyframe is received in the future then the caps are updated above.
        if !self.obj().src_pad().has_current_caps() {
            self.obj()
                .set_src_caps(&self.obj().src_pad().pad_template_caps());
        }

        self.obj().queue_buffer(
            PacketToBufferRelation::Seqnums(state.pending_picture_ext_seqnum..=packet.ext_seqnum()),
            buffer,
        )?;

        Ok(gst::FlowSuccess::Ok)
    }
}