av1parse: Correct the pts for frames and OBUs inside a TU.

When the output alignment is smaller than the input alignment, for example, When the output alignment is "FRAME" and the parse is likely connecting to a decoder, the current PTS setting for AV1 frames inside a TU is not very correct. For example, a TU may begin with non-displayed frames and end with a displayed frame. The current way will assign the PTS to the first non-displayed frame, which is a decode-only frame and the PTS will be discarded in the video decoder. While the last displayed frame has invalid PTS, and so the video decoder needs to guess its PTS based on the frame rate and previous frame's PTS. This is not a decent and robust way. And more important, when the previous frames provide DTS, the video decoder will also guess the PTS based on the previous frames' DTS and trigger the warning like: gstvideodecoder.c:3147:gst_video_decoder_prepare_finish_frame: \ <vavp9dec0> decreasing timestame It sets the reordered_output and makes the decoder in free run mode. We should correct the PTS for a TU, let the non-displayed frames have no PTS while set the correct PTS to the displayed one. Also, when the AV1 stream has multi spatial layers, there are more than one displayed frames inside one TU with the same PTS. Note: If the input alignment is not TU aligned, we can not know the exact PTS of this TU, and so we just clear the PTS of the decode only frame and leave others unchanged. We also correct all the PTS if the output is OBU aligned. All their PTS and DTS are set to the input buffer's PTS. Part-of: <https://gitlab.freedesktop.org/gstreamer/gstreamer/-/merge_requests/3182>
2025-09-03 02:33:53 +00:00 · 2022-10-14 16:05:28 +08:00 · 2022-10-14 16:05:28 +08:00 · 242401915f
commit 242401915f
parent a1f352196f
1 changed files with 78 additions and 0 deletions
--- a/subprojects/gst-plugins-bad/gst/videoparsers/gstav1parse.c
+++ b/subprojects/gst-plugins-bad/gst/videoparsers/gstav1parse.c
@ -127,6 +127,10 @@ struct _GstAV1Parse
  gboolean header;
  gboolean keyframe;
  gboolean show_frame;
+
+  GstClockTime buffer_pts;
+  GstClockTime buffer_dts;
+  GstClockTime buffer_duration;
 };

 static GstStaticPadTemplate sinktemplate = GST_STATIC_PAD_TEMPLATE ("sink",
@ -273,6 +277,8 @@ static gboolean gst_av1_parse_set_sink_caps (GstBaseParse * parse,
    GstCaps * caps);
 static GstCaps *gst_av1_parse_get_sink_caps (GstBaseParse * parse,
    GstCaps * filter);
+static GstFlowReturn gst_av1_parse_pre_push_frame (GstBaseParse * parse,
+    GstBaseParseFrame * frame);

 /* Clear the parse state related to data kind OBUs. */
 static void
@ -283,6 +289,14 @@ gst_av1_parse_reset_obu_data_state (GstAV1Parse * self)
  self->within_one_frame = FALSE;
 }

+static void
+gst_av1_parse_reset_tu_timestamp (GstAV1Parse * self)
+{
+  self->buffer_pts = GST_CLOCK_TIME_NONE;
+  self->buffer_dts = GST_CLOCK_TIME_NONE;
+  self->buffer_duration = GST_CLOCK_TIME_NONE;
+}
+
 static void
 gst_av1_parse_reset (GstAV1Parse * self)
 {
@ -307,6 +321,7 @@ gst_av1_parse_reset (GstAV1Parse * self)
  g_clear_pointer (&self->parser, gst_av1_parser_free);
  gst_adapter_clear (self->cache_out);
  gst_adapter_clear (self->frame_cache);
+  gst_av1_parse_reset_tu_timestamp (self);
 }

 static void
@ -345,6 +360,8 @@ gst_av1_parse_class_init (GstAV1ParseClass * klass)
  parse_class->start = GST_DEBUG_FUNCPTR (gst_av1_parse_start);
  parse_class->stop = GST_DEBUG_FUNCPTR (gst_av1_parse_stop);
  parse_class->handle_frame = GST_DEBUG_FUNCPTR (gst_av1_parse_handle_frame);
+  parse_class->pre_push_frame =
+      GST_DEBUG_FUNCPTR (gst_av1_parse_pre_push_frame);
  parse_class->set_sink_caps = GST_DEBUG_FUNCPTR (gst_av1_parse_set_sink_caps);
  parse_class->get_sink_caps = GST_DEBUG_FUNCPTR (gst_av1_parse_get_sink_caps);

@ -1594,11 +1611,17 @@ gst_av1_parse_handle_to_small_and_equal_align (GstBaseParse * parse,
    return GST_FLOW_ERROR;
  }

+  self->buffer_pts = GST_BUFFER_PTS (buffer);
+  self->buffer_dts = GST_BUFFER_DTS (buffer);
+  self->buffer_duration = GST_BUFFER_DURATION (buffer);
+
  consumed_before_push = 0;
  offset = 0;
  frame_complete = FALSE;
 again:
  while (offset < map_info.size) {
+    GST_BUFFER_OFFSET (buffer) = offset;
+
    res = gst_av1_parser_identify_one_obu (self->parser,
        map_info.data + offset, map_info.size - offset, &obu, &consumed);
    if (res == GST_AV1_PARSER_OK)
@ -1700,6 +1723,7 @@ again:
 out:
  gst_buffer_unmap (buffer, &map_info);
  gst_buffer_unref (buffer);
+  gst_av1_parse_reset_tu_timestamp (self);
  return ret;
 }

@ -2055,3 +2079,57 @@ gst_av1_parse_handle_frame (GstBaseParse * parse,

  return ret;
 }
+
+static GstFlowReturn
+gst_av1_parse_pre_push_frame (GstBaseParse * parse, GstBaseParseFrame * frame)
+{
+  GstAV1Parse *self = GST_AV1_PARSE (parse);
+
+  frame->flags |= GST_BASE_PARSE_FRAME_FLAG_CLIP;
+
+  if (!frame->buffer)
+    return GST_FLOW_OK;
+
+  if (self->align == GST_AV1_PARSE_ALIGN_FRAME) {
+    /* When the input align to TU, it may may contain more than one frames
+       inside its buffer. When splitting a TU into frames, the base parse
+       class only assign the PTS to the first frame and leave the others'
+       PTS invalid. But in fact, all decode only frames should have invalid
+       PTS while showable frames should have correct PTS setting. */
+    if (self->in_align == GST_AV1_PARSE_ALIGN_TEMPORAL_UNIT
+        || self->in_align == GST_AV1_PARSE_ALIGN_TEMPORAL_UNIT_ANNEX_B) {
+      if (GST_BUFFER_FLAG_IS_SET (frame->buffer, GST_BUFFER_FLAG_DECODE_ONLY)) {
+        GST_BUFFER_PTS (frame->buffer) = GST_CLOCK_TIME_NONE;
+        GST_BUFFER_DURATION (frame->buffer) = GST_CLOCK_TIME_NONE;
+      } else {
+        GST_BUFFER_PTS (frame->buffer) = self->buffer_pts;
+        GST_BUFFER_DURATION (frame->buffer) = self->buffer_duration;
+      }
+
+      GST_BUFFER_DTS (frame->buffer) = self->buffer_dts;
+    } else {
+      if (GST_BUFFER_FLAG_IS_SET (frame->buffer, GST_BUFFER_FLAG_DECODE_ONLY)) {
+        GST_BUFFER_PTS (frame->buffer) = GST_CLOCK_TIME_NONE;
+        GST_BUFFER_DURATION (frame->buffer) = GST_CLOCK_TIME_NONE;
+      }
+    }
+  } else if (self->align == GST_AV1_PARSE_ALIGN_OBU) {
+    /* When we split a big frame or TU into OBUs, all OBUs should have the
+       same PTS and DTS of the input buffer, and should not have duration. */
+    if (self->in_align >= GST_AV1_PARSE_ALIGN_FRAME) {
+      GST_BUFFER_PTS (frame->buffer) = self->buffer_pts;
+      GST_BUFFER_DTS (frame->buffer) = self->buffer_dts;
+      GST_BUFFER_DURATION (frame->buffer) = GST_CLOCK_TIME_NONE;
+    }
+  }
+
+  GST_LOG_OBJECT (parse, "Adjust the frame buffer PTS/DTS/duration."
+      " The buffer of size %" G_GSIZE_FORMAT " now with dts %"
+      GST_TIME_FORMAT ", pts %" GST_TIME_FORMAT ", duration %"
+      GST_TIME_FORMAT, gst_buffer_get_size (frame->buffer),
+      GST_TIME_ARGS (GST_BUFFER_DTS (frame->buffer)),
+      GST_TIME_ARGS (GST_BUFFER_PTS (frame->buffer)),
+      GST_TIME_ARGS (GST_BUFFER_DURATION (frame->buffer)));
+
+  return GST_FLOW_OK;
+}