qtmux: Error out on discontinuities/gaps when muxing raw audio

When muxing raw audio, we have no way of storing timestamps but are just storing a continuous stream of audio samples. If the difference between the expected and the real timestamp becomes to big, we should error out instead of silently creating files with wrong A/V sync. https://bugzilla.gnome.org/show_bug.cgi?id=780679
2025-06-05 23:18:52 +00:00 · 2017-03-29 14:01:25 +03:00 · 2017-03-29 14:01:25 +03:00 · f0163a016c
commit f0163a016c
parent e4cbefcb6c
2 changed files with 53 additions and 2 deletions
--- a/gst/isomp4/gstqtmux.c
+++ b/gst/isomp4/gstqtmux.c
@ -146,6 +146,10 @@
 GST_DEBUG_CATEGORY_STATIC (gst_qt_mux_debug);
 #define GST_CAT_DEFAULT gst_qt_mux_debug

+#ifndef ABSDIFF
+#define ABSDIFF(a, b) ((a) > (b) ? (a) - (b) : (b) - (a))
+#endif
+
 /* Hacker notes.
 *
 * The basic building blocks of MP4 files are:
@ -270,6 +274,7 @@ enum
  PROP_DO_CTTS,
  PROP_INTERLEAVE_BYTES,
  PROP_INTERLEAVE_TIME,
+  PROP_MAX_RAW_AUDIO_DRIFT,
 };

 /* some spare for header size as well */
@ -291,6 +296,7 @@ enum
 #define DEFAULT_RESERVED_BYTES_PER_SEC_PER_TRAK 550
 #define DEFAULT_INTERLEAVE_BYTES 0
 #define DEFAULT_INTERLEAVE_TIME 250*GST_MSECOND
+#define DEFAULT_MAX_RAW_AUDIO_DRIFT 40 * GST_MSECOND

 static void gst_qt_mux_finalize (GObject * object);

@ -493,6 +499,11 @@ gst_qt_mux_class_init (GstQTMuxClass * klass)
          "Interleave between streams in nanoseconds",
          0, G_MAXUINT64, DEFAULT_INTERLEAVE_TIME,
          G_PARAM_READWRITE | G_PARAM_STATIC_STRINGS));
+  g_object_class_install_property (gobject_class, PROP_MAX_RAW_AUDIO_DRIFT,
+      g_param_spec_uint64 ("max-raw-audio-drift", "Max Raw Audio Drift",
+          "Maximum allowed drift of raw audio samples vs. timestamps in nanoseconds",
+          0, G_MAXUINT64, DEFAULT_MAX_RAW_AUDIO_DRIFT,
+          G_PARAM_READWRITE | G_PARAM_STATIC_STRINGS));

  gstelement_class->request_new_pad =
      GST_DEBUG_FUNCPTR (gst_qt_mux_request_new_pad);
@ -508,6 +519,7 @@ gst_qt_mux_pad_reset (GstQTPad * qtpad)
  qtpad->sample_size = 0;
  qtpad->sync = FALSE;
  qtpad->last_dts = 0;
+  qtpad->sample_offset = 0;
  qtpad->dts_adjustment = GST_CLOCK_TIME_NONE;
  qtpad->first_ts = GST_CLOCK_TIME_NONE;
  qtpad->first_dts = GST_CLOCK_TIME_NONE;
@ -655,6 +667,7 @@ gst_qt_mux_init (GstQTMux * qtmux, GstQTMuxClass * qtmux_klass)
      DEFAULT_RESERVED_BYTES_PER_SEC_PER_TRAK;
  qtmux->interleave_bytes = DEFAULT_INTERLEAVE_BYTES;
  qtmux->interleave_time = DEFAULT_INTERLEAVE_TIME;
+  qtmux->max_raw_audio_drift = DEFAULT_MAX_RAW_AUDIO_DRIFT;

  /* always need this */
  qtmux->context =
@ -3318,14 +3331,27 @@ gst_qt_mux_add_buffer (GstQTMux * qtmux, GstQTPad * pad, GstBuffer * buf)

  /* fragments only deal with 1 buffer == 1 chunk (== 1 sample) */
  if (pad->sample_size && !qtmux->fragment_sequence) {
+    GstClockTime expected_timestamp;
+
    /* Constant size packets: usually raw audio (with many samples per
       buffer (= chunk)), but can also be fixed-packet-size codecs like ADPCM
     */
    sample_size = pad->sample_size;
    if (gst_buffer_get_size (last_buf) % sample_size != 0)
      goto fragmented_sample;
+
    /* note: qt raw audio storage warps it implicitly into a timewise
-     * perfect stream, discarding buffer times */
+     * perfect stream, discarding buffer times.
+     * If the difference between the current PTS and the expected one
+     * becomes too big, we error out: there was a gap and we have no way to
+     * represent that, causing A/V sync to be off */
+    expected_timestamp =
+        gst_util_uint64_scale (pad->sample_offset, GST_SECOND,
+        atom_trak_get_timescale (pad->trak)) + pad->first_ts;
+    if (ABSDIFF (GST_BUFFER_DTS_OR_PTS (last_buf),
+            expected_timestamp) > qtmux->max_raw_audio_drift)
+      goto raw_audio_timestamp_drift;
+
    if (GST_BUFFER_DURATION (last_buf) != GST_CLOCK_TIME_NONE) {
      nsamples = gst_util_uint64_scale_round (GST_BUFFER_DURATION (last_buf),
          atom_trak_get_timescale (pad->trak), GST_SECOND);
@ -3339,7 +3365,9 @@ gst_qt_mux_add_buffer (GstQTMux * qtmux, GstQTPad * pad, GstBuffer * buf)

    /* timescale = samplerate */
    scaled_duration = 1;
-    pad->last_dts += duration;
+    pad->last_dts =
+        pad->first_dts + gst_util_uint64_scale_round (pad->sample_offset +
+        nsamples, GST_SECOND, atom_trak_get_timescale (pad->trak));
  } else {
    nsamples = 1;
    sample_size = gst_buffer_get_size (last_buf);
@ -3371,6 +3399,8 @@ gst_qt_mux_add_buffer (GstQTMux * qtmux, GstQTPad * pad, GstBuffer * buf)
    }
  }

+  pad->sample_offset += nsamples;
+
  /* for computing the avg bitrate */
  pad->total_bytes += gst_buffer_get_size (last_buf);
  pad->total_duration += duration;
@ -3473,6 +3503,18 @@ fragmented_sample:
        ("Audio buffer contains fragmented sample."));
    goto bail;
  }
+raw_audio_timestamp_drift:
+  {
+    /* TODO: Could in theory be implemented with edit lists */
+    GST_ELEMENT_ERROR (qtmux, STREAM, MUX, (NULL),
+        ("Audio stream timestamps are drifting (got %" GST_TIME_FORMAT
+            ", expected %" GST_TIME_FORMAT "). This is not supported yet!",
+            GST_TIME_ARGS (GST_BUFFER_DTS_OR_PTS (last_buf)),
+            GST_TIME_ARGS (gst_util_uint64_scale (pad->sample_offset,
+                    GST_SECOND,
+                    atom_trak_get_timescale (pad->trak)) + pad->first_ts)));
+    goto bail;
+  }
 no_pts:
  {
    GST_ELEMENT_ERROR (qtmux, STREAM, MUX, (NULL), ("Buffer has no PTS."));
@ -4923,6 +4965,9 @@ gst_qt_mux_get_property (GObject * object,
    case PROP_INTERLEAVE_TIME:
      g_value_set_uint64 (value, qtmux->interleave_time);
      break;
+    case PROP_MAX_RAW_AUDIO_DRIFT:
+      g_value_set_uint64 (value, qtmux->max_raw_audio_drift);
+      break;
    default:
      G_OBJECT_WARN_INVALID_PROPERTY_ID (object, prop_id, pspec);
      break;
@ -5008,6 +5053,9 @@ gst_qt_mux_set_property (GObject * object,
      qtmux->interleave_time = g_value_get_uint64 (value);
      qtmux->interleave_time_set = TRUE;
      break;
+    case PROP_MAX_RAW_AUDIO_DRIFT:
+      qtmux->max_raw_audio_drift = g_value_get_uint64 (value);
+      break;
    default:
      G_OBJECT_WARN_INVALID_PROPERTY_ID (object, prop_id, pspec);
      break;
--- a/gst/isomp4/gstqtmux.h
+++ b/gst/isomp4/gstqtmux.h
@ -108,6 +108,7 @@ struct _GstQTPad
  GstBuffer *last_buf;
  /* dts of last_buf */
  GstClockTime last_dts;
+  guint64 sample_offset;

  /* This is compensate for CTTS */
  GstClockTime dts_adjustment;
@ -254,6 +255,8 @@ struct _GstQTMux
  GstClockTime interleave_time;
  gboolean interleave_bytes_set, interleave_time_set;

+  GstClockTime max_raw_audio_drift;
+
  /* Reserved minimum MOOV size in bytes
   * This is converted from reserved_max_duration
   * using the bytes/trak/sec estimate */