qtdemux: Add audio clipping meta when playing gapless m4a content

Part-of: <https://gitlab.freedesktop.org/gstreamer/gstreamer/-/merge_requests/4200>
2025-01-08 16:35:40 +00:00 · 2023-03-03 12:10:38 +01:00 · 2023-03-03 12:10:38 +01:00 · 0071c97128
commit 0071c97128
parent 51ebda4df5
7 changed files with 775 additions and 3 deletions
--- a/subprojects/gst-plugins-good/gst/isomp4/qtdemux.c
+++ b/subprojects/gst-plugins-good/gst/isomp4/qtdemux.c
@ -386,6 +386,8 @@ static gboolean gst_qtdemux_stream_update_segment (GstQTDemux * qtdemux,
 static void gst_qtdemux_send_gap_for_segment (GstQTDemux * demux,
    QtDemuxStream * stream, gint segment_index, GstClockTime pos);

+static void qtdemux_check_if_is_gapless_audio (GstQTDemux * qtdemux);
+
 static gboolean qtdemux_pull_mfro_mfra (GstQTDemux * qtdemux);
 static void check_update_duration (GstQTDemux * qtdemux, GstClockTime duration);

@ -659,7 +661,12 @@ gst_qtdemux_get_duration (GstQTDemux * qtdemux, GstClockTime * duration)

  if (qtdemux->duration != 0 &&
      qtdemux->duration != G_MAXINT64 && qtdemux->timescale != 0) {
-    *duration = QTTIME_TO_GSTTIME (qtdemux, qtdemux->duration);
+    /* If this is single-stream audio media with gapless data,
+     * report the duration of the valid subset of the overall data. */
+    if (qtdemux->gapless_audio_info.type != GAPLESS_AUDIO_INFO_TYPE_NONE)
+      *duration = qtdemux->gapless_audio_info.valid_duration;
+    else
+      *duration = QTTIME_TO_GSTTIME (qtdemux, qtdemux->duration);
    res = TRUE;
  } else {
    *duration = GST_CLOCK_TIME_NONE;
@ -2048,6 +2055,11 @@ gst_qtdemux_reset (GstQTDemux * qtdemux, gboolean hard)
    qtdemux->have_group_id = FALSE;
    qtdemux->group_id = G_MAXUINT;

+    qtdemux->gapless_audio_info.type = GAPLESS_AUDIO_INFO_TYPE_NONE;
+    qtdemux->gapless_audio_info.num_start_padding_pcm_frames = 0;
+    qtdemux->gapless_audio_info.num_end_padding_pcm_frames = 0;
+    qtdemux->gapless_audio_info.num_valid_pcm_frames = 0;
+
    g_queue_clear_full (&qtdemux->protection_event_queue,
        (GDestroyNotify) gst_event_unref);

@ -5507,6 +5519,14 @@ gst_qtdemux_stream_update_segment (GstQTDemux * qtdemux, QtDemuxStream * stream,
  stream->segment.time = time;
  stream->segment.position = stream->segment.start;

+  /* Gapless audio requires adjustments to the segment
+   * to reflect the actual playtime length. In
+   * particular, this must exclude padding data. */
+  if (qtdemux->gapless_audio_info.type != GAPLESS_AUDIO_INFO_TYPE_NONE) {
+    stream->segment.stop = stream->segment.start +
+        qtdemux->gapless_audio_info.valid_duration;
+  }
+
  GST_DEBUG_OBJECT (stream->pad, "New segment: %" GST_SEGMENT_FORMAT,
      &stream->segment);

@ -6414,6 +6434,83 @@ gst_qtdemux_push_buffer (GstQTDemux * qtdemux, QtDemuxStream * stream,
      GST_ERROR_OBJECT (qtdemux, "failed to attach aavd metadata to buffer");
  }

+  if (qtdemux->gapless_audio_info.type != GAPLESS_AUDIO_INFO_TYPE_NONE) {
+    guint64 num_start_padding_pcm_frames;
+    guint64 audio_sample_offset;
+    guint64 audio_sample_offset_end;
+    guint64 start_of_trailing_padding;
+    guint64 start_clip = 0, end_clip = 0;
+    guint64 total_num_clipped_samples;
+    GstClockTime timestamp_decrement;
+
+    /* Attach GstAudioClippingMeta to exclude padding data. */
+
+    num_start_padding_pcm_frames =
+        qtdemux->gapless_audio_info.num_start_padding_pcm_frames;
+
+    audio_sample_offset = stream->sample_index * stream->stts_duration;
+    audio_sample_offset_end = audio_sample_offset + stream->stts_duration;
+    start_of_trailing_padding = num_start_padding_pcm_frames +
+        qtdemux->gapless_audio_info.num_valid_pcm_frames;
+
+    if (audio_sample_offset < num_start_padding_pcm_frames) {
+      guint64 num_padding_audio_samples =
+          num_start_padding_pcm_frames - audio_sample_offset;
+      start_clip = MIN (num_padding_audio_samples, stream->stts_duration);
+    }
+
+    timestamp_decrement = qtdemux->gapless_audio_info.start_padding_duration;
+
+    if (audio_sample_offset >= start_of_trailing_padding) {
+      /* This case happens when the buffer is located fully past
+       * the beginning of the padding area at the end of the stream.
+       * Add the end padding to the decrement amount to ensure
+       * continuous timestamps when transitioning from gapless
+       * media to gapless media. */
+      end_clip = stream->stts_duration;
+      timestamp_decrement += qtdemux->gapless_audio_info.end_padding_duration;
+    } else if (audio_sample_offset_end >= start_of_trailing_padding) {
+      /* This case happens when the beginning of the padding area that
+       * is located at the end of the stream intersects the buffer. */
+      end_clip = audio_sample_offset_end - start_of_trailing_padding;
+    }
+
+    total_num_clipped_samples = start_clip + end_clip;
+
+    if (total_num_clipped_samples != 0) {
+      GST_DEBUG_OBJECT (qtdemux, "adding audio clipping meta: start / "
+          "end clip: %" G_GUINT64_FORMAT " / %" G_GUINT64_FORMAT,
+          start_clip, end_clip);
+      gst_buffer_add_audio_clipping_meta (buf, GST_FORMAT_DEFAULT,
+          start_clip, end_clip);
+
+      if (total_num_clipped_samples >= stream->stts_duration) {
+        GST_BUFFER_DURATION (buf) = 0;
+        GST_BUFFER_FLAG_SET (buf, GST_BUFFER_FLAG_DECODE_ONLY);
+        GST_BUFFER_FLAG_SET (buf, GST_BUFFER_FLAG_DROPPABLE);
+      } else {
+        guint64 num_valid_samples =
+            stream->stts_duration - total_num_clipped_samples;
+        GST_BUFFER_DURATION (buf) =
+            QTSTREAMTIME_TO_GSTTIME (stream, num_valid_samples);
+      }
+    }
+
+    /* The timestamps need to be shifted to factor in the skipped padding data. */
+
+    if (GST_BUFFER_PTS_IS_VALID (buf)) {
+      GstClockTime ts = GST_BUFFER_PTS (buf);
+      GST_BUFFER_PTS (buf) =
+          (ts >= timestamp_decrement) ? (ts - timestamp_decrement) : 0;
+    }
+
+    if (GST_BUFFER_DTS_IS_VALID (buf)) {
+      GstClockTime ts = GST_BUFFER_DTS (buf);
+      GST_BUFFER_DTS (buf) =
+          (ts >= timestamp_decrement) ? (ts - timestamp_decrement) : 0;
+    }
+  }
+
  if (stream->protected && (stream->protection_scheme_type == FOURCC_cenc
          || stream->protection_scheme_type == FOURCC_cbcs)) {
    GstStructure *crypto_info;
@ -7565,6 +7662,129 @@ gst_qtdemux_send_gap_for_segment (GstQTDemux * demux,
  }
 }

+static void
+qtdemux_check_if_is_gapless_audio (GstQTDemux * qtdemux)
+{
+  QtDemuxStream *stream;
+
+  if (QTDEMUX_N_STREAMS (qtdemux) != 1)
+    goto incompatible_stream;
+
+  stream = QTDEMUX_NTH_STREAM (qtdemux, 0);
+
+  if (stream->subtype != FOURCC_soun || stream->n_segments != 1)
+    goto incompatible_stream;
+
+  /* Gapless audio info from revdns tags (most notably iTunSMPB) is
+   * detected in the main udta node. If it isn't present, try as
+   * fallback to recognize the encoder name, and apply known priming
+   * and padding quantities specific to the encoder. */
+  if (qtdemux->gapless_audio_info.type == GAPLESS_AUDIO_INFO_TYPE_NONE) {
+    const gchar *orig_encoder_name = NULL;
+
+    if (gst_tag_list_peek_string_index (qtdemux->tag_list, GST_TAG_ENCODER, 0,
+            &orig_encoder_name) && orig_encoder_name != NULL) {
+      gchar *lowercase_encoder_name = g_ascii_strdown (orig_encoder_name, -1);
+
+      if (strstr (lowercase_encoder_name, "nero") != NULL)
+        qtdemux->gapless_audio_info.type = GAPLESS_AUDIO_INFO_TYPE_NERO;
+
+      g_free (lowercase_encoder_name);
+
+      switch (qtdemux->gapless_audio_info.type) {
+        case GAPLESS_AUDIO_INFO_TYPE_NERO:{
+          guint64 total_length;
+          guint64 valid_length;
+          guint64 start_padding;
+
+          /* The Nero AAC encoder always uses a lead-in of 1600 PCM frames.
+           * Also, in Nero AAC's case, stream->duration contains the number
+           * of PCM frames with start padding but without end padding.
+           * The decoder delay equals 1 frame length, which is covered by
+           * factoring stream->stts_duration into the start padding. */
+          start_padding = 1600 + stream->stts_duration;
+
+          if (G_UNLIKELY (stream->duration < start_padding)) {
+            GST_ERROR_OBJECT (qtdemux, "stream duration is %" G_GUINT64_FORMAT
+                " but start_padding is %" G_GUINT64_FORMAT, stream->duration,
+                start_padding);
+            goto invalid_gapless_audio_info;
+          }
+          valid_length = stream->duration - start_padding;
+
+          qtdemux->gapless_audio_info.num_start_padding_pcm_frames =
+              start_padding;
+          qtdemux->gapless_audio_info.num_valid_pcm_frames = valid_length;
+
+          total_length = stream->n_samples * stream->stts_duration;
+
+          if (G_LIKELY (total_length >= valid_length)) {
+            guint64 total_padding = total_length - valid_length;
+            if (G_UNLIKELY (total_padding < start_padding)) {
+              GST_ERROR_OBJECT (qtdemux, "total_padding is %" G_GUINT64_FORMAT
+                  " but start_padding is %" G_GUINT64_FORMAT, total_padding,
+                  start_padding);
+              goto invalid_gapless_audio_info;
+            }
+
+            qtdemux->gapless_audio_info.num_end_padding_pcm_frames =
+                total_padding - start_padding;
+          } else {
+            qtdemux->gapless_audio_info.num_end_padding_pcm_frames = 0;
+          }
+
+          GST_DEBUG_OBJECT (qtdemux, "media was encoded with Nero AAC encoder; "
+              "using encoder specific lead-in and padding figures");
+        }
+
+        default:
+          break;
+      }
+    }
+  }
+
+  if (qtdemux->gapless_audio_info.type != GAPLESS_AUDIO_INFO_TYPE_NONE) {
+    qtdemux->gapless_audio_info.start_padding_duration =
+        QTSTREAMTIME_TO_GSTTIME (stream,
+        qtdemux->gapless_audio_info.num_start_padding_pcm_frames);
+    qtdemux->gapless_audio_info.end_padding_duration =
+        QTSTREAMTIME_TO_GSTTIME (stream,
+        qtdemux->gapless_audio_info.num_end_padding_pcm_frames);
+    qtdemux->gapless_audio_info.valid_duration =
+        QTSTREAMTIME_TO_GSTTIME (stream,
+        qtdemux->gapless_audio_info.num_valid_pcm_frames);
+  }
+
+  GST_DEBUG_OBJECT (qtdemux, "found valid gapless audio info: num start / end "
+      "PCM padding frames: %" G_GUINT64_FORMAT " / %" G_GUINT64_FORMAT "; "
+      "start / end padding durations: %" GST_TIME_FORMAT " / %" GST_TIME_FORMAT
+      "; num valid PCM frames: %" G_GUINT64_FORMAT "; valid duration: %"
+      GST_TIME_FORMAT, qtdemux->gapless_audio_info.num_start_padding_pcm_frames,
+      qtdemux->gapless_audio_info.num_end_padding_pcm_frames,
+      GST_TIME_ARGS (qtdemux->gapless_audio_info.start_padding_duration),
+      GST_TIME_ARGS (qtdemux->gapless_audio_info.end_padding_duration),
+      qtdemux->gapless_audio_info.num_valid_pcm_frames,
+      GST_TIME_ARGS (qtdemux->gapless_audio_info.valid_duration));
+
+  return;
+
+incompatible_stream:
+  if (G_UNLIKELY (qtdemux->gapless_audio_info.type !=
+          GAPLESS_AUDIO_INFO_TYPE_NONE)) {
+    GST_WARNING_OBJECT (qtdemux,
+        "media contains gapless audio info, but it is not suitable for "
+        "gapless audio playback (media must be audio-only, single-stream, "
+        "single-segment; ignoring unusable gapless info");
+    qtdemux->gapless_audio_info.type = GAPLESS_AUDIO_INFO_TYPE_NONE;
+  }
+  return;
+
+invalid_gapless_audio_info:
+  GST_WARNING_OBJECT (qtdemux,
+      "media contains invalid/unusable gapless audio info");
+  return;
+}
+
 static GstFlowReturn
 gst_qtdemux_chain (GstPad * sinkpad, GstObject * parent, GstBuffer * inbuf)
 {
@ -14009,6 +14229,8 @@ qtdemux_prepare_streams (GstQTDemux * qtdemux)
    }
  }

+  qtdemux_check_if_is_gapless_audio (qtdemux);
+
  return ret;
 }

--- a/subprojects/gst-plugins-good/gst/isomp4/qtdemux.h
+++ b/subprojects/gst-plugins-good/gst/isomp4/qtdemux.h
@ -54,6 +54,7 @@ typedef struct _QtDemuxSample QtDemuxSample;
 typedef struct _QtDemuxSegment QtDemuxSegment;
 typedef struct _QtDemuxRandomAccessEntry QtDemuxRandomAccessEntry;
 typedef struct _QtDemuxStreamStsdEntry QtDemuxStreamStsdEntry;
+typedef struct _QtDemuxGaplessAudioInfo QtDemuxGaplessAudioInfo;

 typedef GstBuffer * (*QtDemuxProcessFunc)(GstQTDemux * qtdemux, QtDemuxStream * stream, GstBuffer * buf);

@ -78,6 +79,36 @@ typedef enum {
  VARIANT_MSS_FRAGMENTED,
 } Variant;

+typedef enum {
+  /* No valid gapless audio info present. Types other than this one
+   * are used only if all of these apply:
+   *
+   * 1. There is embedded gapless audio information available
+   * 2. Only one stream exists
+   * 3. Said stream has only one segment
+   * 4. Said stream is an audio stream
+   */
+  GAPLESS_AUDIO_INFO_TYPE_NONE,
+  /* Using information from the iTunes iTunSMPB revdns tag. */
+  GAPLESS_AUDIO_INFO_TYPE_ITUNES,
+  /* Using known Nero encoder delay information. */
+  GAPLESS_AUDIO_INFO_TYPE_NERO
+} QtDemuxGaplessAudioInfoType;
+
+/* Gapless audio information, only used for single-stream audio-only media. */
+struct _QtDemuxGaplessAudioInfo {
+  QtDemuxGaplessAudioInfoType type;
+
+  guint64 num_start_padding_pcm_frames;
+  guint64 num_end_padding_pcm_frames;
+  guint64 num_valid_pcm_frames;
+
+  /* PCM frame amounts converted to nanoseconds. */
+  GstClockTime start_padding_duration;
+  GstClockTime end_padding_duration;
+  GstClockTime valid_duration;
+};
+
 struct _GstQTDemux {
  GstElement element;

@ -177,6 +208,8 @@ struct _GstQTDemux {

  gint64 chapters_track_id;

+  QtDemuxGaplessAudioInfo gapless_audio_info;
+
  /* protection support */
  GPtrArray *protection_system_ids; /* Holds identifiers of all content protection systems for all tracks */
  GQueue protection_event_queue; /* holds copy of upstream protection events */
--- a/subprojects/gst-plugins-good/gst/isomp4/qtdemux_tags.c
+++ b/subprojects/gst-plugins-good/gst/isomp4/qtdemux_tags.c
@ -747,12 +747,111 @@ qtdemux_tag_add_revdns (GstQTDemux * demux, GstTagList * taglist,
        break;
      }
    }
-    if (i == G_N_ELEMENTS (tags))
-      goto unknown_tag;
+
+    /* Some tags might not actually be used for metadata about the media,
+     * but for other purposes. One such tag is iTunSMPB, which contains
+     * padding information for gapless playback. Scan these separately. */
+    if (i == G_N_ELEMENTS (tags)) {
+      if (!g_ascii_strncasecmp ("iTunSMPB", namestr, 8)) {
+        /* iTunSMPB tag format goes as follows:
+         *
+         * " 00000000 xxxxxxxx yyyyyyyy zzzzzzzzzzzzzzzz 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000"
+         *
+         * The data is actually an ASCII string containing these hex fields.
+         * The description above is _not_ a description of a binary format!
+         * These need to be parsed with g_ascii_strtoull() and base 16.
+         *
+         * (The quotes are not part of it; they just emphasize the
+         * whitespace at the beginning of the string).
+         *
+         * Only the fields marked with x/y/z are of interest here.
+         *
+         * The x field is the priming, in samples.
+         * These are the padding samples at the beginning of the stream.
+         *
+         * The y field is the remainder, in samples.
+         * These are the padding samples at the end of the stream.
+         *
+         * The z field is the number of valid PCM frames, excluding the
+         * priming and remainder. (In other words, the number of PCM
+         * frames that make up the actual audio, without the padding.)
+         *
+         * The data starts at offset 16. All access to it must therefore skip
+         * the first 16 bytes.
+         */
+
+        const gsize start_offset = 16;
+        const gsize priming_offset = start_offset + 10;
+        const gsize remainder_offset = start_offset + 19;
+        const gsize num_valid_pcm_frames_offset = start_offset + 28;
+        const gsize total_length = 44;
+        const gchar *str;
+        guint64 priming;
+        guint64 remainder;
+        guint64 num_valid_pcm_frames;
+        /* Temporary buffer for g_ascii_strtoull() calls.
+         * Add extra +1 space for nullbyte. */
+        gchar tmp[16 + 1];
+
+        /* Use the iTunSMPB info if no other info has been found yet. */
+        if (demux->gapless_audio_info.type != GAPLESS_AUDIO_INFO_TYPE_NONE) {
+          GST_DEBUG_OBJECT (demux, "iTunSMPB information found, "
+              "but other gapless audio info was already read");
+          goto finish;
+        }
+
+        if (G_UNLIKELY (datasize < (start_offset + total_length))) {
+          GST_WARNING_OBJECT (demux,
+              "iTunSMPB tag data size too small - not parsing");
+          goto finish;
+        }
+
+        str = (gchar *) ((guint8 *) data->data);
+
+#define PARSE_ITUNSMPB_FIELD(FIELD_NAME, NUM_DIGITS) \
+        G_STMT_START \
+        { \
+          gint str_idx; \
+\
+          for (str_idx = 0; str_idx < (NUM_DIGITS); ++str_idx) { \
+            gchar ch = str[FIELD_NAME ## _offset + str_idx]; \
+            if (!g_ascii_isxdigit (ch)) { \
+              GST_WARNING_OBJECT (demux, #FIELD_NAME " field in iTunSMPB " \
+                  "tag data has invalid character '%c'", ch); \
+              goto finish; \
+            } \
+            tmp[str_idx] = ch; \
+          } \
+          tmp[NUM_DIGITS] = 0; \
+\
+          FIELD_NAME = g_ascii_strtoull (tmp, NULL, 16); \
+        } \
+        G_STMT_END
+
+        PARSE_ITUNSMPB_FIELD (priming, 8);
+        PARSE_ITUNSMPB_FIELD (remainder, 8);
+        PARSE_ITUNSMPB_FIELD (num_valid_pcm_frames, 16);
+
+#undef PARSE_ITUNSMPB_FIELD
+
+        GST_DEBUG_OBJECT (demux, "iTunSMPB information: priming %"
+            G_GUINT64_FORMAT " remainder %" G_GUINT64_FORMAT
+            " num valid PCM frames %" G_GUINT64_FORMAT, priming, remainder,
+            num_valid_pcm_frames);
+
+        demux->gapless_audio_info.type = GAPLESS_AUDIO_INFO_TYPE_ITUNES;
+        demux->gapless_audio_info.num_start_padding_pcm_frames = priming;
+        demux->gapless_audio_info.num_end_padding_pcm_frames = remainder;
+        demux->gapless_audio_info.num_valid_pcm_frames = num_valid_pcm_frames;
+      } else {
+        goto unknown_tag;
+      }
+    }
  } else {
    goto unknown_tag;
  }

+finish:
  return;

 /* errors */
--- a/subprojects/gst-plugins-good/tests/check/elements/qtdemux.c
+++ b/subprojects/gst-plugins-good/tests/check/elements/qtdemux.c
@ -27,6 +27,8 @@
 #include <glib/gprintf.h>

 #include <gst/check/check.h>
+#include <gst/app/gstappsink.h>
+#include <gst/audio/audio.h>

 #define TEST_FILE_PREFIX GST_TEST_FILES_PATH G_DIR_SEPARATOR_S

@ -1200,6 +1202,419 @@ GST_START_TEST (test_qtdemux_mss_fragment)

 GST_END_TEST;

+typedef struct
+{
+  const gchar *filename;
+  /* Total number of AAC frames, including any and all dummy/empty/padding frames. */
+  guint num_aac_frames;
+  /* In AAC, this is 1024 in the vast majority of the cases.
+   * AAC can also use 960 samples per frame, but this is rare. */
+  guint num_samples_per_frame;
+  /* How many padding samples to expect at the beginning and the end.
+   * The amount of padding samples can exceed the size of a frame.
+   * This means that the first and last N frame(s) can actually be
+   * fully made of padding samples and thus need to be thrown away. */
+  guint num_start_padding_samples;
+  guint num_end_padding_samples;
+  guint sample_rate;
+  /* Some encoders produce data whose last frame uses a different
+   * (smaller) stts value to handle the padding at the end. Data
+   * produced by such encoders will not get a clipmeta added at the
+   * end. When using test data produced by such an encoder, this
+   * must be set to FALSE, otherwise it must be set to TRUE.
+   * Notably, anything that produces an iTunSMPB tag (iTunes itself
+   * as well as newer Nero encoders for example) will cause such
+   * a clipmeta to be added. */
+  gboolean expect_clipmeta_at_end;
+
+  /* Total number of samples available, with / without padding
+   * samples factored in. */
+  guint64 num_samples_with_padding;
+  guint64 num_samples_without_padding;
+
+  /* The index of the first / last frame that contains valid samples.
+   * Indices start with 0. Valid range is [0 , (num_aac_frames-1)].
+   * In virtually all cases, when the AAC data was encoded with iTunes,
+   * the first and last valid frames will be partially clipped. */
+  guint first_frame_with_valid_samples;
+  guint last_frame_with_valid_samples;
+
+  guint64 num_samples_in_first_valid_frame;
+  guint64 num_samples_in_last_valid_frame;
+
+  GstClockTime total_duration_without_padding;
+
+  GstElement *appsink;
+} GaplessTestInfo;
+
+static void
+precalculate_gapless_test_factors (GaplessTestInfo * info)
+{
+  info->num_samples_with_padding = info->num_aac_frames *
+      info->num_samples_per_frame;
+  info->num_samples_without_padding = info->num_samples_with_padding -
+      info->num_start_padding_samples - info->num_end_padding_samples;
+
+  info->first_frame_with_valid_samples = info->num_start_padding_samples /
+      info->num_samples_per_frame;
+  info->last_frame_with_valid_samples = (info->num_samples_with_padding -
+      info->num_end_padding_samples) / info->num_samples_per_frame;
+
+  info->num_samples_in_first_valid_frame =
+      (info->first_frame_with_valid_samples + 1) * info->num_samples_per_frame -
+      info->num_start_padding_samples;
+  info->num_samples_in_last_valid_frame =
+      (info->num_samples_with_padding - info->num_end_padding_samples) -
+      info->last_frame_with_valid_samples * info->num_samples_per_frame;
+
+  /* The total actual playtime duration. */
+  info->total_duration_without_padding =
+      gst_util_uint64_scale_int (info->num_samples_without_padding, GST_SECOND,
+      info->sample_rate);
+
+  GST_DEBUG ("num_samples_with_padding %" G_GUINT64_FORMAT
+      " num_samples_without_padding %" G_GUINT64_FORMAT
+      " first_frame_with_valid_samples %u"
+      " last_frame_with_valid_samples %u"
+      " num_samples_in_first_valid_frame %" G_GUINT64_FORMAT
+      " num_samples_in_last_valid_frame %" G_GUINT64_FORMAT
+      " total_duration_without_padding %" G_GUINT64_FORMAT,
+      info->num_samples_with_padding, info->num_samples_without_padding,
+      info->first_frame_with_valid_samples, info->last_frame_with_valid_samples,
+      info->num_samples_in_first_valid_frame,
+      info->num_samples_in_last_valid_frame,
+      info->total_duration_without_padding);
+}
+
+static void
+setup_gapless_itunes_test_info (GaplessTestInfo * info)
+{
+  info->filename =
+      "sine-1kHztone-48kHzrate-mono-s32le-200000samples-itunes.m4a";
+  info->num_aac_frames = 198;
+  info->num_samples_per_frame = 1024;
+  info->sample_rate = 48000;
+  info->expect_clipmeta_at_end = TRUE;
+
+  info->num_start_padding_samples = 2112;
+  info->num_end_padding_samples = 640;
+
+  precalculate_gapless_test_factors (info);
+}
+
+static void
+setup_gapless_nero_with_itunsmpb_test_info (GaplessTestInfo * info)
+{
+  info->filename =
+      "sine-1kHztone-48kHzrate-mono-s32le-200000samples-nero-with-itunsmpb.m4a";
+  info->num_aac_frames = 198;
+  info->num_samples_per_frame = 1024;
+  info->sample_rate = 48000;
+  info->expect_clipmeta_at_end = TRUE;
+
+  info->num_start_padding_samples = 2624;
+  info->num_end_padding_samples = 128;
+
+  precalculate_gapless_test_factors (info);
+}
+
+static void
+setup_gapless_nero_without_itunsmpb_test_info (GaplessTestInfo * info)
+{
+  info->filename =
+      "sine-1kHztone-48kHzrate-mono-s32le-200000samples-nero-without-itunsmpb.m4a";
+  info->num_aac_frames = 198;
+  info->num_samples_per_frame = 1024;
+  info->sample_rate = 48000;
+  /* Older Nero AAC encoders produce a different stts value for the
+   * last frame to skip padding data. In this file, all frames except
+   * the last one use an stts value of 1024, while the last value
+   * uses an stts value of 896. Consequently, the logic inside qtdemux
+   * won't deem it necessary to add an audioclipmeta - there are no
+   * padding samples to clip. */
+  info->expect_clipmeta_at_end = FALSE;
+
+  info->num_start_padding_samples = 2624;
+  info->num_end_padding_samples = 128;
+
+  precalculate_gapless_test_factors (info);
+}
+
+static void
+check_parsed_aac_frame (GaplessTestInfo * info, guint frame_num)
+{
+  GstClockTime expected_pts = GST_CLOCK_TIME_NONE;
+  GstClockTime expected_duration = GST_CLOCK_TIME_NONE;
+  GstClockTimeDiff ts_delta;
+  guint64 expected_sample_offset;
+  guint64 expected_num_samples;
+  gboolean expect_audioclipmeta = FALSE;
+  guint64 expected_audioclipmeta_start = 0;
+  guint64 expected_audioclipmeta_end = 0;
+  GstSample *sample;
+  GstBuffer *buffer;
+  GstAudioClippingMeta *audioclip_meta;
+
+  if (frame_num < info->first_frame_with_valid_samples) {
+    /* Frame is at the beginning and is fully clipped. */
+    expected_sample_offset = 0;
+    expected_num_samples = 0;
+
+    expected_audioclipmeta_start = info->num_samples_per_frame;
+    expected_audioclipmeta_end = 0;
+  } else if (frame_num == info->first_frame_with_valid_samples) {
+    /* Frame is at the beginning and is partially clipped. */
+
+    expected_sample_offset = 0;
+    expected_num_samples = info->num_samples_in_first_valid_frame;
+
+    expected_audioclipmeta_start = info->num_samples_per_frame -
+        info->num_samples_in_first_valid_frame;
+    expected_audioclipmeta_end = 0;
+  } else if (frame_num < info->last_frame_with_valid_samples) {
+    /* Regular, unclipped frame. */
+
+    expected_sample_offset = info->num_samples_in_first_valid_frame +
+        info->num_samples_per_frame * (frame_num -
+        info->first_frame_with_valid_samples - 1);
+    expected_num_samples = info->num_samples_per_frame;
+  } else if (frame_num == info->last_frame_with_valid_samples) {
+    /* The first frame at the end with padding samples. This one will have
+     * the last few valid samples, followed by the first padding samples. */
+
+    expected_sample_offset = info->num_samples_in_first_valid_frame +
+        info->num_samples_per_frame * (frame_num -
+        info->first_frame_with_valid_samples - 1);
+    expected_num_samples = info->num_samples_in_last_valid_frame;
+
+    if (info->expect_clipmeta_at_end) {
+      expect_audioclipmeta = TRUE;
+      expected_audioclipmeta_start = 0;
+      expected_audioclipmeta_end =
+          info->num_samples_per_frame - expected_num_samples;
+    }
+  } else {
+    /* A fully clipped frame at the end of the stream. */
+
+    expected_sample_offset = info->num_samples_in_first_valid_frame +
+        info->num_samples_without_padding;
+    expected_num_samples = 0;
+
+    if (info->expect_clipmeta_at_end) {
+      expect_audioclipmeta = TRUE;
+      expected_audioclipmeta_start = 0;
+      expected_audioclipmeta_end = info->num_samples_per_frame;
+    }
+  }
+
+  /* Pull the frame from appsink so we can check it. */
+
+  sample = gst_app_sink_pull_sample (GST_APP_SINK (info->appsink));
+  fail_if (sample == NULL);
+  fail_unless (GST_IS_SAMPLE (sample));
+
+  expected_pts = gst_util_uint64_scale_int (expected_sample_offset,
+      GST_SECOND, info->sample_rate);
+  expected_duration = gst_util_uint64_scale_int (expected_num_samples,
+      GST_SECOND, info->sample_rate);
+
+  buffer = gst_sample_get_buffer (sample);
+  fail_if (buffer == NULL);
+
+  /* Verify the sample's PTS and duration. Allow for 1 nanosecond difference
+   * to account for rounding errors in sample <-> timestamp conversions. */
+  ts_delta = GST_CLOCK_DIFF (GST_BUFFER_PTS (buffer), expected_pts);
+  fail_unless (ABS (ts_delta) <= 1);
+  ts_delta = GST_CLOCK_DIFF (GST_BUFFER_DURATION (buffer), expected_duration);
+  fail_unless (ABS (ts_delta) <= 1);
+  /* Check if there's audio clip metadata, and verify it if it exists. */
+  if (expect_audioclipmeta) {
+    audioclip_meta = gst_buffer_get_audio_clipping_meta (buffer);
+    fail_if (audioclip_meta == NULL);
+    fail_unless_equals_uint64 (audioclip_meta->start,
+        expected_audioclipmeta_start);
+    fail_unless_equals_uint64 (audioclip_meta->end, expected_audioclipmeta_end);
+  }
+
+  gst_sample_unref (sample);
+}
+
+static void
+qtdemux_pad_added_cb_for_gapless (GstElement * demux, GstPad * pad,
+    GaplessTestInfo * info)
+{
+  GstPad *appsink_pad;
+  GstPadLinkReturn ret;
+
+  appsink_pad = gst_element_get_static_pad (info->appsink, "sink");
+
+  if (gst_pad_is_linked (appsink_pad))
+    goto finish;
+
+  ret = gst_pad_link (pad, appsink_pad);
+  if (GST_PAD_LINK_FAILED (ret)) {
+    GST_ERROR ("Could not link qtdemux and appsink: %s",
+        gst_pad_link_get_name (ret));
+  }
+
+finish:
+  gst_object_unref (GST_OBJECT (appsink_pad));
+}
+
+static void
+perform_gapless_test (GaplessTestInfo * info)
+{
+  GstElement *source, *demux, *appsink, *pipeline;
+  GstStateChangeReturn state_ret;
+  guint frame_num;
+
+  pipeline = gst_pipeline_new (NULL);
+  source = gst_element_factory_make ("filesrc", NULL);
+  demux = gst_element_factory_make ("qtdemux", NULL);
+  appsink = gst_element_factory_make ("appsink", NULL);
+
+  info->appsink = appsink;
+
+  g_signal_connect (demux, "pad-added", (GCallback)
+      qtdemux_pad_added_cb_for_gapless, info);
+
+  gst_bin_add_many (GST_BIN (pipeline), source, demux, appsink, NULL);
+  gst_element_link (source, demux);
+
+  {
+    char *full_filename =
+        g_build_filename (GST_TEST_FILES_PATH, info->filename, NULL);
+    g_object_set (G_OBJECT (source), "location", full_filename, NULL);
+    g_free (full_filename);
+  }
+
+  g_object_set (G_OBJECT (appsink), "async", FALSE, "sync", FALSE,
+      "max-buffers", 1, "enable-last-sample", FALSE, "processing-deadline",
+      G_MAXUINT64, NULL);
+
+  state_ret = gst_element_set_state (pipeline, GST_STATE_PLAYING);
+
+  fail_unless (state_ret != GST_STATE_CHANGE_FAILURE);
+
+  if (state_ret == GST_STATE_CHANGE_ASYNC) {
+    GST_LOG ("waiting for pipeline to reach PAUSED state");
+    state_ret = gst_element_get_state (pipeline, NULL, NULL, -1);
+    fail_unless_equals_int (state_ret, GST_STATE_CHANGE_SUCCESS);
+  }
+
+  /* Verify all frames from the test signal. */
+  for (frame_num = 0; frame_num < info->num_aac_frames; ++frame_num)
+    check_parsed_aac_frame (info, frame_num);
+
+  /* Check what duration is returned by a query. This duration must exclude
+   * the padding samples. */
+  {
+    GstQuery *query;
+    gint64 duration;
+    GstFormat format;
+
+    query = gst_query_new_duration (GST_FORMAT_TIME);
+    fail_unless (gst_element_query (pipeline, query));
+
+    gst_query_parse_duration (query, &format, &duration);
+    fail_unless_equals_int (format, GST_FORMAT_TIME);
+    fail_unless_equals_uint64 ((guint64) duration,
+        info->total_duration_without_padding);
+
+    gst_query_unref (query);
+  }
+
+  /* Seek tests: Here we seek to a certain position that corresponds to a
+   * certain frame. Then we check if we indeed got that frame. */
+
+  /* Seek back to the first frame. This will _not_ be the first valid frame.
+   * Instead, it will be a frame that gets only decoded and has duration
+   * zero. Other zero-duration frames may follow, until the first frame
+   * with valid data is encountered. This means that when the user seeks
+   * to position 0, downstream will subsequently get a number of buffers
+   * with PTS 0, and all of those buffers except the last will have a
+   * duration of 0. */
+  {
+    fail_unless_equals_int (gst_element_set_state (pipeline, GST_STATE_PAUSED),
+        GST_STATE_CHANGE_SUCCESS);
+    gst_element_seek_simple (pipeline, GST_FORMAT_TIME, GST_SEEK_FLAG_FLUSH, 0);
+    fail_unless_equals_int (gst_element_set_state (pipeline, GST_STATE_PLAYING),
+        GST_STATE_CHANGE_SUCCESS);
+
+    check_parsed_aac_frame (info, 0);
+  }
+
+  /* Now move to the frame past the very first one that contained valid samples.
+   * This very first frame will usually be clipped, and be output as the last
+   * buffer at PTS 0 (see above). */
+  {
+    GstClockTime position;
+
+    position =
+        gst_util_uint64_scale_int (info->num_samples_in_first_valid_frame,
+        GST_SECOND, info->sample_rate);
+
+    fail_unless_equals_int (gst_element_set_state (pipeline, GST_STATE_PAUSED),
+        GST_STATE_CHANGE_SUCCESS);
+    gst_element_seek_simple (pipeline, GST_FORMAT_TIME, GST_SEEK_FLAG_FLUSH,
+        position);
+    fail_unless_equals_int (gst_element_set_state (pipeline, GST_STATE_PLAYING),
+        GST_STATE_CHANGE_SUCCESS);
+
+    check_parsed_aac_frame (info, info->first_frame_with_valid_samples + 1);
+  }
+
+  /* Seek to the last frame with valid samples (= the first frame with padding
+   * samples at the end of the stream). */
+  {
+    GstClockTime position;
+
+    position =
+        gst_util_uint64_scale_int (info->num_samples_in_first_valid_frame +
+        info->num_samples_without_padding - info->num_samples_per_frame,
+        GST_SECOND, info->sample_rate);
+
+    fail_unless_equals_int (gst_element_set_state (pipeline, GST_STATE_PAUSED),
+        GST_STATE_CHANGE_SUCCESS);
+    gst_element_seek_simple (pipeline, GST_FORMAT_TIME, GST_SEEK_FLAG_FLUSH,
+        position);
+    fail_unless_equals_int (gst_element_set_state (pipeline, GST_STATE_PLAYING),
+        GST_STATE_CHANGE_SUCCESS);
+
+    check_parsed_aac_frame (info, info->last_frame_with_valid_samples);
+  }
+
+  gst_element_set_state (pipeline, GST_STATE_NULL);
+  gst_object_unref (pipeline);
+}
+
+GST_START_TEST (test_qtdemux_gapless_itunes_data)
+{
+  GaplessTestInfo info;
+  setup_gapless_itunes_test_info (&info);
+  perform_gapless_test (&info);
+}
+
+GST_END_TEST;
+
+GST_START_TEST (test_qtdemux_gapless_nero_data_with_itunsmpb)
+{
+  GaplessTestInfo info;
+  setup_gapless_nero_with_itunsmpb_test_info (&info);
+  perform_gapless_test (&info);
+}
+
+GST_END_TEST;
+
+GST_START_TEST (test_qtdemux_gapless_nero_data_without_itunsmpb)
+{
+  GaplessTestInfo info;
+  setup_gapless_nero_without_itunsmpb_test_info (&info);
+  perform_gapless_test (&info);
+}
+
+GST_END_TEST;
+
 static Suite *
 qtdemux_suite (void)
 {
@ -1215,6 +1630,9 @@ qtdemux_suite (void)
  tcase_add_test (tc_chain, test_qtdemux_pad_names);
  tcase_add_test (tc_chain, test_qtdemux_compensate_data_offset);
  tcase_add_test (tc_chain, test_qtdemux_mss_fragment);
+  tcase_add_test (tc_chain, test_qtdemux_gapless_itunes_data);
+  tcase_add_test (tc_chain, test_qtdemux_gapless_nero_data_with_itunsmpb);
+  tcase_add_test (tc_chain, test_qtdemux_gapless_nero_data_without_itunsmpb);

  return s;
 }
--- a/subprojects/gst-plugins-good/tests/files/sine-1kHztone-48kHzrate-mono-s32le-200000samples-itunes.m4a
+++ b/subprojects/gst-plugins-good/tests/files/sine-1kHztone-48kHzrate-mono-s32le-200000samples-itunes.m4a
--- a/subprojects/gst-plugins-good/tests/files/sine-1kHztone-48kHzrate-mono-s32le-200000samples-nero-with-itunsmpb.m4a
+++ b/subprojects/gst-plugins-good/tests/files/sine-1kHztone-48kHzrate-mono-s32le-200000samples-nero-with-itunsmpb.m4a
--- a/subprojects/gst-plugins-good/tests/files/sine-1kHztone-48kHzrate-mono-s32le-200000samples-nero-without-itunsmpb.m4a
+++ b/subprojects/gst-plugins-good/tests/files/sine-1kHztone-48kHzrate-mono-s32le-200000samples-nero-without-itunsmpb.m4a