mpegaudioparse: Support gapless playback

Gapless playback is handled by adjusting buffer timestamps & durations
and by adding GstAudioClippingMeta.

Support for "Frankenstein" streams (= poorly stitched together streams)
is also added, so that gapless playback support doesn't prevent those
from being properly played.

Co-authored-by: Sebastian Dröge <sebastian@centricular.com>
Part-of: <https://gitlab.freedesktop.org/gstreamer/gstreamer/-/merge_requests/1028>
This commit is contained in:
Carlos Rafael Giani 2019-09-07 19:15:42 +02:00 committed by Sebastian Dröge
parent 2db283499e
commit 0431a0845c
4 changed files with 816 additions and 9 deletions

View file

@ -35,6 +35,84 @@
*
*/
/* Notes about gapless playback, "Frankenstein" streams, and the Xing header frame:
*
* Gapless playback is based on the LAME tag, which is located in the Xing
* header frame. The tag contains the encoder delay and encoder padding.
* The encoder delay specifies how many padding nullsamples have been prepended
* by the encoder at the start of the mp3 stream, while the encoder padding
* specifies how many padding nullsamples got added at the end of the stream.
*
* In addition, there is also a "decoder delay". This affects all existing
* mp3 decoders - they themselves introduce a delay into the signal due to
* the way mp3 decoding works. This delay is 529 samples long in all known
* decoders. Unlike the encoder delay, the decoder delay is not specified
* anywhere in the mp3 stream. Players/decoders therefore hardcode the
* decoder delay as 529 samples.
*
* (The LAME tech FAQ mentions 528 samples instead of 529, but LAME seems to
* use 529 samples. Also, decoders like mpg123 use 529 samples instead of 528.
* The situation is a little unclear, but 529 samples seems to be standard.)
*
* For proper gapless playback, both mpegaudioparse and a downstream MPEG
* audio decoder must do their part. mpegaudioparse adjusts buffer PTS/DTS
* and durations, and adds GstAudioClippingMeta to outgoing buffers if
* clipping is necessary. MPEG decoders then clip decoded frames according
* to that meta (if present).
*
* To detect when to add GstAudioClippingMeta and when to adjust PTS/DTS/
* durations, the number of the current frame is retrieved. Based on that, the
* current stream position in samples is calculated. With the sample position,
* it is determined whether or not the current playback position is still
* if the actual playback range (= in the actual playback range of the stream
* that excludes padding samples), or if it is already outside, or partially
* outside.
*
* start_of_actual_samples and end_of_actual_samples define the start/end
* of this actual playback range, in samples. So:
* If sample_pos >= start_of_actual_samples and sample_pos end_of_actual_samples
* -> sample_pos is inside the actual playback range.
*
* (The decoder delay could in theory be left for the decoder to worry
* about. But then, the decoder would also have to adjust PTS/DTS/durations
* of decoded buffers, which is not something a GstAudioDecoder based element
* should have to deal with. So, for convenience, mpegaudioparse also factors
* that delay into its calculations.)
*
*
* "Frankenstein" streams are MPEG streams which have streams beyond
* what the Xing metadata indicates. Such streams typically are the
* result of poorly stitching individual mp3s together, like this:
*
* cat first.mp3 second.mp3 > joined.mp3
*
* The resulting mp3 is not guaranteed to be valid. In particular, this can
* cause confusion when first.mp3 contains a Xing header frame. Its length
* indicator then does not match the actual length (which is bigger). When
* this is detected, a log line about this being a Frankenstein stream is
* generated.
*
*
* Xing header frames are empty dummy MPEG frames. They only exist for
* supplying metadata. They are encoded as valid silent MPEG frames for
* backwards compatibility with older hardware MP3 players, but can be safely
* dropped.
*
* For more about Xng header frames, see:
* https://www.codeproject.com/Articles/8295/MPEG-Audio-Frame-Header#XINGHeader
* https://www.compuphase.com/mp3/mp3loops.htm#PADDING_DELAYS
*
* To facilitate gapless playback and ensure that MPEG audio decoders don't
* actually decode this frame as an empty MPEG frame, it is marked here as
* GST_BUFFER_FLAG_DECODE_ONLY / GST_BUFFER_FLAG_DROPPABLE in mpegaudioparse
* after its metadata got extracted. It is also marked as such if it is
* encountered again after the user for example seeked back to the beginning
* of the mp3 stream. Its duration is also set to zero to make sure that the
* frame does not cause baseparse to increment the timestamp of the frame that
* follows this one.
*
*/
/* FIXME: we should make the base class (GstBaseParse) aware of the
* XING seek table somehow, so it can use it properly for things like
* accurate seeks. Currently it can only do a lookup via the convert function,
@ -98,12 +176,20 @@ static GstFlowReturn gst_mpeg_audio_parse_handle_frame (GstBaseParse * parse,
GstBaseParseFrame * frame, gint * skipsize);
static GstFlowReturn gst_mpeg_audio_parse_pre_push_frame (GstBaseParse * parse,
GstBaseParseFrame * frame);
static gboolean gst_mpeg_audio_parse_src_query (GstBaseParse * parse,
GstQuery * query);
static gboolean gst_mpeg_audio_parse_sink_event (GstBaseParse * parse,
GstEvent * event);
static gboolean gst_mpeg_audio_parse_convert (GstBaseParse * parse,
GstFormat src_format, gint64 src_value,
GstFormat dest_format, gint64 * dest_value);
static GstCaps *gst_mpeg_audio_parse_get_sink_caps (GstBaseParse * parse,
GstCaps * filter);
static gboolean
gst_mpeg_audio_parse_check_if_is_xing_header_frame (GstMpegAudioParse *
mp3parse, GstBuffer * buf);
static void gst_mpeg_audio_parse_handle_first_frame (GstMpegAudioParse *
mp3parse, GstBuffer * buf);
@ -166,6 +252,8 @@ gst_mpeg_audio_parse_class_init (GstMpegAudioParseClass * klass)
GST_DEBUG_FUNCPTR (gst_mpeg_audio_parse_handle_frame);
parse_class->pre_push_frame =
GST_DEBUG_FUNCPTR (gst_mpeg_audio_parse_pre_push_frame);
parse_class->src_query = GST_DEBUG_FUNCPTR (gst_mpeg_audio_parse_src_query);
parse_class->sink_event = GST_DEBUG_FUNCPTR (gst_mpeg_audio_parse_sink_event);
parse_class->convert = GST_DEBUG_FUNCPTR (gst_mpeg_audio_parse_convert);
parse_class->get_sink_caps =
GST_DEBUG_FUNCPTR (gst_mpeg_audio_parse_get_sink_caps);
@ -194,12 +282,16 @@ gst_mpeg_audio_parse_class_init (GstMpegAudioParseClass * klass)
static void
gst_mpeg_audio_parse_reset (GstMpegAudioParse * mp3parse)
{
mp3parse->upstream_format = GST_FORMAT_UNDEFINED;
mp3parse->channels = -1;
mp3parse->rate = -1;
mp3parse->sent_codec_tag = FALSE;
mp3parse->last_posted_crc = CRC_UNKNOWN;
mp3parse->last_posted_channel_mode = MPEG_AUDIO_CHANNEL_MODE_UNKNOWN;
mp3parse->freerate = 0;
mp3parse->spf = 0;
mp3parse->outgoing_frame_is_xing_header = FALSE;
mp3parse->hdr_bitrate = 0;
mp3parse->bitrate_is_constant = TRUE;
@ -224,6 +316,12 @@ gst_mpeg_audio_parse_reset (GstMpegAudioParse * mp3parse)
mp3parse->encoder_delay = 0;
mp3parse->encoder_padding = 0;
mp3parse->decoder_delay = 0;
mp3parse->start_of_actual_samples = 0;
mp3parse->end_of_actual_samples = 0;
mp3parse->total_padding_time = GST_CLOCK_TIME_NONE;
mp3parse->start_padding_time = GST_CLOCK_TIME_NONE;
mp3parse->end_padding_time = GST_CLOCK_TIME_NONE;
}
static void
@ -745,6 +843,11 @@ gst_mpeg_audio_parse_handle_frame (GstBaseParse * parse,
mp3parse->spf = 576;
}
/* We need the frame duration for calculating the frame number later
* in gst_mpeg_audio_parse_pre_push_frame (). */
mp3parse->frame_duration = gst_util_uint64_scale (GST_SECOND,
mp3parse->spf, mp3parse->rate);
/* lead_in:
* We start pushing 9 frames earlier (29 frames for MPEG2) than
* segment start to be able to decode the first frame we want.
@ -764,6 +867,21 @@ gst_mpeg_audio_parse_handle_frame (GstBaseParse * parse,
}
mp3parse->hdr_bitrate = bitrate;
/* While during normal playback, the Xing header frame is seen only once
* (right at the beginning), we may see it again if the user seeked back
* to the beginning. To make sure it is dropped again and NOT pushed
* downstream, we have to check every frame for Xing IDs.
*
* (sent_codec_tag is TRUE after this Xing frame got parsed.) */
if (G_LIKELY (mp3parse->sent_codec_tag)) {
if (G_UNLIKELY (gst_mpeg_audio_parse_check_if_is_xing_header_frame
(mp3parse, buf))) {
GST_DEBUG_OBJECT (mp3parse, "This is a Xing header frame, which "
"contains no meaningful audio data, and can be safely dropped");
mp3parse->outgoing_frame_is_xing_header = TRUE;
}
}
/* For first frame; check for seek tables and output a codec tag */
gst_mpeg_audio_parse_handle_first_frame (mp3parse, buf);
@ -774,6 +892,17 @@ gst_mpeg_audio_parse_handle_frame (GstBaseParse * parse,
cleanup:
gst_buffer_unmap (buf, &map);
/* We don't actually drop the frame right here, but rather in
* gst_mpeg_audio_parse_pre_push_frame (), since it is still important
* to let other code bits do their work there even if we want to drop
* the current frame. */
if (G_UNLIKELY (mp3parse->outgoing_frame_is_xing_header)) {
frame->flags |= GST_BASE_PARSE_FRAME_FLAG_NO_FRAME;
/* Set duration to zero to prevent the baseparse class
* from incrementing outgoing timestamps */
GST_BUFFER_DURATION (frame->buffer) = 0;
}
if (res && bpf <= map.size) {
return gst_base_parse_finish_frame (parse, frame, bpf);
}
@ -781,6 +910,54 @@ cleanup:
return GST_FLOW_OK;
}
static gboolean
gst_mpeg_audio_parse_check_if_is_xing_header_frame (GstMpegAudioParse *
mp3parse, GstBuffer * buf)
{
/* TODO: get rid of code duplication
* (see gst_mpeg_audio_parse_handle_first_frame ()) */
const guint32 xing_id = 0x58696e67; /* 'Xing' in hex */
const guint32 info_id = 0x496e666f; /* 'Info' in hex - found in LAME CBR files */
gint offset_xing;
GstMapInfo map;
guint8 *data;
guint64 avail;
guint32 read_id_xing = 0;
gboolean ret = FALSE;
/* Check first frame for Xing info */
if (mp3parse->version == 1) { /* MPEG-1 file */
if (mp3parse->channels == 1)
offset_xing = 0x11;
else
offset_xing = 0x20;
} else { /* MPEG-2 header */
if (mp3parse->channels == 1)
offset_xing = 0x09;
else
offset_xing = 0x11;
}
/* Skip the 4 bytes of the MP3 header too */
offset_xing += 4;
/* Check if we have enough data to read the Xing header */
gst_buffer_map (buf, &map, GST_MAP_READ);
data = map.data;
avail = map.size;
if (avail >= offset_xing + 4) {
read_id_xing = GST_READ_UINT32_BE (data + offset_xing);
ret = (read_id_xing == xing_id || read_id_xing == info_id);
}
gst_buffer_unmap (buf, &map);
return ret;
}
static void
gst_mpeg_audio_parse_handle_first_frame (GstMpegAudioParse * mp3parse,
GstBuffer * buf)
@ -841,10 +1018,15 @@ gst_mpeg_audio_parse_handle_first_frame (GstMpegAudioParse * mp3parse,
guint32 xing_flags;
guint bytes_needed = offset_xing + 8;
gint64 total_bytes;
guint64 num_xing_samples = 0;
GstClockTime total_time;
GST_DEBUG_OBJECT (mp3parse, "Found Xing header marker 0x%x", xing_id);
GST_DEBUG_OBJECT (mp3parse, "This is a Xing header frame, which contains "
"no meaningful audio data, and can be safely dropped");
mp3parse->outgoing_frame_is_xing_header = TRUE;
/* Move data after Xing header */
data += offset_xing + 4;
@ -875,9 +1057,9 @@ gst_mpeg_audio_parse_handle_first_frame (GstMpegAudioParse * mp3parse,
"Invalid number of frames in Xing header");
mp3parse->xing_flags &= ~XING_FRAMES_FLAG;
} else {
num_xing_samples = (guint64) (mp3parse->xing_frames) * (mp3parse->spf);
mp3parse->xing_total_time = gst_util_uint64_scale (GST_SECOND,
(guint64) (mp3parse->xing_frames) * (mp3parse->spf),
mp3parse->rate);
num_xing_samples, mp3parse->rate);
}
data += 4;
@ -886,6 +1068,10 @@ gst_mpeg_audio_parse_handle_first_frame (GstMpegAudioParse * mp3parse,
mp3parse->xing_total_time = 0;
}
/* Store the entire time as actual total time for now. Should there be
* any padding present, this value will get adjusted accordingly. */
mp3parse->xing_actual_total_time = mp3parse->xing_total_time;
if (xing_flags & XING_BYTES_FLAG) {
mp3parse->xing_bytes = GST_READ_UINT32_BE (data);
if (mp3parse->xing_bytes == 0) {
@ -967,8 +1153,10 @@ gst_mpeg_audio_parse_handle_first_frame (GstMpegAudioParse * mp3parse,
} else
mp3parse->xing_vbr_scale = 0;
GST_DEBUG_OBJECT (mp3parse, "Xing header reported %u frames, time %"
GST_TIME_FORMAT ", %u bytes, vbr scale %u", mp3parse->xing_frames,
GST_DEBUG_OBJECT (mp3parse, "Xing header reported %u frames, %"
G_GUINT64_FORMAT " samples, time %" GST_TIME_FORMAT
" (this includes potentially present padding data), %u bytes,"
" vbr scale %u", mp3parse->xing_frames, num_xing_samples,
GST_TIME_ARGS (mp3parse->xing_total_time), mp3parse->xing_bytes,
mp3parse->xing_vbr_scale);
@ -986,6 +1174,8 @@ gst_mpeg_audio_parse_handle_first_frame (GstMpegAudioParse * mp3parse,
gchar lame_version[10] = { 0, };
guint tag_rev;
guint32 encoder_delay, encoder_padding;
guint64 total_padding_samples;
guint64 actual_num_xing_samples;
memcpy (lame_version, data, 9);
data += 9;
@ -1001,11 +1191,63 @@ gst_mpeg_audio_parse_handle_first_frame (GstMpegAudioParse * mp3parse,
encoder_padding = GST_READ_UINT24_BE (data);
encoder_padding &= 0x000fff;
total_padding_samples = encoder_delay + encoder_padding;
mp3parse->encoder_delay = encoder_delay;
mp3parse->encoder_padding = encoder_padding;
GST_DEBUG_OBJECT (mp3parse, "Encoder delay %u, encoder padding %u",
encoder_delay, encoder_padding);
/* As mentioned in the overview at the beginning of this source
* file, decoders exhibit a delay of 529 samples. */
mp3parse->decoder_delay = 529;
/* Where the actual, non-padding samples start & end, in sample offsets. */
mp3parse->start_of_actual_samples = mp3parse->encoder_delay +
mp3parse->decoder_delay;
mp3parse->end_of_actual_samples = num_xing_samples +
mp3parse->decoder_delay - mp3parse->encoder_padding;
/* Length of padding at the start and at the end of the stream,
* in nanoseconds. */
mp3parse->start_padding_time = gst_util_uint64_scale_int (GST_SECOND,
mp3parse->start_of_actual_samples, mp3parse->rate);
mp3parse->end_padding_time = mp3parse->xing_total_time -
gst_util_uint64_scale_int (mp3parse->end_of_actual_samples,
GST_SECOND, mp3parse->rate);
/* Total length of all combined padding samples, in nanoseconds. */
mp3parse->total_padding_time = gst_util_uint64_scale_int (GST_SECOND,
total_padding_samples, mp3parse->rate);
/* Length of media, in samples, without the number of padding samples. */
actual_num_xing_samples = (num_xing_samples >= total_padding_samples) ?
(num_xing_samples - total_padding_samples) : 0;
/* Length of media, converted to nanoseconds. This is used for setting
* baseparse's duration. */
mp3parse->xing_actual_total_time = gst_util_uint64_scale (GST_SECOND,
actual_num_xing_samples, mp3parse->rate);
GST_DEBUG_OBJECT (mp3parse, "Encoder delay: %u samples",
mp3parse->encoder_delay);
GST_DEBUG_OBJECT (mp3parse, "Encoder padding: %u samples",
mp3parse->encoder_padding);
GST_DEBUG_OBJECT (mp3parse, "Decoder delay: %u samples",
mp3parse->decoder_delay);
GST_DEBUG_OBJECT (mp3parse, "Start of actual samples: %"
G_GUINT64_FORMAT, mp3parse->start_of_actual_samples);
GST_DEBUG_OBJECT (mp3parse, "End of actual samples: %"
G_GUINT64_FORMAT, mp3parse->end_of_actual_samples);
GST_DEBUG_OBJECT (mp3parse, "Total padding samples: %" G_GUINT64_FORMAT,
total_padding_samples);
GST_DEBUG_OBJECT (mp3parse, "Start padding time: %" GST_TIME_FORMAT,
GST_TIME_ARGS (mp3parse->start_padding_time));
GST_DEBUG_OBJECT (mp3parse, "End padding time: %" GST_TIME_FORMAT,
GST_TIME_ARGS (mp3parse->end_padding_time));
GST_DEBUG_OBJECT (mp3parse, "Total padding time: %" GST_TIME_FORMAT,
GST_TIME_ARGS (mp3parse->total_padding_time));
GST_DEBUG_OBJECT (mp3parse, "Actual total media samples: %"
G_GUINT64_FORMAT, actual_num_xing_samples);
GST_DEBUG_OBJECT (mp3parse, "Actual total media length: %"
GST_TIME_FORMAT, GST_TIME_ARGS (mp3parse->xing_actual_total_time));
}
} else if (read_id_vbri == vbri_id) {
gint64 total_bytes, total_frames;
@ -1143,7 +1385,7 @@ gst_mpeg_audio_parse_handle_first_frame (GstMpegAudioParse * mp3parse,
/* set duration if tables provided a valid one */
if (mp3parse->xing_flags & XING_FRAMES_FLAG) {
gst_base_parse_set_duration (GST_BASE_PARSE (mp3parse), GST_FORMAT_TIME,
mp3parse->xing_total_time, 0);
mp3parse->xing_actual_total_time, 0);
}
if (mp3parse->vbri_total_time != 0 && mp3parse->vbri_valid) {
gst_base_parse_set_duration (GST_BASE_PARSE (mp3parse), GST_FORMAT_TIME,
@ -1318,6 +1560,91 @@ gst_mpeg_audio_parse_bytepos_to_time (GstMpegAudioParse * mp3parse,
return FALSE;
}
static gboolean
gst_mpeg_audio_parse_src_query (GstBaseParse * parse, GstQuery * query)
{
gboolean res = FALSE;
GstMpegAudioParse *mp3parse = GST_MPEG_AUDIO_PARSE (parse);
res = GST_BASE_PARSE_CLASS (parent_class)->src_query (parse, query);
if (!res)
return FALSE;
/* If upstream operates in BYTE format then consider any parsed Xing/LAME
* header to remove encoder/decoder delay and padding samples from the
* position query. */
if (mp3parse->upstream_format == GST_FORMAT_BYTES
|| GST_PAD_MODE (GST_BASE_PARSE_SINK_PAD (parse)) == GST_PAD_MODE_PULL) {
switch (GST_QUERY_TYPE (query)) {
case GST_QUERY_POSITION:{
GstFormat format;
gint64 position, new_position;
GstClockTime duration_to_skip;
gst_query_parse_position (query, &format, &position);
/* Adjust the position to exclude padding samples. */
if ((position < 0) || (format != GST_FORMAT_TIME))
break;
duration_to_skip = mp3parse->frame_duration +
mp3parse->start_padding_time;
if (position < duration_to_skip)
new_position = 0;
else
new_position = position - duration_to_skip;
if (new_position > (mp3parse->xing_actual_total_time))
new_position = mp3parse->xing_actual_total_time;
GST_LOG_OBJECT (mp3parse, "applying gapless padding info to position "
"query response: %" GST_TIME_FORMAT " -> %" GST_TIME_FORMAT,
GST_TIME_ARGS (position), GST_TIME_ARGS (new_position));
gst_query_set_position (query, GST_FORMAT_TIME, new_position);
break;
}
default:
break;
}
}
return res;
}
static gboolean
gst_mpeg_audio_parse_sink_event (GstBaseParse * parse, GstEvent * event)
{
gboolean res = FALSE;
GstMpegAudioParse *mp3parse = GST_MPEG_AUDIO_PARSE (parse);
res =
GST_BASE_PARSE_CLASS (parent_class)->sink_event (parse,
gst_event_ref (event));
if (!res) {
gst_event_unref (event);
return FALSE;
}
switch (GST_EVENT_TYPE (event)) {
case GST_EVENT_SEGMENT:{
const GstSegment *segment;
gst_event_parse_segment (event, &segment);
mp3parse->upstream_format = segment->format;
}
default:
break;
}
gst_event_unref (event);
return res;
}
static gboolean
gst_mpeg_audio_parse_convert (GstBaseParse * parse, GstFormat src_format,
gint64 src_value, GstFormat dest_format, gint64 * dest_value)
@ -1418,6 +1745,179 @@ gst_mpeg_audio_parse_pre_push_frame (GstBaseParse * parse,
gst_tag_list_unref (taglist);
}
/* adjust buffer PTS/DTS/durations according to gapless playback info */
if ((mp3parse->upstream_format == GST_FORMAT_BYTES
|| GST_PAD_MODE (GST_BASE_PARSE_SINK_PAD (parse)) ==
GST_PAD_MODE_PULL)
&& GST_CLOCK_TIME_IS_VALID (mp3parse->total_padding_time)) {
guint64 frame_nr;
GstClockTime pts, dts;
gboolean add_clipping_meta = FALSE;
guint32 start_clip = 0, end_clip = 0;
GstClockTime timestamp_decrement;
guint64 sample_pos;
guint64 sample_pos_end;
/* Get the number of the current frame so we can determine where we
* currently are in the MPEG stream.
*
* Gapless playback is best done based on samples, not timestamps,
* to avoid potential rounding errors that can otherwise cause a few
* samples to be incorrectly clipped or not clipped.
*
* TODO: At the moment, there is no dedicated baseparse API for finding
* out what frame we are currently in. The frame number is calculated
* out of the PTS of the current frame. Each frame has the same duration,
* and at this point, the buffer's PTS has not been adjusted to exclude
* the padding samples, so the PTS will be an integer multiple of
* frame_duration. However, this is not an ideal solution. Investigate
* how to properly implement this. */
frame_nr = GST_BUFFER_PTS (frame->buffer) / mp3parse->frame_duration;
GST_LOG_OBJECT (mp3parse, "Handling MP3 frame #%" G_GUINT64_FORMAT,
frame_nr);
/* By default, we subtract the start_padding_time from the timestamps.
* start_padding_time specifies the duration of the padding samples
* at the beginning of the MPEG stream. To factor out these padding
* samples, we have to shift the timestamps back, which is done with
* this decrement. */
timestamp_decrement = mp3parse->start_padding_time;
pts = GST_BUFFER_PTS (frame->buffer);
dts = GST_BUFFER_DTS (frame->buffer);
/* sample_pos specifies the current position of the beginning of the
* current frame, while sample_pos_end specifies the current position
* of 1 samples past the end of the current frame. Both values are
* in samples. */
sample_pos = frame_nr * mp3parse->spf;
sample_pos_end = sample_pos + mp3parse->spf;
/* Check if the frame is not (fully) within the actual playback range. */
if (G_UNLIKELY (sample_pos <= mp3parse->start_of_actual_samples ||
(sample_pos_end >= mp3parse->end_of_actual_samples))) {
if (G_UNLIKELY (frame_nr >= mp3parse->xing_frames)) {
/* Test #1: Check if the current position lies past the length
* that is specified by the Xing frame header. This normally does
* not happen, but does occur with "Frankenstein" streams (see
* the explanation at the beginning of this source file for more).
* Do this first, since the other test may yield false positives
* in this case. */
GST_LOG_OBJECT (mp3parse, "There are frames beyond what the Xing "
"metadata indicates; this is a Frankenstein stream!");
/* The frames past the "officially" last one (= the last one according
* to the Xing header frame) are located past the padding samples
* that follow the actual playback range. The length of these
* padding samples in nanoseconds is stored in end_padding_time.
* We need to shift the PTS to compensate for these padding samples,
* otherwise there would be a timestamp discontinuity between the
* last "official" frame and the first "Frankenstein" frame. */
timestamp_decrement += mp3parse->end_padding_time;
} else if (sample_pos_end <= mp3parse->start_of_actual_samples) {
/* Test #2: Check if the frame lies completely before the actual
* playback range. This happens if the number of padding samples
* at the start of the stream exceeds the size of a frame, meaning
* that the entire frame will be filled with padding samples.
* This has not been observed so far. However, it is in theory
* possible, so handle it here. */
/* We want to clip all samples in the frame. Since this is a frame
* at the start of the stream, set start_clip to the frame size.
* Also set the buffer duration to 0 to make sure baseparse does not
* increment timestamps after this current frame is finished. */
start_clip = mp3parse->spf;
GST_BUFFER_DURATION (frame->buffer) = 0;
add_clipping_meta = TRUE;
} else if (sample_pos <= mp3parse->start_of_actual_samples) {
/* Test #3: Check if a portion of the frame lies before the actual
* playback range. Set the duration to the number of samples that
* remain after clipping. */
start_clip = mp3parse->start_of_actual_samples - sample_pos;
GST_BUFFER_DURATION (frame->buffer) =
gst_util_uint64_scale_int (sample_pos_end -
mp3parse->start_of_actual_samples, GST_SECOND, mp3parse->rate);
add_clipping_meta = TRUE;
} else if (sample_pos >= mp3parse->end_of_actual_samples) {
/* Test #4: Check if the frame lies completely after the actual
* playback range. Similar to test #2, this happens if the number
* of padding samples at the end of the stream exceeds the size of
* a frame, meaning that the entire frame will be filled with padding
* samples. Unlike test #2, this has been observed in mp3s several
* times: The penultimate frame is partially clipped, the final
* frame is fully clipped. */
GstClockTime padding_ns;
/* We want to clip all samples in the frame. Since this is a frame
* at the end of the stream, set end_clip to the frame size.
* Also set the buffer duration to 0 to make sure baseparse does not
* increment timestamps after this current frame is finished. */
end_clip = mp3parse->spf;
GST_BUFFER_DURATION (frame->buffer) = 0;
/* Even though this frame will be fully clipped, we still have to
* make sure its timestamps are not discontinuous with the preceding
* ones. To that end, it is necessary to subtract the time range
* between the current position and the last valid playback range
* position from the PTS and DTS. */
padding_ns = gst_util_uint64_scale_int (sample_pos -
mp3parse->end_of_actual_samples, GST_SECOND, mp3parse->rate);
timestamp_decrement += padding_ns;
add_clipping_meta = TRUE;
} else if (sample_pos_end >= mp3parse->end_of_actual_samples) {
/* Test #5: Check if a portion of the frame lies after the actual
* playback range. Set the duration to the number of samples that
* remain after clipping. */
end_clip = sample_pos_end - mp3parse->end_of_actual_samples;
GST_BUFFER_DURATION (frame->buffer) =
gst_util_uint64_scale_int (mp3parse->end_of_actual_samples -
sample_pos, GST_SECOND, mp3parse->rate);
add_clipping_meta = TRUE;
}
}
if (G_UNLIKELY (add_clipping_meta)) {
GST_DEBUG_OBJECT (mp3parse, "Adding clipping meta: start %"
G_GUINT32_FORMAT " end %" G_GUINT32_FORMAT, start_clip, end_clip);
gst_buffer_add_audio_clipping_meta (frame->buffer, GST_FORMAT_DEFAULT,
start_clip, end_clip);
}
/* Adjust the timestamps by subtracting from them. The decrement
* is computed above. */
GST_BUFFER_PTS (frame->buffer) = (pts >= timestamp_decrement) ? (pts -
timestamp_decrement) : 0;
GST_BUFFER_DTS (frame->buffer) = (dts >= timestamp_decrement) ? (dts -
timestamp_decrement) : 0;
/* NOTE: We do not adjust the size here, just the timestamps and duration.
* We also do not drop fully clipped frames. This is because downstream
* MPEG audio decoders still need the data of the frame, even if it gets
* fully clipped later. They do need these frames for their decoding process.
* If these frames were dropped, the decoders would not fully decode all
* of the data from the MPEG stream. */
/* TODO: Should offset/offset_end also be adjusted? */
}
/* Check if this frame can safely be dropped (for example, because it is an
* empty Xing header frame). */
if (G_UNLIKELY (mp3parse->outgoing_frame_is_xing_header)) {
GST_DEBUG_OBJECT (mp3parse, "Marking frame as decode-only / droppable");
mp3parse->outgoing_frame_is_xing_header = FALSE;
GST_BUFFER_DURATION (frame->buffer) = 0;
GST_BUFFER_FLAG_SET (frame->buffer, GST_BUFFER_FLAG_DECODE_ONLY);
GST_BUFFER_FLAG_SET (frame->buffer, GST_BUFFER_FLAG_DROPPABLE);
}
/* usual clipping applies */
frame->flags |= GST_BASE_PARSE_FRAME_FLAG_CLIP;

View file

@ -51,14 +51,19 @@ struct _GstMpegAudioParse {
GstBaseParse baseparse;
/*< private >*/
GstFormat upstream_format;
gint rate;
gint channels;
gint layer;
gint version;
GstClockTime max_bitreservoir;
/* samples per frame */
gint spf;
/* Samples per frame */
gint spf;
GstClockTime frame_duration;
gint freerate;
@ -67,6 +72,8 @@ struct _GstMpegAudioParse {
gint last_posted_crc, last_crc;
guint last_posted_channel_mode, last_mode;
gboolean outgoing_frame_is_xing_header;
/* Bitrate from non-vbr headers */
guint32 hdr_bitrate;
gboolean bitrate_is_constant;
@ -75,6 +82,7 @@ struct _GstMpegAudioParse {
guint32 xing_flags;
guint32 xing_frames;
GstClockTime xing_total_time;
GstClockTime xing_actual_total_time;
guint32 xing_bytes;
/* percent -> filepos mapping */
guchar xing_seek_table[100];
@ -95,6 +103,14 @@ struct _GstMpegAudioParse {
/* LAME info */
guint32 encoder_delay;
guint32 encoder_padding;
/* Gapless playback states */
guint32 decoder_delay;
guint64 start_of_actual_samples;
guint64 end_of_actual_samples;
GstClockTime start_padding_time;
GstClockTime end_padding_time;
GstClockTime total_padding_time;
};
/**

View file

@ -24,6 +24,8 @@
*/
#include <gst/check/gstcheck.h>
#include <gst/app/gstappsink.h>
#include <gst/audio/audio.h>
#include "parser.h"
#define SRC_CAPS_TMPL "audio/mpeg, parsed=(boolean)false, mpegversion=(int)1"
@ -123,6 +125,294 @@ GST_START_TEST (test_parse_detect_stream)
GST_END_TEST;
/* Gapless tests are performed using a test signal that contains 30 MPEG
* frames, has padding samples at the beginning and at the end, a LAME
* tag to inform about said padding samples, and a sample rate of 32 kHz
* and 1 channel. The test signal is 1009ms long. setup_gapless_test_info()
* fills the GaplessTestInfo struct with details about this test signal. */
typedef struct
{
const gchar *filename;
guint num_mpeg_frames;
guint num_samples_per_frame;
guint num_start_padding_samples;
guint num_end_padding_samples;
guint sample_rate;
guint first_padded_end_frame;
guint64 num_samples_with_padding;
guint64 num_samples_without_padding;
GstClockTime first_frame_duration;
GstClockTime regular_frame_duration;
GstClockTime total_duration_without_padding;
GstElement *appsink;
GstElement *parser;
} GaplessTestInfo;
static void
setup_gapless_test_info (GaplessTestInfo * info)
{
info->filename = "sine-1009ms-1ch-32000hz-gapless-with-lame-tag.mp3";
info->num_mpeg_frames = 31;
info->num_samples_per_frame = 1152; /* standard for MP3s */
info->sample_rate = 32000;
/* Note that these start and end padding figures are not exactly like
* those that we get from the LAME tag. That's because that tag only
* contains the _encoder_ delay & padding. In the figures below, the
* _decoder_ delay is also factored in (529 samples). mpegaudioparse
* does the same, so we have to apply it here. */
info->num_start_padding_samples = 1105;
info->num_end_padding_samples = 1167;
/* In MP3s with LAME tags, the first frame is a frame made of Xing/LAME
* metadata and dummy nullsamples (this is for backwards compatibility).
* num_start_padding_samples defines how many padding samples are there
* (this does not include the nullsamples from the first dummy frame).
* Likewise, num_end_padding_samples defines how many padding samples
* are there at the end of the MP3 stream.
* There may be more padding samples than the size of one frame, meaning
* that there may be frames that are made entirely of padding samples.
* Such frames are output by mpegaudioparse, but their duration is set
* to 0, and their PTS corresponds to the last valid PTS in the stream
* (= the last PTS that is within the actual media data).
* For this reason, we cannot just assume that the last frame is the
* one containing padding - there may be more. So, calculate the number
* of the first frame that contains padding sames from the _end_ of
* the stream. We'll need that later for buffer PTS and duration checks. */
info->first_padded_end_frame = (info->num_mpeg_frames - 1 -
info->num_end_padding_samples / info->num_samples_per_frame);
info->num_samples_with_padding = (info->num_mpeg_frames - 1) *
info->num_samples_per_frame;
info->num_samples_without_padding = info->num_samples_with_padding -
info->num_start_padding_samples - info->num_end_padding_samples;
/* The first frame (excluding the dummy frame at the beginning) will be
* clipped due to the padding samples at the start of the stream, so we
* have to calculate this separately. */
info->first_frame_duration =
gst_util_uint64_scale_int (info->num_samples_per_frame -
info->num_start_padding_samples, GST_SECOND, info->sample_rate);
/* Regular, unclipped MPEG frame duration. */
info->regular_frame_duration =
gst_util_uint64_scale_int (info->num_samples_per_frame, GST_SECOND,
info->sample_rate);
/* The total actual playtime duration. */
info->total_duration_without_padding =
gst_util_uint64_scale_int (info->num_samples_without_padding, GST_SECOND,
info->sample_rate);
}
static void
check_parsed_mpeg_frame (GaplessTestInfo * info, guint frame_num)
{
GstClockTime expected_pts = GST_CLOCK_TIME_NONE;
GstClockTime expected_duration = GST_CLOCK_TIME_NONE;
gboolean expect_audioclipmeta = FALSE;
guint64 expected_audioclipmeta_start = 0;
guint64 expected_audioclipmeta_end = 0;
GstSample *sample;
GstBuffer *buffer;
GstAudioClippingMeta *audioclip_meta;
GST_DEBUG ("checking frame %u", frame_num);
/* This is called after the frame with the given number has been output by
* mpegaudioparse. We can then pull that frame from appsink, and check its
* PTS, duration, and audioclipmeta (if we expect it to be there). */
if (frame_num == 0) {
expected_pts = 0;
expected_duration = 0;
expect_audioclipmeta = FALSE;
} else if (frame_num == 1) {
/* First frame (excluding the dummy metadata frame at the beginning of
* the MPEG stream that mpegaudioparse internally drops). This one will be
* clipped due to the padding samples at the beginning, so we expect a
* clipping meta to be there. Also, its duration will be smaller than that
* of regular, unclipped frames. */
expected_pts = 0;
expected_duration = info->first_frame_duration;
expect_audioclipmeta = TRUE;
expected_audioclipmeta_start = info->num_start_padding_samples;
expected_audioclipmeta_end = 0;
} else if (frame_num > 1 && frame_num < info->first_padded_end_frame) {
/* Regular, unclipped frame. */
expected_pts = info->first_frame_duration + (frame_num - 2) *
info->regular_frame_duration;
expected_duration = info->regular_frame_duration;
} else if (frame_num == info->first_padded_end_frame) {
/* The first frame at the end with padding samples. This one will have
* the last few valid samples, followed by the first padding samples. */
guint64 num_valid_samples = (info->num_samples_with_padding -
info->num_end_padding_samples) - (frame_num - 1) *
info->num_samples_per_frame;
guint64 num_padding_samples = info->num_samples_per_frame -
num_valid_samples;
expected_pts = info->first_frame_duration + (frame_num - 2) *
info->regular_frame_duration;
expected_duration = gst_util_uint64_scale_int (num_valid_samples,
GST_SECOND, info->sample_rate);
expect_audioclipmeta = TRUE;
expected_audioclipmeta_start = 0;
expected_audioclipmeta_end = num_padding_samples;
} else {
/* A fully clipped frame at the end of the stream. */
expected_pts = info->total_duration_without_padding;
expected_duration = 0;
expect_audioclipmeta = TRUE;
expected_audioclipmeta_start = 0;
expected_audioclipmeta_end = info->num_samples_per_frame;
}
/* Pull the frame from appsink so we can check it. */
sample = gst_app_sink_pull_sample (GST_APP_SINK (info->appsink));
fail_if (sample == NULL);
fail_unless (GST_IS_SAMPLE (sample));
buffer = gst_sample_get_buffer (sample);
fail_if (buffer == NULL);
/* Verify the sample's PTS and duration. */
fail_unless_equals_uint64 (GST_BUFFER_PTS (buffer), expected_pts);
fail_unless_equals_uint64 (GST_BUFFER_DURATION (buffer), expected_duration);
/* Check if there's audio clip metadata, and verify it if it exists. */
if (expect_audioclipmeta) {
audioclip_meta = gst_buffer_get_audio_clipping_meta (buffer);
fail_if (audioclip_meta == NULL);
fail_unless_equals_uint64 (audioclip_meta->start,
expected_audioclipmeta_start);
fail_unless_equals_uint64 (audioclip_meta->end, expected_audioclipmeta_end);
}
gst_sample_unref (sample);
}
GST_START_TEST (test_parse_gapless_and_skip_padding_samples)
{
GstElement *source, *parser, *appsink, *pipeline;
GstStateChangeReturn state_ret;
guint frame_num;
GaplessTestInfo info;
setup_gapless_test_info (&info);
pipeline = gst_pipeline_new (NULL);
source = gst_element_factory_make ("filesrc", NULL);
parser = gst_element_factory_make ("mpegaudioparse", NULL);
appsink = gst_element_factory_make ("appsink", NULL);
info.appsink = appsink;
info.parser = parser;
gst_bin_add_many (GST_BIN (pipeline), source, parser, appsink, NULL);
gst_element_link_many (source, parser, appsink, NULL);
{
char *full_filename =
g_build_filename (GST_TEST_FILES_PATH, info.filename, NULL);
g_object_set (G_OBJECT (source), "location", full_filename, NULL);
g_free (full_filename);
}
g_object_set (G_OBJECT (appsink), "async", FALSE, "sync", FALSE,
"max-buffers", 1, "enable-last-sample", FALSE, "processing-deadline",
G_MAXUINT64, NULL);
state_ret = gst_element_set_state (pipeline, GST_STATE_PLAYING);
fail_unless (state_ret != GST_STATE_CHANGE_FAILURE);
if (state_ret == GST_STATE_CHANGE_ASYNC) {
GST_LOG ("waiting for pipeline to reach PAUSED state");
state_ret = gst_element_get_state (pipeline, NULL, NULL, -1);
fail_unless_equals_int (state_ret, GST_STATE_CHANGE_SUCCESS);
}
/* Verify all frames from the test signal. */
for (frame_num = 0; frame_num < info.num_mpeg_frames; ++frame_num)
check_parsed_mpeg_frame (&info, frame_num);
/* Check what duration is returned by a query. This duration must exclude
* the padding samples. */
{
GstQuery *query;
gint64 duration;
GstFormat format;
query = gst_query_new_duration (GST_FORMAT_TIME);
fail_unless (gst_element_query (pipeline, query));
gst_query_parse_duration (query, &format, &duration);
fail_unless_equals_int (format, GST_FORMAT_TIME);
fail_unless_equals_uint64 ((guint64) duration,
info.total_duration_without_padding);
gst_query_unref (query);
}
/* Seek tests: Here we seek to a certain position that corresponds to a
* certain frame. Then we check if we indeed got that frame. */
/* Seek back to the first frame. */
{
fail_unless_equals_int (gst_element_set_state (pipeline, GST_STATE_PAUSED),
GST_STATE_CHANGE_SUCCESS);
gst_element_seek_simple (pipeline, GST_FORMAT_TIME, GST_SEEK_FLAG_FLUSH |
GST_SEEK_FLAG_KEY_UNIT, 0);
fail_unless_equals_int (gst_element_set_state (pipeline, GST_STATE_PLAYING),
GST_STATE_CHANGE_SUCCESS);
check_parsed_mpeg_frame (&info, 1);
}
/* Seek to the second frame. */
{
fail_unless_equals_int (gst_element_set_state (pipeline, GST_STATE_PAUSED),
GST_STATE_CHANGE_SUCCESS);
gst_element_seek_simple (pipeline, GST_FORMAT_TIME, GST_SEEK_FLAG_FLUSH |
GST_SEEK_FLAG_KEY_UNIT, info.first_frame_duration);
fail_unless_equals_int (gst_element_set_state (pipeline, GST_STATE_PLAYING),
GST_STATE_CHANGE_SUCCESS);
check_parsed_mpeg_frame (&info, 2);
}
/* Seek to the last frame with valid samples (= the first frame with padding
* samples at the end of the stream). */
{
GstClockTime pts = info.first_frame_duration +
(info.first_padded_end_frame - 2) * info.regular_frame_duration;
fail_unless_equals_int (gst_element_set_state (pipeline, GST_STATE_PAUSED),
GST_STATE_CHANGE_SUCCESS);
gst_element_seek_simple (pipeline, GST_FORMAT_TIME, GST_SEEK_FLAG_FLUSH |
GST_SEEK_FLAG_KEY_UNIT, pts);
fail_unless_equals_int (gst_element_set_state (pipeline, GST_STATE_PLAYING),
GST_STATE_CHANGE_SUCCESS);
check_parsed_mpeg_frame (&info, info.first_padded_end_frame);
}
gst_element_set_state (pipeline, GST_STATE_NULL);
gst_object_unref (pipeline);
}
GST_END_TEST;
static Suite *
mpegaudioparse_suite (void)
{
@ -142,6 +432,7 @@ mpegaudioparse_suite (void)
tcase_add_test (tc_chain, test_parse_split);
tcase_add_test (tc_chain, test_parse_skip_garbage);
tcase_add_test (tc_chain, test_parse_detect_stream);
tcase_add_test (tc_chain, test_parse_gapless_and_skip_padding_samples);
return s;
}