qtdemux: Add support for wvtt (WebVTT) subtitles.

WebVTT in ISO MP4 is specified in ISO 14496-30,
and needed for DASH support. It's stored in an
mp4 specific format. To handle it compatibly,
the wvtt boxes are converted back into WebVTT text
and pushed as application/x-subtitle-vtt

Part-of: <https://gitlab.freedesktop.org/gstreamer/gstreamer/-/merge_requests/1182>
This commit is contained in:
Jan Schmidt 2021-06-05 03:13:52 +10:00 committed by GStreamer Marge Bot
parent 41d6f47f2b
commit 6cada5b064
6 changed files with 327 additions and 9 deletions

View file

@ -110,6 +110,7 @@ G_BEGIN_DECLS
#define FOURCC_cprt GST_MAKE_FOURCC('c','p','r','t') #define FOURCC_cprt GST_MAKE_FOURCC('c','p','r','t')
#define FOURCC_crgn GST_MAKE_FOURCC('c','r','g','n') #define FOURCC_crgn GST_MAKE_FOURCC('c','r','g','n')
#define FOURCC_ctab GST_MAKE_FOURCC('c','t','a','b') #define FOURCC_ctab GST_MAKE_FOURCC('c','t','a','b')
#define FOURCC_ctim GST_MAKE_FOURCC('c','t','i','m')
#define FOURCC_ctts GST_MAKE_FOURCC('c','t','t','s') #define FOURCC_ctts GST_MAKE_FOURCC('c','t','t','s')
#define FOURCC_cslg GST_MAKE_FOURCC('c','s','l','g') #define FOURCC_cslg GST_MAKE_FOURCC('c','s','l','g')
#define FOURCC_d263 GST_MAKE_FOURCC('d','2','6','3') #define FOURCC_d263 GST_MAKE_FOURCC('d','2','6','3')
@ -158,6 +159,7 @@ G_BEGIN_DECLS
#define FOURCC_hnti GST_MAKE_FOURCC('h','n','t','i') #define FOURCC_hnti GST_MAKE_FOURCC('h','n','t','i')
#define FOURCC_hvc1 GST_MAKE_FOURCC('h','v','c','1') #define FOURCC_hvc1 GST_MAKE_FOURCC('h','v','c','1')
#define FOURCC_hvcC GST_MAKE_FOURCC('h','v','c','C') #define FOURCC_hvcC GST_MAKE_FOURCC('h','v','c','C')
#define FOURCC_iden GST_MAKE_FOURCC('i','d','e','n')
#define FOURCC_ilst GST_MAKE_FOURCC('i','l','s','t') #define FOURCC_ilst GST_MAKE_FOURCC('i','l','s','t')
#define FOURCC_ima4 GST_MAKE_FOURCC('i','m','a','4') #define FOURCC_ima4 GST_MAKE_FOURCC('i','m','a','4')
#define FOURCC_imap GST_MAKE_FOURCC('i','m','a','p') #define FOURCC_imap GST_MAKE_FOURCC('i','m','a','p')
@ -201,6 +203,7 @@ G_BEGIN_DECLS
#define FOURCC_prof GST_MAKE_FOURCC('p','r','o','f') #define FOURCC_prof GST_MAKE_FOURCC('p','r','o','f')
#define FOURCC_enof GST_MAKE_FOURCC('e','n','o','f') #define FOURCC_enof GST_MAKE_FOURCC('e','n','o','f')
#define FOURCC_fiel GST_MAKE_FOURCC('f','i','e','l') #define FOURCC_fiel GST_MAKE_FOURCC('f','i','e','l')
#define FOURCC_payl GST_MAKE_FOURCC('p','a','y','l')
#define FOURCC_pcst GST_MAKE_FOURCC('p','c','s','t') #define FOURCC_pcst GST_MAKE_FOURCC('p','c','s','t')
#define FOURCC_pgap GST_MAKE_FOURCC('p','g','a','p') #define FOURCC_pgap GST_MAKE_FOURCC('p','g','a','p')
#define FOURCC_png GST_MAKE_FOURCC('p','n','g',' ') #define FOURCC_png GST_MAKE_FOURCC('p','n','g',' ')
@ -242,6 +245,7 @@ G_BEGIN_DECLS
#define FOURCC_stsd GST_MAKE_FOURCC('s','t','s','d') #define FOURCC_stsd GST_MAKE_FOURCC('s','t','s','d')
#define FOURCC_stss GST_MAKE_FOURCC('s','t','s','s') #define FOURCC_stss GST_MAKE_FOURCC('s','t','s','s')
#define FOURCC_stsz GST_MAKE_FOURCC('s','t','s','z') #define FOURCC_stsz GST_MAKE_FOURCC('s','t','s','z')
#define FOURCC_sttg GST_MAKE_FOURCC('s','t','t','g')
#define FOURCC_stts GST_MAKE_FOURCC('s','t','t','s') #define FOURCC_stts GST_MAKE_FOURCC('s','t','t','s')
#define FOURCC_styp GST_MAKE_FOURCC('s','t','y','p') #define FOURCC_styp GST_MAKE_FOURCC('s','t','y','p')
#define FOURCC_subp GST_MAKE_FOURCC('s','u','b','p') #define FOURCC_subp GST_MAKE_FOURCC('s','u','b','p')
@ -271,9 +275,14 @@ G_BEGIN_DECLS
#define FOURCC_vp08 GST_MAKE_FOURCC('v','p','0','8') #define FOURCC_vp08 GST_MAKE_FOURCC('v','p','0','8')
#define FOURCC_vp09 GST_MAKE_FOURCC('v','p','0','9') #define FOURCC_vp09 GST_MAKE_FOURCC('v','p','0','9')
#define FOURCC_vpcC GST_MAKE_FOURCC('v','p','c','C') #define FOURCC_vpcC GST_MAKE_FOURCC('v','p','c','C')
#define FOURCC_vtta GST_MAKE_FOURCC('v','t','t','a')
#define FOURCC_vttc GST_MAKE_FOURCC('v','t','t','c')
#define FOURCC_vttC GST_MAKE_FOURCC('v','t','t','C')
#define FOURCC_vtte GST_MAKE_FOURCC('v','t','t','e')
#define FOURCC_xvid GST_MAKE_FOURCC('x','v','i','d') #define FOURCC_xvid GST_MAKE_FOURCC('x','v','i','d')
#define FOURCC_wave GST_MAKE_FOURCC('w','a','v','e') #define FOURCC_wave GST_MAKE_FOURCC('w','a','v','e')
#define FOURCC_wide GST_MAKE_FOURCC('w','i','d','e') #define FOURCC_wide GST_MAKE_FOURCC('w','i','d','e')
#define FOURCC_wvtt GST_MAKE_FOURCC('w','v','t','t')
#define FOURCC_zlib GST_MAKE_FOURCC('z','l','i','b') #define FOURCC_zlib GST_MAKE_FOURCC('z','l','i','b')
#define FOURCC_lpcm GST_MAKE_FOURCC('l','p','c','m') #define FOURCC_lpcm GST_MAKE_FOURCC('l','p','c','m')
#define FOURCC_av01 GST_MAKE_FOURCC('a','v','0','1') #define FOURCC_av01 GST_MAKE_FOURCC('a','v','0','1')

View file

@ -8,6 +8,7 @@ mp4_sources = [
'qtdemux_lang.c', 'qtdemux_lang.c',
'qtdemux_tags.c', 'qtdemux_tags.c',
'qtdemux_tree.c', 'qtdemux_tree.c',
'qtdemux-webvtt.c',
'gstisoff.c', 'gstisoff.c',
'gstqtmux.c', 'gstqtmux.c',
'gstqtmoovrecover.c', 'gstqtmoovrecover.c',

View file

@ -0,0 +1,221 @@
/* GStreamer
* Copyright (C) 2008 Thijs Vermeir <thijsvermeir@gmail.com>
* Copyright (C) 2011 David Schleef <ds@schleef.org>
* Copyright (C) 2021 Jan Schmidt <jan@centricular.com>
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Library General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Library General Public License for more details.
*
* You should have received a copy of the GNU Library General Public
* License along with this library; if not, write to the
* Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
* Boston, MA 02110-1301, USA.
*/
#ifdef HAVE_CONFIG_H
#include "config.h"
#endif
#include "qtdemux-webvtt.h"
#include <gst/base/gstbytereader.h>
#include "fourcc.h"
#include "qtdemux.h"
#include "qtatomparser.h"
#include <stdlib.h>
#include <string.h>
GST_DEBUG_CATEGORY_EXTERN (qtdemux_debug);
#define GST_CAT_DEFAULT qtdemux_debug
gboolean
qtdemux_webvtt_is_empty (GstQTDemux * demux, guint8 * data, gsize size)
{
GstByteReader br;
guint32 atom_size;
guint32 atom_type;
gst_byte_reader_init (&br, data, size);
if (gst_byte_reader_get_remaining (&br) < 8)
return FALSE;
if (!gst_byte_reader_get_uint32_be (&br, &atom_size) ||
!qt_atom_parser_get_fourcc (&br, &atom_type))
return FALSE;
if (atom_type == FOURCC_vtte)
return TRUE;
return FALSE;
}
struct WebvttCue
{
const guint8 *cue_id;
guint32 cue_id_len;
const guint8 *cue_time;
guint32 cue_time_len;
const guint8 *settings;
guint32 settings_len;
const guint8 *cue_text;
guint32 cue_text_len;
};
static void
webvtt_append_timestamp_to_string (GstClockTime timestamp, GString * str)
{
guint h, m, s, ms;
h = timestamp / (3600 * GST_SECOND);
timestamp -= h * 3600 * GST_SECOND;
m = timestamp / (60 * GST_SECOND);
timestamp -= m * 60 * GST_SECOND;
s = timestamp / GST_SECOND;
timestamp -= s * GST_SECOND;
ms = timestamp / GST_MSECOND;
g_string_append_printf (str, "%02d:%02d:%02d.%03d", h, m, s, ms);
}
static gboolean
webvtt_decode_vttc (GstQTDemux * qtdemux, GstByteReader * br,
GstClockTime start, GstClockTime duration, GString * s)
{
struct WebvttCue cue = { 0, };
gboolean have_data = FALSE;
while (gst_byte_reader_get_remaining (br) >= 8) {
guint32 atom_size;
guint32 atom_type;
guint next_pos;
if (!gst_byte_reader_get_uint32_be (br, &atom_size) ||
!qt_atom_parser_get_fourcc (br, &atom_type))
break;
if (gst_byte_reader_get_remaining (br) < atom_size - 8)
break;
next_pos = gst_byte_reader_get_pos (br) - 8 + atom_size;
GST_LOG_OBJECT (qtdemux, "WebVTT cue atom %" GST_FOURCC_FORMAT " len %u",
GST_FOURCC_ARGS (atom_type), atom_size);
switch (atom_type) {
case FOURCC_ctim:
if (!gst_byte_reader_get_data (br, atom_size - 8, &cue.cue_time))
return FALSE;
cue.cue_time_len = atom_size - 8;
break;
case FOURCC_iden:
if (!gst_byte_reader_get_data (br, atom_size - 8, &cue.cue_id))
return FALSE;
cue.cue_id_len = atom_size - 8;
break;
case FOURCC_sttg:
if (!gst_byte_reader_get_data (br, atom_size - 8, &cue.settings))
return FALSE;
cue.settings_len = atom_size - 8;
break;
case FOURCC_payl:
if (!gst_byte_reader_get_data (br, atom_size - 8, &cue.cue_text))
return FALSE;
cue.cue_text_len = atom_size - 8;
have_data = TRUE;
break;
}
if (!gst_byte_reader_set_pos (br, next_pos))
break;
}
if (have_data) {
if (cue.cue_id)
g_string_append_printf (s, "%.*s\n", cue.cue_id_len, cue.cue_id);
/* Write the cue time and optional settings */
webvtt_append_timestamp_to_string (start, s);
g_string_append_printf (s, " --> ");
webvtt_append_timestamp_to_string (start + duration, s);
if (cue.settings)
g_string_append_printf (s, " %.*s\n", cue.settings_len, cue.settings);
else
g_string_append (s, "\n");
g_string_append_printf (s, "%.*s\n\n", cue.cue_text_len, cue.cue_text);
}
return have_data;
}
GstBuffer *
qtdemux_webvtt_decode (GstQTDemux * qtdemux, GstClockTime start,
GstClockTime duration, guint8 * data, gsize size)
{
GstByteReader br;
GString *str = NULL;
GstBuffer *buf = NULL;
gst_byte_reader_init (&br, data, size);
while (gst_byte_reader_get_remaining (&br) >= 8) {
guint32 atom_size;
guint32 atom_type;
guint next_pos;
if (!gst_byte_reader_get_uint32_be (&br, &atom_size) ||
!qt_atom_parser_get_fourcc (&br, &atom_type))
break;
if (gst_byte_reader_get_remaining (&br) < atom_size - 8)
break;
next_pos = gst_byte_reader_get_pos (&br) - 8 + atom_size;
switch (atom_type) {
case FOURCC_vttc:
GST_LOG_OBJECT (qtdemux,
"WebVTT cue atom %" GST_FOURCC_FORMAT " len %u",
GST_FOURCC_ARGS (atom_type), atom_size);
if (str == NULL)
str = g_string_new (NULL);
if (!webvtt_decode_vttc (qtdemux, &br, start, duration, str))
break;
break;
case FOURCC_vtte:
/* The empty segment case should be handled separately using qtdemux_webvtt_is_empty().
* Ignore it during decode */
break;
case FOURCC_vtta:
/* extra attributes */
break;
default:
GST_DEBUG_OBJECT (qtdemux,
"Unknown WebVTT sample atom %" GST_FOURCC_FORMAT,
GST_FOURCC_ARGS (atom_type));
break;
}
if (!gst_byte_reader_set_pos (&br, next_pos))
break;
}
if (str) {
gsize webvtt_len = str->len;
gchar *webvtt_chunk = g_string_free (str, FALSE);
buf = gst_buffer_new_wrapped (webvtt_chunk, webvtt_len);
}
return buf;
}

View file

@ -0,0 +1,32 @@
/* GStreamer
* Copyright (C) <2021> Jan Schmidt <jan@centricular.com>
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Library General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Library General Public License for more details.
*
* You should have received a copy of the GNU Library General Public
* License along with this library; if not, write to the
* Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
* Boston, MA 02110-1301, USA.
*/
#include <gst/gst.h>
#include "qtdemux.h"
#ifndef __QTDEMUX_WEBVTT_H__
#define __QTDEMUX_WEBVTT_H__
G_BEGIN_DECLS
gboolean qtdemux_webvtt_is_empty(GstQTDemux *demux, guint8 *data, gsize size);
GstBuffer *qtdemux_webvtt_decode (GstQTDemux * qtdemux, GstClockTime start, GstClockTime duration, guint8 *data, gsize size);
G_END_DECLS
#endif

View file

@ -70,6 +70,7 @@
#include "qtpalette.h" #include "qtpalette.h"
#include "qtdemux_tags.h" #include "qtdemux_tags.h"
#include "qtdemux_tree.h" #include "qtdemux_tree.h"
#include "qtdemux-webvtt.h"
#include <stdlib.h> #include <stdlib.h>
#include <string.h> #include <string.h>
@ -5775,6 +5776,41 @@ gst_qtdemux_process_buffer_text (GstQTDemux * qtdemux, QtDemuxStream * stream,
return buf; return buf;
} }
/* WebVTT sample handling according to 14496-30 */
static GstBuffer *
gst_qtdemux_process_buffer_wvtt (GstQTDemux * qtdemux, QtDemuxStream * stream,
GstBuffer * buf)
{
GstBuffer *outbuf = NULL;
GstMapInfo map;
if (!gst_buffer_map (buf, &map, GST_MAP_READ)) {
g_assert_not_reached (); /* The buffer must be mappable */
}
if (qtdemux_webvtt_is_empty (qtdemux, map.data, map.size)) {
GstEvent *gap = NULL;
/* Push a gap event */
stream->segment.position = GST_BUFFER_PTS (buf);
gap =
gst_event_new_gap (stream->segment.position, GST_BUFFER_DURATION (buf));
gst_pad_push_event (stream->pad, gap);
if (GST_BUFFER_DURATION_IS_VALID (buf))
stream->segment.position += GST_BUFFER_DURATION (buf);
} else {
outbuf =
qtdemux_webvtt_decode (qtdemux, GST_BUFFER_PTS (buf),
GST_BUFFER_DURATION (buf), map.data, map.size);
gst_buffer_copy_into (outbuf, buf, GST_BUFFER_COPY_METADATA, 0, -1);
}
gst_buffer_unmap (buf, &map);
gst_buffer_unref (buf);
return outbuf;
}
static GstFlowReturn static GstFlowReturn
gst_qtdemux_push_buffer (GstQTDemux * qtdemux, QtDemuxStream * stream, gst_qtdemux_push_buffer (GstQTDemux * qtdemux, QtDemuxStream * stream,
GstBuffer * buf) GstBuffer * buf)
@ -6071,6 +6107,12 @@ gst_qtdemux_decorate_and_push_buffer (GstQTDemux * qtdemux,
/* we're going to modify the metadata */ /* we're going to modify the metadata */
buf = gst_buffer_make_writable (buf); buf = gst_buffer_make_writable (buf);
GST_BUFFER_DTS (buf) = dts;
GST_BUFFER_PTS (buf) = pts;
GST_BUFFER_DURATION (buf) = duration;
GST_BUFFER_OFFSET (buf) = -1;
GST_BUFFER_OFFSET_END (buf) = -1;
if (G_UNLIKELY (stream->process_func)) if (G_UNLIKELY (stream->process_func))
buf = stream->process_func (qtdemux, stream, buf); buf = stream->process_func (qtdemux, stream, buf);
@ -6078,12 +6120,6 @@ gst_qtdemux_decorate_and_push_buffer (GstQTDemux * qtdemux,
goto exit; goto exit;
} }
GST_BUFFER_DTS (buf) = dts;
GST_BUFFER_PTS (buf) = pts;
GST_BUFFER_DURATION (buf) = duration;
GST_BUFFER_OFFSET (buf) = -1;
GST_BUFFER_OFFSET_END (buf) = -1;
if (!keyframe) { if (!keyframe) {
GST_BUFFER_FLAG_SET (buf, GST_BUFFER_FLAG_DELTA_UNIT); GST_BUFFER_FLAG_SET (buf, GST_BUFFER_FLAG_DELTA_UNIT);
stream->on_keyframe = FALSE; stream->on_keyframe = FALSE;
@ -6312,7 +6348,8 @@ gst_qtdemux_loop_state_movie (GstQTDemux * qtdemux)
/* Only send gap events on non-subtitle streams if lagging way behind. */ /* Only send gap events on non-subtitle streams if lagging way behind. */
if (stream->subtype == FOURCC_subp if (stream->subtype == FOURCC_subp
|| stream->subtype == FOURCC_text || stream->subtype == FOURCC_sbtl) || stream->subtype == FOURCC_text || stream->subtype == FOURCC_sbtl ||
stream->subtype == FOURCC_wvtt)
gap_threshold = 1 * GST_SECOND; gap_threshold = 1 * GST_SECOND;
else else
gap_threshold = 3 * GST_SECOND; gap_threshold = 3 * GST_SECOND;
@ -8854,7 +8891,7 @@ gst_qtdemux_add_stream (GstQTDemux * qtdemux,
GST_DEBUG_OBJECT (qtdemux, "stream type, not creating pad"); GST_DEBUG_OBJECT (qtdemux, "stream type, not creating pad");
} else if (stream->subtype == FOURCC_subp || stream->subtype == FOURCC_text } else if (stream->subtype == FOURCC_subp || stream->subtype == FOURCC_text
|| stream->subtype == FOURCC_sbtl || stream->subtype == FOURCC_subt || stream->subtype == FOURCC_sbtl || stream->subtype == FOURCC_subt
|| stream->subtype == FOURCC_clcp) { || stream->subtype == FOURCC_clcp || stream->subtype == FOURCC_wvtt) {
gchar *name = g_strdup_printf ("subtitle_%u", qtdemux->n_sub_streams); gchar *name = g_strdup_printf ("subtitle_%u", qtdemux->n_sub_streams);
stream->pad = stream->pad =
@ -12829,7 +12866,7 @@ qtdemux_parse_trak (GstQTDemux * qtdemux, GNode * trak)
entry->sampled = TRUE; entry->sampled = TRUE;
} else if (stream->subtype == FOURCC_subp || stream->subtype == FOURCC_text } else if (stream->subtype == FOURCC_subp || stream->subtype == FOURCC_text
|| stream->subtype == FOURCC_sbtl || stream->subtype == FOURCC_subt || stream->subtype == FOURCC_sbtl || stream->subtype == FOURCC_subt
|| stream->subtype == FOURCC_clcp) { || stream->subtype == FOURCC_clcp || stream->subtype == FOURCC_wvtt) {
entry->sampled = TRUE; entry->sampled = TRUE;
entry->sparse = TRUE; entry->sparse = TRUE;
@ -14991,6 +15028,22 @@ qtdemux_sub_caps (GstQTDemux * qtdemux, QtDemuxStream * stream,
_codec ("XML subtitles"); _codec ("XML subtitles");
caps = gst_caps_new_empty_simple ("application/ttml+xml"); caps = gst_caps_new_empty_simple ("application/ttml+xml");
break; break;
case FOURCC_wvtt:
{
GstBuffer *buffer;
const gchar *buf = "WEBVTT\n\n";
_codec ("WebVTT subtitles");
caps = gst_caps_new_empty_simple ("application/x-subtitle-vtt");
stream->process_func = gst_qtdemux_process_buffer_wvtt;
/* FIXME: Parse the vttC atom and get the entire WEBVTT header */
buffer = gst_buffer_new_and_alloc (8);
gst_buffer_fill (buffer, 0, buf, 8);
stream->buffers = g_slist_append (stream->buffers, buffer);
break;
}
case FOURCC_c608: case FOURCC_c608:
_codec ("CEA 608 Closed Caption"); _codec ("CEA 608 Closed Caption");
caps = caps =

View file

@ -218,6 +218,7 @@ static const QtNodeType qt_node_types[] = {
{FOURCC_pssh, "protection system specific header", 0}, {FOURCC_pssh, "protection system specific header", 0},
{FOURCC_tenc, "track encryption", 0}, {FOURCC_tenc, "track encryption", 0},
{FOURCC_stpp, "XML subtitle sample entry", 0}, {FOURCC_stpp, "XML subtitle sample entry", 0},
{FOURCC_wvtt, "WebVTT subtitle sample entry", 0},
{FOURCC_clcp, "Closed Caption", 0}, {FOURCC_clcp, "Closed Caption", 0},
{FOURCC_av01, "AV1 Sample Entry", 0}, {FOURCC_av01, "AV1 Sample Entry", 0},
{FOURCC_av1C, "AV1 Codec Configuration", 0}, {FOURCC_av1C, "AV1 Codec Configuration", 0},
@ -227,6 +228,7 @@ static const QtNodeType qt_node_types[] = {
{FOURCC_av1M, "AV1 Metadata sample group entry", 0}, {FOURCC_av1M, "AV1 Metadata sample group entry", 0},
{FOURCC_aavd, "AAX encrypted audio", 0}, {FOURCC_aavd, "AAX encrypted audio", 0},
{FOURCC_adrm, "AAX DRM key data", 0}, {FOURCC_adrm, "AAX DRM key data", 0},
{FOURCC_vttc, "VTTCueBox 14496-30", QT_FLAG_CONTAINER},
{0, "unknown", 0,}, {0, "unknown", 0,},
}; };