From 6cada5b0644c3e456b583872d0cbd6e549dadb87 Mon Sep 17 00:00:00 2001
From: Jan Schmidt <jan@centricular.com>
Date: Sat, 5 Jun 2021 03:13:52 +1000
Subject: [PATCH] qtdemux: Add support for wvtt (WebVTT) subtitles.

WebVTT in ISO MP4 is specified in ISO 14496-30,
and needed for DASH support. It's stored in an
mp4 specific format. To handle it compatibly,
the wvtt boxes are converted back into WebVTT text
and pushed as application/x-subtitle-vtt

Part-of: <https://gitlab.freedesktop.org/gstreamer/gstreamer/-/merge_requests/1182>
---
 .../gst-plugins-good/gst/isomp4/fourcc.h      |   9 +
 .../gst-plugins-good/gst/isomp4/meson.build   |   1 +
 .../gst/isomp4/qtdemux-webvtt.c               | 221 ++++++++++++++++++
 .../gst/isomp4/qtdemux-webvtt.h               |  32 +++
 .../gst-plugins-good/gst/isomp4/qtdemux.c     |  71 +++++-
 .../gst/isomp4/qtdemux_types.c                |   2 +
 6 files changed, 327 insertions(+), 9 deletions(-)
 create mode 100644 subprojects/gst-plugins-good/gst/isomp4/qtdemux-webvtt.c
 create mode 100644 subprojects/gst-plugins-good/gst/isomp4/qtdemux-webvtt.h

diff --git a/subprojects/gst-plugins-good/gst/isomp4/fourcc.h b/subprojects/gst-plugins-good/gst/isomp4/fourcc.h
index 8872b4bda3..5be6921e7d 100644
--- a/subprojects/gst-plugins-good/gst/isomp4/fourcc.h
+++ b/subprojects/gst-plugins-good/gst/isomp4/fourcc.h
@@ -110,6 +110,7 @@ G_BEGIN_DECLS
 #define FOURCC_cprt     GST_MAKE_FOURCC('c','p','r','t')
 #define FOURCC_crgn     GST_MAKE_FOURCC('c','r','g','n')
 #define FOURCC_ctab     GST_MAKE_FOURCC('c','t','a','b')
+#define FOURCC_ctim     GST_MAKE_FOURCC('c','t','i','m')
 #define FOURCC_ctts     GST_MAKE_FOURCC('c','t','t','s')
 #define FOURCC_cslg     GST_MAKE_FOURCC('c','s','l','g')
 #define FOURCC_d263     GST_MAKE_FOURCC('d','2','6','3')
@@ -158,6 +159,7 @@ G_BEGIN_DECLS
 #define FOURCC_hnti     GST_MAKE_FOURCC('h','n','t','i')
 #define FOURCC_hvc1     GST_MAKE_FOURCC('h','v','c','1')
 #define FOURCC_hvcC     GST_MAKE_FOURCC('h','v','c','C')
+#define FOURCC_iden     GST_MAKE_FOURCC('i','d','e','n')
 #define FOURCC_ilst     GST_MAKE_FOURCC('i','l','s','t')
 #define FOURCC_ima4     GST_MAKE_FOURCC('i','m','a','4')
 #define FOURCC_imap     GST_MAKE_FOURCC('i','m','a','p')
@@ -201,6 +203,7 @@ G_BEGIN_DECLS
 #define FOURCC_prof     GST_MAKE_FOURCC('p','r','o','f')
 #define FOURCC_enof     GST_MAKE_FOURCC('e','n','o','f')
 #define FOURCC_fiel     GST_MAKE_FOURCC('f','i','e','l')
+#define FOURCC_payl     GST_MAKE_FOURCC('p','a','y','l')
 #define FOURCC_pcst     GST_MAKE_FOURCC('p','c','s','t')
 #define FOURCC_pgap     GST_MAKE_FOURCC('p','g','a','p')
 #define FOURCC_png      GST_MAKE_FOURCC('p','n','g',' ')
@@ -242,6 +245,7 @@ G_BEGIN_DECLS
 #define FOURCC_stsd     GST_MAKE_FOURCC('s','t','s','d')
 #define FOURCC_stss     GST_MAKE_FOURCC('s','t','s','s')
 #define FOURCC_stsz     GST_MAKE_FOURCC('s','t','s','z')
+#define FOURCC_sttg     GST_MAKE_FOURCC('s','t','t','g')
 #define FOURCC_stts     GST_MAKE_FOURCC('s','t','t','s')
 #define FOURCC_styp     GST_MAKE_FOURCC('s','t','y','p')
 #define FOURCC_subp     GST_MAKE_FOURCC('s','u','b','p')
@@ -271,9 +275,14 @@ G_BEGIN_DECLS
 #define FOURCC_vp08     GST_MAKE_FOURCC('v','p','0','8')
 #define FOURCC_vp09     GST_MAKE_FOURCC('v','p','0','9')
 #define FOURCC_vpcC     GST_MAKE_FOURCC('v','p','c','C')
+#define FOURCC_vtta     GST_MAKE_FOURCC('v','t','t','a')
+#define FOURCC_vttc     GST_MAKE_FOURCC('v','t','t','c')
+#define FOURCC_vttC     GST_MAKE_FOURCC('v','t','t','C')
+#define FOURCC_vtte     GST_MAKE_FOURCC('v','t','t','e')
 #define FOURCC_xvid     GST_MAKE_FOURCC('x','v','i','d')
 #define FOURCC_wave     GST_MAKE_FOURCC('w','a','v','e')
 #define FOURCC_wide     GST_MAKE_FOURCC('w','i','d','e')
+#define FOURCC_wvtt     GST_MAKE_FOURCC('w','v','t','t')
 #define FOURCC_zlib     GST_MAKE_FOURCC('z','l','i','b')
 #define FOURCC_lpcm     GST_MAKE_FOURCC('l','p','c','m')
 #define FOURCC_av01     GST_MAKE_FOURCC('a','v','0','1')
diff --git a/subprojects/gst-plugins-good/gst/isomp4/meson.build b/subprojects/gst-plugins-good/gst/isomp4/meson.build
index b510c0510e..b10c09c406 100644
--- a/subprojects/gst-plugins-good/gst/isomp4/meson.build
+++ b/subprojects/gst-plugins-good/gst/isomp4/meson.build
@@ -8,6 +8,7 @@ mp4_sources = [
   'qtdemux_lang.c',
   'qtdemux_tags.c',
   'qtdemux_tree.c',
+  'qtdemux-webvtt.c',
   'gstisoff.c',
   'gstqtmux.c',
   'gstqtmoovrecover.c',
diff --git a/subprojects/gst-plugins-good/gst/isomp4/qtdemux-webvtt.c b/subprojects/gst-plugins-good/gst/isomp4/qtdemux-webvtt.c
new file mode 100644
index 0000000000..ada3d4d177
--- /dev/null
+++ b/subprojects/gst-plugins-good/gst/isomp4/qtdemux-webvtt.c
@@ -0,0 +1,221 @@
+/* GStreamer
+ * Copyright (C) 2008 Thijs Vermeir <thijsvermeir@gmail.com>
+ * Copyright (C) 2011 David Schleef <ds@schleef.org>
+ * Copyright (C) 2021 Jan Schmidt <jan@centricular.com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Library General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Library General Public
+ * License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
+ * Boston, MA 02110-1301, USA.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "qtdemux-webvtt.h"
+#include <gst/base/gstbytereader.h>
+
+#include "fourcc.h"
+#include "qtdemux.h"
+#include "qtatomparser.h"
+
+#include <stdlib.h>
+#include <string.h>
+
+GST_DEBUG_CATEGORY_EXTERN (qtdemux_debug);
+#define GST_CAT_DEFAULT qtdemux_debug
+
+gboolean
+qtdemux_webvtt_is_empty (GstQTDemux * demux, guint8 * data, gsize size)
+{
+  GstByteReader br;
+  guint32 atom_size;
+  guint32 atom_type;
+
+  gst_byte_reader_init (&br, data, size);
+  if (gst_byte_reader_get_remaining (&br) < 8)
+    return FALSE;
+
+  if (!gst_byte_reader_get_uint32_be (&br, &atom_size) ||
+      !qt_atom_parser_get_fourcc (&br, &atom_type))
+    return FALSE;
+
+  if (atom_type == FOURCC_vtte)
+    return TRUE;
+
+  return FALSE;
+}
+
+struct WebvttCue
+{
+  const guint8 *cue_id;
+  guint32 cue_id_len;
+
+  const guint8 *cue_time;
+  guint32 cue_time_len;
+
+  const guint8 *settings;
+  guint32 settings_len;
+
+  const guint8 *cue_text;
+  guint32 cue_text_len;
+};
+
+static void
+webvtt_append_timestamp_to_string (GstClockTime timestamp, GString * str)
+{
+  guint h, m, s, ms;
+
+  h = timestamp / (3600 * GST_SECOND);
+
+  timestamp -= h * 3600 * GST_SECOND;
+  m = timestamp / (60 * GST_SECOND);
+
+  timestamp -= m * 60 * GST_SECOND;
+  s = timestamp / GST_SECOND;
+
+  timestamp -= s * GST_SECOND;
+  ms = timestamp / GST_MSECOND;
+
+  g_string_append_printf (str, "%02d:%02d:%02d.%03d", h, m, s, ms);
+}
+
+static gboolean
+webvtt_decode_vttc (GstQTDemux * qtdemux, GstByteReader * br,
+    GstClockTime start, GstClockTime duration, GString * s)
+{
+  struct WebvttCue cue = { 0, };
+  gboolean have_data = FALSE;
+
+  while (gst_byte_reader_get_remaining (br) >= 8) {
+    guint32 atom_size;
+    guint32 atom_type;
+    guint next_pos;
+
+    if (!gst_byte_reader_get_uint32_be (br, &atom_size) ||
+        !qt_atom_parser_get_fourcc (br, &atom_type))
+      break;
+
+    if (gst_byte_reader_get_remaining (br) < atom_size - 8)
+      break;
+    next_pos = gst_byte_reader_get_pos (br) - 8 + atom_size;
+
+    GST_LOG_OBJECT (qtdemux, "WebVTT cue atom %" GST_FOURCC_FORMAT " len %u",
+        GST_FOURCC_ARGS (atom_type), atom_size);
+
+    switch (atom_type) {
+      case FOURCC_ctim:
+        if (!gst_byte_reader_get_data (br, atom_size - 8, &cue.cue_time))
+          return FALSE;
+        cue.cue_time_len = atom_size - 8;
+        break;
+      case FOURCC_iden:
+        if (!gst_byte_reader_get_data (br, atom_size - 8, &cue.cue_id))
+          return FALSE;
+        cue.cue_id_len = atom_size - 8;
+        break;
+      case FOURCC_sttg:
+        if (!gst_byte_reader_get_data (br, atom_size - 8, &cue.settings))
+          return FALSE;
+        cue.settings_len = atom_size - 8;
+        break;
+      case FOURCC_payl:
+        if (!gst_byte_reader_get_data (br, atom_size - 8, &cue.cue_text))
+          return FALSE;
+        cue.cue_text_len = atom_size - 8;
+        have_data = TRUE;
+        break;
+    }
+
+    if (!gst_byte_reader_set_pos (br, next_pos))
+      break;
+  }
+
+  if (have_data) {
+    if (cue.cue_id)
+      g_string_append_printf (s, "%.*s\n", cue.cue_id_len, cue.cue_id);
+
+    /* Write the cue time and optional settings */
+    webvtt_append_timestamp_to_string (start, s);
+    g_string_append_printf (s, " --> ");
+    webvtt_append_timestamp_to_string (start + duration, s);
+
+    if (cue.settings)
+      g_string_append_printf (s, " %.*s\n", cue.settings_len, cue.settings);
+    else
+      g_string_append (s, "\n");
+
+    g_string_append_printf (s, "%.*s\n\n", cue.cue_text_len, cue.cue_text);
+  }
+
+  return have_data;
+}
+
+GstBuffer *
+qtdemux_webvtt_decode (GstQTDemux * qtdemux, GstClockTime start,
+    GstClockTime duration, guint8 * data, gsize size)
+{
+  GstByteReader br;
+  GString *str = NULL;
+  GstBuffer *buf = NULL;
+
+  gst_byte_reader_init (&br, data, size);
+  while (gst_byte_reader_get_remaining (&br) >= 8) {
+    guint32 atom_size;
+    guint32 atom_type;
+    guint next_pos;
+
+    if (!gst_byte_reader_get_uint32_be (&br, &atom_size) ||
+        !qt_atom_parser_get_fourcc (&br, &atom_type))
+      break;
+
+    if (gst_byte_reader_get_remaining (&br) < atom_size - 8)
+      break;
+    next_pos = gst_byte_reader_get_pos (&br) - 8 + atom_size;
+
+    switch (atom_type) {
+      case FOURCC_vttc:
+        GST_LOG_OBJECT (qtdemux,
+            "WebVTT cue atom %" GST_FOURCC_FORMAT " len %u",
+            GST_FOURCC_ARGS (atom_type), atom_size);
+        if (str == NULL)
+          str = g_string_new (NULL);
+        if (!webvtt_decode_vttc (qtdemux, &br, start, duration, str))
+          break;
+        break;
+      case FOURCC_vtte:
+        /* The empty segment case should be handled separately using qtdemux_webvtt_is_empty().
+         * Ignore it during decode */
+        break;
+      case FOURCC_vtta:
+        /* extra attributes */
+        break;
+      default:
+        GST_DEBUG_OBJECT (qtdemux,
+            "Unknown WebVTT sample atom %" GST_FOURCC_FORMAT,
+            GST_FOURCC_ARGS (atom_type));
+        break;
+    }
+    if (!gst_byte_reader_set_pos (&br, next_pos))
+      break;
+  }
+
+  if (str) {
+    gsize webvtt_len = str->len;
+    gchar *webvtt_chunk = g_string_free (str, FALSE);
+    buf = gst_buffer_new_wrapped (webvtt_chunk, webvtt_len);
+  }
+
+  return buf;
+}
diff --git a/subprojects/gst-plugins-good/gst/isomp4/qtdemux-webvtt.h b/subprojects/gst-plugins-good/gst/isomp4/qtdemux-webvtt.h
new file mode 100644
index 0000000000..d411c95092
--- /dev/null
+++ b/subprojects/gst-plugins-good/gst/isomp4/qtdemux-webvtt.h
@@ -0,0 +1,32 @@
+/* GStreamer
+ * Copyright (C) <2021> Jan Schmidt <jan@centricular.com>
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Library General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Library General Public
+ * License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
+ * Boston, MA 02110-1301, USA.
+ */
+#include <gst/gst.h>
+#include "qtdemux.h"
+
+#ifndef __QTDEMUX_WEBVTT_H__
+#define __QTDEMUX_WEBVTT_H__
+
+G_BEGIN_DECLS
+
+gboolean qtdemux_webvtt_is_empty(GstQTDemux *demux, guint8 *data, gsize size);
+GstBuffer *qtdemux_webvtt_decode (GstQTDemux * qtdemux, GstClockTime start, GstClockTime duration, guint8 *data, gsize size);
+
+G_END_DECLS
+
+#endif
diff --git a/subprojects/gst-plugins-good/gst/isomp4/qtdemux.c b/subprojects/gst-plugins-good/gst/isomp4/qtdemux.c
index 49f8ae4581..cb20edf08f 100644
--- a/subprojects/gst-plugins-good/gst/isomp4/qtdemux.c
+++ b/subprojects/gst-plugins-good/gst/isomp4/qtdemux.c
@@ -70,6 +70,7 @@
 #include "qtpalette.h"
 #include "qtdemux_tags.h"
 #include "qtdemux_tree.h"
+#include "qtdemux-webvtt.h"
 
 #include <stdlib.h>
 #include <string.h>
@@ -5775,6 +5776,41 @@ gst_qtdemux_process_buffer_text (GstQTDemux * qtdemux, QtDemuxStream * stream,
   return buf;
 }
 
+/* WebVTT sample handling according to 14496-30 */
+static GstBuffer *
+gst_qtdemux_process_buffer_wvtt (GstQTDemux * qtdemux, QtDemuxStream * stream,
+    GstBuffer * buf)
+{
+  GstBuffer *outbuf = NULL;
+  GstMapInfo map;
+
+  if (!gst_buffer_map (buf, &map, GST_MAP_READ)) {
+    g_assert_not_reached ();    /* The buffer must be mappable */
+  }
+
+  if (qtdemux_webvtt_is_empty (qtdemux, map.data, map.size)) {
+    GstEvent *gap = NULL;
+    /* Push a gap event */
+    stream->segment.position = GST_BUFFER_PTS (buf);
+    gap =
+        gst_event_new_gap (stream->segment.position, GST_BUFFER_DURATION (buf));
+    gst_pad_push_event (stream->pad, gap);
+
+    if (GST_BUFFER_DURATION_IS_VALID (buf))
+      stream->segment.position += GST_BUFFER_DURATION (buf);
+  } else {
+    outbuf =
+        qtdemux_webvtt_decode (qtdemux, GST_BUFFER_PTS (buf),
+        GST_BUFFER_DURATION (buf), map.data, map.size);
+    gst_buffer_copy_into (outbuf, buf, GST_BUFFER_COPY_METADATA, 0, -1);
+  }
+
+  gst_buffer_unmap (buf, &map);
+  gst_buffer_unref (buf);
+
+  return outbuf;
+}
+
 static GstFlowReturn
 gst_qtdemux_push_buffer (GstQTDemux * qtdemux, QtDemuxStream * stream,
     GstBuffer * buf)
@@ -6071,6 +6107,12 @@ gst_qtdemux_decorate_and_push_buffer (GstQTDemux * qtdemux,
   /* we're going to modify the metadata */
   buf = gst_buffer_make_writable (buf);
 
+  GST_BUFFER_DTS (buf) = dts;
+  GST_BUFFER_PTS (buf) = pts;
+  GST_BUFFER_DURATION (buf) = duration;
+  GST_BUFFER_OFFSET (buf) = -1;
+  GST_BUFFER_OFFSET_END (buf) = -1;
+
   if (G_UNLIKELY (stream->process_func))
     buf = stream->process_func (qtdemux, stream, buf);
 
@@ -6078,12 +6120,6 @@ gst_qtdemux_decorate_and_push_buffer (GstQTDemux * qtdemux,
     goto exit;
   }
 
-  GST_BUFFER_DTS (buf) = dts;
-  GST_BUFFER_PTS (buf) = pts;
-  GST_BUFFER_DURATION (buf) = duration;
-  GST_BUFFER_OFFSET (buf) = -1;
-  GST_BUFFER_OFFSET_END (buf) = -1;
-
   if (!keyframe) {
     GST_BUFFER_FLAG_SET (buf, GST_BUFFER_FLAG_DELTA_UNIT);
     stream->on_keyframe = FALSE;
@@ -6312,7 +6348,8 @@ gst_qtdemux_loop_state_movie (GstQTDemux * qtdemux)
 
       /* Only send gap events on non-subtitle streams if lagging way behind. */
       if (stream->subtype == FOURCC_subp
-          || stream->subtype == FOURCC_text || stream->subtype == FOURCC_sbtl)
+          || stream->subtype == FOURCC_text || stream->subtype == FOURCC_sbtl ||
+          stream->subtype == FOURCC_wvtt)
         gap_threshold = 1 * GST_SECOND;
       else
         gap_threshold = 3 * GST_SECOND;
@@ -8854,7 +8891,7 @@ gst_qtdemux_add_stream (GstQTDemux * qtdemux,
     GST_DEBUG_OBJECT (qtdemux, "stream type, not creating pad");
   } else if (stream->subtype == FOURCC_subp || stream->subtype == FOURCC_text
       || stream->subtype == FOURCC_sbtl || stream->subtype == FOURCC_subt
-      || stream->subtype == FOURCC_clcp) {
+      || stream->subtype == FOURCC_clcp || stream->subtype == FOURCC_wvtt) {
     gchar *name = g_strdup_printf ("subtitle_%u", qtdemux->n_sub_streams);
 
     stream->pad =
@@ -12829,7 +12866,7 @@ qtdemux_parse_trak (GstQTDemux * qtdemux, GNode * trak)
       entry->sampled = TRUE;
     } else if (stream->subtype == FOURCC_subp || stream->subtype == FOURCC_text
         || stream->subtype == FOURCC_sbtl || stream->subtype == FOURCC_subt
-        || stream->subtype == FOURCC_clcp) {
+        || stream->subtype == FOURCC_clcp || stream->subtype == FOURCC_wvtt) {
 
       entry->sampled = TRUE;
       entry->sparse = TRUE;
@@ -14991,6 +15028,22 @@ qtdemux_sub_caps (GstQTDemux * qtdemux, QtDemuxStream * stream,
       _codec ("XML subtitles");
       caps = gst_caps_new_empty_simple ("application/ttml+xml");
       break;
+    case FOURCC_wvtt:
+    {
+      GstBuffer *buffer;
+      const gchar *buf = "WEBVTT\n\n";
+
+      _codec ("WebVTT subtitles");
+      caps = gst_caps_new_empty_simple ("application/x-subtitle-vtt");
+      stream->process_func = gst_qtdemux_process_buffer_wvtt;
+
+      /* FIXME: Parse the vttC atom and get the entire WEBVTT header */
+      buffer = gst_buffer_new_and_alloc (8);
+      gst_buffer_fill (buffer, 0, buf, 8);
+      stream->buffers = g_slist_append (stream->buffers, buffer);
+
+      break;
+    }
     case FOURCC_c608:
       _codec ("CEA 608 Closed Caption");
       caps =
diff --git a/subprojects/gst-plugins-good/gst/isomp4/qtdemux_types.c b/subprojects/gst-plugins-good/gst/isomp4/qtdemux_types.c
index 15ad3e5e8b..3c2c18855c 100644
--- a/subprojects/gst-plugins-good/gst/isomp4/qtdemux_types.c
+++ b/subprojects/gst-plugins-good/gst/isomp4/qtdemux_types.c
@@ -218,6 +218,7 @@ static const QtNodeType qt_node_types[] = {
   {FOURCC_pssh, "protection system specific header", 0},
   {FOURCC_tenc, "track encryption", 0},
   {FOURCC_stpp, "XML subtitle sample entry", 0},
+  {FOURCC_wvtt, "WebVTT subtitle sample entry", 0},
   {FOURCC_clcp, "Closed Caption", 0},
   {FOURCC_av01, "AV1 Sample Entry", 0},
   {FOURCC_av1C, "AV1 Codec Configuration", 0},
@@ -227,6 +228,7 @@ static const QtNodeType qt_node_types[] = {
   {FOURCC_av1M, "AV1 Metadata sample group entry", 0},
   {FOURCC_aavd, "AAX encrypted audio", 0},
   {FOURCC_adrm, "AAX DRM key data", 0},
+  {FOURCC_vttc, "VTTCueBox 14496-30", QT_FLAG_CONTAINER},
   {0, "unknown", 0,},
 };