From 2f177a7616d080937752fe19f15d633fd9e32da3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tim-Philipp=20M=C3=BCller?= Date: Sun, 16 Dec 2012 12:05:02 +0000 Subject: [PATCH] typefinding: detect stand-alone SSA/ASS subtitle files https://bugzilla.gnome.org/show_bug.cgi?id=625113 --- gst/typefind/gsttypefindfunctions.c | 90 +++++++++++++++++++++++++++++ 1 file changed, 90 insertions(+) diff --git a/gst/typefind/gsttypefindfunctions.c b/gst/typefind/gsttypefindfunctions.c index 9203bccb90..258ec24f27 100644 --- a/gst/typefind/gsttypefindfunctions.c +++ b/gst/typefind/gsttypefindfunctions.c @@ -4699,6 +4699,93 @@ dvdiso_type_find (GstTypeFind * tf, gpointer private) "application/octet-stream", NULL); } +/* SSA/ASS subtitles + * + * http://en.wikipedia.org/wiki/SubStation_Alpha + * http://matroska.org/technical/specs/subtitles/ssa.html + */ +static void +ssa_type_find (GstTypeFind * tf, gpointer private) +{ + const gchar *start, *end, *ver_str, *media_type = NULL; + const guint8 *data; + gchar *str, *script_type, *p = NULL; + gint64 len; + + data = gst_type_find_peek (tf, 0, 32); + + if (data == NULL) + return; + + /* there might be a BOM at the beginning */ + if (memcmp (data, "[Script Info]", 13) != 0 && + memcmp (data + 2, "[Script Info]", 13) != 0 && + memcmp (data + 3, "[Script Info]", 13) != 0 && + memcmp (data + 4, "[Script Info]", 13) != 0) { + return; + } + + /* now check if we have SSA or ASS */ + len = gst_type_find_get_length (tf); + if (len > 8192) + len = 8192; + + data = gst_type_find_peek (tf, 0, len); + if (data == NULL) + return; + + /* skip BOM */ + start = (gchar *) memchr (data, '[', 5); + g_assert (start); + len -= (start - (gchar *) data); + + /* ignore anything non-UTF8 for now, in future we might at least allow + * other UTF variants that are clearly prefixed with the appropriate BOM */ + if (!g_utf8_validate (start, len, &end) && (len - (end - start)) > 6) { + GST_FIXME ("non-UTF8 SSA/ASS file"); + return; + } + + /* something at start, but not a UTF-8 BOM? */ + if (data[0] != '[' && (data[0] != 0xEF || data[1] != 0xBB || data[2] != 0xBF)) + return; + + /* ignore any partial UTF-8 characters at the end */ + len = end - start; + + /* create a NUL-terminated string so it's easier to process it safely */ + str = g_strndup (start, len - 1); + script_type = strstr (str, "ScriptType:"); + if (script_type != NULL) { + gdouble version; + + ver_str = script_type + 11; + while (*ver_str == ' ' || *ver_str == 'v' || *ver_str == 'V') + ++ver_str; + version = g_ascii_strtod (ver_str, &p); + if (version == 4.0 && p != NULL && *p == '+') + media_type = "application/x-ass"; + else if (version >= 1.0 && version <= 4.0) + media_type = "application/x-ssa"; + } + + if (media_type == NULL) { + if (strstr (str, "[v4+ Styles]") || strstr (str, "[V4+ Styles]")) + media_type = "application/x-ass"; + else if (strstr (str, "[v4 Styles]") || strstr (str, "[V4 Styles]")) + media_type = "application/x-ssa"; + } + + if (media_type != NULL) { + gst_type_find_suggest_simple (tf, GST_TYPE_FIND_MAXIMUM, + media_type, "parsed", G_TYPE_BOOLEAN, FALSE, NULL); + } else { + GST_WARNING ("could not detect SSA/ASS variant"); + } + + g_free (str); +} + /*** generic typefind for streams that have some data at a specific position***/ typedef struct { @@ -5073,6 +5160,9 @@ plugin_init (GstPlugin * plugin) TYPE_FIND_REGISTER (plugin, "application/octet-stream", GST_RANK_MARGINAL, dvdiso_type_find, NULL, NULL, NULL, NULL); + TYPE_FIND_REGISTER (plugin, "application/x-ssa", GST_RANK_SECONDARY, + ssa_type_find, "ssa,ass", NULL, NULL, NULL); + return TRUE; }