/* GStreamer SAMI subtitle parser * Copyright (c) 2006, 2013 Young-Ho Cha <ganadist at gmail com> * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Library General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Library General Public License for more details. * * You should have received a copy of the GNU Library General Public * License along with this library; if not, write to the * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, * Boston, MA 02110-1301, USA. */ #include "samiparse.h" #include <glib.h> #include <string.h> #include <stdlib.h> #define ITALIC_TAG 'i' #define SPAN_TAG 's' #define RUBY_TAG 'r' #define RT_TAG 't' #define CLEAR_TAG '0' typedef struct _HtmlParser HtmlParser; typedef struct _HtmlContext HtmlContext; typedef struct _GstSamiContext GstSamiContext; struct _GstSamiContext { GString *buf; /* buffer to collect content */ GString *rubybuf; /* buffer to collect ruby content */ GString *resultbuf; /* when opening the next 'sync' tag, move * from 'buf' to avoid to append following * content */ GString *state; /* in many sami files there are tags that * are not closed, so for each open tag the * parser will append a tag flag here so * that tags can be closed properly on * 'sync' tags. See _context_push_state() * and _context_pop_state(). */ HtmlContext *htmlctxt; /* html parser context */ gboolean has_result; /* set when ready to push out result */ gboolean in_sync; /* flag to avoid appending anything except the * content of the sync elements to buf */ guint64 time1; /* previous start attribute in sync tag */ guint64 time2; /* current start attribute in sync tag */ }; struct _HtmlParser { void (*start_element) (HtmlContext * ctx, const gchar * name, const gchar ** attr, gpointer user_data); void (*end_element) (HtmlContext * ctx, const gchar * name, gpointer user_data); void (*text) (HtmlContext * ctx, const gchar * text, gsize text_len, gpointer user_data); }; struct _HtmlContext { const HtmlParser *parser; gpointer user_data; GString *buf; }; static HtmlContext * html_context_new (HtmlParser * parser, gpointer user_data) { HtmlContext *ctxt = (HtmlContext *) g_new0 (HtmlContext, 1); ctxt->parser = parser; ctxt->user_data = user_data; ctxt->buf = g_string_new (NULL); return ctxt; } static void html_context_free (HtmlContext * ctxt) { g_string_free (ctxt->buf, TRUE); g_free (ctxt); } struct EntityMap { const gunichar unescaped; const gchar *escaped; }; struct EntityMap XmlEntities[] = { {34, "quot;"}, {38, "amp;"}, {39, "apos;"}, {60, "lt;"}, {62, "gt;"}, {0, NULL}, }; struct EntityMap HtmlEntities[] = { /* nbsp will handle manually { 160, "nbsp;" }, */ {161, "iexcl;"}, {162, "cent;"}, {163, "pound;"}, {164, "curren;"}, {165, "yen;"}, {166, "brvbar;"}, {167, "sect;"}, {168, "uml;"}, {169, "copy;"}, {170, "ordf;"}, {171, "laquo;"}, {172, "not;"}, {173, "shy;"}, {174, "reg;"}, {175, "macr;"}, {176, "deg;"}, {177, "plusmn;"}, {178, "sup2;"}, {179, "sup3;"}, {180, "acute;"}, {181, "micro;"}, {182, "para;"}, {183, "middot;"}, {184, "cedil;"}, {185, "sup1;"}, {186, "ordm;"}, {187, "raquo;"}, {188, "frac14;"}, {189, "frac12;"}, {190, "frac34;"}, {191, "iquest;"}, {192, "Agrave;"}, {193, "Aacute;"}, {194, "Acirc;"}, {195, "Atilde;"}, {196, "Auml;"}, {197, "Aring;"}, {198, "AElig;"}, {199, "Ccedil;"}, {200, "Egrave;"}, {201, "Eacute;"}, {202, "Ecirc;"}, {203, "Euml;"}, {204, "Igrave;"}, {205, "Iacute;"}, {206, "Icirc;"}, {207, "Iuml;"}, {208, "ETH;"}, {209, "Ntilde;"}, {210, "Ograve;"}, {211, "Oacute;"}, {212, "Ocirc;"}, {213, "Otilde;"}, {214, "Ouml;"}, {215, "times;"}, {216, "Oslash;"}, {217, "Ugrave;"}, {218, "Uacute;"}, {219, "Ucirc;"}, {220, "Uuml;"}, {221, "Yacute;"}, {222, "THORN;"}, {223, "szlig;"}, {224, "agrave;"}, {225, "aacute;"}, {226, "acirc;"}, {227, "atilde;"}, {228, "auml;"}, {229, "aring;"}, {230, "aelig;"}, {231, "ccedil;"}, {232, "egrave;"}, {233, "eacute;"}, {234, "ecirc;"}, {235, "euml;"}, {236, "igrave;"}, {237, "iacute;"}, {238, "icirc;"}, {239, "iuml;"}, {240, "eth;"}, {241, "ntilde;"}, {242, "ograve;"}, {243, "oacute;"}, {244, "ocirc;"}, {245, "otilde;"}, {246, "ouml;"}, {247, "divide;"}, {248, "oslash;"}, {249, "ugrave;"}, {250, "uacute;"}, {251, "ucirc;"}, {252, "uuml;"}, {253, "yacute;"}, {254, "thorn;"}, {255, "yuml;"}, {338, "OElig;"}, {339, "oelig;"}, {352, "Scaron;"}, {353, "scaron;"}, {376, "Yuml;"}, {402, "fnof;"}, {710, "circ;"}, {732, "tilde;"}, {913, "Alpha;"}, {914, "Beta;"}, {915, "Gamma;"}, {916, "Delta;"}, {917, "Epsilon;"}, {918, "Zeta;"}, {919, "Eta;"}, {920, "Theta;"}, {921, "Iota;"}, {922, "Kappa;"}, {923, "Lambda;"}, {924, "Mu;"}, {925, "Nu;"}, {926, "Xi;"}, {927, "Omicron;"}, {928, "Pi;"}, {929, "Rho;"}, {931, "Sigma;"}, {932, "Tau;"}, {933, "Upsilon;"}, {934, "Phi;"}, {935, "Chi;"}, {936, "Psi;"}, {937, "Omega;"}, {945, "alpha;"}, {946, "beta;"}, {947, "gamma;"}, {948, "delta;"}, {949, "epsilon;"}, {950, "zeta;"}, {951, "eta;"}, {952, "theta;"}, {953, "iota;"}, {954, "kappa;"}, {955, "lambda;"}, {956, "mu;"}, {957, "nu;"}, {958, "xi;"}, {959, "omicron;"}, {960, "pi;"}, {961, "rho;"}, {962, "sigmaf;"}, {963, "sigma;"}, {964, "tau;"}, {965, "upsilon;"}, {966, "phi;"}, {967, "chi;"}, {968, "psi;"}, {969, "omega;"}, {977, "thetasym;"}, {978, "upsih;"}, {982, "piv;"}, {8194, "ensp;"}, {8195, "emsp;"}, {8201, "thinsp;"}, {8204, "zwnj;"}, {8205, "zwj;"}, {8206, "lrm;"}, {8207, "rlm;"}, {8211, "ndash;"}, {8212, "mdash;"}, {8216, "lsquo;"}, {8217, "rsquo;"}, {8218, "sbquo;"}, {8220, "ldquo;"}, {8221, "rdquo;"}, {8222, "bdquo;"}, {8224, "dagger;"}, {8225, "Dagger;"}, {8226, "bull;"}, {8230, "hellip;"}, {8240, "permil;"}, {8242, "prime;"}, {8243, "Prime;"}, {8249, "lsaquo;"}, {8250, "rsaquo;"}, {8254, "oline;"}, {8260, "frasl;"}, {8364, "euro;"}, {8465, "image;"}, {8472, "weierp;"}, {8476, "real;"}, {8482, "trade;"}, {8501, "alefsym;"}, {8592, "larr;"}, {8593, "uarr;"}, {8594, "rarr;"}, {8595, "darr;"}, {8596, "harr;"}, {8629, "crarr;"}, {8656, "lArr;"}, {8657, "uArr;"}, {8658, "rArr;"}, {8659, "dArr;"}, {8660, "hArr;"}, {8704, "forall;"}, {8706, "part;"}, {8707, "exist;"}, {8709, "empty;"}, {8711, "nabla;"}, {8712, "isin;"}, {8713, "notin;"}, {8715, "ni;"}, {8719, "prod;"}, {8721, "sum;"}, {8722, "minus;"}, {8727, "lowast;"}, {8730, "radic;"}, {8733, "prop;"}, {8734, "infin;"}, {8736, "ang;"}, {8743, "and;"}, {8744, "or;"}, {8745, "cap;"}, {8746, "cup;"}, {8747, "int;"}, {8756, "there4;"}, {8764, "sim;"}, {8773, "cong;"}, {8776, "asymp;"}, {8800, "ne;"}, {8801, "equiv;"}, {8804, "le;"}, {8805, "ge;"}, {8834, "sub;"}, {8835, "sup;"}, {8836, "nsub;"}, {8838, "sube;"}, {8839, "supe;"}, {8853, "oplus;"}, {8855, "otimes;"}, {8869, "perp;"}, {8901, "sdot;"}, {8968, "lceil;"}, {8969, "rceil;"}, {8970, "lfloor;"}, {8971, "rfloor;"}, {9001, "lang;"}, {9002, "rang;"}, {9674, "loz;"}, {9824, "spades;"}, {9827, "clubs;"}, {9829, "hearts;"}, {9830, "diams;"}, {0, NULL}, }; static gchar * unescape_string (const gchar * text) { gint i; GString *unescaped = g_string_new (NULL); while (*text) { if (*text == '&') { text++; /* unescape   and */ if (!g_ascii_strncasecmp (text, "nbsp", 4)) { unescaped = g_string_append_unichar (unescaped, 160); text += 4; if (*text == ';') { text++; } goto next; } /* pass xml entities. these will be processed as pango markup */ for (i = 0; XmlEntities[i].escaped; i++) { gssize len = strlen (XmlEntities[i].escaped); if (!g_ascii_strncasecmp (text, XmlEntities[i].escaped, len)) { unescaped = g_string_append_c (unescaped, '&'); unescaped = g_string_append_len (unescaped, XmlEntities[i].escaped, len); text += len; goto next; } } /* convert html entities */ for (i = 0; HtmlEntities[i].escaped; i++) { gssize len = strlen (HtmlEntities[i].escaped); if (!strncmp (text, HtmlEntities[i].escaped, len)) { unescaped = g_string_append_unichar (unescaped, HtmlEntities[i].unescaped); text += len; goto next; } } if (*text == '#') { gboolean is_hex = FALSE; gunichar l; gchar *end = NULL; text++; if (*text == 'x') { is_hex = TRUE; text++; } errno = 0; if (is_hex) { l = strtoul (text, &end, 16); } else { l = strtoul (text, &end, 10); } if (text == end || errno != 0) { /* error occured. pass it */ goto next; } unescaped = g_string_append_unichar (unescaped, l); text = end; if (*text == ';') { text++; } goto next; } /* escape & */ unescaped = g_string_append (unescaped, "&"); next: continue; } else if (g_ascii_isspace (*text)) { unescaped = g_string_append_c (unescaped, ' '); /* strip whitespace */ do { text++; } while ((*text) && g_ascii_isspace (*text)); } else { unescaped = g_string_append_c (unescaped, *text); text++; } } return g_string_free (unescaped, FALSE); } static const gchar * string_token (const gchar * string, const gchar * delimiter, gchar ** first) { gchar *next = strstr (string, delimiter); if (next) { *first = g_strndup (string, next - string); } else { *first = g_strdup (string); } return next; } static void html_context_handle_element (HtmlContext * ctxt, const gchar * string, gboolean must_close) { gchar *name = NULL; gint count = 0, i; gchar **attrs; const gchar *found, *next; /* split element name and attributes */ next = string_token (string, " ", &name); if (next) { /* count attributes */ found = next + 1; while (TRUE) { found = strchr (found, '='); if (!found) break; found++; count++; } } else { count = 0; } attrs = g_new0 (gchar *, (count + 1) * 2); for (i = 0; i < count && next != NULL; i += 2) { gchar *attr_name = NULL, *attr_value = NULL; gsize length; next = string_token (next + 1, "=", &attr_name); next = string_token (next + 1, " ", &attr_value); /* strip " or ' from attribute value */ if (attr_value[0] == '"' || attr_value[0] == '\'') { gchar *tmp = g_strdup (attr_value + 1); g_free (attr_value); attr_value = tmp; } length = strlen (attr_value); if (length > 0 && (attr_value[length - 1] == '"' || attr_value[length - 1] == '\'')) { attr_value[length - 1] = '\0'; } attrs[i] = attr_name; attrs[i + 1] = attr_value; } ctxt->parser->start_element (ctxt, name, (const gchar **) attrs, ctxt->user_data); if (must_close) { ctxt->parser->end_element (ctxt, name, ctxt->user_data); } g_strfreev (attrs); g_free (name); } static void html_context_parse (HtmlContext * ctxt, gchar * text, gsize text_len) { const gchar *next = NULL; ctxt->buf = g_string_append_len (ctxt->buf, text, text_len); next = ctxt->buf->str; while (TRUE) { if (next[0] == '<') { gchar *element = NULL; /* find <blahblah> */ if (!strchr (next, '>')) { /* no tag end point. buffer will be process in next time */ return; } next = string_token (next, ">", &element); next++; if (g_str_has_suffix (next, "/")) { /* handle <blah/> */ element[strlen (element) - 1] = '\0'; html_context_handle_element (ctxt, element + 1, TRUE); } else if (element[1] == '/') { /* handle </blah> */ ctxt->parser->end_element (ctxt, element + 2, ctxt->user_data); } else { /* handle <blah> */ html_context_handle_element (ctxt, element + 1, FALSE); } g_free (element); } else if (strchr (next, '<')) { gchar *text = NULL; gsize length; next = string_token (next, "<", &text); text = g_strstrip (text); length = strlen (text); ctxt->parser->text (ctxt, text, length, ctxt->user_data); g_free (text); } else { gchar *text = (gchar *) next; gsize length; text = g_strstrip (text); length = strlen (text); ctxt->parser->text (ctxt, text, length, ctxt->user_data); ctxt->buf = g_string_assign (ctxt->buf, ""); return; } } ctxt->buf = g_string_assign (ctxt->buf, next); } static gchar * has_tag (GString * str, const gchar tag) { return strrchr (str->str, tag); } static void sami_context_push_state (GstSamiContext * sctx, char state) { GST_LOG ("state %c", state); g_string_append_c (sctx->state, state); } static void sami_context_pop_state (GstSamiContext * sctx, char state) { GString *str = g_string_new (""); GString *context_state = sctx->state; int i; GST_LOG ("state %c", state); for (i = context_state->len - 1; i >= 0; i--) { switch (context_state->str[i]) { case ITALIC_TAG: /* <i> */ { g_string_append (str, "</i>"); break; } case SPAN_TAG: /* <span foreground= > */ { g_string_append (str, "</span>"); break; } case RUBY_TAG: /* <span size= > -- ruby */ { break; } case RT_TAG: /* ruby */ { /* FIXME: support for furigana/ruby once implemented in pango */ g_string_append (sctx->rubybuf, "</span>"); if (has_tag (context_state, ITALIC_TAG)) { g_string_append (sctx->rubybuf, "</i>"); } break; } default: break; } if (context_state->str[i] == state) { g_string_append (sctx->buf, str->str); g_string_free (str, TRUE); g_string_truncate (context_state, i); return; } } if (state == CLEAR_TAG) { g_string_append (sctx->buf, str->str); g_string_truncate (context_state, 0); } g_string_free (str, TRUE); } static void handle_start_sync (GstSamiContext * sctx, const gchar ** atts) { int i; sami_context_pop_state (sctx, CLEAR_TAG); if (atts != NULL) { for (i = 0; (atts[i] != NULL); i += 2) { const gchar *key, *value; key = atts[i]; value = atts[i + 1]; if (!value) continue; if (!g_ascii_strcasecmp ("start", key)) { /* Only set a new start time if we don't have text pending */ if (sctx->resultbuf->len == 0) sctx->time1 = sctx->time2; sctx->time2 = atoi ((const char *) value) * GST_MSECOND; sctx->time2 = MAX (sctx->time2, sctx->time1); g_string_append (sctx->resultbuf, sctx->buf->str); sctx->has_result = (sctx->resultbuf->len != 0) ? TRUE : FALSE; g_string_truncate (sctx->buf, 0); } } } } static void handle_start_font (GstSamiContext * sctx, const gchar ** atts) { int i; sami_context_pop_state (sctx, SPAN_TAG); if (atts != NULL) { g_string_append (sctx->buf, "<span"); for (i = 0; (atts[i] != NULL); i += 2) { const gchar *key, *value; key = atts[i]; value = atts[i + 1]; if (!value) continue; if (!g_ascii_strcasecmp ("color", key)) { /* * There are invalid color value in many * sami files. * It will fix hex color value that start without '#' */ const gchar *sharp = ""; int len = strlen (value); if (!(*value == '#' && len == 7)) { gchar *r; /* check if it looks like hex */ if (strtol ((const char *) value, &r, 16) >= 0 && ((gchar *) r == (value + 6) && len == 6)) { sharp = "#"; } } /* some colours can be found in many sami files, but X RGB database * doesn't contain a colour by this name, so map explicitly */ if (!g_ascii_strcasecmp ("aqua", value)) { value = "#00ffff"; } else if (!g_ascii_strcasecmp ("crimson", value)) { value = "#dc143c"; } else if (!g_ascii_strcasecmp ("fuchsia", value)) { value = "#ff00ff"; } else if (!g_ascii_strcasecmp ("indigo", value)) { value = "#4b0082"; } else if (!g_ascii_strcasecmp ("lime", value)) { value = "#00ff00"; } else if (!g_ascii_strcasecmp ("olive", value)) { value = "#808000"; } else if (!g_ascii_strcasecmp ("silver", value)) { value = "#c0c0c0"; } else if (!g_ascii_strcasecmp ("teal", value)) { value = "#008080"; } g_string_append_printf (sctx->buf, " foreground=\"%s%s\"", sharp, value); } else if (!g_ascii_strcasecmp ("face", key)) { g_string_append_printf (sctx->buf, " font_family=\"%s\"", value); } } g_string_append_c (sctx->buf, '>'); sami_context_push_state (sctx, SPAN_TAG); } } static void handle_start_element (HtmlContext * ctx, const gchar * name, const char **atts, gpointer user_data) { GstSamiContext *sctx = (GstSamiContext *) user_data; GST_LOG ("name:%s", name); if (!g_ascii_strcasecmp ("sync", name)) { handle_start_sync (sctx, atts); sctx->in_sync = TRUE; } else if (!g_ascii_strcasecmp ("font", name)) { handle_start_font (sctx, atts); } else if (!g_ascii_strcasecmp ("ruby", name)) { sami_context_push_state (sctx, RUBY_TAG); } else if (!g_ascii_strcasecmp ("br", name)) { g_string_append_c (sctx->buf, '\n'); /* FIXME: support for furigana/ruby once implemented in pango */ } else if (!g_ascii_strcasecmp ("rt", name)) { if (has_tag (sctx->state, ITALIC_TAG)) { g_string_append (sctx->rubybuf, "<i>"); } g_string_append (sctx->rubybuf, "<span size='xx-small' rise='-100'>"); sami_context_push_state (sctx, RT_TAG); } else if (!g_ascii_strcasecmp ("i", name)) { g_string_append (sctx->buf, "<i>"); sami_context_push_state (sctx, ITALIC_TAG); } else if (!g_ascii_strcasecmp ("p", name)) { } } static void handle_end_element (HtmlContext * ctx, const char *name, gpointer user_data) { GstSamiContext *sctx = (GstSamiContext *) user_data; GST_LOG ("name:%s", name); if (!g_ascii_strcasecmp ("sync", name)) { sctx->in_sync = FALSE; } else if ((!g_ascii_strcasecmp ("body", name)) || (!g_ascii_strcasecmp ("sami", name))) { /* We will usually have one buffer left when the body is closed * as we need the next sync to actually send it */ if (sctx->buf->len != 0) { /* Only set a new start time if we don't have text pending */ if (sctx->resultbuf->len == 0) sctx->time1 = sctx->time2; sctx->time2 = GST_CLOCK_TIME_NONE; g_string_append (sctx->resultbuf, sctx->buf->str); sctx->has_result = (sctx->resultbuf->len != 0) ? TRUE : FALSE; g_string_truncate (sctx->buf, 0); } } else if (!g_ascii_strcasecmp ("font", name)) { sami_context_pop_state (sctx, SPAN_TAG); } else if (!g_ascii_strcasecmp ("ruby", name)) { sami_context_pop_state (sctx, RUBY_TAG); } else if (!g_ascii_strcasecmp ("i", name)) { sami_context_pop_state (sctx, ITALIC_TAG); } } static void handle_text (HtmlContext * ctx, const gchar * text, gsize text_len, gpointer user_data) { GstSamiContext *sctx = (GstSamiContext *) user_data; /* Skip everything except content of the sync elements */ if (!sctx->in_sync) return; if (has_tag (sctx->state, RT_TAG)) { g_string_append_c (sctx->rubybuf, ' '); g_string_append (sctx->rubybuf, text); g_string_append_c (sctx->rubybuf, ' '); } else { g_string_append (sctx->buf, text); } } static HtmlParser samiParser = { handle_start_element, /* start_element */ handle_end_element, /* end_element */ handle_text, /* text */ }; void sami_context_init (ParserState * state) { GstSamiContext *context; g_assert (state->user_data == NULL); context = g_new0 (GstSamiContext, 1); context->htmlctxt = html_context_new (&samiParser, context); context->buf = g_string_new (""); context->rubybuf = g_string_new (""); context->resultbuf = g_string_new (""); context->state = g_string_new (""); state->user_data = context; } void sami_context_deinit (ParserState * state) { GstSamiContext *context = (GstSamiContext *) state->user_data; if (context) { html_context_free (context->htmlctxt); context->htmlctxt = NULL; g_string_free (context->buf, TRUE); g_string_free (context->rubybuf, TRUE); g_string_free (context->resultbuf, TRUE); g_string_free (context->state, TRUE); g_free (context); state->user_data = NULL; } } void sami_context_reset (ParserState * state) { GstSamiContext *context = (GstSamiContext *) state->user_data; if (context) { g_string_truncate (context->buf, 0); g_string_truncate (context->rubybuf, 0); g_string_truncate (context->resultbuf, 0); g_string_truncate (context->state, 0); context->has_result = FALSE; context->in_sync = FALSE; context->time1 = 0; context->time2 = 0; } } gchar * parse_sami (ParserState * state, const gchar * line) { gchar *ret = NULL; GstSamiContext *context = (GstSamiContext *) state->user_data; gchar *unescaped = unescape_string (line); html_context_parse (context->htmlctxt, (gchar *) unescaped, strlen (unescaped)); g_free (unescaped); if (context->has_result) { if (context->rubybuf->len) { context->rubybuf = g_string_append_c (context->rubybuf, '\n'); g_string_prepend (context->resultbuf, context->rubybuf->str); context->rubybuf = g_string_truncate (context->rubybuf, 0); } ret = g_string_free (context->resultbuf, FALSE); context->resultbuf = g_string_new (""); state->start_time = context->time1; state->duration = context->time2 - context->time1; context->has_result = FALSE; } return ret; }