diff --git a/gst/subparse/Makefile.am b/gst/subparse/Makefile.am index 40c4be4db0..7f89447286 100644 --- a/gst/subparse/Makefile.am +++ b/gst/subparse/Makefile.am @@ -1,10 +1,6 @@ plugin_LTLIBRARIES = libgstsubparse.la -if USE_XML SAMIPARSE_SOURCES = samiparse.c samiparse.h -else -SAMIPARSE_SOURCES = -endif libgstsubparse_la_SOURCES = \ gstssaparse.c \ diff --git a/gst/subparse/gstsubparse.c b/gst/subparse/gstsubparse.c index c3f41c9db0..983bec65b0 100644 --- a/gst/subparse/gstsubparse.c +++ b/gst/subparse/gstsubparse.c @@ -55,7 +55,6 @@ gst_sub_parse_get_property (GObject * object, guint prop_id, GValue * value, GParamSpec * pspec); -#ifndef GST_DISABLE_XML static GstStaticPadTemplate sink_templ = GST_STATIC_PAD_TEMPLATE ("sink", GST_PAD_SINK, GST_PAD_ALWAYS, @@ -63,15 +62,6 @@ static GstStaticPadTemplate sink_templ = GST_STATIC_PAD_TEMPLATE ("sink", "application/x-subtitle-tmplayer; application/x-subtitle-mpl2; " "application/x-subtitle-dks; application/x-subtitle-qttext") ); -#else -static GstStaticPadTemplate sink_templ = GST_STATIC_PAD_TEMPLATE ("sink", - GST_PAD_SINK, - GST_PAD_ALWAYS, - GST_STATIC_CAPS ("application/x-subtitle; application/x-subtitle-dks; " - "application/x-subtitle-tmplayer; application/x-subtitle-mpl2; " - "application/x-subtitle-qttext") - ); -#endif static GstStaticPadTemplate src_templ = GST_STATIC_PAD_TEMPLATE ("src", GST_PAD_SRC, @@ -107,11 +97,9 @@ gst_sub_parse_dispose (GObject * object) case GST_SUB_PARSE_FORMAT_QTTEXT: qttext_context_deinit (&subparse->state); break; -#ifndef GST_DISABLE_XML case GST_SUB_PARSE_FORMAT_SAMI: sami_context_deinit (&subparse->state); break; -#endif default: break; } @@ -1173,11 +1161,9 @@ parser_state_dispose (GstSubParse * self, ParserState * state) } if (state->user_data) { switch (self->parser_type) { -#ifndef GST_DISABLE_XML case GST_SUB_PARSE_FORMAT_SAMI: sami_context_reset (state); break; -#endif default: break; } @@ -1283,13 +1269,11 @@ gst_sub_parse_data_format_autodetect (gchar * match_str) GST_LOG ("MPSub (time based) format detected"); return GST_SUB_PARSE_FORMAT_MPSUB; } -#ifndef GST_DISABLE_XML if (strstr (match_str, "") != NULL || strstr (match_str, "") != NULL) { GST_LOG ("SAMI (time based) format detected"); return GST_SUB_PARSE_FORMAT_SAMI; } -#endif /* we're boldly assuming the first subtitle appears within the first hour */ if (sscanf (match_str, "0:%02u:%02u:", &n1, &n2) == 2 || sscanf (match_str, "0:%02u:%02u=", &n1, &n2) == 2 || @@ -1348,13 +1332,11 @@ gst_sub_parse_format_autodetect (GstSubParse * self) self->parse_line = parse_mpsub; return gst_caps_new_simple ("text/x-raw", "format", G_TYPE_STRING, "utf8", NULL); -#ifndef GST_DISABLE_XML case GST_SUB_PARSE_FORMAT_SAMI: self->parse_line = parse_sami; sami_context_init (&self->state); return gst_caps_new_simple ("text/x-raw", "format", G_TYPE_STRING, "pango-markup", NULL); -#endif case GST_SUB_PARSE_FORMAT_TMPLAYER: self->parse_line = parse_tmplayer; self->state.max_duration = 5 * GST_SECOND; @@ -1409,10 +1391,8 @@ feed_textbuf (GstSubParse * self, GstBuffer * buf) parser_state_init (&self->state); g_string_truncate (self->textbuf, 0); gst_adapter_clear (self->adapter); -#ifndef GST_DISABLE_XML if (self->parser_type == GST_SUB_PARSE_FORMAT_SAMI) sami_context_reset (&self->state); -#endif /* we could set a flag to make sure that the next buffer we push out also * has the DISCONT flag set, but there's no point really given that it's * subtitles which are discontinuous by nature. */ @@ -1690,10 +1670,8 @@ GST_STATIC_CAPS ("application/x-subtitle-tmplayer"); static GstStaticCaps sub_caps = GST_STATIC_CAPS ("application/x-subtitle"); #define MPL2_CAPS (gst_static_caps_get (&mpl2_caps)) -#ifndef GST_DISABLE_XML static GstStaticCaps smi_caps = GST_STATIC_CAPS ("application/x-subtitle-sami"); #define SAMI_CAPS (gst_static_caps_get (&smi_caps)) -#endif static GstStaticCaps dks_caps = GST_STATIC_CAPS ("application/x-subtitle-dks"); #define DKS_CAPS (gst_static_caps_get (&dks_caps)) @@ -1774,12 +1752,10 @@ gst_subparse_type_find (GstTypeFind * tf, gpointer private) GST_DEBUG ("MPSub format detected"); caps = SUB_CAPS; break; -#ifndef GST_DISABLE_XML case GST_SUB_PARSE_FORMAT_SAMI: GST_DEBUG ("SAMI (time-based) format detected"); caps = SAMI_CAPS; break; -#endif case GST_SUB_PARSE_FORMAT_TMPLAYER: GST_DEBUG ("TMPlayer (time based) format detected"); caps = TMP_CAPS; diff --git a/gst/subparse/samiparse.c b/gst/subparse/samiparse.c index b61c80277d..afd58855f7 100644 --- a/gst/subparse/samiparse.c +++ b/gst/subparse/samiparse.c @@ -1,5 +1,5 @@ /* GStreamer SAMI subtitle parser - * Copyright (c) 2006 Young-Ho Cha + * Copyright (c) 2006, 2013 Young-Ho Cha * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Library General Public @@ -19,8 +19,9 @@ #include "samiparse.h" -#include +#include #include +#include #define ITALIC_TAG 'i' #define SPAN_TAG 's' @@ -28,6 +29,8 @@ #define RT_TAG 't' #define CLEAR_TAG '0' +typedef struct _HtmlParser HtmlParser; +typedef struct _HtmlContext HtmlContext; typedef struct _GstSamiContext GstSamiContext; struct _GstSamiContext @@ -43,7 +46,7 @@ struct _GstSamiContext * that tags can be closed properly on * 'sync' tags. See _context_push_state() * and _context_pop_state(). */ - htmlParserCtxtPtr htmlctxt; /* html parser context */ + HtmlContext *htmlctxt; /* html parser context */ gboolean has_result; /* set when ready to push out result */ gboolean in_sync; /* flag to avoid appending anything except the * content of the sync elements to buf */ @@ -51,6 +54,525 @@ struct _GstSamiContext guint64 time2; /* current start attribute in sync tag */ }; +struct _HtmlParser +{ + void (*start_element) (HtmlContext * ctx, + const gchar * name, const gchar ** attr, gpointer user_data); + void (*end_element) (HtmlContext * ctx, + const gchar * name, gpointer user_data); + void (*text) (HtmlContext * ctx, + const gchar * text, gsize text_len, gpointer user_data); +}; + +struct _HtmlContext +{ + const HtmlParser *parser; + gpointer user_data; + GString *buf; +}; + +static HtmlContext * +html_context_new (HtmlParser * parser, gpointer user_data) +{ + HtmlContext *ctxt = (HtmlContext *) g_new0 (HtmlContext, 1); + ctxt->parser = parser; + ctxt->user_data = user_data; + ctxt->buf = g_string_new (NULL); + return ctxt; +} + +static void +html_context_free (HtmlContext * ctxt) +{ + g_string_free (ctxt->buf, TRUE); + g_free (ctxt); +} + +struct EntityMap +{ + const gunichar unescaped; + const gchar *escaped; +}; + +struct EntityMap XmlEntities[] = { + {34, "quot;"}, + {38, "amp;"}, + {39, "apos;"}, + {60, "lt;"}, + {62, "gt;"}, + {0, NULL}, +}; + +struct EntityMap HtmlEntities[] = { +/* nbsp will handle manually +{ 160, "nbsp;" }, */ + {161, "iexcl;"}, + {162, "cent;"}, + {163, "pound;"}, + {164, "curren;"}, + {165, "yen;"}, + {166, "brvbar;"}, + {167, "sect;"}, + {168, "uml;"}, + {169, "copy;"}, + {170, "ordf;"}, + {171, "laquo;"}, + {172, "not;"}, + {173, "shy;"}, + {174, "reg;"}, + {175, "macr;"}, + {176, "deg;"}, + {177, "plusmn;"}, + {178, "sup2;"}, + {179, "sup3;"}, + {180, "acute;"}, + {181, "micro;"}, + {182, "para;"}, + {183, "middot;"}, + {184, "cedil;"}, + {185, "sup1;"}, + {186, "ordm;"}, + {187, "raquo;"}, + {188, "frac14;"}, + {189, "frac12;"}, + {190, "frac34;"}, + {191, "iquest;"}, + {192, "Agrave;"}, + {193, "Aacute;"}, + {194, "Acirc;"}, + {195, "Atilde;"}, + {196, "Auml;"}, + {197, "Aring;"}, + {198, "AElig;"}, + {199, "Ccedil;"}, + {200, "Egrave;"}, + {201, "Eacute;"}, + {202, "Ecirc;"}, + {203, "Euml;"}, + {204, "Igrave;"}, + {205, "Iacute;"}, + {206, "Icirc;"}, + {207, "Iuml;"}, + {208, "ETH;"}, + {209, "Ntilde;"}, + {210, "Ograve;"}, + {211, "Oacute;"}, + {212, "Ocirc;"}, + {213, "Otilde;"}, + {214, "Ouml;"}, + {215, "times;"}, + {216, "Oslash;"}, + {217, "Ugrave;"}, + {218, "Uacute;"}, + {219, "Ucirc;"}, + {220, "Uuml;"}, + {221, "Yacute;"}, + {222, "THORN;"}, + {223, "szlig;"}, + {224, "agrave;"}, + {225, "aacute;"}, + {226, "acirc;"}, + {227, "atilde;"}, + {228, "auml;"}, + {229, "aring;"}, + {230, "aelig;"}, + {231, "ccedil;"}, + {232, "egrave;"}, + {233, "eacute;"}, + {234, "ecirc;"}, + {235, "euml;"}, + {236, "igrave;"}, + {237, "iacute;"}, + {238, "icirc;"}, + {239, "iuml;"}, + {240, "eth;"}, + {241, "ntilde;"}, + {242, "ograve;"}, + {243, "oacute;"}, + {244, "ocirc;"}, + {245, "otilde;"}, + {246, "ouml;"}, + {247, "divide;"}, + {248, "oslash;"}, + {249, "ugrave;"}, + {250, "uacute;"}, + {251, "ucirc;"}, + {252, "uuml;"}, + {253, "yacute;"}, + {254, "thorn;"}, + {255, "yuml;"}, + {338, "OElig;"}, + {339, "oelig;"}, + {352, "Scaron;"}, + {353, "scaron;"}, + {376, "Yuml;"}, + {402, "fnof;"}, + {710, "circ;"}, + {732, "tilde;"}, + {913, "Alpha;"}, + {914, "Beta;"}, + {915, "Gamma;"}, + {916, "Delta;"}, + {917, "Epsilon;"}, + {918, "Zeta;"}, + {919, "Eta;"}, + {920, "Theta;"}, + {921, "Iota;"}, + {922, "Kappa;"}, + {923, "Lambda;"}, + {924, "Mu;"}, + {925, "Nu;"}, + {926, "Xi;"}, + {927, "Omicron;"}, + {928, "Pi;"}, + {929, "Rho;"}, + {931, "Sigma;"}, + {932, "Tau;"}, + {933, "Upsilon;"}, + {934, "Phi;"}, + {935, "Chi;"}, + {936, "Psi;"}, + {937, "Omega;"}, + {945, "alpha;"}, + {946, "beta;"}, + {947, "gamma;"}, + {948, "delta;"}, + {949, "epsilon;"}, + {950, "zeta;"}, + {951, "eta;"}, + {952, "theta;"}, + {953, "iota;"}, + {954, "kappa;"}, + {955, "lambda;"}, + {956, "mu;"}, + {957, "nu;"}, + {958, "xi;"}, + {959, "omicron;"}, + {960, "pi;"}, + {961, "rho;"}, + {962, "sigmaf;"}, + {963, "sigma;"}, + {964, "tau;"}, + {965, "upsilon;"}, + {966, "phi;"}, + {967, "chi;"}, + {968, "psi;"}, + {969, "omega;"}, + {977, "thetasym;"}, + {978, "upsih;"}, + {982, "piv;"}, + {8194, "ensp;"}, + {8195, "emsp;"}, + {8201, "thinsp;"}, + {8204, "zwnj;"}, + {8205, "zwj;"}, + {8206, "lrm;"}, + {8207, "rlm;"}, + {8211, "ndash;"}, + {8212, "mdash;"}, + {8216, "lsquo;"}, + {8217, "rsquo;"}, + {8218, "sbquo;"}, + {8220, "ldquo;"}, + {8221, "rdquo;"}, + {8222, "bdquo;"}, + {8224, "dagger;"}, + {8225, "Dagger;"}, + {8226, "bull;"}, + {8230, "hellip;"}, + {8240, "permil;"}, + {8242, "prime;"}, + {8243, "Prime;"}, + {8249, "lsaquo;"}, + {8250, "rsaquo;"}, + {8254, "oline;"}, + {8260, "frasl;"}, + {8364, "euro;"}, + {8465, "image;"}, + {8472, "weierp;"}, + {8476, "real;"}, + {8482, "trade;"}, + {8501, "alefsym;"}, + {8592, "larr;"}, + {8593, "uarr;"}, + {8594, "rarr;"}, + {8595, "darr;"}, + {8596, "harr;"}, + {8629, "crarr;"}, + {8656, "lArr;"}, + {8657, "uArr;"}, + {8658, "rArr;"}, + {8659, "dArr;"}, + {8660, "hArr;"}, + {8704, "forall;"}, + {8706, "part;"}, + {8707, "exist;"}, + {8709, "empty;"}, + {8711, "nabla;"}, + {8712, "isin;"}, + {8713, "notin;"}, + {8715, "ni;"}, + {8719, "prod;"}, + {8721, "sum;"}, + {8722, "minus;"}, + {8727, "lowast;"}, + {8730, "radic;"}, + {8733, "prop;"}, + {8734, "infin;"}, + {8736, "ang;"}, + {8743, "and;"}, + {8744, "or;"}, + {8745, "cap;"}, + {8746, "cup;"}, + {8747, "int;"}, + {8756, "there4;"}, + {8764, "sim;"}, + {8773, "cong;"}, + {8776, "asymp;"}, + {8800, "ne;"}, + {8801, "equiv;"}, + {8804, "le;"}, + {8805, "ge;"}, + {8834, "sub;"}, + {8835, "sup;"}, + {8836, "nsub;"}, + {8838, "sube;"}, + {8839, "supe;"}, + {8853, "oplus;"}, + {8855, "otimes;"}, + {8869, "perp;"}, + {8901, "sdot;"}, + {8968, "lceil;"}, + {8969, "rceil;"}, + {8970, "lfloor;"}, + {8971, "rfloor;"}, + {9001, "lang;"}, + {9002, "rang;"}, + {9674, "loz;"}, + {9824, "spades;"}, + {9827, "clubs;"}, + {9829, "hearts;"}, + {9830, "diams;"}, + {0, NULL}, +}; + +static gchar * +unescape_string (const gchar * text) +{ + gint i; + GString *unescaped = g_string_new (NULL); + + while (*text) { + if (*text == '&') { + text++; + + /* unescape   and   */ + if (!g_ascii_strncasecmp (text, "nbsp", 4)) { + unescaped = g_string_append_unichar (unescaped, 160); + text += 4; + if (*text == ';') { + text++; + } + goto next; + } + + /* pass xml entities. these will be processed as pango markup */ + for (i = 0; XmlEntities[i].escaped; i++) { + gssize len = strlen (XmlEntities[i].escaped); + if (!g_ascii_strncasecmp (text, XmlEntities[i].escaped, len)) { + unescaped = g_string_append_c (unescaped, '&'); + unescaped = + g_string_append_len (unescaped, XmlEntities[i].escaped, len); + text += len; + goto next; + } + } + + /* convert html entities */ + for (i = 0; HtmlEntities[i].escaped; i++) { + gssize len = strlen (HtmlEntities[i].escaped); + if (!strncmp (text, HtmlEntities[i].escaped, len)) { + unescaped = + g_string_append_unichar (unescaped, HtmlEntities[i].unescaped); + text += len; + goto next; + } + } + + if (*text == '#') { + gboolean is_hex = FALSE; + gunichar l; + gchar *end = NULL; + + text++; + if (*text == 'x') { + is_hex = TRUE; + text++; + } + errno = 0; + if (is_hex) { + l = strtoul (text, &end, 16); + } else { + l = strtoul (text, &end, 10); + } + + if (text == end || errno != 0) { + /* error occured. pass it */ + goto next; + } + unescaped = g_string_append_unichar (unescaped, l); + text = end; + + if (*text == ';') { + text++; + } + goto next; + } + + /* escape & */ + unescaped = g_string_append (unescaped, "&"); + + next: + continue; + + } else if (g_ascii_isspace (*text)) { + unescaped = g_string_append_c (unescaped, ' '); + /* strip whitespace */ + do { + text++; + } while ((*text) && g_ascii_isspace (*text)); + } else { + unescaped = g_string_append_c (unescaped, *text); + text++; + } + } + + return g_string_free (unescaped, FALSE); +} + +static const gchar * +string_token (const gchar * string, const gchar * delimiter, gchar ** first) +{ + gchar *next = strstr (string, delimiter); + if (next) { + *first = strndup (string, next - string); + } else { + *first = strdup (string); + } + return next; +} + +static void +html_context_handle_element (HtmlContext * ctxt, + const gchar * string, gboolean must_close) +{ + gchar *name = NULL; + gint count = 0, i; + gchar **attrs; + const gchar *found, *next; + + /* split element name and attributes */ + next = string_token (string, " ", &name); + + if (next) { + /* count attributes */ + found = next + 1; + while (TRUE) { + found = strchr (found, '='); + if (!found) + break; + found++; + count++; + } + } else { + count = 0; + } + + attrs = g_new0 (gchar *, (count + 1) * 2); + + for (i = 0; i < count; i += 2) { + gchar *attr_name = NULL, *attr_value = NULL; + gsize length; + next = string_token (next + 1, "=", &attr_name); + next = string_token (next + 1, " ", &attr_value); + + /* strip " or ' from attribute value */ + if (attr_value[0] == '"' || attr_value[0] == '\'') { + gchar *tmp = strdup (attr_value + 1); + g_free (attr_value); + attr_value = tmp; + } + + length = strlen (attr_value); + if (attr_value[length - 1] == '"' || attr_value[length - 1] == '\'') { + attr_value[length - 1] = '\0'; + } + + attrs[i] = attr_name; + attrs[i + 1] = attr_value; + } + + ctxt->parser->start_element (ctxt, name, + (const gchar **) attrs, ctxt->user_data); + if (must_close) { + ctxt->parser->end_element (ctxt, name, ctxt->user_data); + } + g_strfreev (attrs); + g_free (name); +} + +static void +html_context_parse (HtmlContext * ctxt, gchar * text, gsize text_len) +{ + const gchar *next = NULL; + ctxt->buf = g_string_append_len (ctxt->buf, text, text_len); + next = ctxt->buf->str; + while (TRUE) { + if (next[0] == '<') { + gchar *element = NULL; + /* find */ + if (!strchr (next, '>')) { + /* no tag end point. buffer will be process in next time */ + return; + } + + next = string_token (next, ">", &element); + next++; + if (g_str_has_suffix (next, "/")) { + /* handle */ + element[strlen (element) - 1] = '\0'; + html_context_handle_element (ctxt, element + 1, TRUE); + } else if (element[1] == '/') { + /* handle */ + ctxt->parser->end_element (ctxt, element + 2, ctxt->user_data); + } else { + /* handle */ + html_context_handle_element (ctxt, element + 1, FALSE); + } + g_free (element); + } else if (strchr (next, '<')) { + gchar *text = NULL; + gsize length; + next = string_token (next, "<", &text); + text = g_strstrip (text); + length = strlen (text); + ctxt->parser->text (ctxt, text, length, ctxt->user_data); + g_free (text); + + } else { + gchar *text = (gchar *) next; + gsize length; + text = g_strstrip (text); + length = strlen (text); + ctxt->parser->text (ctxt, text, length, ctxt->user_data); + ctxt->buf = g_string_assign (ctxt->buf, ""); + return; + } + } + + ctxt->buf = g_string_assign (ctxt->buf, next); +} + static gchar * has_tag (GString * str, const gchar tag) { @@ -116,26 +638,27 @@ sami_context_pop_state (GstSamiContext * sctx, char state) } static void -handle_start_sync (GstSamiContext * sctx, const xmlChar ** atts) +handle_start_sync (GstSamiContext * sctx, const gchar ** atts) { int i; sami_context_pop_state (sctx, CLEAR_TAG); if (atts != NULL) { for (i = 0; (atts[i] != NULL); i += 2) { - const xmlChar *key, *value; + const gchar *key, *value; key = atts[i]; value = atts[i + 1]; if (!value) continue; - if (!xmlStrncmp ((const xmlChar *) "start", key, 5)) { + if (!g_ascii_strcasecmp ("start", key)) { /* Only set a new start time if we don't have text pending */ if (sctx->resultbuf->len == 0) sctx->time1 = sctx->time2; sctx->time2 = atoi ((const char *) value) * GST_MSECOND; + sctx->time2 = MAX (sctx->time2, sctx->time1); g_string_append (sctx->resultbuf, sctx->buf->str); sctx->has_result = (sctx->resultbuf->len != 0) ? TRUE : FALSE; g_string_truncate (sctx->buf, 0); @@ -145,7 +668,7 @@ handle_start_sync (GstSamiContext * sctx, const xmlChar ** atts) } static void -handle_start_font (GstSamiContext * sctx, const xmlChar ** atts) +handle_start_font (GstSamiContext * sctx, const gchar ** atts) { int i; @@ -153,53 +676,53 @@ handle_start_font (GstSamiContext * sctx, const xmlChar ** atts) if (atts != NULL) { g_string_append (sctx->buf, "= 0 && - ((xmlChar *) r == (value + 6) && len == 6)) { + ((gchar *) r == (value + 6) && len == 6)) { sharp = "#"; } } /* some colours can be found in many sami files, but X RGB database * doesn't contain a colour by this name, so map explicitly */ - if (!xmlStrncasecmp (value, (const xmlChar *) "aqua", len)) { - value = (const xmlChar *) "#00ffff"; - } else if (!xmlStrncasecmp (value, (const xmlChar *) "crimson", len)) { - value = (const xmlChar *) "#dc143c"; - } else if (!xmlStrncasecmp (value, (const xmlChar *) "fuchsia", len)) { - value = (const xmlChar *) "#ff00ff"; - } else if (!xmlStrncasecmp (value, (const xmlChar *) "indigo", len)) { - value = (const xmlChar *) "#4b0082"; - } else if (!xmlStrncasecmp (value, (const xmlChar *) "lime", len)) { - value = (const xmlChar *) "#00ff00"; - } else if (!xmlStrncasecmp (value, (const xmlChar *) "olive", len)) { - value = (const xmlChar *) "#808000"; - } else if (!xmlStrncasecmp (value, (const xmlChar *) "silver", len)) { - value = (const xmlChar *) "#c0c0c0"; - } else if (!xmlStrncasecmp (value, (const xmlChar *) "teal", len)) { - value = (const xmlChar *) "#008080"; + if (!g_ascii_strcasecmp ("aqua", value)) { + value = "#00ffff"; + } else if (!g_ascii_strcasecmp ("crimson", value)) { + value = "#dc143c"; + } else if (!g_ascii_strcasecmp ("fuchsia", value)) { + value = "#ff00ff"; + } else if (!g_ascii_strcasecmp ("indigo", value)) { + value = "#4b0082"; + } else if (!g_ascii_strcasecmp ("lime", value)) { + value = "#00ff00"; + } else if (!g_ascii_strcasecmp ("olive", value)) { + value = "#808000"; + } else if (!g_ascii_strcasecmp ("silver", value)) { + value = "#c0c0c0"; + } else if (!g_ascii_strcasecmp ("teal", value)) { + value = "#008080"; } g_string_append_printf (sctx->buf, " foreground=\"%s%s\"", sharp, value); - } else if (!xmlStrncasecmp ((const xmlChar *) "face", key, 4)) { + } else if (!g_ascii_strcasecmp ("face", key)) { g_string_append_printf (sctx->buf, " font_family=\"%s\"", value); } } @@ -209,46 +732,47 @@ handle_start_font (GstSamiContext * sctx, const xmlChar ** atts) } static void -start_sami_element (void *ctx, const xmlChar * name, const xmlChar ** atts) +handle_start_element (HtmlContext * ctx, const gchar * name, + const char **atts, gpointer user_data) { - GstSamiContext *sctx = (GstSamiContext *) ctx; + GstSamiContext *sctx = (GstSamiContext *) user_data; GST_LOG ("name:%s", name); - if (!xmlStrncmp ((const xmlChar *) "sync", name, 4)) { + if (!g_ascii_strcasecmp ("sync", name)) { handle_start_sync (sctx, atts); sctx->in_sync = TRUE; - } else if (!xmlStrncmp ((const xmlChar *) "font", name, 4)) { + } else if (!g_ascii_strcasecmp ("font", name)) { handle_start_font (sctx, atts); - } else if (!xmlStrncmp ((const xmlChar *) "ruby", name, 4)) { + } else if (!g_ascii_strcasecmp ("ruby", name)) { sami_context_push_state (sctx, RUBY_TAG); - } else if (!xmlStrncmp ((const xmlChar *) "br", name, 2)) { + } else if (!g_ascii_strcasecmp ("br", name)) { g_string_append_c (sctx->buf, '\n'); /* FIXME: support for furigana/ruby once implemented in pango */ - } else if (!xmlStrncmp ((const xmlChar *) "rt", name, 2)) { + } else if (!g_ascii_strcasecmp ("rt", name)) { if (has_tag (sctx->state, ITALIC_TAG)) { g_string_append (sctx->rubybuf, ""); } g_string_append (sctx->rubybuf, ""); sami_context_push_state (sctx, RT_TAG); - } else if (!xmlStrncmp ((const xmlChar *) "p", name, 1)) { - } else if (!xmlStrncmp ((const xmlChar *) "i", name, 1)) { + } else if (!g_ascii_strcasecmp ("i", name)) { g_string_append (sctx->buf, ""); sami_context_push_state (sctx, ITALIC_TAG); + } else if (!g_ascii_strcasecmp ("p", name)) { } } static void -end_sami_element (void *ctx, const xmlChar * name) +handle_end_element (HtmlContext * ctx, const char *name, gpointer user_data) { - GstSamiContext *sctx = (GstSamiContext *) ctx; + GstSamiContext *sctx = (GstSamiContext *) user_data; GST_LOG ("name:%s", name); - if (!xmlStrncmp ((const xmlChar *) "sync", name, 4)) { + if (!g_ascii_strcasecmp ("sync", name)) { sctx->in_sync = FALSE; - } else if ((!xmlStrncmp ((const xmlChar *) "body", name, 4)) || - (!xmlStrncmp ((const xmlChar *) "sami", name, 4))) { + } else if ((!g_ascii_strcasecmp ("body", name)) || + (!g_ascii_strcasecmp ("sami", name))) { /* We will usually have one buffer left when the body is closed * as we need the next sync to actually send it */ if (sctx->buf->len != 0) { @@ -261,90 +785,40 @@ end_sami_element (void *ctx, const xmlChar * name) sctx->has_result = (sctx->resultbuf->len != 0) ? TRUE : FALSE; g_string_truncate (sctx->buf, 0); } - } else if (!xmlStrncmp ((const xmlChar *) "font", name, 4)) { + } else if (!g_ascii_strcasecmp ("font", name)) { sami_context_pop_state (sctx, SPAN_TAG); - } else if (!xmlStrncmp ((const xmlChar *) "ruby", name, 4)) { + } else if (!g_ascii_strcasecmp ("ruby", name)) { sami_context_pop_state (sctx, RUBY_TAG); - } else if (!xmlStrncmp ((const xmlChar *) "i", name, 1)) { + } else if (!g_ascii_strcasecmp ("i", name)) { sami_context_pop_state (sctx, ITALIC_TAG); } } static void -characters_sami (void *ctx, const xmlChar * ch, int len) +handle_text (HtmlContext * ctx, const gchar * text, gsize text_len, + gpointer user_data) { - GstSamiContext *sctx = (GstSamiContext *) ctx; - gchar *escaped; - gchar *tmp; - gint i; + GstSamiContext *sctx = (GstSamiContext *) user_data; /* Skip everything except content of the sync elements */ if (!sctx->in_sync) return; - escaped = g_markup_escape_text ((const gchar *) ch, len); - g_strstrip (escaped); - - /* Remove double spaces forom the string as those are - * usually added by newlines and indention */ - tmp = escaped; - for (i = 0; i <= strlen (escaped); i++) { - escaped[i] = *tmp; - if (*tmp != ' ') { - tmp++; - continue; - } - while (*tmp == ' ') - tmp++; - } - if (has_tag (sctx->state, RT_TAG)) { g_string_append_c (sctx->rubybuf, ' '); - g_string_append (sctx->rubybuf, escaped); + g_string_append (sctx->rubybuf, text); g_string_append_c (sctx->rubybuf, ' '); } else { - g_string_append (sctx->buf, escaped); + g_string_append (sctx->buf, text); } - g_free (escaped); } -static xmlSAXHandler samiSAXHandlerStruct = { - NULL, /* internalSubset */ - NULL, /* isStandalone */ - NULL, /* hasInternalSubset */ - NULL, /* hasExternalSubset */ - NULL, /* resolveEntity */ - NULL, /* getEntity */ - NULL, /* entityDecl */ - NULL, /* notationDecl */ - NULL, /* attributeDecl */ - NULL, /* elementDecl */ - NULL, /* unparsedEntityDecl */ - NULL, /* setDocumentLocator */ - NULL, /* startDocument */ - NULL, /* endDocument */ - start_sami_element, /* startElement */ - end_sami_element, /* endElement */ - NULL, /* reference */ - characters_sami, /* characters */ - NULL, /* ignorableWhitespace */ - NULL, /* processingInstruction */ - NULL, /* comment */ - NULL, /* xmlParserWarning */ - NULL, /* xmlParserError */ - NULL, /* xmlParserError */ - NULL, /* getParameterEntity */ - NULL, /* cdataBlock */ - NULL, /* externalSubset */ - 1, /* initialized */ - NULL, /* private */ - NULL, /* startElementNsSAX2Func */ - NULL, /* endElementNsSAX2Func */ - NULL /* xmlStructuredErrorFunc */ +static HtmlParser samiParser = { + handle_start_element, /* start_element */ + handle_end_element, /* end_element */ + handle_text, /* text */ }; -static xmlSAXHandlerPtr samiSAXHandler = &samiSAXHandlerStruct; - void sami_context_init (ParserState * state) { @@ -354,8 +828,7 @@ sami_context_init (ParserState * state) state->user_data = (gpointer) g_new0 (GstSamiContext, 1); context = (GstSamiContext *) state->user_data; - context->htmlctxt = htmlCreatePushParserCtxt (samiSAXHandler, context, - "", 0, NULL, XML_CHAR_ENCODING_UTF8); + context->htmlctxt = html_context_new (&samiParser, context); context->buf = g_string_new (""); context->rubybuf = g_string_new (""); context->resultbuf = g_string_new (""); @@ -368,17 +841,8 @@ sami_context_deinit (ParserState * state) GstSamiContext *context = (GstSamiContext *) state->user_data; if (context) { - htmlParserCtxtPtr htmlctxt = context->htmlctxt; - - /* destroy sax context */ - htmlDocPtr doc; - - htmlParseChunk (htmlctxt, "", 0, 1); - doc = htmlctxt->myDoc; - htmlFreeParserCtxt (htmlctxt); + html_context_free (context->htmlctxt); context->htmlctxt = NULL; - if (doc) - xmlFreeDoc (doc); g_string_free (context->buf, TRUE); g_string_free (context->rubybuf, TRUE); g_string_free (context->resultbuf, TRUE); @@ -405,70 +869,29 @@ sami_context_reset (ParserState * state) } } -static gchar * -fix_invalid_entities (const gchar * line) -{ - const gchar *cp, *pp; /* current pointer, previous pointer */ - gssize size; - GString *ret = g_string_new (NULL); - - pp = line; - cp = strchr (line, '&'); - while (cp) { - size = cp - pp; - ret = g_string_append_len (ret, pp, size); - cp++; - if (g_ascii_strncasecmp (cp, "nbsp;", 5) - && (!g_ascii_strncasecmp (cp, "nbsp", 4))) { - /* translate " " to " " */ - ret = g_string_append_len (ret, " ", 6); - cp += 4; - } else if (g_ascii_strncasecmp (cp, "quot;", 5) - && g_ascii_strncasecmp (cp, "amp;", 4) - && g_ascii_strncasecmp (cp, "apos;", 5) - && g_ascii_strncasecmp (cp, "lt;", 3) - && g_ascii_strncasecmp (cp, "gt;", 3) - && g_ascii_strncasecmp (cp, "nbsp;", 5) - && cp[0] != '#') { - /* translate "&" to "&" */ - ret = g_string_append_len (ret, "&", 5); - } else { - /* do not translate */ - ret = g_string_append_c (ret, '&'); - } - - pp = cp; - cp = strchr (pp, '&'); - } - ret = g_string_append (ret, pp); - return g_string_free (ret, FALSE); -} - gchar * parse_sami (ParserState * state, const gchar * line) { - gchar *fixed_line; + gchar *ret = NULL; GstSamiContext *context = (GstSamiContext *) state->user_data; - fixed_line = fix_invalid_entities (line); - htmlParseChunk (context->htmlctxt, fixed_line, strlen (fixed_line), 0); - g_free (fixed_line); + gchar *unescaped = unescape_string (line); + html_context_parse (context->htmlctxt, (gchar *) unescaped, + strlen (unescaped)); + g_free (unescaped); if (context->has_result) { - gchar *r; - if (context->rubybuf->len) { context->rubybuf = g_string_append_c (context->rubybuf, '\n'); g_string_prepend (context->resultbuf, context->rubybuf->str); context->rubybuf = g_string_truncate (context->rubybuf, 0); } - r = g_string_free (context->resultbuf, FALSE); + ret = g_string_free (context->resultbuf, FALSE); context->resultbuf = g_string_new (""); state->start_time = context->time1; state->duration = context->time2 - context->time1; context->has_result = FALSE; - return r; } - return NULL; + return ret; }