subparse: remove libxml dependency for sami parser and re-enable sami parser

To celebrate 2013.gnome.asia, updated sami parser for gstreamer 1.x. :D

Remove conditional block for check libxml usage and
implement a simple html markup parser for the sami
parser.

https://bugzilla.gnome.org/show_bug.cgi?id=693056
This commit is contained in:
Young-Ho Cha 2013-05-25 17:10:14 +09:00 committed by Tim-Philipp Müller
parent b0eb99baaa
commit f597efe24b
3 changed files with 587 additions and 192 deletions

View file

@ -1,10 +1,6 @@
plugin_LTLIBRARIES = libgstsubparse.la
if USE_XML
SAMIPARSE_SOURCES = samiparse.c samiparse.h
else
SAMIPARSE_SOURCES =
endif
libgstsubparse_la_SOURCES = \
gstssaparse.c \

View file

@ -55,7 +55,6 @@ gst_sub_parse_get_property (GObject * object, guint prop_id,
GValue * value, GParamSpec * pspec);
#ifndef GST_DISABLE_XML
static GstStaticPadTemplate sink_templ = GST_STATIC_PAD_TEMPLATE ("sink",
GST_PAD_SINK,
GST_PAD_ALWAYS,
@ -63,15 +62,6 @@ static GstStaticPadTemplate sink_templ = GST_STATIC_PAD_TEMPLATE ("sink",
"application/x-subtitle-tmplayer; application/x-subtitle-mpl2; "
"application/x-subtitle-dks; application/x-subtitle-qttext")
);
#else
static GstStaticPadTemplate sink_templ = GST_STATIC_PAD_TEMPLATE ("sink",
GST_PAD_SINK,
GST_PAD_ALWAYS,
GST_STATIC_CAPS ("application/x-subtitle; application/x-subtitle-dks; "
"application/x-subtitle-tmplayer; application/x-subtitle-mpl2; "
"application/x-subtitle-qttext")
);
#endif
static GstStaticPadTemplate src_templ = GST_STATIC_PAD_TEMPLATE ("src",
GST_PAD_SRC,
@ -107,11 +97,9 @@ gst_sub_parse_dispose (GObject * object)
case GST_SUB_PARSE_FORMAT_QTTEXT:
qttext_context_deinit (&subparse->state);
break;
#ifndef GST_DISABLE_XML
case GST_SUB_PARSE_FORMAT_SAMI:
sami_context_deinit (&subparse->state);
break;
#endif
default:
break;
}
@ -1173,11 +1161,9 @@ parser_state_dispose (GstSubParse * self, ParserState * state)
}
if (state->user_data) {
switch (self->parser_type) {
#ifndef GST_DISABLE_XML
case GST_SUB_PARSE_FORMAT_SAMI:
sami_context_reset (state);
break;
#endif
default:
break;
}
@ -1283,13 +1269,11 @@ gst_sub_parse_data_format_autodetect (gchar * match_str)
GST_LOG ("MPSub (time based) format detected");
return GST_SUB_PARSE_FORMAT_MPSUB;
}
#ifndef GST_DISABLE_XML
if (strstr (match_str, "<SAMI>") != NULL ||
strstr (match_str, "<sami>") != NULL) {
GST_LOG ("SAMI (time based) format detected");
return GST_SUB_PARSE_FORMAT_SAMI;
}
#endif
/* we're boldly assuming the first subtitle appears within the first hour */
if (sscanf (match_str, "0:%02u:%02u:", &n1, &n2) == 2 ||
sscanf (match_str, "0:%02u:%02u=", &n1, &n2) == 2 ||
@ -1348,13 +1332,11 @@ gst_sub_parse_format_autodetect (GstSubParse * self)
self->parse_line = parse_mpsub;
return gst_caps_new_simple ("text/x-raw",
"format", G_TYPE_STRING, "utf8", NULL);
#ifndef GST_DISABLE_XML
case GST_SUB_PARSE_FORMAT_SAMI:
self->parse_line = parse_sami;
sami_context_init (&self->state);
return gst_caps_new_simple ("text/x-raw",
"format", G_TYPE_STRING, "pango-markup", NULL);
#endif
case GST_SUB_PARSE_FORMAT_TMPLAYER:
self->parse_line = parse_tmplayer;
self->state.max_duration = 5 * GST_SECOND;
@ -1409,10 +1391,8 @@ feed_textbuf (GstSubParse * self, GstBuffer * buf)
parser_state_init (&self->state);
g_string_truncate (self->textbuf, 0);
gst_adapter_clear (self->adapter);
#ifndef GST_DISABLE_XML
if (self->parser_type == GST_SUB_PARSE_FORMAT_SAMI)
sami_context_reset (&self->state);
#endif
/* we could set a flag to make sure that the next buffer we push out also
* has the DISCONT flag set, but there's no point really given that it's
* subtitles which are discontinuous by nature. */
@ -1690,10 +1670,8 @@ GST_STATIC_CAPS ("application/x-subtitle-tmplayer");
static GstStaticCaps sub_caps = GST_STATIC_CAPS ("application/x-subtitle");
#define MPL2_CAPS (gst_static_caps_get (&mpl2_caps))
#ifndef GST_DISABLE_XML
static GstStaticCaps smi_caps = GST_STATIC_CAPS ("application/x-subtitle-sami");
#define SAMI_CAPS (gst_static_caps_get (&smi_caps))
#endif
static GstStaticCaps dks_caps = GST_STATIC_CAPS ("application/x-subtitle-dks");
#define DKS_CAPS (gst_static_caps_get (&dks_caps))
@ -1774,12 +1752,10 @@ gst_subparse_type_find (GstTypeFind * tf, gpointer private)
GST_DEBUG ("MPSub format detected");
caps = SUB_CAPS;
break;
#ifndef GST_DISABLE_XML
case GST_SUB_PARSE_FORMAT_SAMI:
GST_DEBUG ("SAMI (time-based) format detected");
caps = SAMI_CAPS;
break;
#endif
case GST_SUB_PARSE_FORMAT_TMPLAYER:
GST_DEBUG ("TMPlayer (time based) format detected");
caps = TMP_CAPS;

View file

@ -1,5 +1,5 @@
/* GStreamer SAMI subtitle parser
* Copyright (c) 2006 Young-Ho Cha <ganadist at chollian net>
* Copyright (c) 2006, 2013 Young-Ho Cha <ganadist at gmail com>
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Library General Public
@ -19,8 +19,9 @@
#include "samiparse.h"
#include <libxml/HTMLparser.h>
#include <glib.h>
#include <string.h>
#include <stdlib.h>
#define ITALIC_TAG 'i'
#define SPAN_TAG 's'
@ -28,6 +29,8 @@
#define RT_TAG 't'
#define CLEAR_TAG '0'
typedef struct _HtmlParser HtmlParser;
typedef struct _HtmlContext HtmlContext;
typedef struct _GstSamiContext GstSamiContext;
struct _GstSamiContext
@ -43,7 +46,7 @@ struct _GstSamiContext
* that tags can be closed properly on
* 'sync' tags. See _context_push_state()
* and _context_pop_state(). */
htmlParserCtxtPtr htmlctxt; /* html parser context */
HtmlContext *htmlctxt; /* html parser context */
gboolean has_result; /* set when ready to push out result */
gboolean in_sync; /* flag to avoid appending anything except the
* content of the sync elements to buf */
@ -51,6 +54,525 @@ struct _GstSamiContext
guint64 time2; /* current start attribute in sync tag */
};
struct _HtmlParser
{
void (*start_element) (HtmlContext * ctx,
const gchar * name, const gchar ** attr, gpointer user_data);
void (*end_element) (HtmlContext * ctx,
const gchar * name, gpointer user_data);
void (*text) (HtmlContext * ctx,
const gchar * text, gsize text_len, gpointer user_data);
};
struct _HtmlContext
{
const HtmlParser *parser;
gpointer user_data;
GString *buf;
};
static HtmlContext *
html_context_new (HtmlParser * parser, gpointer user_data)
{
HtmlContext *ctxt = (HtmlContext *) g_new0 (HtmlContext, 1);
ctxt->parser = parser;
ctxt->user_data = user_data;
ctxt->buf = g_string_new (NULL);
return ctxt;
}
static void
html_context_free (HtmlContext * ctxt)
{
g_string_free (ctxt->buf, TRUE);
g_free (ctxt);
}
struct EntityMap
{
const gunichar unescaped;
const gchar *escaped;
};
struct EntityMap XmlEntities[] = {
{34, "quot;"},
{38, "amp;"},
{39, "apos;"},
{60, "lt;"},
{62, "gt;"},
{0, NULL},
};
struct EntityMap HtmlEntities[] = {
/* nbsp will handle manually
{ 160, "nbsp;" }, */
{161, "iexcl;"},
{162, "cent;"},
{163, "pound;"},
{164, "curren;"},
{165, "yen;"},
{166, "brvbar;"},
{167, "sect;"},
{168, "uml;"},
{169, "copy;"},
{170, "ordf;"},
{171, "laquo;"},
{172, "not;"},
{173, "shy;"},
{174, "reg;"},
{175, "macr;"},
{176, "deg;"},
{177, "plusmn;"},
{178, "sup2;"},
{179, "sup3;"},
{180, "acute;"},
{181, "micro;"},
{182, "para;"},
{183, "middot;"},
{184, "cedil;"},
{185, "sup1;"},
{186, "ordm;"},
{187, "raquo;"},
{188, "frac14;"},
{189, "frac12;"},
{190, "frac34;"},
{191, "iquest;"},
{192, "Agrave;"},
{193, "Aacute;"},
{194, "Acirc;"},
{195, "Atilde;"},
{196, "Auml;"},
{197, "Aring;"},
{198, "AElig;"},
{199, "Ccedil;"},
{200, "Egrave;"},
{201, "Eacute;"},
{202, "Ecirc;"},
{203, "Euml;"},
{204, "Igrave;"},
{205, "Iacute;"},
{206, "Icirc;"},
{207, "Iuml;"},
{208, "ETH;"},
{209, "Ntilde;"},
{210, "Ograve;"},
{211, "Oacute;"},
{212, "Ocirc;"},
{213, "Otilde;"},
{214, "Ouml;"},
{215, "times;"},
{216, "Oslash;"},
{217, "Ugrave;"},
{218, "Uacute;"},
{219, "Ucirc;"},
{220, "Uuml;"},
{221, "Yacute;"},
{222, "THORN;"},
{223, "szlig;"},
{224, "agrave;"},
{225, "aacute;"},
{226, "acirc;"},
{227, "atilde;"},
{228, "auml;"},
{229, "aring;"},
{230, "aelig;"},
{231, "ccedil;"},
{232, "egrave;"},
{233, "eacute;"},
{234, "ecirc;"},
{235, "euml;"},
{236, "igrave;"},
{237, "iacute;"},
{238, "icirc;"},
{239, "iuml;"},
{240, "eth;"},
{241, "ntilde;"},
{242, "ograve;"},
{243, "oacute;"},
{244, "ocirc;"},
{245, "otilde;"},
{246, "ouml;"},
{247, "divide;"},
{248, "oslash;"},
{249, "ugrave;"},
{250, "uacute;"},
{251, "ucirc;"},
{252, "uuml;"},
{253, "yacute;"},
{254, "thorn;"},
{255, "yuml;"},
{338, "OElig;"},
{339, "oelig;"},
{352, "Scaron;"},
{353, "scaron;"},
{376, "Yuml;"},
{402, "fnof;"},
{710, "circ;"},
{732, "tilde;"},
{913, "Alpha;"},
{914, "Beta;"},
{915, "Gamma;"},
{916, "Delta;"},
{917, "Epsilon;"},
{918, "Zeta;"},
{919, "Eta;"},
{920, "Theta;"},
{921, "Iota;"},
{922, "Kappa;"},
{923, "Lambda;"},
{924, "Mu;"},
{925, "Nu;"},
{926, "Xi;"},
{927, "Omicron;"},
{928, "Pi;"},
{929, "Rho;"},
{931, "Sigma;"},
{932, "Tau;"},
{933, "Upsilon;"},
{934, "Phi;"},
{935, "Chi;"},
{936, "Psi;"},
{937, "Omega;"},
{945, "alpha;"},
{946, "beta;"},
{947, "gamma;"},
{948, "delta;"},
{949, "epsilon;"},
{950, "zeta;"},
{951, "eta;"},
{952, "theta;"},
{953, "iota;"},
{954, "kappa;"},
{955, "lambda;"},
{956, "mu;"},
{957, "nu;"},
{958, "xi;"},
{959, "omicron;"},
{960, "pi;"},
{961, "rho;"},
{962, "sigmaf;"},
{963, "sigma;"},
{964, "tau;"},
{965, "upsilon;"},
{966, "phi;"},
{967, "chi;"},
{968, "psi;"},
{969, "omega;"},
{977, "thetasym;"},
{978, "upsih;"},
{982, "piv;"},
{8194, "ensp;"},
{8195, "emsp;"},
{8201, "thinsp;"},
{8204, "zwnj;"},
{8205, "zwj;"},
{8206, "lrm;"},
{8207, "rlm;"},
{8211, "ndash;"},
{8212, "mdash;"},
{8216, "lsquo;"},
{8217, "rsquo;"},
{8218, "sbquo;"},
{8220, "ldquo;"},
{8221, "rdquo;"},
{8222, "bdquo;"},
{8224, "dagger;"},
{8225, "Dagger;"},
{8226, "bull;"},
{8230, "hellip;"},
{8240, "permil;"},
{8242, "prime;"},
{8243, "Prime;"},
{8249, "lsaquo;"},
{8250, "rsaquo;"},
{8254, "oline;"},
{8260, "frasl;"},
{8364, "euro;"},
{8465, "image;"},
{8472, "weierp;"},
{8476, "real;"},
{8482, "trade;"},
{8501, "alefsym;"},
{8592, "larr;"},
{8593, "uarr;"},
{8594, "rarr;"},
{8595, "darr;"},
{8596, "harr;"},
{8629, "crarr;"},
{8656, "lArr;"},
{8657, "uArr;"},
{8658, "rArr;"},
{8659, "dArr;"},
{8660, "hArr;"},
{8704, "forall;"},
{8706, "part;"},
{8707, "exist;"},
{8709, "empty;"},
{8711, "nabla;"},
{8712, "isin;"},
{8713, "notin;"},
{8715, "ni;"},
{8719, "prod;"},
{8721, "sum;"},
{8722, "minus;"},
{8727, "lowast;"},
{8730, "radic;"},
{8733, "prop;"},
{8734, "infin;"},
{8736, "ang;"},
{8743, "and;"},
{8744, "or;"},
{8745, "cap;"},
{8746, "cup;"},
{8747, "int;"},
{8756, "there4;"},
{8764, "sim;"},
{8773, "cong;"},
{8776, "asymp;"},
{8800, "ne;"},
{8801, "equiv;"},
{8804, "le;"},
{8805, "ge;"},
{8834, "sub;"},
{8835, "sup;"},
{8836, "nsub;"},
{8838, "sube;"},
{8839, "supe;"},
{8853, "oplus;"},
{8855, "otimes;"},
{8869, "perp;"},
{8901, "sdot;"},
{8968, "lceil;"},
{8969, "rceil;"},
{8970, "lfloor;"},
{8971, "rfloor;"},
{9001, "lang;"},
{9002, "rang;"},
{9674, "loz;"},
{9824, "spades;"},
{9827, "clubs;"},
{9829, "hearts;"},
{9830, "diams;"},
{0, NULL},
};
static gchar *
unescape_string (const gchar * text)
{
gint i;
GString *unescaped = g_string_new (NULL);
while (*text) {
if (*text == '&') {
text++;
/* unescape &nbsp and &nbsp; */
if (!g_ascii_strncasecmp (text, "nbsp", 4)) {
unescaped = g_string_append_unichar (unescaped, 160);
text += 4;
if (*text == ';') {
text++;
}
goto next;
}
/* pass xml entities. these will be processed as pango markup */
for (i = 0; XmlEntities[i].escaped; i++) {
gssize len = strlen (XmlEntities[i].escaped);
if (!g_ascii_strncasecmp (text, XmlEntities[i].escaped, len)) {
unescaped = g_string_append_c (unescaped, '&');
unescaped =
g_string_append_len (unescaped, XmlEntities[i].escaped, len);
text += len;
goto next;
}
}
/* convert html entities */
for (i = 0; HtmlEntities[i].escaped; i++) {
gssize len = strlen (HtmlEntities[i].escaped);
if (!strncmp (text, HtmlEntities[i].escaped, len)) {
unescaped =
g_string_append_unichar (unescaped, HtmlEntities[i].unescaped);
text += len;
goto next;
}
}
if (*text == '#') {
gboolean is_hex = FALSE;
gunichar l;
gchar *end = NULL;
text++;
if (*text == 'x') {
is_hex = TRUE;
text++;
}
errno = 0;
if (is_hex) {
l = strtoul (text, &end, 16);
} else {
l = strtoul (text, &end, 10);
}
if (text == end || errno != 0) {
/* error occured. pass it */
goto next;
}
unescaped = g_string_append_unichar (unescaped, l);
text = end;
if (*text == ';') {
text++;
}
goto next;
}
/* escape & */
unescaped = g_string_append (unescaped, "&amp;");
next:
continue;
} else if (g_ascii_isspace (*text)) {
unescaped = g_string_append_c (unescaped, ' ');
/* strip whitespace */
do {
text++;
} while ((*text) && g_ascii_isspace (*text));
} else {
unescaped = g_string_append_c (unescaped, *text);
text++;
}
}
return g_string_free (unescaped, FALSE);
}
static const gchar *
string_token (const gchar * string, const gchar * delimiter, gchar ** first)
{
gchar *next = strstr (string, delimiter);
if (next) {
*first = strndup (string, next - string);
} else {
*first = strdup (string);
}
return next;
}
static void
html_context_handle_element (HtmlContext * ctxt,
const gchar * string, gboolean must_close)
{
gchar *name = NULL;
gint count = 0, i;
gchar **attrs;
const gchar *found, *next;
/* split element name and attributes */
next = string_token (string, " ", &name);
if (next) {
/* count attributes */
found = next + 1;
while (TRUE) {
found = strchr (found, '=');
if (!found)
break;
found++;
count++;
}
} else {
count = 0;
}
attrs = g_new0 (gchar *, (count + 1) * 2);
for (i = 0; i < count; i += 2) {
gchar *attr_name = NULL, *attr_value = NULL;
gsize length;
next = string_token (next + 1, "=", &attr_name);
next = string_token (next + 1, " ", &attr_value);
/* strip " or ' from attribute value */
if (attr_value[0] == '"' || attr_value[0] == '\'') {
gchar *tmp = strdup (attr_value + 1);
g_free (attr_value);
attr_value = tmp;
}
length = strlen (attr_value);
if (attr_value[length - 1] == '"' || attr_value[length - 1] == '\'') {
attr_value[length - 1] = '\0';
}
attrs[i] = attr_name;
attrs[i + 1] = attr_value;
}
ctxt->parser->start_element (ctxt, name,
(const gchar **) attrs, ctxt->user_data);
if (must_close) {
ctxt->parser->end_element (ctxt, name, ctxt->user_data);
}
g_strfreev (attrs);
g_free (name);
}
static void
html_context_parse (HtmlContext * ctxt, gchar * text, gsize text_len)
{
const gchar *next = NULL;
ctxt->buf = g_string_append_len (ctxt->buf, text, text_len);
next = ctxt->buf->str;
while (TRUE) {
if (next[0] == '<') {
gchar *element = NULL;
/* find <blahblah> */
if (!strchr (next, '>')) {
/* no tag end point. buffer will be process in next time */
return;
}
next = string_token (next, ">", &element);
next++;
if (g_str_has_suffix (next, "/")) {
/* handle <blah/> */
element[strlen (element) - 1] = '\0';
html_context_handle_element (ctxt, element + 1, TRUE);
} else if (element[1] == '/') {
/* handle </blah> */
ctxt->parser->end_element (ctxt, element + 2, ctxt->user_data);
} else {
/* handle <blah> */
html_context_handle_element (ctxt, element + 1, FALSE);
}
g_free (element);
} else if (strchr (next, '<')) {
gchar *text = NULL;
gsize length;
next = string_token (next, "<", &text);
text = g_strstrip (text);
length = strlen (text);
ctxt->parser->text (ctxt, text, length, ctxt->user_data);
g_free (text);
} else {
gchar *text = (gchar *) next;
gsize length;
text = g_strstrip (text);
length = strlen (text);
ctxt->parser->text (ctxt, text, length, ctxt->user_data);
ctxt->buf = g_string_assign (ctxt->buf, "");
return;
}
}
ctxt->buf = g_string_assign (ctxt->buf, next);
}
static gchar *
has_tag (GString * str, const gchar tag)
{
@ -116,26 +638,27 @@ sami_context_pop_state (GstSamiContext * sctx, char state)
}
static void
handle_start_sync (GstSamiContext * sctx, const xmlChar ** atts)
handle_start_sync (GstSamiContext * sctx, const gchar ** atts)
{
int i;
sami_context_pop_state (sctx, CLEAR_TAG);
if (atts != NULL) {
for (i = 0; (atts[i] != NULL); i += 2) {
const xmlChar *key, *value;
const gchar *key, *value;
key = atts[i];
value = atts[i + 1];
if (!value)
continue;
if (!xmlStrncmp ((const xmlChar *) "start", key, 5)) {
if (!g_ascii_strcasecmp ("start", key)) {
/* Only set a new start time if we don't have text pending */
if (sctx->resultbuf->len == 0)
sctx->time1 = sctx->time2;
sctx->time2 = atoi ((const char *) value) * GST_MSECOND;
sctx->time2 = MAX (sctx->time2, sctx->time1);
g_string_append (sctx->resultbuf, sctx->buf->str);
sctx->has_result = (sctx->resultbuf->len != 0) ? TRUE : FALSE;
g_string_truncate (sctx->buf, 0);
@ -145,7 +668,7 @@ handle_start_sync (GstSamiContext * sctx, const xmlChar ** atts)
}
static void
handle_start_font (GstSamiContext * sctx, const xmlChar ** atts)
handle_start_font (GstSamiContext * sctx, const gchar ** atts)
{
int i;
@ -153,53 +676,53 @@ handle_start_font (GstSamiContext * sctx, const xmlChar ** atts)
if (atts != NULL) {
g_string_append (sctx->buf, "<span");
for (i = 0; (atts[i] != NULL); i += 2) {
const xmlChar *key, *value;
const gchar *key, *value;
key = atts[i];
value = atts[i + 1];
if (!value)
continue;
if (!xmlStrncmp ((const xmlChar *) "color", key, 5)) {
if (!g_ascii_strcasecmp ("color", key)) {
/*
* There are invalid color value in many
* sami files.
* It will fix hex color value that start without '#'
*/
const gchar *sharp = "";
int len = xmlStrlen (value);
int len = strlen (value);
if (!(*value == '#' && len == 7)) {
gchar *r;
/* check if it looks like hex */
if (strtol ((const char *) value, &r, 16) >= 0 &&
((xmlChar *) r == (value + 6) && len == 6)) {
((gchar *) r == (value + 6) && len == 6)) {
sharp = "#";
}
}
/* some colours can be found in many sami files, but X RGB database
* doesn't contain a colour by this name, so map explicitly */
if (!xmlStrncasecmp (value, (const xmlChar *) "aqua", len)) {
value = (const xmlChar *) "#00ffff";
} else if (!xmlStrncasecmp (value, (const xmlChar *) "crimson", len)) {
value = (const xmlChar *) "#dc143c";
} else if (!xmlStrncasecmp (value, (const xmlChar *) "fuchsia", len)) {
value = (const xmlChar *) "#ff00ff";
} else if (!xmlStrncasecmp (value, (const xmlChar *) "indigo", len)) {
value = (const xmlChar *) "#4b0082";
} else if (!xmlStrncasecmp (value, (const xmlChar *) "lime", len)) {
value = (const xmlChar *) "#00ff00";
} else if (!xmlStrncasecmp (value, (const xmlChar *) "olive", len)) {
value = (const xmlChar *) "#808000";
} else if (!xmlStrncasecmp (value, (const xmlChar *) "silver", len)) {
value = (const xmlChar *) "#c0c0c0";
} else if (!xmlStrncasecmp (value, (const xmlChar *) "teal", len)) {
value = (const xmlChar *) "#008080";
if (!g_ascii_strcasecmp ("aqua", value)) {
value = "#00ffff";
} else if (!g_ascii_strcasecmp ("crimson", value)) {
value = "#dc143c";
} else if (!g_ascii_strcasecmp ("fuchsia", value)) {
value = "#ff00ff";
} else if (!g_ascii_strcasecmp ("indigo", value)) {
value = "#4b0082";
} else if (!g_ascii_strcasecmp ("lime", value)) {
value = "#00ff00";
} else if (!g_ascii_strcasecmp ("olive", value)) {
value = "#808000";
} else if (!g_ascii_strcasecmp ("silver", value)) {
value = "#c0c0c0";
} else if (!g_ascii_strcasecmp ("teal", value)) {
value = "#008080";
}
g_string_append_printf (sctx->buf, " foreground=\"%s%s\"", sharp,
value);
} else if (!xmlStrncasecmp ((const xmlChar *) "face", key, 4)) {
} else if (!g_ascii_strcasecmp ("face", key)) {
g_string_append_printf (sctx->buf, " font_family=\"%s\"", value);
}
}
@ -209,46 +732,47 @@ handle_start_font (GstSamiContext * sctx, const xmlChar ** atts)
}
static void
start_sami_element (void *ctx, const xmlChar * name, const xmlChar ** atts)
handle_start_element (HtmlContext * ctx, const gchar * name,
const char **atts, gpointer user_data)
{
GstSamiContext *sctx = (GstSamiContext *) ctx;
GstSamiContext *sctx = (GstSamiContext *) user_data;
GST_LOG ("name:%s", name);
if (!xmlStrncmp ((const xmlChar *) "sync", name, 4)) {
if (!g_ascii_strcasecmp ("sync", name)) {
handle_start_sync (sctx, atts);
sctx->in_sync = TRUE;
} else if (!xmlStrncmp ((const xmlChar *) "font", name, 4)) {
} else if (!g_ascii_strcasecmp ("font", name)) {
handle_start_font (sctx, atts);
} else if (!xmlStrncmp ((const xmlChar *) "ruby", name, 4)) {
} else if (!g_ascii_strcasecmp ("ruby", name)) {
sami_context_push_state (sctx, RUBY_TAG);
} else if (!xmlStrncmp ((const xmlChar *) "br", name, 2)) {
} else if (!g_ascii_strcasecmp ("br", name)) {
g_string_append_c (sctx->buf, '\n');
/* FIXME: support for furigana/ruby once implemented in pango */
} else if (!xmlStrncmp ((const xmlChar *) "rt", name, 2)) {
} else if (!g_ascii_strcasecmp ("rt", name)) {
if (has_tag (sctx->state, ITALIC_TAG)) {
g_string_append (sctx->rubybuf, "<i>");
}
g_string_append (sctx->rubybuf, "<span size='xx-small' rise='-100'>");
sami_context_push_state (sctx, RT_TAG);
} else if (!xmlStrncmp ((const xmlChar *) "p", name, 1)) {
} else if (!xmlStrncmp ((const xmlChar *) "i", name, 1)) {
} else if (!g_ascii_strcasecmp ("i", name)) {
g_string_append (sctx->buf, "<i>");
sami_context_push_state (sctx, ITALIC_TAG);
} else if (!g_ascii_strcasecmp ("p", name)) {
}
}
static void
end_sami_element (void *ctx, const xmlChar * name)
handle_end_element (HtmlContext * ctx, const char *name, gpointer user_data)
{
GstSamiContext *sctx = (GstSamiContext *) ctx;
GstSamiContext *sctx = (GstSamiContext *) user_data;
GST_LOG ("name:%s", name);
if (!xmlStrncmp ((const xmlChar *) "sync", name, 4)) {
if (!g_ascii_strcasecmp ("sync", name)) {
sctx->in_sync = FALSE;
} else if ((!xmlStrncmp ((const xmlChar *) "body", name, 4)) ||
(!xmlStrncmp ((const xmlChar *) "sami", name, 4))) {
} else if ((!g_ascii_strcasecmp ("body", name)) ||
(!g_ascii_strcasecmp ("sami", name))) {
/* We will usually have one buffer left when the body is closed
* as we need the next sync to actually send it */
if (sctx->buf->len != 0) {
@ -261,90 +785,40 @@ end_sami_element (void *ctx, const xmlChar * name)
sctx->has_result = (sctx->resultbuf->len != 0) ? TRUE : FALSE;
g_string_truncate (sctx->buf, 0);
}
} else if (!xmlStrncmp ((const xmlChar *) "font", name, 4)) {
} else if (!g_ascii_strcasecmp ("font", name)) {
sami_context_pop_state (sctx, SPAN_TAG);
} else if (!xmlStrncmp ((const xmlChar *) "ruby", name, 4)) {
} else if (!g_ascii_strcasecmp ("ruby", name)) {
sami_context_pop_state (sctx, RUBY_TAG);
} else if (!xmlStrncmp ((const xmlChar *) "i", name, 1)) {
} else if (!g_ascii_strcasecmp ("i", name)) {
sami_context_pop_state (sctx, ITALIC_TAG);
}
}
static void
characters_sami (void *ctx, const xmlChar * ch, int len)
handle_text (HtmlContext * ctx, const gchar * text, gsize text_len,
gpointer user_data)
{
GstSamiContext *sctx = (GstSamiContext *) ctx;
gchar *escaped;
gchar *tmp;
gint i;
GstSamiContext *sctx = (GstSamiContext *) user_data;
/* Skip everything except content of the sync elements */
if (!sctx->in_sync)
return;
escaped = g_markup_escape_text ((const gchar *) ch, len);
g_strstrip (escaped);
/* Remove double spaces forom the string as those are
* usually added by newlines and indention */
tmp = escaped;
for (i = 0; i <= strlen (escaped); i++) {
escaped[i] = *tmp;
if (*tmp != ' ') {
tmp++;
continue;
}
while (*tmp == ' ')
tmp++;
}
if (has_tag (sctx->state, RT_TAG)) {
g_string_append_c (sctx->rubybuf, ' ');
g_string_append (sctx->rubybuf, escaped);
g_string_append (sctx->rubybuf, text);
g_string_append_c (sctx->rubybuf, ' ');
} else {
g_string_append (sctx->buf, escaped);
g_string_append (sctx->buf, text);
}
g_free (escaped);
}
static xmlSAXHandler samiSAXHandlerStruct = {
NULL, /* internalSubset */
NULL, /* isStandalone */
NULL, /* hasInternalSubset */
NULL, /* hasExternalSubset */
NULL, /* resolveEntity */
NULL, /* getEntity */
NULL, /* entityDecl */
NULL, /* notationDecl */
NULL, /* attributeDecl */
NULL, /* elementDecl */
NULL, /* unparsedEntityDecl */
NULL, /* setDocumentLocator */
NULL, /* startDocument */
NULL, /* endDocument */
start_sami_element, /* startElement */
end_sami_element, /* endElement */
NULL, /* reference */
characters_sami, /* characters */
NULL, /* ignorableWhitespace */
NULL, /* processingInstruction */
NULL, /* comment */
NULL, /* xmlParserWarning */
NULL, /* xmlParserError */
NULL, /* xmlParserError */
NULL, /* getParameterEntity */
NULL, /* cdataBlock */
NULL, /* externalSubset */
1, /* initialized */
NULL, /* private */
NULL, /* startElementNsSAX2Func */
NULL, /* endElementNsSAX2Func */
NULL /* xmlStructuredErrorFunc */
static HtmlParser samiParser = {
handle_start_element, /* start_element */
handle_end_element, /* end_element */
handle_text, /* text */
};
static xmlSAXHandlerPtr samiSAXHandler = &samiSAXHandlerStruct;
void
sami_context_init (ParserState * state)
{
@ -354,8 +828,7 @@ sami_context_init (ParserState * state)
state->user_data = (gpointer) g_new0 (GstSamiContext, 1);
context = (GstSamiContext *) state->user_data;
context->htmlctxt = htmlCreatePushParserCtxt (samiSAXHandler, context,
"", 0, NULL, XML_CHAR_ENCODING_UTF8);
context->htmlctxt = html_context_new (&samiParser, context);
context->buf = g_string_new ("");
context->rubybuf = g_string_new ("");
context->resultbuf = g_string_new ("");
@ -368,17 +841,8 @@ sami_context_deinit (ParserState * state)
GstSamiContext *context = (GstSamiContext *) state->user_data;
if (context) {
htmlParserCtxtPtr htmlctxt = context->htmlctxt;
/* destroy sax context */
htmlDocPtr doc;
htmlParseChunk (htmlctxt, "", 0, 1);
doc = htmlctxt->myDoc;
htmlFreeParserCtxt (htmlctxt);
html_context_free (context->htmlctxt);
context->htmlctxt = NULL;
if (doc)
xmlFreeDoc (doc);
g_string_free (context->buf, TRUE);
g_string_free (context->rubybuf, TRUE);
g_string_free (context->resultbuf, TRUE);
@ -405,70 +869,29 @@ sami_context_reset (ParserState * state)
}
}
static gchar *
fix_invalid_entities (const gchar * line)
{
const gchar *cp, *pp; /* current pointer, previous pointer */
gssize size;
GString *ret = g_string_new (NULL);
pp = line;
cp = strchr (line, '&');
while (cp) {
size = cp - pp;
ret = g_string_append_len (ret, pp, size);
cp++;
if (g_ascii_strncasecmp (cp, "nbsp;", 5)
&& (!g_ascii_strncasecmp (cp, "nbsp", 4))) {
/* translate "&nbsp" to "&nbsp;" */
ret = g_string_append_len (ret, "&nbsp;", 6);
cp += 4;
} else if (g_ascii_strncasecmp (cp, "quot;", 5)
&& g_ascii_strncasecmp (cp, "amp;", 4)
&& g_ascii_strncasecmp (cp, "apos;", 5)
&& g_ascii_strncasecmp (cp, "lt;", 3)
&& g_ascii_strncasecmp (cp, "gt;", 3)
&& g_ascii_strncasecmp (cp, "nbsp;", 5)
&& cp[0] != '#') {
/* translate "&" to "&amp;" */
ret = g_string_append_len (ret, "&amp;", 5);
} else {
/* do not translate */
ret = g_string_append_c (ret, '&');
}
pp = cp;
cp = strchr (pp, '&');
}
ret = g_string_append (ret, pp);
return g_string_free (ret, FALSE);
}
gchar *
parse_sami (ParserState * state, const gchar * line)
{
gchar *fixed_line;
gchar *ret = NULL;
GstSamiContext *context = (GstSamiContext *) state->user_data;
fixed_line = fix_invalid_entities (line);
htmlParseChunk (context->htmlctxt, fixed_line, strlen (fixed_line), 0);
g_free (fixed_line);
gchar *unescaped = unescape_string (line);
html_context_parse (context->htmlctxt, (gchar *) unescaped,
strlen (unescaped));
g_free (unescaped);
if (context->has_result) {
gchar *r;
if (context->rubybuf->len) {
context->rubybuf = g_string_append_c (context->rubybuf, '\n');
g_string_prepend (context->resultbuf, context->rubybuf->str);
context->rubybuf = g_string_truncate (context->rubybuf, 0);
}
r = g_string_free (context->resultbuf, FALSE);
ret = g_string_free (context->resultbuf, FALSE);
context->resultbuf = g_string_new ("");
state->start_time = context->time1;
state->duration = context->time2 - context->time1;
context->has_result = FALSE;
return r;
}
return NULL;
return ret;
}