2006-05-15 09:14:35 +00:00
|
|
|
/* GStreamer SAMI subtitle parser
|
2013-05-25 08:10:14 +00:00
|
|
|
* Copyright (c) 2006, 2013 Young-Ho Cha <ganadist at gmail com>
|
2006-05-15 09:14:35 +00:00
|
|
|
*
|
|
|
|
* This library is free software; you can redistribute it and/or
|
|
|
|
* modify it under the terms of the GNU Library General Public
|
|
|
|
* License as published by the Free Software Foundation; either
|
|
|
|
* version 2 of the License, or (at your option) any later version.
|
|
|
|
*
|
|
|
|
* This library is distributed in the hope that it will be useful,
|
|
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
|
|
* Library General Public License for more details.
|
|
|
|
*
|
|
|
|
* You should have received a copy of the GNU Library General Public
|
|
|
|
* License along with this library; if not, write to the
|
2012-11-03 23:05:09 +00:00
|
|
|
* Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
|
|
|
|
* Boston, MA 02110-1301, USA.
|
2006-05-15 09:14:35 +00:00
|
|
|
*/
|
|
|
|
|
|
|
|
#include "samiparse.h"
|
|
|
|
|
2013-05-25 08:10:14 +00:00
|
|
|
#include <glib.h>
|
2006-05-15 09:14:35 +00:00
|
|
|
#include <string.h>
|
2013-05-25 08:10:14 +00:00
|
|
|
#include <stdlib.h>
|
2006-05-15 09:14:35 +00:00
|
|
|
|
|
|
|
#define ITALIC_TAG 'i'
|
2006-06-12 12:44:38 +00:00
|
|
|
#define SPAN_TAG 's'
|
2006-05-15 09:14:35 +00:00
|
|
|
#define RUBY_TAG 'r'
|
|
|
|
#define RT_TAG 't'
|
|
|
|
#define CLEAR_TAG '0'
|
|
|
|
|
2013-05-25 08:10:14 +00:00
|
|
|
typedef struct _HtmlParser HtmlParser;
|
|
|
|
typedef struct _HtmlContext HtmlContext;
|
2006-05-15 09:14:35 +00:00
|
|
|
typedef struct _GstSamiContext GstSamiContext;
|
|
|
|
|
|
|
|
struct _GstSamiContext
|
|
|
|
{
|
|
|
|
GString *buf; /* buffer to collect content */
|
|
|
|
GString *rubybuf; /* buffer to collect ruby content */
|
|
|
|
GString *resultbuf; /* when opening the next 'sync' tag, move
|
|
|
|
* from 'buf' to avoid to append following
|
|
|
|
* content */
|
|
|
|
GString *state; /* in many sami files there are tags that
|
|
|
|
* are not closed, so for each open tag the
|
|
|
|
* parser will append a tag flag here so
|
|
|
|
* that tags can be closed properly on
|
|
|
|
* 'sync' tags. See _context_push_state()
|
|
|
|
* and _context_pop_state(). */
|
2013-05-25 08:10:14 +00:00
|
|
|
HtmlContext *htmlctxt; /* html parser context */
|
2006-05-15 09:14:35 +00:00
|
|
|
gboolean has_result; /* set when ready to push out result */
|
2008-05-05 11:14:48 +00:00
|
|
|
gboolean in_sync; /* flag to avoid appending anything except the
|
|
|
|
* content of the sync elements to buf */
|
2006-05-15 09:14:35 +00:00
|
|
|
guint64 time1; /* previous start attribute in sync tag */
|
|
|
|
guint64 time2; /* current start attribute in sync tag */
|
|
|
|
};
|
|
|
|
|
2013-05-25 08:10:14 +00:00
|
|
|
struct _HtmlParser
|
|
|
|
{
|
|
|
|
void (*start_element) (HtmlContext * ctx,
|
|
|
|
const gchar * name, const gchar ** attr, gpointer user_data);
|
|
|
|
void (*end_element) (HtmlContext * ctx,
|
|
|
|
const gchar * name, gpointer user_data);
|
|
|
|
void (*text) (HtmlContext * ctx,
|
|
|
|
const gchar * text, gsize text_len, gpointer user_data);
|
|
|
|
};
|
|
|
|
|
|
|
|
struct _HtmlContext
|
|
|
|
{
|
|
|
|
const HtmlParser *parser;
|
|
|
|
gpointer user_data;
|
|
|
|
GString *buf;
|
|
|
|
};
|
|
|
|
|
|
|
|
static HtmlContext *
|
|
|
|
html_context_new (HtmlParser * parser, gpointer user_data)
|
|
|
|
{
|
|
|
|
HtmlContext *ctxt = (HtmlContext *) g_new0 (HtmlContext, 1);
|
|
|
|
ctxt->parser = parser;
|
|
|
|
ctxt->user_data = user_data;
|
|
|
|
ctxt->buf = g_string_new (NULL);
|
|
|
|
return ctxt;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
html_context_free (HtmlContext * ctxt)
|
|
|
|
{
|
|
|
|
g_string_free (ctxt->buf, TRUE);
|
|
|
|
g_free (ctxt);
|
|
|
|
}
|
|
|
|
|
|
|
|
struct EntityMap
|
|
|
|
{
|
|
|
|
const gunichar unescaped;
|
|
|
|
const gchar *escaped;
|
|
|
|
};
|
|
|
|
|
|
|
|
struct EntityMap XmlEntities[] = {
|
|
|
|
{34, "quot;"},
|
|
|
|
{38, "amp;"},
|
|
|
|
{39, "apos;"},
|
|
|
|
{60, "lt;"},
|
|
|
|
{62, "gt;"},
|
|
|
|
{0, NULL},
|
|
|
|
};
|
|
|
|
|
|
|
|
struct EntityMap HtmlEntities[] = {
|
|
|
|
/* nbsp will handle manually
|
|
|
|
{ 160, "nbsp;" }, */
|
|
|
|
{161, "iexcl;"},
|
|
|
|
{162, "cent;"},
|
|
|
|
{163, "pound;"},
|
|
|
|
{164, "curren;"},
|
|
|
|
{165, "yen;"},
|
|
|
|
{166, "brvbar;"},
|
|
|
|
{167, "sect;"},
|
|
|
|
{168, "uml;"},
|
|
|
|
{169, "copy;"},
|
|
|
|
{170, "ordf;"},
|
|
|
|
{171, "laquo;"},
|
|
|
|
{172, "not;"},
|
|
|
|
{173, "shy;"},
|
|
|
|
{174, "reg;"},
|
|
|
|
{175, "macr;"},
|
|
|
|
{176, "deg;"},
|
|
|
|
{177, "plusmn;"},
|
|
|
|
{178, "sup2;"},
|
|
|
|
{179, "sup3;"},
|
|
|
|
{180, "acute;"},
|
|
|
|
{181, "micro;"},
|
|
|
|
{182, "para;"},
|
|
|
|
{183, "middot;"},
|
|
|
|
{184, "cedil;"},
|
|
|
|
{185, "sup1;"},
|
|
|
|
{186, "ordm;"},
|
|
|
|
{187, "raquo;"},
|
|
|
|
{188, "frac14;"},
|
|
|
|
{189, "frac12;"},
|
|
|
|
{190, "frac34;"},
|
|
|
|
{191, "iquest;"},
|
|
|
|
{192, "Agrave;"},
|
|
|
|
{193, "Aacute;"},
|
|
|
|
{194, "Acirc;"},
|
|
|
|
{195, "Atilde;"},
|
|
|
|
{196, "Auml;"},
|
|
|
|
{197, "Aring;"},
|
|
|
|
{198, "AElig;"},
|
|
|
|
{199, "Ccedil;"},
|
|
|
|
{200, "Egrave;"},
|
|
|
|
{201, "Eacute;"},
|
|
|
|
{202, "Ecirc;"},
|
|
|
|
{203, "Euml;"},
|
|
|
|
{204, "Igrave;"},
|
|
|
|
{205, "Iacute;"},
|
|
|
|
{206, "Icirc;"},
|
|
|
|
{207, "Iuml;"},
|
|
|
|
{208, "ETH;"},
|
|
|
|
{209, "Ntilde;"},
|
|
|
|
{210, "Ograve;"},
|
|
|
|
{211, "Oacute;"},
|
|
|
|
{212, "Ocirc;"},
|
|
|
|
{213, "Otilde;"},
|
|
|
|
{214, "Ouml;"},
|
|
|
|
{215, "times;"},
|
|
|
|
{216, "Oslash;"},
|
|
|
|
{217, "Ugrave;"},
|
|
|
|
{218, "Uacute;"},
|
|
|
|
{219, "Ucirc;"},
|
|
|
|
{220, "Uuml;"},
|
|
|
|
{221, "Yacute;"},
|
|
|
|
{222, "THORN;"},
|
|
|
|
{223, "szlig;"},
|
|
|
|
{224, "agrave;"},
|
|
|
|
{225, "aacute;"},
|
|
|
|
{226, "acirc;"},
|
|
|
|
{227, "atilde;"},
|
|
|
|
{228, "auml;"},
|
|
|
|
{229, "aring;"},
|
|
|
|
{230, "aelig;"},
|
|
|
|
{231, "ccedil;"},
|
|
|
|
{232, "egrave;"},
|
|
|
|
{233, "eacute;"},
|
|
|
|
{234, "ecirc;"},
|
|
|
|
{235, "euml;"},
|
|
|
|
{236, "igrave;"},
|
|
|
|
{237, "iacute;"},
|
|
|
|
{238, "icirc;"},
|
|
|
|
{239, "iuml;"},
|
|
|
|
{240, "eth;"},
|
|
|
|
{241, "ntilde;"},
|
|
|
|
{242, "ograve;"},
|
|
|
|
{243, "oacute;"},
|
|
|
|
{244, "ocirc;"},
|
|
|
|
{245, "otilde;"},
|
|
|
|
{246, "ouml;"},
|
|
|
|
{247, "divide;"},
|
|
|
|
{248, "oslash;"},
|
|
|
|
{249, "ugrave;"},
|
|
|
|
{250, "uacute;"},
|
|
|
|
{251, "ucirc;"},
|
|
|
|
{252, "uuml;"},
|
|
|
|
{253, "yacute;"},
|
|
|
|
{254, "thorn;"},
|
|
|
|
{255, "yuml;"},
|
|
|
|
{338, "OElig;"},
|
|
|
|
{339, "oelig;"},
|
|
|
|
{352, "Scaron;"},
|
|
|
|
{353, "scaron;"},
|
|
|
|
{376, "Yuml;"},
|
|
|
|
{402, "fnof;"},
|
|
|
|
{710, "circ;"},
|
|
|
|
{732, "tilde;"},
|
|
|
|
{913, "Alpha;"},
|
|
|
|
{914, "Beta;"},
|
|
|
|
{915, "Gamma;"},
|
|
|
|
{916, "Delta;"},
|
|
|
|
{917, "Epsilon;"},
|
|
|
|
{918, "Zeta;"},
|
|
|
|
{919, "Eta;"},
|
|
|
|
{920, "Theta;"},
|
|
|
|
{921, "Iota;"},
|
|
|
|
{922, "Kappa;"},
|
|
|
|
{923, "Lambda;"},
|
|
|
|
{924, "Mu;"},
|
|
|
|
{925, "Nu;"},
|
|
|
|
{926, "Xi;"},
|
|
|
|
{927, "Omicron;"},
|
|
|
|
{928, "Pi;"},
|
|
|
|
{929, "Rho;"},
|
|
|
|
{931, "Sigma;"},
|
|
|
|
{932, "Tau;"},
|
|
|
|
{933, "Upsilon;"},
|
|
|
|
{934, "Phi;"},
|
|
|
|
{935, "Chi;"},
|
|
|
|
{936, "Psi;"},
|
|
|
|
{937, "Omega;"},
|
|
|
|
{945, "alpha;"},
|
|
|
|
{946, "beta;"},
|
|
|
|
{947, "gamma;"},
|
|
|
|
{948, "delta;"},
|
|
|
|
{949, "epsilon;"},
|
|
|
|
{950, "zeta;"},
|
|
|
|
{951, "eta;"},
|
|
|
|
{952, "theta;"},
|
|
|
|
{953, "iota;"},
|
|
|
|
{954, "kappa;"},
|
|
|
|
{955, "lambda;"},
|
|
|
|
{956, "mu;"},
|
|
|
|
{957, "nu;"},
|
|
|
|
{958, "xi;"},
|
|
|
|
{959, "omicron;"},
|
|
|
|
{960, "pi;"},
|
|
|
|
{961, "rho;"},
|
|
|
|
{962, "sigmaf;"},
|
|
|
|
{963, "sigma;"},
|
|
|
|
{964, "tau;"},
|
|
|
|
{965, "upsilon;"},
|
|
|
|
{966, "phi;"},
|
|
|
|
{967, "chi;"},
|
|
|
|
{968, "psi;"},
|
|
|
|
{969, "omega;"},
|
|
|
|
{977, "thetasym;"},
|
|
|
|
{978, "upsih;"},
|
|
|
|
{982, "piv;"},
|
|
|
|
{8194, "ensp;"},
|
|
|
|
{8195, "emsp;"},
|
|
|
|
{8201, "thinsp;"},
|
|
|
|
{8204, "zwnj;"},
|
|
|
|
{8205, "zwj;"},
|
|
|
|
{8206, "lrm;"},
|
|
|
|
{8207, "rlm;"},
|
|
|
|
{8211, "ndash;"},
|
|
|
|
{8212, "mdash;"},
|
|
|
|
{8216, "lsquo;"},
|
|
|
|
{8217, "rsquo;"},
|
|
|
|
{8218, "sbquo;"},
|
|
|
|
{8220, "ldquo;"},
|
|
|
|
{8221, "rdquo;"},
|
|
|
|
{8222, "bdquo;"},
|
|
|
|
{8224, "dagger;"},
|
|
|
|
{8225, "Dagger;"},
|
|
|
|
{8226, "bull;"},
|
|
|
|
{8230, "hellip;"},
|
|
|
|
{8240, "permil;"},
|
|
|
|
{8242, "prime;"},
|
|
|
|
{8243, "Prime;"},
|
|
|
|
{8249, "lsaquo;"},
|
|
|
|
{8250, "rsaquo;"},
|
|
|
|
{8254, "oline;"},
|
|
|
|
{8260, "frasl;"},
|
|
|
|
{8364, "euro;"},
|
|
|
|
{8465, "image;"},
|
|
|
|
{8472, "weierp;"},
|
|
|
|
{8476, "real;"},
|
|
|
|
{8482, "trade;"},
|
|
|
|
{8501, "alefsym;"},
|
|
|
|
{8592, "larr;"},
|
|
|
|
{8593, "uarr;"},
|
|
|
|
{8594, "rarr;"},
|
|
|
|
{8595, "darr;"},
|
|
|
|
{8596, "harr;"},
|
|
|
|
{8629, "crarr;"},
|
|
|
|
{8656, "lArr;"},
|
|
|
|
{8657, "uArr;"},
|
|
|
|
{8658, "rArr;"},
|
|
|
|
{8659, "dArr;"},
|
|
|
|
{8660, "hArr;"},
|
|
|
|
{8704, "forall;"},
|
|
|
|
{8706, "part;"},
|
|
|
|
{8707, "exist;"},
|
|
|
|
{8709, "empty;"},
|
|
|
|
{8711, "nabla;"},
|
|
|
|
{8712, "isin;"},
|
|
|
|
{8713, "notin;"},
|
|
|
|
{8715, "ni;"},
|
|
|
|
{8719, "prod;"},
|
|
|
|
{8721, "sum;"},
|
|
|
|
{8722, "minus;"},
|
|
|
|
{8727, "lowast;"},
|
|
|
|
{8730, "radic;"},
|
|
|
|
{8733, "prop;"},
|
|
|
|
{8734, "infin;"},
|
|
|
|
{8736, "ang;"},
|
|
|
|
{8743, "and;"},
|
|
|
|
{8744, "or;"},
|
|
|
|
{8745, "cap;"},
|
|
|
|
{8746, "cup;"},
|
|
|
|
{8747, "int;"},
|
|
|
|
{8756, "there4;"},
|
|
|
|
{8764, "sim;"},
|
|
|
|
{8773, "cong;"},
|
|
|
|
{8776, "asymp;"},
|
|
|
|
{8800, "ne;"},
|
|
|
|
{8801, "equiv;"},
|
|
|
|
{8804, "le;"},
|
|
|
|
{8805, "ge;"},
|
|
|
|
{8834, "sub;"},
|
|
|
|
{8835, "sup;"},
|
|
|
|
{8836, "nsub;"},
|
|
|
|
{8838, "sube;"},
|
|
|
|
{8839, "supe;"},
|
|
|
|
{8853, "oplus;"},
|
|
|
|
{8855, "otimes;"},
|
|
|
|
{8869, "perp;"},
|
|
|
|
{8901, "sdot;"},
|
|
|
|
{8968, "lceil;"},
|
|
|
|
{8969, "rceil;"},
|
|
|
|
{8970, "lfloor;"},
|
|
|
|
{8971, "rfloor;"},
|
|
|
|
{9001, "lang;"},
|
|
|
|
{9002, "rang;"},
|
|
|
|
{9674, "loz;"},
|
|
|
|
{9824, "spades;"},
|
|
|
|
{9827, "clubs;"},
|
|
|
|
{9829, "hearts;"},
|
|
|
|
{9830, "diams;"},
|
|
|
|
{0, NULL},
|
|
|
|
};
|
|
|
|
|
|
|
|
static gchar *
|
|
|
|
unescape_string (const gchar * text)
|
|
|
|
{
|
|
|
|
gint i;
|
|
|
|
GString *unescaped = g_string_new (NULL);
|
|
|
|
|
|
|
|
while (*text) {
|
|
|
|
if (*text == '&') {
|
|
|
|
text++;
|
|
|
|
|
|
|
|
/* unescape   and */
|
|
|
|
if (!g_ascii_strncasecmp (text, "nbsp", 4)) {
|
|
|
|
unescaped = g_string_append_unichar (unescaped, 160);
|
|
|
|
text += 4;
|
|
|
|
if (*text == ';') {
|
|
|
|
text++;
|
|
|
|
}
|
|
|
|
goto next;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* pass xml entities. these will be processed as pango markup */
|
|
|
|
for (i = 0; XmlEntities[i].escaped; i++) {
|
|
|
|
gssize len = strlen (XmlEntities[i].escaped);
|
|
|
|
if (!g_ascii_strncasecmp (text, XmlEntities[i].escaped, len)) {
|
|
|
|
unescaped = g_string_append_c (unescaped, '&');
|
|
|
|
unescaped =
|
|
|
|
g_string_append_len (unescaped, XmlEntities[i].escaped, len);
|
|
|
|
text += len;
|
|
|
|
goto next;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* convert html entities */
|
|
|
|
for (i = 0; HtmlEntities[i].escaped; i++) {
|
|
|
|
gssize len = strlen (HtmlEntities[i].escaped);
|
|
|
|
if (!strncmp (text, HtmlEntities[i].escaped, len)) {
|
|
|
|
unescaped =
|
|
|
|
g_string_append_unichar (unescaped, HtmlEntities[i].unescaped);
|
|
|
|
text += len;
|
|
|
|
goto next;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (*text == '#') {
|
|
|
|
gboolean is_hex = FALSE;
|
|
|
|
gunichar l;
|
|
|
|
gchar *end = NULL;
|
|
|
|
|
|
|
|
text++;
|
|
|
|
if (*text == 'x') {
|
|
|
|
is_hex = TRUE;
|
|
|
|
text++;
|
|
|
|
}
|
|
|
|
errno = 0;
|
|
|
|
if (is_hex) {
|
|
|
|
l = strtoul (text, &end, 16);
|
|
|
|
} else {
|
|
|
|
l = strtoul (text, &end, 10);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (text == end || errno != 0) {
|
2019-08-29 17:42:39 +00:00
|
|
|
/* error occurred. pass it */
|
2013-05-25 08:10:14 +00:00
|
|
|
goto next;
|
|
|
|
}
|
|
|
|
unescaped = g_string_append_unichar (unescaped, l);
|
|
|
|
text = end;
|
|
|
|
|
|
|
|
if (*text == ';') {
|
|
|
|
text++;
|
|
|
|
}
|
|
|
|
goto next;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* escape & */
|
|
|
|
unescaped = g_string_append (unescaped, "&");
|
|
|
|
|
|
|
|
next:
|
|
|
|
continue;
|
|
|
|
|
|
|
|
} else if (g_ascii_isspace (*text)) {
|
|
|
|
unescaped = g_string_append_c (unescaped, ' ');
|
|
|
|
/* strip whitespace */
|
|
|
|
do {
|
|
|
|
text++;
|
|
|
|
} while ((*text) && g_ascii_isspace (*text));
|
|
|
|
} else {
|
|
|
|
unescaped = g_string_append_c (unescaped, *text);
|
|
|
|
text++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return g_string_free (unescaped, FALSE);
|
|
|
|
}
|
|
|
|
|
|
|
|
static const gchar *
|
|
|
|
string_token (const gchar * string, const gchar * delimiter, gchar ** first)
|
|
|
|
{
|
|
|
|
gchar *next = strstr (string, delimiter);
|
|
|
|
if (next) {
|
2013-07-17 08:04:47 +00:00
|
|
|
*first = g_strndup (string, next - string);
|
2013-05-25 08:10:14 +00:00
|
|
|
} else {
|
2013-07-17 08:04:47 +00:00
|
|
|
*first = g_strdup (string);
|
2013-05-25 08:10:14 +00:00
|
|
|
}
|
|
|
|
return next;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
html_context_handle_element (HtmlContext * ctxt,
|
|
|
|
const gchar * string, gboolean must_close)
|
|
|
|
{
|
|
|
|
gchar *name = NULL;
|
|
|
|
gint count = 0, i;
|
|
|
|
gchar **attrs;
|
|
|
|
const gchar *found, *next;
|
|
|
|
|
|
|
|
/* split element name and attributes */
|
|
|
|
next = string_token (string, " ", &name);
|
|
|
|
|
|
|
|
if (next) {
|
|
|
|
/* count attributes */
|
|
|
|
found = next + 1;
|
|
|
|
while (TRUE) {
|
|
|
|
found = strchr (found, '=');
|
|
|
|
if (!found)
|
|
|
|
break;
|
|
|
|
found++;
|
|
|
|
count++;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
count = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
attrs = g_new0 (gchar *, (count + 1) * 2);
|
|
|
|
|
2013-11-21 13:04:46 +00:00
|
|
|
for (i = 0; i < count && next != NULL; i += 2) {
|
2013-05-25 08:10:14 +00:00
|
|
|
gchar *attr_name = NULL, *attr_value = NULL;
|
|
|
|
gsize length;
|
|
|
|
next = string_token (next + 1, "=", &attr_name);
|
2022-04-28 05:32:27 +00:00
|
|
|
if (!next) {
|
|
|
|
g_free (attr_name);
|
|
|
|
break;
|
|
|
|
}
|
2013-05-25 08:10:14 +00:00
|
|
|
next = string_token (next + 1, " ", &attr_value);
|
|
|
|
|
|
|
|
/* strip " or ' from attribute value */
|
|
|
|
if (attr_value[0] == '"' || attr_value[0] == '\'') {
|
2013-07-17 08:04:47 +00:00
|
|
|
gchar *tmp = g_strdup (attr_value + 1);
|
2013-05-25 08:10:14 +00:00
|
|
|
g_free (attr_value);
|
|
|
|
attr_value = tmp;
|
|
|
|
}
|
|
|
|
|
|
|
|
length = strlen (attr_value);
|
2017-01-20 06:02:38 +00:00
|
|
|
if (length > 0 && (attr_value[length - 1] == '"'
|
|
|
|
|| attr_value[length - 1] == '\'')) {
|
2013-05-25 08:10:14 +00:00
|
|
|
attr_value[length - 1] = '\0';
|
|
|
|
}
|
|
|
|
|
|
|
|
attrs[i] = attr_name;
|
|
|
|
attrs[i + 1] = attr_value;
|
|
|
|
}
|
|
|
|
|
|
|
|
ctxt->parser->start_element (ctxt, name,
|
|
|
|
(const gchar **) attrs, ctxt->user_data);
|
|
|
|
if (must_close) {
|
|
|
|
ctxt->parser->end_element (ctxt, name, ctxt->user_data);
|
|
|
|
}
|
|
|
|
g_strfreev (attrs);
|
|
|
|
g_free (name);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
html_context_parse (HtmlContext * ctxt, gchar * text, gsize text_len)
|
|
|
|
{
|
|
|
|
const gchar *next = NULL;
|
|
|
|
ctxt->buf = g_string_append_len (ctxt->buf, text, text_len);
|
|
|
|
next = ctxt->buf->str;
|
|
|
|
while (TRUE) {
|
|
|
|
if (next[0] == '<') {
|
|
|
|
gchar *element = NULL;
|
|
|
|
/* find <blahblah> */
|
|
|
|
if (!strchr (next, '>')) {
|
|
|
|
/* no tag end point. buffer will be process in next time */
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
next = string_token (next, ">", &element);
|
|
|
|
next++;
|
2022-06-29 23:13:19 +00:00
|
|
|
if (g_str_has_suffix (element, "/")) {
|
2013-05-25 08:10:14 +00:00
|
|
|
/* handle <blah/> */
|
|
|
|
element[strlen (element) - 1] = '\0';
|
|
|
|
html_context_handle_element (ctxt, element + 1, TRUE);
|
|
|
|
} else if (element[1] == '/') {
|
|
|
|
/* handle </blah> */
|
|
|
|
ctxt->parser->end_element (ctxt, element + 2, ctxt->user_data);
|
|
|
|
} else {
|
|
|
|
/* handle <blah> */
|
|
|
|
html_context_handle_element (ctxt, element + 1, FALSE);
|
|
|
|
}
|
|
|
|
g_free (element);
|
|
|
|
} else if (strchr (next, '<')) {
|
|
|
|
gchar *text = NULL;
|
|
|
|
gsize length;
|
|
|
|
next = string_token (next, "<", &text);
|
|
|
|
text = g_strstrip (text);
|
|
|
|
length = strlen (text);
|
|
|
|
ctxt->parser->text (ctxt, text, length, ctxt->user_data);
|
|
|
|
g_free (text);
|
|
|
|
|
|
|
|
} else {
|
|
|
|
gchar *text = (gchar *) next;
|
|
|
|
gsize length;
|
|
|
|
text = g_strstrip (text);
|
|
|
|
length = strlen (text);
|
|
|
|
ctxt->parser->text (ctxt, text, length, ctxt->user_data);
|
|
|
|
ctxt->buf = g_string_assign (ctxt->buf, "");
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
ctxt->buf = g_string_assign (ctxt->buf, next);
|
|
|
|
}
|
|
|
|
|
2006-05-15 09:14:35 +00:00
|
|
|
static gchar *
|
|
|
|
has_tag (GString * str, const gchar tag)
|
|
|
|
{
|
|
|
|
return strrchr (str->str, tag);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
sami_context_push_state (GstSamiContext * sctx, char state)
|
|
|
|
{
|
2008-12-11 15:49:12 +00:00
|
|
|
GST_LOG ("state %c", state);
|
2006-05-15 09:14:35 +00:00
|
|
|
g_string_append_c (sctx->state, state);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
|
|
|
sami_context_pop_state (GstSamiContext * sctx, char state)
|
|
|
|
{
|
|
|
|
GString *str = g_string_new ("");
|
|
|
|
GString *context_state = sctx->state;
|
|
|
|
int i;
|
|
|
|
|
2008-12-11 15:49:12 +00:00
|
|
|
GST_LOG ("state %c", state);
|
2006-05-15 09:14:35 +00:00
|
|
|
for (i = context_state->len - 1; i >= 0; i--) {
|
|
|
|
switch (context_state->str[i]) {
|
|
|
|
case ITALIC_TAG: /* <i> */
|
|
|
|
{
|
|
|
|
g_string_append (str, "</i>");
|
|
|
|
break;
|
|
|
|
}
|
2006-06-12 12:44:38 +00:00
|
|
|
case SPAN_TAG: /* <span foreground= > */
|
2006-05-15 09:14:35 +00:00
|
|
|
{
|
|
|
|
g_string_append (str, "</span>");
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
case RUBY_TAG: /* <span size= > -- ruby */
|
|
|
|
{
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
case RT_TAG: /* ruby */
|
|
|
|
{
|
|
|
|
/* FIXME: support for furigana/ruby once implemented in pango */
|
|
|
|
g_string_append (sctx->rubybuf, "</span>");
|
|
|
|
if (has_tag (context_state, ITALIC_TAG)) {
|
|
|
|
g_string_append (sctx->rubybuf, "</i>");
|
|
|
|
}
|
|
|
|
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
default:
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
if (context_state->str[i] == state) {
|
|
|
|
g_string_append (sctx->buf, str->str);
|
|
|
|
g_string_free (str, TRUE);
|
|
|
|
g_string_truncate (context_state, i);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (state == CLEAR_TAG) {
|
|
|
|
g_string_append (sctx->buf, str->str);
|
|
|
|
g_string_truncate (context_state, 0);
|
|
|
|
}
|
|
|
|
g_string_free (str, TRUE);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
2013-05-25 08:10:14 +00:00
|
|
|
handle_start_sync (GstSamiContext * sctx, const gchar ** atts)
|
2006-05-15 09:14:35 +00:00
|
|
|
{
|
|
|
|
int i;
|
|
|
|
|
|
|
|
sami_context_pop_state (sctx, CLEAR_TAG);
|
|
|
|
if (atts != NULL) {
|
|
|
|
for (i = 0; (atts[i] != NULL); i += 2) {
|
2013-05-25 08:10:14 +00:00
|
|
|
const gchar *key, *value;
|
2006-05-15 09:14:35 +00:00
|
|
|
|
|
|
|
key = atts[i];
|
|
|
|
value = atts[i + 1];
|
|
|
|
|
|
|
|
if (!value)
|
|
|
|
continue;
|
2013-05-25 08:10:14 +00:00
|
|
|
if (!g_ascii_strcasecmp ("start", key)) {
|
2008-05-05 12:33:05 +00:00
|
|
|
/* Only set a new start time if we don't have text pending */
|
|
|
|
if (sctx->resultbuf->len == 0)
|
|
|
|
sctx->time1 = sctx->time2;
|
|
|
|
|
2006-05-15 09:14:35 +00:00
|
|
|
sctx->time2 = atoi ((const char *) value) * GST_MSECOND;
|
2013-05-25 08:10:14 +00:00
|
|
|
sctx->time2 = MAX (sctx->time2, sctx->time1);
|
2006-05-15 09:14:35 +00:00
|
|
|
g_string_append (sctx->resultbuf, sctx->buf->str);
|
2008-05-05 12:33:05 +00:00
|
|
|
sctx->has_result = (sctx->resultbuf->len != 0) ? TRUE : FALSE;
|
2006-05-15 09:14:35 +00:00
|
|
|
g_string_truncate (sctx->buf, 0);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
2013-05-25 08:10:14 +00:00
|
|
|
handle_start_font (GstSamiContext * sctx, const gchar ** atts)
|
2006-05-15 09:14:35 +00:00
|
|
|
{
|
|
|
|
int i;
|
|
|
|
|
2006-06-12 12:44:38 +00:00
|
|
|
sami_context_pop_state (sctx, SPAN_TAG);
|
2006-05-15 09:14:35 +00:00
|
|
|
if (atts != NULL) {
|
2006-06-12 12:44:38 +00:00
|
|
|
g_string_append (sctx->buf, "<span");
|
2006-05-15 09:14:35 +00:00
|
|
|
for (i = 0; (atts[i] != NULL); i += 2) {
|
2013-05-25 08:10:14 +00:00
|
|
|
const gchar *key, *value;
|
2006-05-15 09:14:35 +00:00
|
|
|
|
|
|
|
key = atts[i];
|
|
|
|
value = atts[i + 1];
|
|
|
|
|
|
|
|
if (!value)
|
|
|
|
continue;
|
2013-05-25 08:10:14 +00:00
|
|
|
if (!g_ascii_strcasecmp ("color", key)) {
|
2006-05-15 09:14:35 +00:00
|
|
|
/*
|
|
|
|
* There are invalid color value in many
|
|
|
|
* sami files.
|
|
|
|
* It will fix hex color value that start without '#'
|
|
|
|
*/
|
2010-03-15 21:49:53 +00:00
|
|
|
const gchar *sharp = "";
|
2013-05-25 08:10:14 +00:00
|
|
|
int len = strlen (value);
|
2006-05-15 09:14:35 +00:00
|
|
|
|
|
|
|
if (!(*value == '#' && len == 7)) {
|
|
|
|
gchar *r;
|
|
|
|
|
2006-05-18 10:05:23 +00:00
|
|
|
/* check if it looks like hex */
|
|
|
|
if (strtol ((const char *) value, &r, 16) >= 0 &&
|
2013-05-25 08:10:14 +00:00
|
|
|
((gchar *) r == (value + 6) && len == 6)) {
|
2006-05-15 09:14:35 +00:00
|
|
|
sharp = "#";
|
|
|
|
}
|
|
|
|
}
|
2007-03-21 10:23:11 +00:00
|
|
|
/* some colours can be found in many sami files, but X RGB database
|
2006-05-15 09:41:03 +00:00
|
|
|
* doesn't contain a colour by this name, so map explicitly */
|
2013-05-25 08:10:14 +00:00
|
|
|
if (!g_ascii_strcasecmp ("aqua", value)) {
|
|
|
|
value = "#00ffff";
|
|
|
|
} else if (!g_ascii_strcasecmp ("crimson", value)) {
|
|
|
|
value = "#dc143c";
|
|
|
|
} else if (!g_ascii_strcasecmp ("fuchsia", value)) {
|
|
|
|
value = "#ff00ff";
|
|
|
|
} else if (!g_ascii_strcasecmp ("indigo", value)) {
|
|
|
|
value = "#4b0082";
|
|
|
|
} else if (!g_ascii_strcasecmp ("lime", value)) {
|
|
|
|
value = "#00ff00";
|
|
|
|
} else if (!g_ascii_strcasecmp ("olive", value)) {
|
|
|
|
value = "#808000";
|
|
|
|
} else if (!g_ascii_strcasecmp ("silver", value)) {
|
|
|
|
value = "#c0c0c0";
|
|
|
|
} else if (!g_ascii_strcasecmp ("teal", value)) {
|
|
|
|
value = "#008080";
|
2006-05-15 09:41:03 +00:00
|
|
|
}
|
2006-06-12 12:44:38 +00:00
|
|
|
g_string_append_printf (sctx->buf, " foreground=\"%s%s\"", sharp,
|
2006-05-15 09:14:35 +00:00
|
|
|
value);
|
2013-05-25 08:10:14 +00:00
|
|
|
} else if (!g_ascii_strcasecmp ("face", key)) {
|
2006-06-12 12:44:38 +00:00
|
|
|
g_string_append_printf (sctx->buf, " font_family=\"%s\"", value);
|
2006-05-15 09:14:35 +00:00
|
|
|
}
|
|
|
|
}
|
2006-06-12 12:44:38 +00:00
|
|
|
g_string_append_c (sctx->buf, '>');
|
|
|
|
sami_context_push_state (sctx, SPAN_TAG);
|
2006-05-15 09:14:35 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
2013-05-25 08:10:14 +00:00
|
|
|
handle_start_element (HtmlContext * ctx, const gchar * name,
|
|
|
|
const char **atts, gpointer user_data)
|
2006-05-15 09:14:35 +00:00
|
|
|
{
|
2013-05-25 08:10:14 +00:00
|
|
|
GstSamiContext *sctx = (GstSamiContext *) user_data;
|
2006-05-15 09:14:35 +00:00
|
|
|
|
2008-12-11 15:49:12 +00:00
|
|
|
GST_LOG ("name:%s", name);
|
|
|
|
|
2013-05-25 08:10:14 +00:00
|
|
|
if (!g_ascii_strcasecmp ("sync", name)) {
|
2006-05-15 09:14:35 +00:00
|
|
|
handle_start_sync (sctx, atts);
|
2008-05-05 11:14:48 +00:00
|
|
|
sctx->in_sync = TRUE;
|
2013-05-25 08:10:14 +00:00
|
|
|
} else if (!g_ascii_strcasecmp ("font", name)) {
|
2006-05-15 09:14:35 +00:00
|
|
|
handle_start_font (sctx, atts);
|
2013-05-25 08:10:14 +00:00
|
|
|
} else if (!g_ascii_strcasecmp ("ruby", name)) {
|
2006-05-15 09:14:35 +00:00
|
|
|
sami_context_push_state (sctx, RUBY_TAG);
|
2013-05-25 08:10:14 +00:00
|
|
|
} else if (!g_ascii_strcasecmp ("br", name)) {
|
2006-05-15 09:14:35 +00:00
|
|
|
g_string_append_c (sctx->buf, '\n');
|
|
|
|
/* FIXME: support for furigana/ruby once implemented in pango */
|
2013-05-25 08:10:14 +00:00
|
|
|
} else if (!g_ascii_strcasecmp ("rt", name)) {
|
2006-05-15 09:14:35 +00:00
|
|
|
if (has_tag (sctx->state, ITALIC_TAG)) {
|
|
|
|
g_string_append (sctx->rubybuf, "<i>");
|
|
|
|
}
|
|
|
|
g_string_append (sctx->rubybuf, "<span size='xx-small' rise='-100'>");
|
|
|
|
sami_context_push_state (sctx, RT_TAG);
|
2013-05-25 08:10:14 +00:00
|
|
|
} else if (!g_ascii_strcasecmp ("i", name)) {
|
2006-05-15 09:14:35 +00:00
|
|
|
g_string_append (sctx->buf, "<i>");
|
|
|
|
sami_context_push_state (sctx, ITALIC_TAG);
|
2013-05-25 08:10:14 +00:00
|
|
|
} else if (!g_ascii_strcasecmp ("p", name)) {
|
2006-05-15 09:14:35 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
2013-05-25 08:10:14 +00:00
|
|
|
handle_end_element (HtmlContext * ctx, const char *name, gpointer user_data)
|
2006-05-15 09:14:35 +00:00
|
|
|
{
|
2013-05-25 08:10:14 +00:00
|
|
|
GstSamiContext *sctx = (GstSamiContext *) user_data;
|
2006-05-15 09:14:35 +00:00
|
|
|
|
2008-12-11 15:49:12 +00:00
|
|
|
GST_LOG ("name:%s", name);
|
|
|
|
|
2013-05-25 08:10:14 +00:00
|
|
|
if (!g_ascii_strcasecmp ("sync", name)) {
|
2008-05-05 11:14:48 +00:00
|
|
|
sctx->in_sync = FALSE;
|
2013-05-25 08:10:14 +00:00
|
|
|
} else if ((!g_ascii_strcasecmp ("body", name)) ||
|
|
|
|
(!g_ascii_strcasecmp ("sami", name))) {
|
2008-05-05 12:33:05 +00:00
|
|
|
/* We will usually have one buffer left when the body is closed
|
|
|
|
* as we need the next sync to actually send it */
|
|
|
|
if (sctx->buf->len != 0) {
|
|
|
|
/* Only set a new start time if we don't have text pending */
|
|
|
|
if (sctx->resultbuf->len == 0)
|
|
|
|
sctx->time1 = sctx->time2;
|
|
|
|
|
|
|
|
sctx->time2 = GST_CLOCK_TIME_NONE;
|
|
|
|
g_string_append (sctx->resultbuf, sctx->buf->str);
|
|
|
|
sctx->has_result = (sctx->resultbuf->len != 0) ? TRUE : FALSE;
|
|
|
|
g_string_truncate (sctx->buf, 0);
|
|
|
|
}
|
2013-05-25 08:10:14 +00:00
|
|
|
} else if (!g_ascii_strcasecmp ("font", name)) {
|
2006-06-12 12:44:38 +00:00
|
|
|
sami_context_pop_state (sctx, SPAN_TAG);
|
2013-05-25 08:10:14 +00:00
|
|
|
} else if (!g_ascii_strcasecmp ("ruby", name)) {
|
2006-05-15 09:14:35 +00:00
|
|
|
sami_context_pop_state (sctx, RUBY_TAG);
|
2013-05-25 08:10:14 +00:00
|
|
|
} else if (!g_ascii_strcasecmp ("i", name)) {
|
2006-05-15 09:14:35 +00:00
|
|
|
sami_context_pop_state (sctx, ITALIC_TAG);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static void
|
2013-05-25 08:10:14 +00:00
|
|
|
handle_text (HtmlContext * ctx, const gchar * text, gsize text_len,
|
|
|
|
gpointer user_data)
|
2006-05-15 09:14:35 +00:00
|
|
|
{
|
2013-05-25 08:10:14 +00:00
|
|
|
GstSamiContext *sctx = (GstSamiContext *) user_data;
|
2006-05-15 09:14:35 +00:00
|
|
|
|
2008-05-05 11:14:48 +00:00
|
|
|
/* Skip everything except content of the sync elements */
|
|
|
|
if (!sctx->in_sync)
|
2006-05-15 09:14:35 +00:00
|
|
|
return;
|
|
|
|
|
|
|
|
if (has_tag (sctx->state, RT_TAG)) {
|
|
|
|
g_string_append_c (sctx->rubybuf, ' ');
|
2013-05-25 08:10:14 +00:00
|
|
|
g_string_append (sctx->rubybuf, text);
|
2006-05-15 09:14:35 +00:00
|
|
|
g_string_append_c (sctx->rubybuf, ' ');
|
|
|
|
} else {
|
2013-05-25 08:10:14 +00:00
|
|
|
g_string_append (sctx->buf, text);
|
2006-05-15 09:14:35 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2013-05-25 08:10:14 +00:00
|
|
|
static HtmlParser samiParser = {
|
|
|
|
handle_start_element, /* start_element */
|
|
|
|
handle_end_element, /* end_element */
|
|
|
|
handle_text, /* text */
|
2006-05-15 09:14:35 +00:00
|
|
|
};
|
2009-08-04 12:29:41 +00:00
|
|
|
|
2006-05-15 09:14:35 +00:00
|
|
|
void
|
|
|
|
sami_context_init (ParserState * state)
|
|
|
|
{
|
|
|
|
GstSamiContext *context;
|
|
|
|
|
|
|
|
g_assert (state->user_data == NULL);
|
2013-07-16 17:30:50 +00:00
|
|
|
|
|
|
|
context = g_new0 (GstSamiContext, 1);
|
2006-05-15 09:14:35 +00:00
|
|
|
|
2013-05-25 08:10:14 +00:00
|
|
|
context->htmlctxt = html_context_new (&samiParser, context);
|
2006-05-15 09:14:35 +00:00
|
|
|
context->buf = g_string_new ("");
|
|
|
|
context->rubybuf = g_string_new ("");
|
|
|
|
context->resultbuf = g_string_new ("");
|
|
|
|
context->state = g_string_new ("");
|
2013-07-16 17:30:50 +00:00
|
|
|
|
|
|
|
state->user_data = context;
|
2006-05-15 09:14:35 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
sami_context_deinit (ParserState * state)
|
|
|
|
{
|
|
|
|
GstSamiContext *context = (GstSamiContext *) state->user_data;
|
|
|
|
|
|
|
|
if (context) {
|
2013-05-25 08:10:14 +00:00
|
|
|
html_context_free (context->htmlctxt);
|
2006-05-15 09:14:35 +00:00
|
|
|
context->htmlctxt = NULL;
|
|
|
|
g_string_free (context->buf, TRUE);
|
|
|
|
g_string_free (context->rubybuf, TRUE);
|
|
|
|
g_string_free (context->resultbuf, TRUE);
|
|
|
|
g_string_free (context->state, TRUE);
|
|
|
|
g_free (context);
|
|
|
|
state->user_data = NULL;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void
|
|
|
|
sami_context_reset (ParserState * state)
|
|
|
|
{
|
|
|
|
GstSamiContext *context = (GstSamiContext *) state->user_data;
|
|
|
|
|
|
|
|
if (context) {
|
|
|
|
g_string_truncate (context->buf, 0);
|
|
|
|
g_string_truncate (context->rubybuf, 0);
|
|
|
|
g_string_truncate (context->resultbuf, 0);
|
|
|
|
g_string_truncate (context->state, 0);
|
|
|
|
context->has_result = FALSE;
|
2008-05-05 11:14:48 +00:00
|
|
|
context->in_sync = FALSE;
|
2006-05-15 09:14:35 +00:00
|
|
|
context->time1 = 0;
|
|
|
|
context->time2 = 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
gchar *
|
|
|
|
parse_sami (ParserState * state, const gchar * line)
|
|
|
|
{
|
2013-05-25 08:10:14 +00:00
|
|
|
gchar *ret = NULL;
|
2006-05-15 09:14:35 +00:00
|
|
|
GstSamiContext *context = (GstSamiContext *) state->user_data;
|
|
|
|
|
2013-05-25 08:10:14 +00:00
|
|
|
gchar *unescaped = unescape_string (line);
|
|
|
|
html_context_parse (context->htmlctxt, (gchar *) unescaped,
|
|
|
|
strlen (unescaped));
|
|
|
|
g_free (unescaped);
|
2006-06-10 18:25:07 +00:00
|
|
|
|
2006-05-15 09:14:35 +00:00
|
|
|
if (context->has_result) {
|
|
|
|
if (context->rubybuf->len) {
|
|
|
|
context->rubybuf = g_string_append_c (context->rubybuf, '\n');
|
|
|
|
g_string_prepend (context->resultbuf, context->rubybuf->str);
|
|
|
|
context->rubybuf = g_string_truncate (context->rubybuf, 0);
|
|
|
|
}
|
|
|
|
|
2013-05-25 08:10:14 +00:00
|
|
|
ret = g_string_free (context->resultbuf, FALSE);
|
2006-05-15 09:14:35 +00:00
|
|
|
context->resultbuf = g_string_new ("");
|
|
|
|
state->start_time = context->time1;
|
|
|
|
state->duration = context->time2 - context->time1;
|
|
|
|
context->has_result = FALSE;
|
|
|
|
}
|
2013-05-25 08:10:14 +00:00
|
|
|
return ret;
|
2006-05-15 09:14:35 +00:00
|
|
|
}
|