mirror of
https://gitlab.freedesktop.org/gstreamer/gstreamer.git
synced 2024-11-05 09:00:54 +00:00
270d23c8e0
Avoid relocations and hard-code entity string length in the struct, since we basically get it for free here. Part-of: <https://gitlab.freedesktop.org/gstreamer/gstreamer/-/merge_requests/2685>
911 lines
25 KiB
C
911 lines
25 KiB
C
/* GStreamer SAMI subtitle parser
|
|
* Copyright (c) 2006, 2013 Young-Ho Cha <ganadist at gmail com>
|
|
*
|
|
* This library is free software; you can redistribute it and/or
|
|
* modify it under the terms of the GNU Library General Public
|
|
* License as published by the Free Software Foundation; either
|
|
* version 2 of the License, or (at your option) any later version.
|
|
*
|
|
* This library is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
* Library General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU Library General Public
|
|
* License along with this library; if not, write to the
|
|
* Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
|
|
* Boston, MA 02110-1301, USA.
|
|
*/
|
|
|
|
#include "samiparse.h"
|
|
|
|
#include <glib.h>
|
|
#include <string.h>
|
|
#include <stdlib.h>
|
|
|
|
#define ITALIC_TAG 'i'
|
|
#define SPAN_TAG 's'
|
|
#define RUBY_TAG 'r'
|
|
#define RT_TAG 't'
|
|
#define CLEAR_TAG '0'
|
|
|
|
typedef struct _HtmlParser HtmlParser;
|
|
typedef struct _HtmlContext HtmlContext;
|
|
typedef struct _GstSamiContext GstSamiContext;
|
|
|
|
struct _GstSamiContext
|
|
{
|
|
GString *buf; /* buffer to collect content */
|
|
GString *rubybuf; /* buffer to collect ruby content */
|
|
GString *resultbuf; /* when opening the next 'sync' tag, move
|
|
* from 'buf' to avoid to append following
|
|
* content */
|
|
GString *state; /* in many sami files there are tags that
|
|
* are not closed, so for each open tag the
|
|
* parser will append a tag flag here so
|
|
* that tags can be closed properly on
|
|
* 'sync' tags. See _context_push_state()
|
|
* and _context_pop_state(). */
|
|
HtmlContext *htmlctxt; /* html parser context */
|
|
gboolean has_result; /* set when ready to push out result */
|
|
gboolean in_sync; /* flag to avoid appending anything except the
|
|
* content of the sync elements to buf */
|
|
guint64 time1; /* previous start attribute in sync tag */
|
|
guint64 time2; /* current start attribute in sync tag */
|
|
};
|
|
|
|
struct _HtmlParser
|
|
{
|
|
void (*start_element) (HtmlContext * ctx,
|
|
const gchar * name, const gchar ** attr, gpointer user_data);
|
|
void (*end_element) (HtmlContext * ctx,
|
|
const gchar * name, gpointer user_data);
|
|
void (*text) (HtmlContext * ctx,
|
|
const gchar * text, gsize text_len, gpointer user_data);
|
|
};
|
|
|
|
struct _HtmlContext
|
|
{
|
|
const HtmlParser *parser;
|
|
gpointer user_data;
|
|
GString *buf;
|
|
};
|
|
|
|
static HtmlContext *
|
|
html_context_new (HtmlParser * parser, gpointer user_data)
|
|
{
|
|
HtmlContext *ctxt = (HtmlContext *) g_new0 (HtmlContext, 1);
|
|
ctxt->parser = parser;
|
|
ctxt->user_data = user_data;
|
|
ctxt->buf = g_string_new (NULL);
|
|
return ctxt;
|
|
}
|
|
|
|
static void
|
|
html_context_free (HtmlContext * ctxt)
|
|
{
|
|
g_string_free (ctxt->buf, TRUE);
|
|
g_free (ctxt);
|
|
}
|
|
|
|
typedef struct
|
|
{
|
|
gunichar unescaped:24;
|
|
guint8 escaped_len;
|
|
gchar escaped[8];
|
|
} EntityMap;
|
|
|
|
#define ENTITY(unicode,ent) unicode,sizeof(ent)-1,ent
|
|
|
|
static const EntityMap XmlEntities[] = {
|
|
{ENTITY (34, "quot")},
|
|
{ENTITY (38, "amp")},
|
|
{ENTITY (39, "apos")},
|
|
{ENTITY (60, "lt")},
|
|
{ENTITY (62, "gt")},
|
|
};
|
|
|
|
static const EntityMap HtmlEntities[] = {
|
|
/* nbsp we'll handle manually
|
|
{ 160, "nbsp;" }, */
|
|
{ENTITY (161, "iexcl")},
|
|
{ENTITY (162, "cent")},
|
|
{ENTITY (163, "pound")},
|
|
{ENTITY (164, "curren")},
|
|
{ENTITY (165, "yen")},
|
|
{ENTITY (166, "brvbar")},
|
|
{ENTITY (167, "sect")},
|
|
{ENTITY (168, "uml")},
|
|
{ENTITY (169, "copy")},
|
|
{ENTITY (170, "ordf")},
|
|
{ENTITY (171, "laquo")},
|
|
{ENTITY (172, "not")},
|
|
{ENTITY (173, "shy")},
|
|
{ENTITY (174, "reg")},
|
|
{ENTITY (175, "macr")},
|
|
{ENTITY (176, "deg")},
|
|
{ENTITY (177, "plusmn")},
|
|
{ENTITY (178, "sup2")},
|
|
{ENTITY (179, "sup3")},
|
|
{ENTITY (180, "acute")},
|
|
{ENTITY (181, "micro")},
|
|
{ENTITY (182, "para")},
|
|
{ENTITY (183, "middot")},
|
|
{ENTITY (184, "cedil")},
|
|
{ENTITY (185, "sup1")},
|
|
{ENTITY (186, "ordm")},
|
|
{ENTITY (187, "raquo")},
|
|
{ENTITY (188, "frac14")},
|
|
{ENTITY (189, "frac12")},
|
|
{ENTITY (190, "frac34")},
|
|
{ENTITY (191, "iquest")},
|
|
{ENTITY (192, "Agrave")},
|
|
{ENTITY (193, "Aacute")},
|
|
{ENTITY (194, "Acirc")},
|
|
{ENTITY (195, "Atilde")},
|
|
{ENTITY (196, "Auml")},
|
|
{ENTITY (197, "Aring")},
|
|
{ENTITY (198, "AElig")},
|
|
{ENTITY (199, "Ccedil")},
|
|
{ENTITY (200, "Egrave")},
|
|
{ENTITY (201, "Eacute")},
|
|
{ENTITY (202, "Ecirc")},
|
|
{ENTITY (203, "Euml")},
|
|
{ENTITY (204, "Igrave")},
|
|
{ENTITY (205, "Iacute")},
|
|
{ENTITY (206, "Icirc")},
|
|
{ENTITY (207, "Iuml")},
|
|
{ENTITY (208, "ETH")},
|
|
{ENTITY (209, "Ntilde")},
|
|
{ENTITY (210, "Ograve")},
|
|
{ENTITY (211, "Oacute")},
|
|
{ENTITY (212, "Ocirc")},
|
|
{ENTITY (213, "Otilde")},
|
|
{ENTITY (214, "Ouml")},
|
|
{ENTITY (215, "times")},
|
|
{ENTITY (216, "Oslash")},
|
|
{ENTITY (217, "Ugrave")},
|
|
{ENTITY (218, "Uacute")},
|
|
{ENTITY (219, "Ucirc")},
|
|
{ENTITY (220, "Uuml")},
|
|
{ENTITY (221, "Yacute")},
|
|
{ENTITY (222, "THORN")},
|
|
{ENTITY (223, "szlig")},
|
|
{ENTITY (224, "agrave")},
|
|
{ENTITY (225, "aacute")},
|
|
{ENTITY (226, "acirc")},
|
|
{ENTITY (227, "atilde")},
|
|
{ENTITY (228, "auml")},
|
|
{ENTITY (229, "aring")},
|
|
{ENTITY (230, "aelig")},
|
|
{ENTITY (231, "ccedil")},
|
|
{ENTITY (232, "egrave")},
|
|
{ENTITY (233, "eacute")},
|
|
{ENTITY (234, "ecirc")},
|
|
{ENTITY (235, "euml")},
|
|
{ENTITY (236, "igrave")},
|
|
{ENTITY (237, "iacute")},
|
|
{ENTITY (238, "icirc")},
|
|
{ENTITY (239, "iuml")},
|
|
{ENTITY (240, "eth")},
|
|
{ENTITY (241, "ntilde")},
|
|
{ENTITY (242, "ograve")},
|
|
{ENTITY (243, "oacute")},
|
|
{ENTITY (244, "ocirc")},
|
|
{ENTITY (245, "otilde")},
|
|
{ENTITY (246, "ouml")},
|
|
{ENTITY (247, "divide")},
|
|
{ENTITY (248, "oslash")},
|
|
{ENTITY (249, "ugrave")},
|
|
{ENTITY (250, "uacute")},
|
|
{ENTITY (251, "ucirc")},
|
|
{ENTITY (252, "uuml")},
|
|
{ENTITY (253, "yacute")},
|
|
{ENTITY (254, "thorn")},
|
|
{ENTITY (255, "yuml")},
|
|
{ENTITY (338, "OElig")},
|
|
{ENTITY (339, "oelig")},
|
|
{ENTITY (352, "Scaron")},
|
|
{ENTITY (353, "scaron")},
|
|
{ENTITY (376, "Yuml")},
|
|
{ENTITY (402, "fnof")},
|
|
{ENTITY (710, "circ")},
|
|
{ENTITY (732, "tilde")},
|
|
{ENTITY (913, "Alpha")},
|
|
{ENTITY (914, "Beta")},
|
|
{ENTITY (915, "Gamma")},
|
|
{ENTITY (916, "Delta")},
|
|
{ENTITY (917, "Epsilon")},
|
|
{ENTITY (918, "Zeta")},
|
|
{ENTITY (919, "Eta")},
|
|
{ENTITY (920, "Theta")},
|
|
{ENTITY (921, "Iota")},
|
|
{ENTITY (922, "Kappa")},
|
|
{ENTITY (923, "Lambda")},
|
|
{ENTITY (924, "Mu")},
|
|
{ENTITY (925, "Nu")},
|
|
{ENTITY (926, "Xi")},
|
|
{ENTITY (927, "Omicron")},
|
|
{ENTITY (928, "Pi")},
|
|
{ENTITY (929, "Rho")},
|
|
{ENTITY (931, "Sigma")},
|
|
{ENTITY (932, "Tau")},
|
|
{ENTITY (933, "Upsilon")},
|
|
{ENTITY (934, "Phi")},
|
|
{ENTITY (935, "Chi")},
|
|
{ENTITY (936, "Psi")},
|
|
{ENTITY (937, "Omega")},
|
|
{ENTITY (945, "alpha")},
|
|
{ENTITY (946, "beta")},
|
|
{ENTITY (947, "gamma")},
|
|
{ENTITY (948, "delta")},
|
|
{ENTITY (949, "epsilon")},
|
|
{ENTITY (950, "zeta")},
|
|
{ENTITY (951, "eta")},
|
|
{ENTITY (952, "theta")},
|
|
{ENTITY (953, "iota")},
|
|
{ENTITY (954, "kappa")},
|
|
{ENTITY (955, "lambda")},
|
|
{ENTITY (956, "mu")},
|
|
{ENTITY (957, "nu")},
|
|
{ENTITY (958, "xi")},
|
|
{ENTITY (959, "omicron")},
|
|
{ENTITY (960, "pi")},
|
|
{ENTITY (961, "rho")},
|
|
{ENTITY (962, "sigmaf")},
|
|
{ENTITY (963, "sigma")},
|
|
{ENTITY (964, "tau")},
|
|
{ENTITY (965, "upsilon")},
|
|
{ENTITY (966, "phi")},
|
|
{ENTITY (967, "chi")},
|
|
{ENTITY (968, "psi")},
|
|
{ENTITY (969, "omega")},
|
|
{ENTITY (977, "thetasym")},
|
|
{ENTITY (978, "upsih")},
|
|
{ENTITY (982, "piv")},
|
|
{ENTITY (8194, "ensp")},
|
|
{ENTITY (8195, "emsp")},
|
|
{ENTITY (8201, "thinsp")},
|
|
{ENTITY (8204, "zwnj")},
|
|
{ENTITY (8205, "zwj")},
|
|
{ENTITY (8206, "lrm")},
|
|
{ENTITY (8207, "rlm")},
|
|
{ENTITY (8211, "ndash")},
|
|
{ENTITY (8212, "mdash")},
|
|
{ENTITY (8216, "lsquo")},
|
|
{ENTITY (8217, "rsquo")},
|
|
{ENTITY (8218, "sbquo")},
|
|
{ENTITY (8220, "ldquo")},
|
|
{ENTITY (8221, "rdquo")},
|
|
{ENTITY (8222, "bdquo")},
|
|
{ENTITY (8224, "dagger")},
|
|
{ENTITY (8225, "Dagger")},
|
|
{ENTITY (8226, "bull")},
|
|
{ENTITY (8230, "hellip")},
|
|
{ENTITY (8240, "permil")},
|
|
{ENTITY (8242, "prime")},
|
|
{ENTITY (8243, "Prime")},
|
|
{ENTITY (8249, "lsaquo")},
|
|
{ENTITY (8250, "rsaquo")},
|
|
{ENTITY (8254, "oline")},
|
|
{ENTITY (8260, "frasl")},
|
|
{ENTITY (8364, "euro")},
|
|
{ENTITY (8465, "image")},
|
|
{ENTITY (8472, "weierp")},
|
|
{ENTITY (8476, "real")},
|
|
{ENTITY (8482, "trade")},
|
|
{ENTITY (8501, "alefsym")},
|
|
{ENTITY (8592, "larr")},
|
|
{ENTITY (8593, "uarr")},
|
|
{ENTITY (8594, "rarr")},
|
|
{ENTITY (8595, "darr")},
|
|
{ENTITY (8596, "harr")},
|
|
{ENTITY (8629, "crarr")},
|
|
{ENTITY (8656, "lArr")},
|
|
{ENTITY (8657, "uArr")},
|
|
{ENTITY (8658, "rArr")},
|
|
{ENTITY (8659, "dArr")},
|
|
{ENTITY (8660, "hArr")},
|
|
{ENTITY (8704, "forall")},
|
|
{ENTITY (8706, "part")},
|
|
{ENTITY (8707, "exist")},
|
|
{ENTITY (8709, "empty")},
|
|
{ENTITY (8711, "nabla")},
|
|
{ENTITY (8712, "isin")},
|
|
{ENTITY (8713, "notin")},
|
|
{ENTITY (8715, "ni")},
|
|
{ENTITY (8719, "prod")},
|
|
{ENTITY (8721, "sum")},
|
|
{ENTITY (8722, "minus")},
|
|
{ENTITY (8727, "lowast")},
|
|
{ENTITY (8730, "radic")},
|
|
{ENTITY (8733, "prop")},
|
|
{ENTITY (8734, "infin")},
|
|
{ENTITY (8736, "ang")},
|
|
{ENTITY (8743, "and")},
|
|
{ENTITY (8744, "or")},
|
|
{ENTITY (8745, "cap")},
|
|
{ENTITY (8746, "cup")},
|
|
{ENTITY (8747, "int")},
|
|
{ENTITY (8756, "there4")},
|
|
{ENTITY (8764, "sim")},
|
|
{ENTITY (8773, "cong")},
|
|
{ENTITY (8776, "asymp")},
|
|
{ENTITY (8800, "ne")},
|
|
{ENTITY (8801, "equiv")},
|
|
{ENTITY (8804, "le")},
|
|
{ENTITY (8805, "ge")},
|
|
{ENTITY (8834, "sub")},
|
|
{ENTITY (8835, "sup")},
|
|
{ENTITY (8836, "nsub")},
|
|
{ENTITY (8838, "sube")},
|
|
{ENTITY (8839, "supe")},
|
|
{ENTITY (8853, "oplus")},
|
|
{ENTITY (8855, "otimes")},
|
|
{ENTITY (8869, "perp")},
|
|
{ENTITY (8901, "sdot")},
|
|
{ENTITY (8968, "lceil")},
|
|
{ENTITY (8969, "rceil")},
|
|
{ENTITY (8970, "lfloor")},
|
|
{ENTITY (8971, "rfloor")},
|
|
{ENTITY (9001, "lang")},
|
|
{ENTITY (9002, "rang")},
|
|
{ENTITY (9674, "loz")},
|
|
{ENTITY (9824, "spades")},
|
|
{ENTITY (9827, "clubs")},
|
|
{ENTITY (9829, "hearts")},
|
|
{ENTITY (9830, "diams")},
|
|
};
|
|
|
|
static gchar *
|
|
unescape_string (const gchar * text)
|
|
{
|
|
gint i;
|
|
GString *unescaped = g_string_new (NULL);
|
|
|
|
while (*text) {
|
|
if (*text == '&') {
|
|
text++;
|
|
|
|
/* unescape   and */
|
|
if (!g_ascii_strncasecmp (text, "nbsp", 4)) {
|
|
unescaped = g_string_append_unichar (unescaped, 160);
|
|
text += 4;
|
|
if (*text == ';') {
|
|
text++;
|
|
}
|
|
goto next;
|
|
}
|
|
|
|
/* pass xml entities. these will be processed as pango markup */
|
|
for (i = 0; i < G_N_ELEMENTS (XmlEntities); i++) {
|
|
const EntityMap *entity = &XmlEntities[i];
|
|
guint8 escaped_len = entity->escaped_len;
|
|
|
|
if (!g_ascii_strncasecmp (text, entity->escaped, escaped_len)
|
|
&& text[escaped_len] == ';') {
|
|
unescaped = g_string_append_c (unescaped, '&');
|
|
unescaped =
|
|
g_string_append_len (unescaped, entity->escaped, escaped_len);
|
|
unescaped = g_string_append_c (unescaped, ';');
|
|
text += escaped_len + 1;
|
|
goto next;
|
|
}
|
|
}
|
|
|
|
/* convert html entities */
|
|
for (i = 0; i < G_N_ELEMENTS (HtmlEntities); i++) {
|
|
const EntityMap *entity = &HtmlEntities[i];
|
|
guint8 escaped_len = entity->escaped_len;
|
|
|
|
if (!strncmp (text, entity->escaped, escaped_len)
|
|
&& text[escaped_len] == ';') {
|
|
unescaped = g_string_append_unichar (unescaped, entity->unescaped);
|
|
text += escaped_len + 1;
|
|
goto next;
|
|
}
|
|
}
|
|
|
|
if (*text == '#') {
|
|
gboolean is_hex = FALSE;
|
|
gunichar l;
|
|
gchar *end = NULL;
|
|
|
|
text++;
|
|
if (*text == 'x') {
|
|
is_hex = TRUE;
|
|
text++;
|
|
}
|
|
errno = 0;
|
|
if (is_hex) {
|
|
l = strtoul (text, &end, 16);
|
|
} else {
|
|
l = strtoul (text, &end, 10);
|
|
}
|
|
|
|
if (text == end || errno != 0) {
|
|
/* error occurred. pass it */
|
|
goto next;
|
|
}
|
|
unescaped = g_string_append_unichar (unescaped, l);
|
|
text = end;
|
|
|
|
if (*text == ';') {
|
|
text++;
|
|
}
|
|
goto next;
|
|
}
|
|
|
|
/* escape & */
|
|
unescaped = g_string_append (unescaped, "&");
|
|
|
|
next:
|
|
continue;
|
|
|
|
} else if (g_ascii_isspace (*text)) {
|
|
unescaped = g_string_append_c (unescaped, ' ');
|
|
/* strip whitespace */
|
|
do {
|
|
text++;
|
|
} while ((*text) && g_ascii_isspace (*text));
|
|
} else {
|
|
unescaped = g_string_append_c (unescaped, *text);
|
|
text++;
|
|
}
|
|
}
|
|
|
|
return g_string_free (unescaped, FALSE);
|
|
}
|
|
|
|
static const gchar *
|
|
string_token (const gchar * string, const gchar * delimiter, gchar ** first)
|
|
{
|
|
gchar *next = strstr (string, delimiter);
|
|
if (next) {
|
|
*first = g_strndup (string, next - string);
|
|
} else {
|
|
*first = g_strdup (string);
|
|
}
|
|
return next;
|
|
}
|
|
|
|
static void
|
|
html_context_handle_element (HtmlContext * ctxt,
|
|
const gchar * string, gboolean must_close)
|
|
{
|
|
gchar *name = NULL;
|
|
gint count = 0, i;
|
|
gchar **attrs;
|
|
const gchar *found, *next;
|
|
|
|
/* split element name and attributes */
|
|
next = string_token (string, " ", &name);
|
|
|
|
if (next) {
|
|
/* count attributes */
|
|
found = next + 1;
|
|
while (TRUE) {
|
|
found = strchr (found, '=');
|
|
if (!found)
|
|
break;
|
|
found++;
|
|
count++;
|
|
}
|
|
} else {
|
|
count = 0;
|
|
}
|
|
|
|
attrs = g_new0 (gchar *, (count + 1) * 2);
|
|
|
|
for (i = 0; i < count && next != NULL; i += 2) {
|
|
gchar *attr_name = NULL, *attr_value = NULL;
|
|
gsize length;
|
|
next = string_token (next + 1, "=", &attr_name);
|
|
if (!next) {
|
|
g_free (attr_name);
|
|
break;
|
|
}
|
|
next = string_token (next + 1, " ", &attr_value);
|
|
|
|
/* strip " or ' from attribute value */
|
|
if (attr_value[0] == '"' || attr_value[0] == '\'') {
|
|
gchar *tmp = g_strdup (attr_value + 1);
|
|
g_free (attr_value);
|
|
attr_value = tmp;
|
|
}
|
|
|
|
length = strlen (attr_value);
|
|
if (length > 0 && (attr_value[length - 1] == '"'
|
|
|| attr_value[length - 1] == '\'')) {
|
|
attr_value[length - 1] = '\0';
|
|
}
|
|
|
|
attrs[i] = attr_name;
|
|
attrs[i + 1] = attr_value;
|
|
}
|
|
|
|
ctxt->parser->start_element (ctxt, name,
|
|
(const gchar **) attrs, ctxt->user_data);
|
|
if (must_close) {
|
|
ctxt->parser->end_element (ctxt, name, ctxt->user_data);
|
|
}
|
|
g_strfreev (attrs);
|
|
g_free (name);
|
|
}
|
|
|
|
static void
|
|
html_context_parse (HtmlContext * ctxt, gchar * text, gsize text_len)
|
|
{
|
|
const gchar *next = NULL;
|
|
ctxt->buf = g_string_append_len (ctxt->buf, text, text_len);
|
|
next = ctxt->buf->str;
|
|
while (TRUE) {
|
|
if (next[0] == '<') {
|
|
gchar *element = NULL;
|
|
/* find <blahblah> */
|
|
if (!strchr (next, '>')) {
|
|
/* no tag end point. buffer will be process in next time */
|
|
return;
|
|
}
|
|
|
|
next = string_token (next, ">", &element);
|
|
next++;
|
|
if (g_str_has_suffix (element, "/")) {
|
|
/* handle <blah/> */
|
|
element[strlen (element) - 1] = '\0';
|
|
html_context_handle_element (ctxt, element + 1, TRUE);
|
|
} else if (element[1] == '/') {
|
|
/* handle </blah> */
|
|
ctxt->parser->end_element (ctxt, element + 2, ctxt->user_data);
|
|
} else {
|
|
/* handle <blah> */
|
|
html_context_handle_element (ctxt, element + 1, FALSE);
|
|
}
|
|
g_free (element);
|
|
} else if (strchr (next, '<')) {
|
|
gchar *text = NULL;
|
|
gsize length;
|
|
next = string_token (next, "<", &text);
|
|
text = g_strstrip (text);
|
|
length = strlen (text);
|
|
ctxt->parser->text (ctxt, text, length, ctxt->user_data);
|
|
g_free (text);
|
|
|
|
} else {
|
|
gchar *text = (gchar *) next;
|
|
gsize length;
|
|
text = g_strstrip (text);
|
|
length = strlen (text);
|
|
ctxt->parser->text (ctxt, text, length, ctxt->user_data);
|
|
ctxt->buf = g_string_assign (ctxt->buf, "");
|
|
return;
|
|
}
|
|
}
|
|
|
|
ctxt->buf = g_string_assign (ctxt->buf, next);
|
|
}
|
|
|
|
static gchar *
|
|
has_tag (GString * str, const gchar tag)
|
|
{
|
|
return strrchr (str->str, tag);
|
|
}
|
|
|
|
static void
|
|
sami_context_push_state (GstSamiContext * sctx, char state)
|
|
{
|
|
GST_LOG ("state %c", state);
|
|
g_string_append_c (sctx->state, state);
|
|
}
|
|
|
|
static void
|
|
sami_context_pop_state (GstSamiContext * sctx, char state)
|
|
{
|
|
GString *str = g_string_new ("");
|
|
GString *context_state = sctx->state;
|
|
int i;
|
|
|
|
GST_LOG ("state %c", state);
|
|
for (i = context_state->len - 1; i >= 0; i--) {
|
|
switch (context_state->str[i]) {
|
|
case ITALIC_TAG: /* <i> */
|
|
{
|
|
g_string_append (str, "</i>");
|
|
break;
|
|
}
|
|
case SPAN_TAG: /* <span foreground= > */
|
|
{
|
|
g_string_append (str, "</span>");
|
|
break;
|
|
}
|
|
case RUBY_TAG: /* <span size= > -- ruby */
|
|
{
|
|
break;
|
|
}
|
|
case RT_TAG: /* ruby */
|
|
{
|
|
/* FIXME: support for furigana/ruby once implemented in pango */
|
|
g_string_append (sctx->rubybuf, "</span>");
|
|
if (has_tag (context_state, ITALIC_TAG)) {
|
|
g_string_append (sctx->rubybuf, "</i>");
|
|
}
|
|
|
|
break;
|
|
}
|
|
default:
|
|
break;
|
|
}
|
|
if (context_state->str[i] == state) {
|
|
g_string_append (sctx->buf, str->str);
|
|
g_string_free (str, TRUE);
|
|
g_string_truncate (context_state, i);
|
|
return;
|
|
}
|
|
}
|
|
if (state == CLEAR_TAG) {
|
|
g_string_append (sctx->buf, str->str);
|
|
g_string_truncate (context_state, 0);
|
|
}
|
|
g_string_free (str, TRUE);
|
|
}
|
|
|
|
static void
|
|
handle_start_sync (GstSamiContext * sctx, const gchar ** atts)
|
|
{
|
|
int i;
|
|
|
|
sami_context_pop_state (sctx, CLEAR_TAG);
|
|
if (atts != NULL) {
|
|
for (i = 0; (atts[i] != NULL); i += 2) {
|
|
const gchar *key, *value;
|
|
|
|
key = atts[i];
|
|
value = atts[i + 1];
|
|
|
|
if (!value)
|
|
continue;
|
|
if (!g_ascii_strcasecmp ("start", key)) {
|
|
/* Only set a new start time if we don't have text pending */
|
|
if (sctx->resultbuf->len == 0)
|
|
sctx->time1 = sctx->time2;
|
|
|
|
sctx->time2 = atoi ((const char *) value) * GST_MSECOND;
|
|
sctx->time2 = MAX (sctx->time2, sctx->time1);
|
|
g_string_append (sctx->resultbuf, sctx->buf->str);
|
|
sctx->has_result = (sctx->resultbuf->len != 0) ? TRUE : FALSE;
|
|
g_string_truncate (sctx->buf, 0);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
static void
|
|
handle_start_font (GstSamiContext * sctx, const gchar ** atts)
|
|
{
|
|
int i;
|
|
|
|
sami_context_pop_state (sctx, SPAN_TAG);
|
|
if (atts != NULL) {
|
|
g_string_append (sctx->buf, "<span");
|
|
for (i = 0; (atts[i] != NULL); i += 2) {
|
|
const gchar *key, *value;
|
|
|
|
key = atts[i];
|
|
value = atts[i + 1];
|
|
|
|
if (!value)
|
|
continue;
|
|
if (!g_ascii_strcasecmp ("color", key)) {
|
|
/*
|
|
* There are invalid color value in many
|
|
* sami files.
|
|
* It will fix hex color value that start without '#'
|
|
*/
|
|
const gchar *sharp = "";
|
|
int len = strlen (value);
|
|
|
|
if (!(*value == '#' && len == 7)) {
|
|
gchar *r;
|
|
|
|
/* check if it looks like hex */
|
|
if (strtol ((const char *) value, &r, 16) >= 0 &&
|
|
((gchar *) r == (value + 6) && len == 6)) {
|
|
sharp = "#";
|
|
}
|
|
}
|
|
/* some colours can be found in many sami files, but X RGB database
|
|
* doesn't contain a colour by this name, so map explicitly */
|
|
if (!g_ascii_strcasecmp ("aqua", value)) {
|
|
value = "#00ffff";
|
|
} else if (!g_ascii_strcasecmp ("crimson", value)) {
|
|
value = "#dc143c";
|
|
} else if (!g_ascii_strcasecmp ("fuchsia", value)) {
|
|
value = "#ff00ff";
|
|
} else if (!g_ascii_strcasecmp ("indigo", value)) {
|
|
value = "#4b0082";
|
|
} else if (!g_ascii_strcasecmp ("lime", value)) {
|
|
value = "#00ff00";
|
|
} else if (!g_ascii_strcasecmp ("olive", value)) {
|
|
value = "#808000";
|
|
} else if (!g_ascii_strcasecmp ("silver", value)) {
|
|
value = "#c0c0c0";
|
|
} else if (!g_ascii_strcasecmp ("teal", value)) {
|
|
value = "#008080";
|
|
}
|
|
g_string_append_printf (sctx->buf, " foreground=\"%s%s\"", sharp,
|
|
value);
|
|
} else if (!g_ascii_strcasecmp ("face", key)) {
|
|
g_string_append_printf (sctx->buf, " font_family=\"%s\"", value);
|
|
}
|
|
}
|
|
g_string_append_c (sctx->buf, '>');
|
|
sami_context_push_state (sctx, SPAN_TAG);
|
|
}
|
|
}
|
|
|
|
static void
|
|
handle_start_element (HtmlContext * ctx, const gchar * name,
|
|
const char **atts, gpointer user_data)
|
|
{
|
|
GstSamiContext *sctx = (GstSamiContext *) user_data;
|
|
|
|
GST_LOG ("name:%s", name);
|
|
|
|
if (!g_ascii_strcasecmp ("sync", name)) {
|
|
handle_start_sync (sctx, atts);
|
|
sctx->in_sync = TRUE;
|
|
} else if (!g_ascii_strcasecmp ("font", name)) {
|
|
handle_start_font (sctx, atts);
|
|
} else if (!g_ascii_strcasecmp ("ruby", name)) {
|
|
sami_context_push_state (sctx, RUBY_TAG);
|
|
} else if (!g_ascii_strcasecmp ("br", name)) {
|
|
g_string_append_c (sctx->buf, '\n');
|
|
/* FIXME: support for furigana/ruby once implemented in pango */
|
|
} else if (!g_ascii_strcasecmp ("rt", name)) {
|
|
if (has_tag (sctx->state, ITALIC_TAG)) {
|
|
g_string_append (sctx->rubybuf, "<i>");
|
|
}
|
|
g_string_append (sctx->rubybuf, "<span size='xx-small' rise='-100'>");
|
|
sami_context_push_state (sctx, RT_TAG);
|
|
} else if (!g_ascii_strcasecmp ("i", name)) {
|
|
g_string_append (sctx->buf, "<i>");
|
|
sami_context_push_state (sctx, ITALIC_TAG);
|
|
} else if (!g_ascii_strcasecmp ("p", name)) {
|
|
}
|
|
}
|
|
|
|
static void
|
|
handle_end_element (HtmlContext * ctx, const char *name, gpointer user_data)
|
|
{
|
|
GstSamiContext *sctx = (GstSamiContext *) user_data;
|
|
|
|
GST_LOG ("name:%s", name);
|
|
|
|
if (!g_ascii_strcasecmp ("sync", name)) {
|
|
sctx->in_sync = FALSE;
|
|
} else if ((!g_ascii_strcasecmp ("body", name)) ||
|
|
(!g_ascii_strcasecmp ("sami", name))) {
|
|
/* We will usually have one buffer left when the body is closed
|
|
* as we need the next sync to actually send it */
|
|
if (sctx->buf->len != 0) {
|
|
/* Only set a new start time if we don't have text pending */
|
|
if (sctx->resultbuf->len == 0)
|
|
sctx->time1 = sctx->time2;
|
|
|
|
sctx->time2 = GST_CLOCK_TIME_NONE;
|
|
g_string_append (sctx->resultbuf, sctx->buf->str);
|
|
sctx->has_result = (sctx->resultbuf->len != 0) ? TRUE : FALSE;
|
|
g_string_truncate (sctx->buf, 0);
|
|
}
|
|
} else if (!g_ascii_strcasecmp ("font", name)) {
|
|
sami_context_pop_state (sctx, SPAN_TAG);
|
|
} else if (!g_ascii_strcasecmp ("ruby", name)) {
|
|
sami_context_pop_state (sctx, RUBY_TAG);
|
|
} else if (!g_ascii_strcasecmp ("i", name)) {
|
|
sami_context_pop_state (sctx, ITALIC_TAG);
|
|
}
|
|
}
|
|
|
|
static void
|
|
handle_text (HtmlContext * ctx, const gchar * text, gsize text_len,
|
|
gpointer user_data)
|
|
{
|
|
GstSamiContext *sctx = (GstSamiContext *) user_data;
|
|
|
|
/* Skip everything except content of the sync elements */
|
|
if (!sctx->in_sync)
|
|
return;
|
|
|
|
if (has_tag (sctx->state, RT_TAG)) {
|
|
g_string_append_c (sctx->rubybuf, ' ');
|
|
g_string_append (sctx->rubybuf, text);
|
|
g_string_append_c (sctx->rubybuf, ' ');
|
|
} else {
|
|
g_string_append (sctx->buf, text);
|
|
}
|
|
}
|
|
|
|
static HtmlParser samiParser = {
|
|
handle_start_element, /* start_element */
|
|
handle_end_element, /* end_element */
|
|
handle_text, /* text */
|
|
};
|
|
|
|
void
|
|
sami_context_init (ParserState * state)
|
|
{
|
|
GstSamiContext *context;
|
|
|
|
g_assert (state->user_data == NULL);
|
|
|
|
context = g_new0 (GstSamiContext, 1);
|
|
|
|
context->htmlctxt = html_context_new (&samiParser, context);
|
|
context->buf = g_string_new ("");
|
|
context->rubybuf = g_string_new ("");
|
|
context->resultbuf = g_string_new ("");
|
|
context->state = g_string_new ("");
|
|
|
|
state->user_data = context;
|
|
}
|
|
|
|
void
|
|
sami_context_deinit (ParserState * state)
|
|
{
|
|
GstSamiContext *context = (GstSamiContext *) state->user_data;
|
|
|
|
if (context) {
|
|
html_context_free (context->htmlctxt);
|
|
context->htmlctxt = NULL;
|
|
g_string_free (context->buf, TRUE);
|
|
g_string_free (context->rubybuf, TRUE);
|
|
g_string_free (context->resultbuf, TRUE);
|
|
g_string_free (context->state, TRUE);
|
|
g_free (context);
|
|
state->user_data = NULL;
|
|
}
|
|
}
|
|
|
|
void
|
|
sami_context_reset (ParserState * state)
|
|
{
|
|
GstSamiContext *context = (GstSamiContext *) state->user_data;
|
|
|
|
if (context) {
|
|
g_string_truncate (context->buf, 0);
|
|
g_string_truncate (context->rubybuf, 0);
|
|
g_string_truncate (context->resultbuf, 0);
|
|
g_string_truncate (context->state, 0);
|
|
context->has_result = FALSE;
|
|
context->in_sync = FALSE;
|
|
context->time1 = 0;
|
|
context->time2 = 0;
|
|
}
|
|
}
|
|
|
|
gchar *
|
|
parse_sami (ParserState * state, const gchar * line)
|
|
{
|
|
gchar *ret = NULL;
|
|
GstSamiContext *context = (GstSamiContext *) state->user_data;
|
|
|
|
gchar *unescaped = unescape_string (line);
|
|
html_context_parse (context->htmlctxt, (gchar *) unescaped,
|
|
strlen (unescaped));
|
|
g_free (unescaped);
|
|
|
|
if (context->has_result) {
|
|
if (context->rubybuf->len) {
|
|
context->rubybuf = g_string_append_c (context->rubybuf, '\n');
|
|
g_string_prepend (context->resultbuf, context->rubybuf->str);
|
|
context->rubybuf = g_string_truncate (context->rubybuf, 0);
|
|
}
|
|
|
|
ret = g_string_free (context->resultbuf, FALSE);
|
|
context->resultbuf = g_string_new ("");
|
|
state->start_time = context->time1;
|
|
state->duration = context->time2 - context->time1;
|
|
context->has_result = FALSE;
|
|
}
|
|
return ret;
|
|
}
|