mirror of
https://gitlab.freedesktop.org/gstreamer/gstreamer.git
synced 2025-01-14 19:35:39 +00:00
7a83664099
Original commit message from CVS: * gst/subparse/samiparse.c: (sami_context_push_state), (sami_context_pop_state), (start_sami_element), (end_sami_element): Some versions of libxml seem to be very picky as to strict formatting of the input and never 'close' the final </body> tag. In order to fix that bad behaviour, we trigger the flushing of remaining data on both </body> and </sami>. Fixes #557365
473 lines
15 KiB
C
473 lines
15 KiB
C
/* GStreamer SAMI subtitle parser
|
|
* Copyright (c) 2006 Young-Ho Cha <ganadist at chollian net>
|
|
*
|
|
* This library is free software; you can redistribute it and/or
|
|
* modify it under the terms of the GNU Library General Public
|
|
* License as published by the Free Software Foundation; either
|
|
* version 2 of the License, or (at your option) any later version.
|
|
*
|
|
* This library is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
* Library General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU Library General Public
|
|
* License along with this library; if not, write to the
|
|
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
|
|
* Boston, MA 02111-1307, USA.
|
|
*/
|
|
|
|
#include "samiparse.h"
|
|
|
|
#include <libxml/HTMLparser.h>
|
|
#include <string.h>
|
|
|
|
#define ITALIC_TAG 'i'
|
|
#define SPAN_TAG 's'
|
|
#define RUBY_TAG 'r'
|
|
#define RT_TAG 't'
|
|
#define CLEAR_TAG '0'
|
|
|
|
typedef struct _GstSamiContext GstSamiContext;
|
|
|
|
struct _GstSamiContext
|
|
{
|
|
GString *buf; /* buffer to collect content */
|
|
GString *rubybuf; /* buffer to collect ruby content */
|
|
GString *resultbuf; /* when opening the next 'sync' tag, move
|
|
* from 'buf' to avoid to append following
|
|
* content */
|
|
GString *state; /* in many sami files there are tags that
|
|
* are not closed, so for each open tag the
|
|
* parser will append a tag flag here so
|
|
* that tags can be closed properly on
|
|
* 'sync' tags. See _context_push_state()
|
|
* and _context_pop_state(). */
|
|
htmlParserCtxtPtr htmlctxt; /* html parser context */
|
|
gboolean has_result; /* set when ready to push out result */
|
|
gboolean in_sync; /* flag to avoid appending anything except the
|
|
* content of the sync elements to buf */
|
|
guint64 time1; /* previous start attribute in sync tag */
|
|
guint64 time2; /* current start attribute in sync tag */
|
|
};
|
|
|
|
static gchar *
|
|
has_tag (GString * str, const gchar tag)
|
|
{
|
|
return strrchr (str->str, tag);
|
|
}
|
|
|
|
static void
|
|
sami_context_push_state (GstSamiContext * sctx, char state)
|
|
{
|
|
GST_LOG ("state %c", state);
|
|
g_string_append_c (sctx->state, state);
|
|
}
|
|
|
|
static void
|
|
sami_context_pop_state (GstSamiContext * sctx, char state)
|
|
{
|
|
GString *str = g_string_new ("");
|
|
GString *context_state = sctx->state;
|
|
int i;
|
|
|
|
GST_LOG ("state %c", state);
|
|
for (i = context_state->len - 1; i >= 0; i--) {
|
|
switch (context_state->str[i]) {
|
|
case ITALIC_TAG: /* <i> */
|
|
{
|
|
g_string_append (str, "</i>");
|
|
break;
|
|
}
|
|
case SPAN_TAG: /* <span foreground= > */
|
|
{
|
|
g_string_append (str, "</span>");
|
|
break;
|
|
}
|
|
case RUBY_TAG: /* <span size= > -- ruby */
|
|
{
|
|
break;
|
|
}
|
|
case RT_TAG: /* ruby */
|
|
{
|
|
/* FIXME: support for furigana/ruby once implemented in pango */
|
|
g_string_append (sctx->rubybuf, "</span>");
|
|
if (has_tag (context_state, ITALIC_TAG)) {
|
|
g_string_append (sctx->rubybuf, "</i>");
|
|
}
|
|
|
|
break;
|
|
}
|
|
default:
|
|
break;
|
|
}
|
|
if (context_state->str[i] == state) {
|
|
g_string_append (sctx->buf, str->str);
|
|
g_string_free (str, TRUE);
|
|
g_string_truncate (context_state, i);
|
|
return;
|
|
}
|
|
}
|
|
if (state == CLEAR_TAG) {
|
|
g_string_append (sctx->buf, str->str);
|
|
g_string_truncate (context_state, 0);
|
|
}
|
|
g_string_free (str, TRUE);
|
|
}
|
|
|
|
static void
|
|
handle_start_sync (GstSamiContext * sctx, const xmlChar ** atts)
|
|
{
|
|
int i;
|
|
|
|
sami_context_pop_state (sctx, CLEAR_TAG);
|
|
if (atts != NULL) {
|
|
for (i = 0; (atts[i] != NULL); i += 2) {
|
|
const xmlChar *key, *value;
|
|
|
|
key = atts[i];
|
|
value = atts[i + 1];
|
|
|
|
if (!value)
|
|
continue;
|
|
if (!xmlStrncmp ((const xmlChar *) "start", key, 5)) {
|
|
/* Only set a new start time if we don't have text pending */
|
|
if (sctx->resultbuf->len == 0)
|
|
sctx->time1 = sctx->time2;
|
|
|
|
sctx->time2 = atoi ((const char *) value) * GST_MSECOND;
|
|
g_string_append (sctx->resultbuf, sctx->buf->str);
|
|
sctx->has_result = (sctx->resultbuf->len != 0) ? TRUE : FALSE;
|
|
g_string_truncate (sctx->buf, 0);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
static void
|
|
handle_start_font (GstSamiContext * sctx, const xmlChar ** atts)
|
|
{
|
|
int i;
|
|
|
|
sami_context_pop_state (sctx, SPAN_TAG);
|
|
if (atts != NULL) {
|
|
g_string_append (sctx->buf, "<span");
|
|
for (i = 0; (atts[i] != NULL); i += 2) {
|
|
const xmlChar *key, *value;
|
|
|
|
key = atts[i];
|
|
value = atts[i + 1];
|
|
|
|
if (!value)
|
|
continue;
|
|
if (!xmlStrncmp ((const xmlChar *) "color", key, 5)) {
|
|
/*
|
|
* There are invalid color value in many
|
|
* sami files.
|
|
* It will fix hex color value that start without '#'
|
|
*/
|
|
gchar *sharp = "";
|
|
int len = xmlStrlen (value);
|
|
|
|
if (!(*value == '#' && len == 7)) {
|
|
gchar *r;
|
|
|
|
/* check if it looks like hex */
|
|
if (strtol ((const char *) value, &r, 16) >= 0 &&
|
|
((xmlChar *) r == (value + 6) && len == 6)) {
|
|
sharp = "#";
|
|
}
|
|
}
|
|
/* some colours can be found in many sami files, but X RGB database
|
|
* doesn't contain a colour by this name, so map explicitly */
|
|
if (!xmlStrncasecmp (value, (const xmlChar *) "aqua", len)) {
|
|
value = (const xmlChar *) "#00ffff";
|
|
} else if (!xmlStrncasecmp (value, (const xmlChar *) "crimson", len)) {
|
|
value = (const xmlChar *) "#dc143c";
|
|
} else if (!xmlStrncasecmp (value, (const xmlChar *) "fuchsia", len)) {
|
|
value = (const xmlChar *) "#ff00ff";
|
|
} else if (!xmlStrncasecmp (value, (const xmlChar *) "indigo", len)) {
|
|
value = (const xmlChar *) "#4b0082";
|
|
} else if (!xmlStrncasecmp (value, (const xmlChar *) "lime", len)) {
|
|
value = (const xmlChar *) "#00ff00";
|
|
} else if (!xmlStrncasecmp (value, (const xmlChar *) "olive", len)) {
|
|
value = (const xmlChar *) "#808000";
|
|
} else if (!xmlStrncasecmp (value, (const xmlChar *) "silver", len)) {
|
|
value = (const xmlChar *) "#c0c0c0";
|
|
} else if (!xmlStrncasecmp (value, (const xmlChar *) "teal", len)) {
|
|
value = (const xmlChar *) "#008080";
|
|
}
|
|
g_string_append_printf (sctx->buf, " foreground=\"%s%s\"", sharp,
|
|
value);
|
|
} else if (!xmlStrncasecmp ((const xmlChar *) "face", key, 4)) {
|
|
g_string_append_printf (sctx->buf, " font_family=\"%s\"", value);
|
|
}
|
|
}
|
|
g_string_append_c (sctx->buf, '>');
|
|
sami_context_push_state (sctx, SPAN_TAG);
|
|
}
|
|
}
|
|
|
|
static void
|
|
start_sami_element (void *ctx, const xmlChar * name, const xmlChar ** atts)
|
|
{
|
|
GstSamiContext *sctx = (GstSamiContext *) ctx;
|
|
|
|
GST_LOG ("name:%s", name);
|
|
|
|
if (!xmlStrncmp ((const xmlChar *) "sync", name, 4)) {
|
|
handle_start_sync (sctx, atts);
|
|
sctx->in_sync = TRUE;
|
|
} else if (!xmlStrncmp ((const xmlChar *) "font", name, 4)) {
|
|
handle_start_font (sctx, atts);
|
|
} else if (!xmlStrncmp ((const xmlChar *) "ruby", name, 4)) {
|
|
sami_context_push_state (sctx, RUBY_TAG);
|
|
} else if (!xmlStrncmp ((const xmlChar *) "br", name, 2)) {
|
|
g_string_append_c (sctx->buf, '\n');
|
|
/* FIXME: support for furigana/ruby once implemented in pango */
|
|
} else if (!xmlStrncmp ((const xmlChar *) "rt", name, 2)) {
|
|
if (has_tag (sctx->state, ITALIC_TAG)) {
|
|
g_string_append (sctx->rubybuf, "<i>");
|
|
}
|
|
g_string_append (sctx->rubybuf, "<span size='xx-small' rise='-100'>");
|
|
sami_context_push_state (sctx, RT_TAG);
|
|
} else if (!xmlStrncmp ((const xmlChar *) "p", name, 1)) {
|
|
} else if (!xmlStrncmp ((const xmlChar *) "i", name, 1)) {
|
|
g_string_append (sctx->buf, "<i>");
|
|
sami_context_push_state (sctx, ITALIC_TAG);
|
|
}
|
|
}
|
|
|
|
static void
|
|
end_sami_element (void *ctx, const xmlChar * name)
|
|
{
|
|
GstSamiContext *sctx = (GstSamiContext *) ctx;
|
|
|
|
GST_LOG ("name:%s", name);
|
|
|
|
if (!xmlStrncmp ((const xmlChar *) "sync", name, 4)) {
|
|
sctx->in_sync = FALSE;
|
|
} else if ((!xmlStrncmp ((const xmlChar *) "body", name, 4)) ||
|
|
(!xmlStrncmp ((const xmlChar *) "sami", name, 4))) {
|
|
/* We will usually have one buffer left when the body is closed
|
|
* as we need the next sync to actually send it */
|
|
if (sctx->buf->len != 0) {
|
|
/* Only set a new start time if we don't have text pending */
|
|
if (sctx->resultbuf->len == 0)
|
|
sctx->time1 = sctx->time2;
|
|
|
|
sctx->time2 = GST_CLOCK_TIME_NONE;
|
|
g_string_append (sctx->resultbuf, sctx->buf->str);
|
|
sctx->has_result = (sctx->resultbuf->len != 0) ? TRUE : FALSE;
|
|
g_string_truncate (sctx->buf, 0);
|
|
}
|
|
} else if (!xmlStrncmp ((const xmlChar *) "font", name, 4)) {
|
|
sami_context_pop_state (sctx, SPAN_TAG);
|
|
} else if (!xmlStrncmp ((const xmlChar *) "ruby", name, 4)) {
|
|
sami_context_pop_state (sctx, RUBY_TAG);
|
|
} else if (!xmlStrncmp ((const xmlChar *) "i", name, 1)) {
|
|
sami_context_pop_state (sctx, ITALIC_TAG);
|
|
}
|
|
}
|
|
|
|
static void
|
|
characters_sami (void *ctx, const xmlChar * ch, int len)
|
|
{
|
|
GstSamiContext *sctx = (GstSamiContext *) ctx;
|
|
gchar *escaped;
|
|
gchar *tmp;
|
|
gint i;
|
|
|
|
/* Skip everything except content of the sync elements */
|
|
if (!sctx->in_sync)
|
|
return;
|
|
|
|
escaped = g_markup_escape_text ((const gchar *) ch, len);
|
|
g_strstrip (escaped);
|
|
|
|
/* Remove double spaces forom the string as those are
|
|
* usually added by newlines and indention */
|
|
tmp = escaped;
|
|
for (i = 0; i <= strlen (escaped); i++) {
|
|
escaped[i] = *tmp;
|
|
if (*tmp != ' ') {
|
|
tmp++;
|
|
continue;
|
|
}
|
|
while (*tmp == ' ')
|
|
tmp++;
|
|
}
|
|
|
|
if (has_tag (sctx->state, RT_TAG)) {
|
|
g_string_append_c (sctx->rubybuf, ' ');
|
|
g_string_append (sctx->rubybuf, escaped);
|
|
g_string_append_c (sctx->rubybuf, ' ');
|
|
} else {
|
|
g_string_append (sctx->buf, escaped);
|
|
}
|
|
g_free (escaped);
|
|
}
|
|
|
|
static xmlSAXHandler samiSAXHandlerStruct = {
|
|
NULL, /* internalSubset */
|
|
NULL, /* isStandalone */
|
|
NULL, /* hasInternalSubset */
|
|
NULL, /* hasExternalSubset */
|
|
NULL, /* resolveEntity */
|
|
NULL, /* getEntity */
|
|
NULL, /* entityDecl */
|
|
NULL, /* notationDecl */
|
|
NULL, /* attributeDecl */
|
|
NULL, /* elementDecl */
|
|
NULL, /* unparsedEntityDecl */
|
|
NULL, /* setDocumentLocator */
|
|
NULL, /* startDocument */
|
|
NULL, /* endDocument */
|
|
start_sami_element, /* startElement */
|
|
end_sami_element, /* endElement */
|
|
NULL, /* reference */
|
|
characters_sami, /* characters */
|
|
NULL, /* ignorableWhitespace */
|
|
NULL, /* processingInstruction */
|
|
NULL, /* comment */
|
|
NULL, /* xmlParserWarning */
|
|
NULL, /* xmlParserError */
|
|
NULL, /* xmlParserError */
|
|
NULL, /* getParameterEntity */
|
|
NULL, /* cdataBlock */
|
|
NULL, /* externalSubset */
|
|
1, /* initialized */
|
|
NULL, /* private */
|
|
NULL, /* startElementNsSAX2Func */
|
|
NULL, /* endElementNsSAX2Func */
|
|
NULL /* xmlStructuredErrorFunc */
|
|
};
|
|
static xmlSAXHandlerPtr samiSAXHandler = &samiSAXHandlerStruct;
|
|
|
|
void
|
|
sami_context_init (ParserState * state)
|
|
{
|
|
GstSamiContext *context;
|
|
|
|
g_assert (state->user_data == NULL);
|
|
state->user_data = (gpointer) g_new0 (GstSamiContext, 1);
|
|
context = (GstSamiContext *) state->user_data;
|
|
|
|
context->htmlctxt = htmlCreatePushParserCtxt (samiSAXHandler, context,
|
|
"", 0, NULL, XML_CHAR_ENCODING_UTF8);
|
|
context->buf = g_string_new ("");
|
|
context->rubybuf = g_string_new ("");
|
|
context->resultbuf = g_string_new ("");
|
|
context->state = g_string_new ("");
|
|
}
|
|
|
|
void
|
|
sami_context_deinit (ParserState * state)
|
|
{
|
|
GstSamiContext *context = (GstSamiContext *) state->user_data;
|
|
|
|
if (context) {
|
|
htmlParserCtxtPtr htmlctxt = context->htmlctxt;
|
|
|
|
/* destroy sax context */
|
|
htmlDocPtr doc;
|
|
|
|
htmlParseChunk (htmlctxt, "", 0, 1);
|
|
doc = htmlctxt->myDoc;
|
|
htmlFreeParserCtxt (htmlctxt);
|
|
context->htmlctxt = NULL;
|
|
if (doc)
|
|
xmlFreeDoc (doc);
|
|
g_string_free (context->buf, TRUE);
|
|
g_string_free (context->rubybuf, TRUE);
|
|
g_string_free (context->resultbuf, TRUE);
|
|
g_string_free (context->state, TRUE);
|
|
g_free (context);
|
|
state->user_data = NULL;
|
|
}
|
|
}
|
|
|
|
void
|
|
sami_context_reset (ParserState * state)
|
|
{
|
|
GstSamiContext *context = (GstSamiContext *) state->user_data;
|
|
|
|
if (context) {
|
|
g_string_truncate (context->buf, 0);
|
|
g_string_truncate (context->rubybuf, 0);
|
|
g_string_truncate (context->resultbuf, 0);
|
|
g_string_truncate (context->state, 0);
|
|
context->has_result = FALSE;
|
|
context->in_sync = FALSE;
|
|
context->time1 = 0;
|
|
context->time2 = 0;
|
|
}
|
|
}
|
|
|
|
static gchar *
|
|
fix_invalid_entities (const gchar * line)
|
|
{
|
|
const gchar *cp, *pp; /* current pointer, previous pointer */
|
|
gssize size;
|
|
GString *ret = g_string_new (NULL);
|
|
|
|
pp = line;
|
|
cp = strchr (line, '&');
|
|
while (cp) {
|
|
size = cp - pp;
|
|
ret = g_string_append_len (ret, pp, size);
|
|
cp++;
|
|
if (g_ascii_strncasecmp (cp, "nbsp;", 5)
|
|
&& (!g_ascii_strncasecmp (cp, "nbsp", 4))) {
|
|
/* translate " " to " " */
|
|
ret = g_string_append_len (ret, " ", 6);
|
|
cp += 4;
|
|
} else if (g_ascii_strncasecmp (cp, "quot;", 5)
|
|
&& g_ascii_strncasecmp (cp, "amp;", 4)
|
|
&& g_ascii_strncasecmp (cp, "apos;", 5)
|
|
&& g_ascii_strncasecmp (cp, "lt;", 3)
|
|
&& g_ascii_strncasecmp (cp, "gt;", 3)
|
|
&& g_ascii_strncasecmp (cp, "nbsp;", 5)
|
|
&& cp[0] != '#') {
|
|
/* translate "&" to "&" */
|
|
ret = g_string_append_len (ret, "&", 5);
|
|
} else {
|
|
/* do not translate */
|
|
ret = g_string_append_c (ret, '&');
|
|
}
|
|
|
|
pp = cp;
|
|
cp = strchr (pp, '&');
|
|
}
|
|
ret = g_string_append (ret, pp);
|
|
return g_string_free (ret, FALSE);
|
|
}
|
|
|
|
gchar *
|
|
parse_sami (ParserState * state, const gchar * line)
|
|
{
|
|
gchar *fixed_line;
|
|
GstSamiContext *context = (GstSamiContext *) state->user_data;
|
|
|
|
fixed_line = fix_invalid_entities (line);
|
|
htmlParseChunk (context->htmlctxt, fixed_line, strlen (fixed_line), 0);
|
|
g_free (fixed_line);
|
|
|
|
if (context->has_result) {
|
|
gchar *r;
|
|
|
|
if (context->rubybuf->len) {
|
|
context->rubybuf = g_string_append_c (context->rubybuf, '\n');
|
|
g_string_prepend (context->resultbuf, context->rubybuf->str);
|
|
context->rubybuf = g_string_truncate (context->rubybuf, 0);
|
|
}
|
|
|
|
r = g_string_free (context->resultbuf, FALSE);
|
|
context->resultbuf = g_string_new ("");
|
|
state->start_time = context->time1;
|
|
state->duration = context->time2 - context->time1;
|
|
context->has_result = FALSE;
|
|
return r;
|
|
}
|
|
return NULL;
|
|
}
|