/* GStreamer SAMI subtitle parser * Copyright (c) 2006 Young-Ho Cha <ganadist at chollian net> * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Library General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Library General Public License for more details. * * You should have received a copy of the GNU Library General Public * License along with this library; if not, write to the * Free Software Foundation, Inc., 59 Temple Place - Suite 330, * Boston, MA 02111-1307, USA. */ #include "samiparse.h" #include <libxml/HTMLparser.h> #include <string.h> #define ITALIC_TAG 'i' #define SPAN_TAG 's' #define RUBY_TAG 'r' #define RT_TAG 't' #define CLEAR_TAG '0' typedef struct _GstSamiContext GstSamiContext; struct _GstSamiContext { GString *buf; /* buffer to collect content */ GString *rubybuf; /* buffer to collect ruby content */ GString *resultbuf; /* when opening the next 'sync' tag, move * from 'buf' to avoid to append following * content */ GString *state; /* in many sami files there are tags that * are not closed, so for each open tag the * parser will append a tag flag here so * that tags can be closed properly on * 'sync' tags. See _context_push_state() * and _context_pop_state(). */ htmlParserCtxtPtr htmlctxt; /* html parser context */ gboolean has_result; /* set when ready to push out result */ gboolean in_sync; /* flag to avoid appending anything except the * content of the sync elements to buf */ guint64 time1; /* previous start attribute in sync tag */ guint64 time2; /* current start attribute in sync tag */ }; static gchar * has_tag (GString * str, const gchar tag) { return strrchr (str->str, tag); } static void sami_context_push_state (GstSamiContext * sctx, char state) { GST_LOG ("state %c", state); g_string_append_c (sctx->state, state); } static void sami_context_pop_state (GstSamiContext * sctx, char state) { GString *str = g_string_new (""); GString *context_state = sctx->state; int i; GST_LOG ("state %c", state); for (i = context_state->len - 1; i >= 0; i--) { switch (context_state->str[i]) { case ITALIC_TAG: /* <i> */ { g_string_append (str, "</i>"); break; } case SPAN_TAG: /* <span foreground= > */ { g_string_append (str, "</span>"); break; } case RUBY_TAG: /* <span size= > -- ruby */ { break; } case RT_TAG: /* ruby */ { /* FIXME: support for furigana/ruby once implemented in pango */ g_string_append (sctx->rubybuf, "</span>"); if (has_tag (context_state, ITALIC_TAG)) { g_string_append (sctx->rubybuf, "</i>"); } break; } default: break; } if (context_state->str[i] == state) { g_string_append (sctx->buf, str->str); g_string_free (str, TRUE); g_string_truncate (context_state, i); return; } } if (state == CLEAR_TAG) { g_string_append (sctx->buf, str->str); g_string_truncate (context_state, 0); } g_string_free (str, TRUE); } static void handle_start_sync (GstSamiContext * sctx, const xmlChar ** atts) { int i; sami_context_pop_state (sctx, CLEAR_TAG); if (atts != NULL) { for (i = 0; (atts[i] != NULL); i += 2) { const xmlChar *key, *value; key = atts[i]; value = atts[i + 1]; if (!value) continue; if (!xmlStrncmp ((const xmlChar *) "start", key, 5)) { /* Only set a new start time if we don't have text pending */ if (sctx->resultbuf->len == 0) sctx->time1 = sctx->time2; sctx->time2 = atoi ((const char *) value) * GST_MSECOND; g_string_append (sctx->resultbuf, sctx->buf->str); sctx->has_result = (sctx->resultbuf->len != 0) ? TRUE : FALSE; g_string_truncate (sctx->buf, 0); } } } } static void handle_start_font (GstSamiContext * sctx, const xmlChar ** atts) { int i; sami_context_pop_state (sctx, SPAN_TAG); if (atts != NULL) { g_string_append (sctx->buf, "<span"); for (i = 0; (atts[i] != NULL); i += 2) { const xmlChar *key, *value; key = atts[i]; value = atts[i + 1]; if (!value) continue; if (!xmlStrncmp ((const xmlChar *) "color", key, 5)) { /* * There are invalid color value in many * sami files. * It will fix hex color value that start without '#' */ gchar *sharp = ""; int len = xmlStrlen (value); if (!(*value == '#' && len == 7)) { gchar *r; /* check if it looks like hex */ if (strtol ((const char *) value, &r, 16) >= 0 && ((xmlChar *) r == (value + 6) && len == 6)) { sharp = "#"; } } /* some colours can be found in many sami files, but X RGB database * doesn't contain a colour by this name, so map explicitly */ if (!xmlStrncasecmp (value, (const xmlChar *) "aqua", len)) { value = (const xmlChar *) "#00ffff"; } else if (!xmlStrncasecmp (value, (const xmlChar *) "crimson", len)) { value = (const xmlChar *) "#dc143c"; } else if (!xmlStrncasecmp (value, (const xmlChar *) "fuchsia", len)) { value = (const xmlChar *) "#ff00ff"; } else if (!xmlStrncasecmp (value, (const xmlChar *) "indigo", len)) { value = (const xmlChar *) "#4b0082"; } else if (!xmlStrncasecmp (value, (const xmlChar *) "lime", len)) { value = (const xmlChar *) "#00ff00"; } else if (!xmlStrncasecmp (value, (const xmlChar *) "olive", len)) { value = (const xmlChar *) "#808000"; } else if (!xmlStrncasecmp (value, (const xmlChar *) "silver", len)) { value = (const xmlChar *) "#c0c0c0"; } else if (!xmlStrncasecmp (value, (const xmlChar *) "teal", len)) { value = (const xmlChar *) "#008080"; } g_string_append_printf (sctx->buf, " foreground=\"%s%s\"", sharp, value); } else if (!xmlStrncasecmp ((const xmlChar *) "face", key, 4)) { g_string_append_printf (sctx->buf, " font_family=\"%s\"", value); } } g_string_append_c (sctx->buf, '>'); sami_context_push_state (sctx, SPAN_TAG); } } static void start_sami_element (void *ctx, const xmlChar * name, const xmlChar ** atts) { GstSamiContext *sctx = (GstSamiContext *) ctx; GST_LOG ("name:%s", name); if (!xmlStrncmp ((const xmlChar *) "sync", name, 4)) { handle_start_sync (sctx, atts); sctx->in_sync = TRUE; } else if (!xmlStrncmp ((const xmlChar *) "font", name, 4)) { handle_start_font (sctx, atts); } else if (!xmlStrncmp ((const xmlChar *) "ruby", name, 4)) { sami_context_push_state (sctx, RUBY_TAG); } else if (!xmlStrncmp ((const xmlChar *) "br", name, 2)) { g_string_append_c (sctx->buf, '\n'); /* FIXME: support for furigana/ruby once implemented in pango */ } else if (!xmlStrncmp ((const xmlChar *) "rt", name, 2)) { if (has_tag (sctx->state, ITALIC_TAG)) { g_string_append (sctx->rubybuf, "<i>"); } g_string_append (sctx->rubybuf, "<span size='xx-small' rise='-100'>"); sami_context_push_state (sctx, RT_TAG); } else if (!xmlStrncmp ((const xmlChar *) "p", name, 1)) { } else if (!xmlStrncmp ((const xmlChar *) "i", name, 1)) { g_string_append (sctx->buf, "<i>"); sami_context_push_state (sctx, ITALIC_TAG); } } static void end_sami_element (void *ctx, const xmlChar * name) { GstSamiContext *sctx = (GstSamiContext *) ctx; GST_LOG ("name:%s", name); if (!xmlStrncmp ((const xmlChar *) "sync", name, 4)) { sctx->in_sync = FALSE; } else if ((!xmlStrncmp ((const xmlChar *) "body", name, 4)) || (!xmlStrncmp ((const xmlChar *) "sami", name, 4))) { /* We will usually have one buffer left when the body is closed * as we need the next sync to actually send it */ if (sctx->buf->len != 0) { /* Only set a new start time if we don't have text pending */ if (sctx->resultbuf->len == 0) sctx->time1 = sctx->time2; sctx->time2 = GST_CLOCK_TIME_NONE; g_string_append (sctx->resultbuf, sctx->buf->str); sctx->has_result = (sctx->resultbuf->len != 0) ? TRUE : FALSE; g_string_truncate (sctx->buf, 0); } } else if (!xmlStrncmp ((const xmlChar *) "font", name, 4)) { sami_context_pop_state (sctx, SPAN_TAG); } else if (!xmlStrncmp ((const xmlChar *) "ruby", name, 4)) { sami_context_pop_state (sctx, RUBY_TAG); } else if (!xmlStrncmp ((const xmlChar *) "i", name, 1)) { sami_context_pop_state (sctx, ITALIC_TAG); } } static void characters_sami (void *ctx, const xmlChar * ch, int len) { GstSamiContext *sctx = (GstSamiContext *) ctx; gchar *escaped; gchar *tmp; gint i; /* Skip everything except content of the sync elements */ if (!sctx->in_sync) return; escaped = g_markup_escape_text ((const gchar *) ch, len); g_strstrip (escaped); /* Remove double spaces forom the string as those are * usually added by newlines and indention */ tmp = escaped; for (i = 0; i <= strlen (escaped); i++) { escaped[i] = *tmp; if (*tmp != ' ') { tmp++; continue; } while (*tmp == ' ') tmp++; } if (has_tag (sctx->state, RT_TAG)) { g_string_append_c (sctx->rubybuf, ' '); g_string_append (sctx->rubybuf, escaped); g_string_append_c (sctx->rubybuf, ' '); } else { g_string_append (sctx->buf, escaped); } g_free (escaped); } static xmlSAXHandler samiSAXHandlerStruct = { NULL, /* internalSubset */ NULL, /* isStandalone */ NULL, /* hasInternalSubset */ NULL, /* hasExternalSubset */ NULL, /* resolveEntity */ NULL, /* getEntity */ NULL, /* entityDecl */ NULL, /* notationDecl */ NULL, /* attributeDecl */ NULL, /* elementDecl */ NULL, /* unparsedEntityDecl */ NULL, /* setDocumentLocator */ NULL, /* startDocument */ NULL, /* endDocument */ start_sami_element, /* startElement */ end_sami_element, /* endElement */ NULL, /* reference */ characters_sami, /* characters */ NULL, /* ignorableWhitespace */ NULL, /* processingInstruction */ NULL, /* comment */ NULL, /* xmlParserWarning */ NULL, /* xmlParserError */ NULL, /* xmlParserError */ NULL, /* getParameterEntity */ NULL, /* cdataBlock */ NULL, /* externalSubset */ 1, /* initialized */ NULL, /* private */ NULL, /* startElementNsSAX2Func */ NULL, /* endElementNsSAX2Func */ NULL /* xmlStructuredErrorFunc */ }; static xmlSAXHandlerPtr samiSAXHandler = &samiSAXHandlerStruct; void sami_context_init (ParserState * state) { GstSamiContext *context; g_assert (state->user_data == NULL); state->user_data = (gpointer) g_new0 (GstSamiContext, 1); context = (GstSamiContext *) state->user_data; context->htmlctxt = htmlCreatePushParserCtxt (samiSAXHandler, context, "", 0, NULL, XML_CHAR_ENCODING_UTF8); context->buf = g_string_new (""); context->rubybuf = g_string_new (""); context->resultbuf = g_string_new (""); context->state = g_string_new (""); } void sami_context_deinit (ParserState * state) { GstSamiContext *context = (GstSamiContext *) state->user_data; if (context) { htmlParserCtxtPtr htmlctxt = context->htmlctxt; /* destroy sax context */ htmlDocPtr doc; htmlParseChunk (htmlctxt, "", 0, 1); doc = htmlctxt->myDoc; htmlFreeParserCtxt (htmlctxt); context->htmlctxt = NULL; if (doc) xmlFreeDoc (doc); g_string_free (context->buf, TRUE); g_string_free (context->rubybuf, TRUE); g_string_free (context->resultbuf, TRUE); g_string_free (context->state, TRUE); g_free (context); state->user_data = NULL; } } void sami_context_reset (ParserState * state) { GstSamiContext *context = (GstSamiContext *) state->user_data; if (context) { g_string_truncate (context->buf, 0); g_string_truncate (context->rubybuf, 0); g_string_truncate (context->resultbuf, 0); g_string_truncate (context->state, 0); context->has_result = FALSE; context->in_sync = FALSE; context->time1 = 0; context->time2 = 0; } } static gchar * fix_invalid_entities (const gchar * line) { const gchar *cp, *pp; /* current pointer, previous pointer */ gssize size; GString *ret = g_string_new (NULL); pp = line; cp = strchr (line, '&'); while (cp) { size = cp - pp; ret = g_string_append_len (ret, pp, size); cp++; if (g_ascii_strncasecmp (cp, "nbsp;", 5) && (!g_ascii_strncasecmp (cp, "nbsp", 4))) { /* translate " " to " " */ ret = g_string_append_len (ret, " ", 6); cp += 4; } else if (g_ascii_strncasecmp (cp, "quot;", 5) && g_ascii_strncasecmp (cp, "amp;", 4) && g_ascii_strncasecmp (cp, "apos;", 5) && g_ascii_strncasecmp (cp, "lt;", 3) && g_ascii_strncasecmp (cp, "gt;", 3) && g_ascii_strncasecmp (cp, "nbsp;", 5) && cp[0] != '#') { /* translate "&" to "&" */ ret = g_string_append_len (ret, "&", 5); } else { /* do not translate */ ret = g_string_append_c (ret, '&'); } pp = cp; cp = strchr (pp, '&'); } ret = g_string_append (ret, pp); return g_string_free (ret, FALSE); } gchar * parse_sami (ParserState * state, const gchar * line) { gchar *fixed_line; GstSamiContext *context = (GstSamiContext *) state->user_data; fixed_line = fix_invalid_entities (line); htmlParseChunk (context->htmlctxt, fixed_line, strlen (fixed_line), 0); g_free (fixed_line); if (context->has_result) { gchar *r; if (context->rubybuf->len) { context->rubybuf = g_string_append_c (context->rubybuf, '\n'); g_string_prepend (context->resultbuf, context->rubybuf->str); context->rubybuf = g_string_truncate (context->rubybuf, 0); } r = g_string_free (context->resultbuf, FALSE); context->resultbuf = g_string_new (""); state->start_time = context->time1; state->duration = context->time2 - context->time1; context->has_result = FALSE; return r; } return NULL; }