gstreamer/subprojects/gst-plugins-base/gst/subparse/samiparse.c

/* GStreamer SAMI subtitle parser
 * Copyright (c) 2006, 2013 Young-Ho Cha <ganadist at gmail com>
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Library General Public
 * License as published by the Free Software Foundation; either
 * version 2 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Library General Public License for more details.
 *
 * You should have received a copy of the GNU Library General Public
 * License along with this library; if not, write to the
 * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
 * Boston, MA 02110-1301, USA.
 */

#include "samiparse.h"

#include <glib.h>
#include <string.h>
#include <stdlib.h>

#define ITALIC_TAG 'i'
#define SPAN_TAG   's'
#define RUBY_TAG   'r'
#define RT_TAG     't'
#define CLEAR_TAG  '0'

typedef struct _HtmlParser HtmlParser;
typedef struct _HtmlContext HtmlContext;
typedef struct _GstSamiContext GstSamiContext;

struct _GstSamiContext
{
  GString *buf;                 /* buffer to collect content */
  GString *rubybuf;             /* buffer to collect ruby content */
  GString *resultbuf;           /* when opening the next 'sync' tag, move
                                 * from 'buf' to avoid to append following
                                 * content */
  GString *state;               /* in many sami files there are tags that
                                 * are not closed, so for each open tag the
                                 * parser will append a tag flag here so
                                 * that tags can be closed properly on
                                 * 'sync' tags. See _context_push_state()
                                 * and _context_pop_state(). */
  HtmlContext *htmlctxt;        /* html parser context */
  gboolean has_result;          /* set when ready to push out result */
  gboolean in_sync;             /* flag to avoid appending anything except the
                                 * content of the sync elements to buf */
  guint64 time1;                /* previous start attribute in sync tag */
  guint64 time2;                /* current start attribute in sync tag  */
};

struct _HtmlParser
{
  void (*start_element) (HtmlContext * ctx,
      const gchar * name, const gchar ** attr, gpointer user_data);
  void (*end_element) (HtmlContext * ctx,
      const gchar * name, gpointer user_data);
  void (*text) (HtmlContext * ctx,
      const gchar * text, gsize text_len, gpointer user_data);
};

struct _HtmlContext
{
  const HtmlParser *parser;
  gpointer user_data;
  GString *buf;
};

static HtmlContext *
html_context_new (HtmlParser * parser, gpointer user_data)
{
  HtmlContext *ctxt = (HtmlContext *) g_new0 (HtmlContext, 1);
  ctxt->parser = parser;
  ctxt->user_data = user_data;
  ctxt->buf = g_string_new (NULL);
  return ctxt;
}

static void
html_context_free (HtmlContext * ctxt)
{
  g_string_free (ctxt->buf, TRUE);
  g_free (ctxt);
}

typedef struct
{
  gunichar unescaped:24;
  guint8 escaped_len;
  gchar escaped[8];
} EntityMap;

#define ENTITY(unicode,ent) unicode,sizeof(ent)-1,ent

static const EntityMap XmlEntities[] = {
  {ENTITY (34, "quot")},
  {ENTITY (38, "amp")},
  {ENTITY (39, "apos")},
  {ENTITY (60, "lt")},
  {ENTITY (62, "gt")},
};

static const EntityMap HtmlEntities[] = {
/* nbsp we'll handle manually
{ 160,	"nbsp;" }, */
  {ENTITY (161, "iexcl")},
  {ENTITY (162, "cent")},
  {ENTITY (163, "pound")},
  {ENTITY (164, "curren")},
  {ENTITY (165, "yen")},
  {ENTITY (166, "brvbar")},
  {ENTITY (167, "sect")},
  {ENTITY (168, "uml")},
  {ENTITY (169, "copy")},
  {ENTITY (170, "ordf")},
  {ENTITY (171, "laquo")},
  {ENTITY (172, "not")},
  {ENTITY (173, "shy")},
  {ENTITY (174, "reg")},
  {ENTITY (175, "macr")},
  {ENTITY (176, "deg")},
  {ENTITY (177, "plusmn")},
  {ENTITY (178, "sup2")},
  {ENTITY (179, "sup3")},
  {ENTITY (180, "acute")},
  {ENTITY (181, "micro")},
  {ENTITY (182, "para")},
  {ENTITY (183, "middot")},
  {ENTITY (184, "cedil")},
  {ENTITY (185, "sup1")},
  {ENTITY (186, "ordm")},
  {ENTITY (187, "raquo")},
  {ENTITY (188, "frac14")},
  {ENTITY (189, "frac12")},
  {ENTITY (190, "frac34")},
  {ENTITY (191, "iquest")},
  {ENTITY (192, "Agrave")},
  {ENTITY (193, "Aacute")},
  {ENTITY (194, "Acirc")},
  {ENTITY (195, "Atilde")},
  {ENTITY (196, "Auml")},
  {ENTITY (197, "Aring")},
  {ENTITY (198, "AElig")},
  {ENTITY (199, "Ccedil")},
  {ENTITY (200, "Egrave")},
  {ENTITY (201, "Eacute")},
  {ENTITY (202, "Ecirc")},
  {ENTITY (203, "Euml")},
  {ENTITY (204, "Igrave")},
  {ENTITY (205, "Iacute")},
  {ENTITY (206, "Icirc")},
  {ENTITY (207, "Iuml")},
  {ENTITY (208, "ETH")},
  {ENTITY (209, "Ntilde")},
  {ENTITY (210, "Ograve")},
  {ENTITY (211, "Oacute")},
  {ENTITY (212, "Ocirc")},
  {ENTITY (213, "Otilde")},
  {ENTITY (214, "Ouml")},
  {ENTITY (215, "times")},
  {ENTITY (216, "Oslash")},
  {ENTITY (217, "Ugrave")},
  {ENTITY (218, "Uacute")},
  {ENTITY (219, "Ucirc")},
  {ENTITY (220, "Uuml")},
  {ENTITY (221, "Yacute")},
  {ENTITY (222, "THORN")},
  {ENTITY (223, "szlig")},
  {ENTITY (224, "agrave")},
  {ENTITY (225, "aacute")},
  {ENTITY (226, "acirc")},
  {ENTITY (227, "atilde")},
  {ENTITY (228, "auml")},
  {ENTITY (229, "aring")},
  {ENTITY (230, "aelig")},
  {ENTITY (231, "ccedil")},
  {ENTITY (232, "egrave")},
  {ENTITY (233, "eacute")},
  {ENTITY (234, "ecirc")},
  {ENTITY (235, "euml")},
  {ENTITY (236, "igrave")},
  {ENTITY (237, "iacute")},
  {ENTITY (238, "icirc")},
  {ENTITY (239, "iuml")},
  {ENTITY (240, "eth")},
  {ENTITY (241, "ntilde")},
  {ENTITY (242, "ograve")},
  {ENTITY (243, "oacute")},
  {ENTITY (244, "ocirc")},
  {ENTITY (245, "otilde")},
  {ENTITY (246, "ouml")},
  {ENTITY (247, "divide")},
  {ENTITY (248, "oslash")},
  {ENTITY (249, "ugrave")},
  {ENTITY (250, "uacute")},
  {ENTITY (251, "ucirc")},
  {ENTITY (252, "uuml")},
  {ENTITY (253, "yacute")},
  {ENTITY (254, "thorn")},
  {ENTITY (255, "yuml")},
  {ENTITY (338, "OElig")},
  {ENTITY (339, "oelig")},
  {ENTITY (352, "Scaron")},
  {ENTITY (353, "scaron")},
  {ENTITY (376, "Yuml")},
  {ENTITY (402, "fnof")},
  {ENTITY (710, "circ")},
  {ENTITY (732, "tilde")},
  {ENTITY (913, "Alpha")},
  {ENTITY (914, "Beta")},
  {ENTITY (915, "Gamma")},
  {ENTITY (916, "Delta")},
  {ENTITY (917, "Epsilon")},
  {ENTITY (918, "Zeta")},
  {ENTITY (919, "Eta")},
  {ENTITY (920, "Theta")},
  {ENTITY (921, "Iota")},
  {ENTITY (922, "Kappa")},
  {ENTITY (923, "Lambda")},
  {ENTITY (924, "Mu")},
  {ENTITY (925, "Nu")},
  {ENTITY (926, "Xi")},
  {ENTITY (927, "Omicron")},
  {ENTITY (928, "Pi")},
  {ENTITY (929, "Rho")},
  {ENTITY (931, "Sigma")},
  {ENTITY (932, "Tau")},
  {ENTITY (933, "Upsilon")},
  {ENTITY (934, "Phi")},
  {ENTITY (935, "Chi")},
  {ENTITY (936, "Psi")},
  {ENTITY (937, "Omega")},
  {ENTITY (945, "alpha")},
  {ENTITY (946, "beta")},
  {ENTITY (947, "gamma")},
  {ENTITY (948, "delta")},
  {ENTITY (949, "epsilon")},
  {ENTITY (950, "zeta")},
  {ENTITY (951, "eta")},
  {ENTITY (952, "theta")},
  {ENTITY (953, "iota")},
  {ENTITY (954, "kappa")},
  {ENTITY (955, "lambda")},
  {ENTITY (956, "mu")},
  {ENTITY (957, "nu")},
  {ENTITY (958, "xi")},
  {ENTITY (959, "omicron")},
  {ENTITY (960, "pi")},
  {ENTITY (961, "rho")},
  {ENTITY (962, "sigmaf")},
  {ENTITY (963, "sigma")},
  {ENTITY (964, "tau")},
  {ENTITY (965, "upsilon")},
  {ENTITY (966, "phi")},
  {ENTITY (967, "chi")},
  {ENTITY (968, "psi")},
  {ENTITY (969, "omega")},
  {ENTITY (977, "thetasym")},
  {ENTITY (978, "upsih")},
  {ENTITY (982, "piv")},
  {ENTITY (8194, "ensp")},
  {ENTITY (8195, "emsp")},
  {ENTITY (8201, "thinsp")},
  {ENTITY (8204, "zwnj")},
  {ENTITY (8205, "zwj")},
  {ENTITY (8206, "lrm")},
  {ENTITY (8207, "rlm")},
  {ENTITY (8211, "ndash")},
  {ENTITY (8212, "mdash")},
  {ENTITY (8216, "lsquo")},
  {ENTITY (8217, "rsquo")},
  {ENTITY (8218, "sbquo")},
  {ENTITY (8220, "ldquo")},
  {ENTITY (8221, "rdquo")},
  {ENTITY (8222, "bdquo")},
  {ENTITY (8224, "dagger")},
  {ENTITY (8225, "Dagger")},
  {ENTITY (8226, "bull")},
  {ENTITY (8230, "hellip")},
  {ENTITY (8240, "permil")},
  {ENTITY (8242, "prime")},
  {ENTITY (8243, "Prime")},
  {ENTITY (8249, "lsaquo")},
  {ENTITY (8250, "rsaquo")},
  {ENTITY (8254, "oline")},
  {ENTITY (8260, "frasl")},
  {ENTITY (8364, "euro")},
  {ENTITY (8465, "image")},
  {ENTITY (8472, "weierp")},
  {ENTITY (8476, "real")},
  {ENTITY (8482, "trade")},
  {ENTITY (8501, "alefsym")},
  {ENTITY (8592, "larr")},
  {ENTITY (8593, "uarr")},
  {ENTITY (8594, "rarr")},
  {ENTITY (8595, "darr")},
  {ENTITY (8596, "harr")},
  {ENTITY (8629, "crarr")},
  {ENTITY (8656, "lArr")},
  {ENTITY (8657, "uArr")},
  {ENTITY (8658, "rArr")},
  {ENTITY (8659, "dArr")},
  {ENTITY (8660, "hArr")},
  {ENTITY (8704, "forall")},
  {ENTITY (8706, "part")},
  {ENTITY (8707, "exist")},
  {ENTITY (8709, "empty")},
  {ENTITY (8711, "nabla")},
  {ENTITY (8712, "isin")},
  {ENTITY (8713, "notin")},
  {ENTITY (8715, "ni")},
  {ENTITY (8719, "prod")},
  {ENTITY (8721, "sum")},
  {ENTITY (8722, "minus")},
  {ENTITY (8727, "lowast")},
  {ENTITY (8730, "radic")},
  {ENTITY (8733, "prop")},
  {ENTITY (8734, "infin")},
  {ENTITY (8736, "ang")},
  {ENTITY (8743, "and")},
  {ENTITY (8744, "or")},
  {ENTITY (8745, "cap")},
  {ENTITY (8746, "cup")},
  {ENTITY (8747, "int")},
  {ENTITY (8756, "there4")},
  {ENTITY (8764, "sim")},
  {ENTITY (8773, "cong")},
  {ENTITY (8776, "asymp")},
  {ENTITY (8800, "ne")},
  {ENTITY (8801, "equiv")},
  {ENTITY (8804, "le")},
  {ENTITY (8805, "ge")},
  {ENTITY (8834, "sub")},
  {ENTITY (8835, "sup")},
  {ENTITY (8836, "nsub")},
  {ENTITY (8838, "sube")},
  {ENTITY (8839, "supe")},
  {ENTITY (8853, "oplus")},
  {ENTITY (8855, "otimes")},
  {ENTITY (8869, "perp")},
  {ENTITY (8901, "sdot")},
  {ENTITY (8968, "lceil")},
  {ENTITY (8969, "rceil")},
  {ENTITY (8970, "lfloor")},
  {ENTITY (8971, "rfloor")},
  {ENTITY (9001, "lang")},
  {ENTITY (9002, "rang")},
  {ENTITY (9674, "loz")},
  {ENTITY (9824, "spades")},
  {ENTITY (9827, "clubs")},
  {ENTITY (9829, "hearts")},
  {ENTITY (9830, "diams")},
};

static gchar *
unescape_string (const gchar * text)
{
  gint i;
  GString *unescaped = g_string_new (NULL);

  while (*text) {
    if (*text == '&') {
      text++;

      /* unescape &nbsp and &nbsp; */
      if (!g_ascii_strncasecmp (text, "nbsp", 4)) {
        g_string_append_unichar (unescaped, 160);
        text += 4;
        if (*text == ';') {
          text++;
        }
        goto next;
      }

      /* pass xml entities. these will be processed as pango markup */
      for (i = 0; i < G_N_ELEMENTS (XmlEntities); i++) {
        const EntityMap *entity = &XmlEntities[i];
        guint8 escaped_len = entity->escaped_len;

        if (!g_ascii_strncasecmp (text, entity->escaped, escaped_len)
            && text[escaped_len] == ';') {
          g_string_append_c (unescaped, '&');
          g_string_append_len (unescaped, entity->escaped, escaped_len);
          g_string_append_c (unescaped, ';');
          text += escaped_len + 1;
          goto next;
        }
      }

      /* convert html entities */
      for (i = 0; i < G_N_ELEMENTS (HtmlEntities); i++) {
        const EntityMap *entity = &HtmlEntities[i];
        guint8 escaped_len = entity->escaped_len;

        if (!strncmp (text, entity->escaped, escaped_len)
            && text[escaped_len] == ';') {
          g_string_append_unichar (unescaped, entity->unescaped);
          text += escaped_len + 1;
          goto next;
        }
      }

      if (*text == '#') {
        gboolean is_hex = FALSE;
        gunichar l;
        gchar *end = NULL;

        text++;
        if (*text == 'x') {
          is_hex = TRUE;
          text++;
        }
        errno = 0;
        if (is_hex) {
          l = strtoul (text, &end, 16);
        } else {
          l = strtoul (text, &end, 10);
        }

        if (text == end || errno != 0) {
          /* error occurred. pass it */
          goto next;
        }
        g_string_append_unichar (unescaped, l);
        text = end;

        if (*text == ';') {
          text++;
        }
        goto next;
      }

      /* escape & */
      g_string_append (unescaped, "&amp;");

    next:
      continue;

    } else if (g_ascii_isspace (*text)) {
      g_string_append_c (unescaped, ' ');
      /* strip whitespace */
      do {
        text++;
      } while ((*text) && g_ascii_isspace (*text));
    } else {
      g_string_append_c (unescaped, *text);
      text++;
    }
  }

  return g_string_free (unescaped, FALSE);
}

static const gchar *
string_token (const gchar * string, const gchar * delimiter, gchar ** first)
{
  gchar *next = strstr (string, delimiter);
  if (next) {
    *first = g_strndup (string, next - string);
  } else {
    *first = g_strdup (string);
  }
  return next;
}

static void
html_context_handle_element (HtmlContext * ctxt,
    const gchar * string, gboolean must_close)
{
  gchar *name = NULL;
  gint count = 0, i;
  gchar **attrs;
  const gchar *found, *next;

  /* split element name and attributes */
  next = string_token (string, " ", &name);

  if (next) {
    /* count attributes */
    found = next + 1;
    while (TRUE) {
      found = strchr (found, '=');
      if (!found)
        break;
      found++;
      count++;
    }
  } else {
    count = 0;
  }

  attrs = g_new0 (gchar *, (count + 1) * 2);

  for (i = 0; i < count && next != NULL; i += 2) {
    gchar *attr_name = NULL, *attr_value = NULL;
    gsize length;
    next = string_token (next + 1, "=", &attr_name);
    if (!next) {
      g_free (attr_name);
      break;
    }
    next = string_token (next + 1, " ", &attr_value);

    /* strip " or ' from attribute value */
    if (attr_value[0] == '"' || attr_value[0] == '\'') {
      gchar *tmp = g_strdup (attr_value + 1);
      g_free (attr_value);
      attr_value = tmp;
    }

    length = strlen (attr_value);
    if (length > 0 && (attr_value[length - 1] == '"'
            || attr_value[length - 1] == '\'')) {
      attr_value[length - 1] = '\0';
    }

    attrs[i] = attr_name;
    attrs[i + 1] = attr_value;
  }

  ctxt->parser->start_element (ctxt, name,
      (const gchar **) attrs, ctxt->user_data);
  if (must_close) {
    ctxt->parser->end_element (ctxt, name, ctxt->user_data);
  }
  g_strfreev (attrs);
  g_free (name);
}

static void
html_context_parse (HtmlContext * ctxt, gchar * text, gsize text_len)
{
  const gchar *next = NULL;

  g_string_append_len (ctxt->buf, text, text_len);
  next = ctxt->buf->str;
  while (TRUE) {
    if (next[0] == '<') {
      gchar *element = NULL;
      /* find <blahblah> */
      if (!strchr (next, '>')) {
        /* no tag end point. buffer will be process in next time */
        return;
      }

      next = string_token (next, ">", &element);
      next++;
      if (g_str_has_suffix (element, "/")) {
        /* handle <blah/> */
        element[strlen (element) - 1] = '\0';
        html_context_handle_element (ctxt, element + 1, TRUE);
      } else if (element[1] == '/') {
        /* handle </blah> */
        ctxt->parser->end_element (ctxt, element + 2, ctxt->user_data);
      } else {
        /* handle <blah> */
        html_context_handle_element (ctxt, element + 1, FALSE);
      }
      g_free (element);
    } else if (strchr (next, '<')) {
      gchar *text = NULL;
      gsize length;
      next = string_token (next, "<", &text);
      text = g_strstrip (text);
      length = strlen (text);
      ctxt->parser->text (ctxt, text, length, ctxt->user_data);
      g_free (text);

    } else {
      gchar *text = (gchar *) next;
      gsize length;
      text = g_strstrip (text);
      length = strlen (text);
      ctxt->parser->text (ctxt, text, length, ctxt->user_data);
      ctxt->buf = g_string_assign (ctxt->buf, "");
      return;
    }
  }

  ctxt->buf = g_string_assign (ctxt->buf, next);
}

static gchar *
has_tag (GString * str, const gchar tag)
{
  return strrchr (str->str, tag);
}

static void
sami_context_push_state (GstSamiContext * sctx, char state)
{
  GST_LOG ("state %c", state);
  g_string_append_c (sctx->state, state);
}

static void
sami_context_pop_state (GstSamiContext * sctx, char state)
{
  GString *str = g_string_new ("");
  GString *context_state = sctx->state;
  int i;

  GST_LOG ("state %c", state);
  for (i = context_state->len - 1; i >= 0; i--) {
    switch (context_state->str[i]) {
      case ITALIC_TAG:         /* <i> */
      {
        g_string_append (str, "</i>");
        break;
      }
      case SPAN_TAG:           /* <span foreground= > */
      {
        g_string_append (str, "</span>");
        break;
      }
      case RUBY_TAG:           /* <span size= >  -- ruby */
      {
        break;
      }
      case RT_TAG:             /*  ruby */
      {
        /* FIXME: support for furigana/ruby once implemented in pango */
        g_string_append (sctx->rubybuf, "</span>");
        if (has_tag (context_state, ITALIC_TAG)) {
          g_string_append (sctx->rubybuf, "</i>");
        }

        break;
      }
      default:
        break;
    }
    if (context_state->str[i] == state) {
      g_string_append (sctx->buf, str->str);
      g_string_free (str, TRUE);
      g_string_truncate (context_state, i);
      return;
    }
  }
  if (state == CLEAR_TAG) {
    g_string_append (sctx->buf, str->str);
    g_string_truncate (context_state, 0);
  }
  g_string_free (str, TRUE);
}

static void
handle_start_sync (GstSamiContext * sctx, const gchar ** atts)
{
  int i;

  sami_context_pop_state (sctx, CLEAR_TAG);
  if (atts != NULL) {
    for (i = 0; (atts[i] != NULL); i += 2) {
      const gchar *key, *value;

      key = atts[i];
      value = atts[i + 1];

      if (!value)
        continue;
      if (!g_ascii_strcasecmp ("start", key)) {
        /* Only set a new start time if we don't have text pending */
        if (sctx->resultbuf->len == 0)
          sctx->time1 = sctx->time2;

        sctx->time2 = atoi ((const char *) value) * GST_MSECOND;
        sctx->time2 = MAX (sctx->time2, sctx->time1);
        g_string_append (sctx->resultbuf, sctx->buf->str);
        sctx->has_result = (sctx->resultbuf->len != 0) ? TRUE : FALSE;
        g_string_truncate (sctx->buf, 0);
      }
    }
  }
}

static void
handle_start_font (GstSamiContext * sctx, const gchar ** atts)
{
  int i;

  sami_context_pop_state (sctx, SPAN_TAG);
  if (atts != NULL) {
    g_string_append (sctx->buf, "<span");
    for (i = 0; (atts[i] != NULL); i += 2) {
      const gchar *key, *value;

      key = atts[i];
      value = atts[i + 1];

      if (!value)
        continue;
      if (!g_ascii_strcasecmp ("color", key)) {
        /*
         * There are invalid color value in many
         * sami files.
         * It will fix hex color value that start without '#'
         */
        const gchar *sharp = "";
        int len = strlen (value);

        if (!(*value == '#' && len == 7)) {
          gchar *r;

          /* check if it looks like hex */
          if (strtol ((const char *) value, &r, 16) >= 0 &&
              ((gchar *) r == (value + 6) && len == 6)) {
            sharp = "#";
          }
        }
        /* some colours can be found in many sami files, but X RGB database
         * doesn't contain a colour by this name, so map explicitly */
        if (!g_ascii_strcasecmp ("aqua", value)) {
          value = "#00ffff";
        } else if (!g_ascii_strcasecmp ("crimson", value)) {
          value = "#dc143c";
        } else if (!g_ascii_strcasecmp ("fuchsia", value)) {
          value = "#ff00ff";
        } else if (!g_ascii_strcasecmp ("indigo", value)) {
          value = "#4b0082";
        } else if (!g_ascii_strcasecmp ("lime", value)) {
          value = "#00ff00";
        } else if (!g_ascii_strcasecmp ("olive", value)) {
          value = "#808000";
        } else if (!g_ascii_strcasecmp ("silver", value)) {
          value = "#c0c0c0";
        } else if (!g_ascii_strcasecmp ("teal", value)) {
          value = "#008080";
        }
        g_string_append_printf (sctx->buf, " foreground=\"%s%s\"", sharp,
            value);
      } else if (!g_ascii_strcasecmp ("face", key)) {
        g_string_append_printf (sctx->buf, " font_family=\"%s\"", value);
      }
    }
    g_string_append_c (sctx->buf, '>');
    sami_context_push_state (sctx, SPAN_TAG);
  }
}

static void
handle_start_element (HtmlContext * ctx, const gchar * name,
    const char **atts, gpointer user_data)
{
  GstSamiContext *sctx = (GstSamiContext *) user_data;

  GST_LOG ("name:%s", name);

  if (!g_ascii_strcasecmp ("sync", name)) {
    handle_start_sync (sctx, atts);
    sctx->in_sync = TRUE;
  } else if (!g_ascii_strcasecmp ("font", name)) {
    handle_start_font (sctx, atts);
  } else if (!g_ascii_strcasecmp ("ruby", name)) {
    sami_context_push_state (sctx, RUBY_TAG);
  } else if (!g_ascii_strcasecmp ("br", name)) {
    g_string_append_c (sctx->buf, '\n');
    /* FIXME: support for furigana/ruby once implemented in pango */
  } else if (!g_ascii_strcasecmp ("rt", name)) {
    if (has_tag (sctx->state, ITALIC_TAG)) {
      g_string_append (sctx->rubybuf, "<i>");
    }
    g_string_append (sctx->rubybuf, "<span size='xx-small' rise='-100'>");
    sami_context_push_state (sctx, RT_TAG);
  } else if (!g_ascii_strcasecmp ("i", name)) {
    g_string_append (sctx->buf, "<i>");
    sami_context_push_state (sctx, ITALIC_TAG);
  } else if (!g_ascii_strcasecmp ("p", name)) {
  }
}

static void
handle_end_element (HtmlContext * ctx, const char *name, gpointer user_data)
{
  GstSamiContext *sctx = (GstSamiContext *) user_data;

  GST_LOG ("name:%s", name);

  if (!g_ascii_strcasecmp ("sync", name)) {
    sctx->in_sync = FALSE;
  } else if ((!g_ascii_strcasecmp ("body", name)) ||
      (!g_ascii_strcasecmp ("sami", name))) {
    /* We will usually have one buffer left when the body is closed
     * as we need the next sync to actually send it */
    if (sctx->buf->len != 0) {
      /* Only set a new start time if we don't have text pending */
      if (sctx->resultbuf->len == 0)
        sctx->time1 = sctx->time2;

      sctx->time2 = GST_CLOCK_TIME_NONE;
      g_string_append (sctx->resultbuf, sctx->buf->str);
      sctx->has_result = (sctx->resultbuf->len != 0) ? TRUE : FALSE;
      g_string_truncate (sctx->buf, 0);
    }
  } else if (!g_ascii_strcasecmp ("font", name)) {
    sami_context_pop_state (sctx, SPAN_TAG);
  } else if (!g_ascii_strcasecmp ("ruby", name)) {
    sami_context_pop_state (sctx, RUBY_TAG);
  } else if (!g_ascii_strcasecmp ("i", name)) {
    sami_context_pop_state (sctx, ITALIC_TAG);
  }
}

static void
handle_text (HtmlContext * ctx, const gchar * text, gsize text_len,
    gpointer user_data)
{
  GstSamiContext *sctx = (GstSamiContext *) user_data;

  /* Skip everything except content of the sync elements */
  if (!sctx->in_sync)
    return;

  if (has_tag (sctx->state, RT_TAG)) {
    g_string_append_c (sctx->rubybuf, ' ');
    g_string_append (sctx->rubybuf, text);
    g_string_append_c (sctx->rubybuf, ' ');
  } else {
    g_string_append (sctx->buf, text);
  }
}

static HtmlParser samiParser = {
  handle_start_element,         /* start_element */
  handle_end_element,           /* end_element */
  handle_text,                  /* text */
};

void
sami_context_init (ParserState * state)
{
  GstSamiContext *context;

  g_assert (state->user_data == NULL);

  context = g_new0 (GstSamiContext, 1);

  context->htmlctxt = html_context_new (&samiParser, context);
  context->buf = g_string_new ("");
  context->rubybuf = g_string_new ("");
  context->resultbuf = g_string_new ("");
  context->state = g_string_new ("");

  state->user_data = context;
}

void
sami_context_deinit (ParserState * state)
{
  GstSamiContext *context = (GstSamiContext *) state->user_data;

  if (context) {
    html_context_free (context->htmlctxt);
    context->htmlctxt = NULL;
    g_string_free (context->buf, TRUE);
    g_string_free (context->rubybuf, TRUE);
    g_string_free (context->resultbuf, TRUE);
    g_string_free (context->state, TRUE);
    g_free (context);
    state->user_data = NULL;
  }
}

void
sami_context_reset (ParserState * state)
{
  GstSamiContext *context = (GstSamiContext *) state->user_data;

  if (context) {
    g_string_truncate (context->buf, 0);
    g_string_truncate (context->rubybuf, 0);
    g_string_truncate (context->resultbuf, 0);
    g_string_truncate (context->state, 0);
    context->has_result = FALSE;
    context->in_sync = FALSE;
    context->time1 = 0;
    context->time2 = 0;
  }
}

gchar *
parse_sami (ParserState * state, const gchar * line)
{
  gchar *ret = NULL;
  GstSamiContext *context = (GstSamiContext *) state->user_data;

  gchar *unescaped = unescape_string (line);
  html_context_parse (context->htmlctxt, (gchar *) unescaped,
      strlen (unescaped));
  g_free (unescaped);

  if (context->has_result) {
    if (context->rubybuf->len) {
      g_string_append_c (context->rubybuf, '\n');
      g_string_prepend (context->resultbuf, context->rubybuf->str);
      g_string_truncate (context->rubybuf, 0);
    }

    ret = g_string_free (context->resultbuf, FALSE);
    context->resultbuf = g_string_new ("");
    state->start_time = context->time1;
    state->duration = context->time2 - context->time1;
    context->has_result = FALSE;
  }
  return ret;
}