/* GStreamer SAMI subtitle parser
* Copyright (c) 2006, 2013 Young-Ho Cha <ganadist at gmail com>
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Library General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* Library General Public License for more details.
* You should have received a copy of the GNU Library General Public
* License along with this library; if not, write to the
* Free Software Foundation, Inc., 51 Franklin St, Fifth Floor,
* Boston, MA 02110-1301, USA.
#include "samiparse.h"
#include <glib.h>
#include <string.h>
#include <stdlib.h>
#define ITALIC_TAG 'i'
#define SPAN_TAG 's'
#define RUBY_TAG 'r'
#define RT_TAG 't'
#define CLEAR_TAG '0'
typedef struct _HtmlParser HtmlParser;
typedef struct _HtmlContext HtmlContext;
typedef struct _GstSamiContext GstSamiContext;
struct _GstSamiContext
GString *buf; /* buffer to collect content */
GString *rubybuf; /* buffer to collect ruby content */
GString *resultbuf; /* when opening the next 'sync' tag, move
* from 'buf' to avoid to append following
* content */
GString *state; /* in many sami files there are tags that
* are not closed, so for each open tag the
* parser will append a tag flag here so
* that tags can be closed properly on
* 'sync' tags. See _context_push_state()
* and _context_pop_state(). */
HtmlContext *htmlctxt; /* html parser context */
gboolean has_result; /* set when ready to push out result */
gboolean in_sync; /* flag to avoid appending anything except the
* content of the sync elements to buf */
guint64 time1; /* previous start attribute in sync tag */
guint64 time2; /* current start attribute in sync tag */
struct _HtmlParser
void (*start_element) (HtmlContext * ctx,
const gchar * name, const gchar ** attr, gpointer user_data);
void (*end_element) (HtmlContext * ctx,
const gchar * name, gpointer user_data);
void (*text) (HtmlContext * ctx,
const gchar * text, gsize text_len, gpointer user_data);
struct _HtmlContext
const HtmlParser *parser;
gpointer user_data;
GString *buf;
static HtmlContext *
html_context_new (HtmlParser * parser, gpointer user_data)
HtmlContext *ctxt = (HtmlContext *) g_new0 (HtmlContext, 1);
ctxt->parser = parser;
ctxt->user_data = user_data;
ctxt->buf = g_string_new (NULL);
return ctxt;
static void
html_context_free (HtmlContext * ctxt)
g_string_free (ctxt->buf, TRUE);
g_free (ctxt);
struct EntityMap
const gunichar unescaped;
const gchar *escaped;
static gchar *
unescape_string (const gchar * text)
gint i;
GString *unescaped = g_string_new (NULL);
while (*text) {
if (*text == '&') {
/* unescape &nbsp and &nbsp; */
if (!g_ascii_strncasecmp (text, "nbsp", 4)) {
unescaped = g_string_append_unichar (unescaped, 160);
text += 4;
if (*text == ';') {
goto next;
/* pass xml entities. these will be processed as pango markup */
for (i = 0; XmlEntities[i].escaped; i++) {
gssize len = strlen (XmlEntities[i].escaped);
if (!g_ascii_strncasecmp (text, XmlEntities[i].escaped, len)) {
unescaped = g_string_append_c (unescaped, '&');
unescaped =
g_string_append_len (unescaped, XmlEntities[i].escaped, len);
text += len;
goto next;
/* convert html entities */
for (i = 0; HtmlEntities[i].escaped; i++) {
gssize len = strlen (HtmlEntities[i].escaped);
if (!strncmp (text, HtmlEntities[i].escaped, len)) {
unescaped =
g_string_append_unichar (unescaped, HtmlEntities[i].unescaped);
text += len;
goto next;
if (*text == '#') {
gboolean is_hex = FALSE;
gunichar l;
gchar *end = NULL;
if (*text == 'x') {
is_hex = TRUE;
errno = 0;
if (is_hex) {
l = strtoul (text, &end, 16);
} else {
l = strtoul (text, &end, 10);
if (text == end || errno != 0) {
/* error occurred. pass it */
goto next;
unescaped = g_string_append_unichar (unescaped, l);
text = end;
if (*text == ';') {
goto next;
/* escape & */
unescaped = g_string_append (unescaped, "&amp;");
} else if (g_ascii_isspace (*text)) {
unescaped = g_string_append_c (unescaped, ' ');
/* strip whitespace */
do {
} while ((*text) && g_ascii_isspace (*text));
} else {
unescaped = g_string_append_c (unescaped, *text);
return g_string_free (unescaped, FALSE);
static const gchar *
string_token (const gchar * string, const gchar * delimiter, gchar ** first)
gchar *next = strstr (string, delimiter);
if (next) {
*first = g_strndup (string, next - string);
} else {
*first = g_strdup (string);
return next;
static void
html_context_handle_element (HtmlContext * ctxt,
const gchar * string, gboolean must_close)
gchar *name = NULL;
gint count = 0, i;
gchar **attrs;
const gchar *found, *next;
/* split element name and attributes */
next = string_token (string, " ", &name);
if (next) {
/* count attributes */
found = next + 1;
while (TRUE) {
found = strchr (found, '=');
if (!found)
} else {
count = 0;
attrs = g_new0 (gchar *, (count + 1) * 2);
for (i = 0; i < count && next != NULL; i += 2) {
gchar *attr_name = NULL, *attr_value = NULL;
gsize length;
next = string_token (next + 1, "=", &attr_name);
if (!next) {
g_free (attr_name);
next = string_token (next + 1, " ", &attr_value);
/* strip " or ' from attribute value */
if (attr_value[0] == '"' || attr_value[0] == '\'') {
gchar *tmp = g_strdup (attr_value + 1);
g_free (attr_value);
attr_value = tmp;
length = strlen (attr_value);
if (length > 0 && (attr_value[length - 1] == '"'
|| attr_value[length - 1] == '\'')) {
attr_value[length - 1] = '\0';
attrs[i] = attr_name;
attrs[i + 1] = attr_value;
ctxt->parser->start_element (ctxt, name,
(const gchar **) attrs, ctxt->user_data);
if (must_close) {
ctxt->parser->end_element (ctxt, name, ctxt->user_data);
g_strfreev (attrs);
g_free (name);
static void
html_context_parse (HtmlContext * ctxt, gchar * text, gsize text_len)
const gchar *next = NULL;
ctxt->buf = g_string_append_len (ctxt->buf, text, text_len);
next = ctxt->buf->str;
while (TRUE) {
if (next[0] == '<') {
gchar *element = NULL;
/* find <blahblah> */
if (!strchr (next, '>')) {
/* no tag end point. buffer will be process in next time */
next = string_token (next, ">", &element);
if (g_str_has_suffix (element, "/")) {
/* handle <blah/> */
element[strlen (element) - 1] = '\0';
html_context_handle_element (ctxt, element + 1, TRUE);
} else if (element[1] == '/') {
/* handle </blah> */
ctxt->parser->end_element (ctxt, element + 2, ctxt->user_data);
} else {
/* handle <blah> */
html_context_handle_element (ctxt, element + 1, FALSE);
g_free (element);
} else if (strchr (next, '<')) {
gchar *text = NULL;
gsize length;
next = string_token (next, "<", &text);
text = g_strstrip (text);
length = strlen (text);
ctxt->parser->text (ctxt, text, length, ctxt->user_data);
g_free (text);
} else {
gchar *text = (gchar *) next;
gsize length;
text = g_strstrip (text);
length = strlen (text);
ctxt->parser->text (ctxt, text, length, ctxt->user_data);
ctxt->buf = g_string_assign (ctxt->buf, "");
ctxt->buf = g_string_assign (ctxt->buf, next);
static gchar *
has_tag (GString * str, const gchar tag)
return strrchr (str->str, tag);
static void
sami_context_push_state (GstSamiContext * sctx, char state)
GST_LOG ("state %c", state);
g_string_append_c (sctx->state, state);
static void
sami_context_pop_state (GstSamiContext * sctx, char state)
GString *str = g_string_new ("");
GString *context_state = sctx->state;
int i;
GST_LOG ("state %c", state);
for (i = context_state->len - 1; i >= 0; i--) {
switch (context_state->str[i]) {
case ITALIC_TAG: /* <i> */
g_string_append (str, "</i>");
case SPAN_TAG: /* <span foreground= > */
g_string_append (str, "</span>");
case RUBY_TAG: /* <span size= > -- ruby */
case RT_TAG: /* ruby */
/* FIXME: support for furigana/ruby once implemented in pango */
g_string_append (sctx->rubybuf, "</span>");
if (has_tag (context_state, ITALIC_TAG)) {
g_string_append (sctx->rubybuf, "</i>");
if (context_state->str[i] == state) {
g_string_append (sctx->buf, str->str);
g_string_free (str, TRUE);
g_string_truncate (context_state, i);
if (state == CLEAR_TAG) {
g_string_append (sctx->buf, str->str);
g_string_truncate (context_state, 0);
g_string_free (str, TRUE);
static void
handle_start_sync (GstSamiContext * sctx, const gchar ** atts)
int i;
sami_context_pop_state (sctx, CLEAR_TAG);
if (atts != NULL) {
for (i = 0; (atts[i] != NULL); i += 2) {
const gchar *key, *value;
key = atts[i];
value = atts[i + 1];
if (!value)
if (!g_ascii_strcasecmp ("start", key)) {
/* Only set a new start time if we don't have text pending */
if (sctx->resultbuf->len == 0)
sctx->time1 = sctx->time2;
sctx->time2 = atoi ((const char *) value) * GST_MSECOND;
sctx->time2 = MAX (sctx->time2, sctx->time1);
g_string_append (sctx->resultbuf, sctx->buf->str);
sctx->has_result = (sctx->resultbuf->len != 0) ? TRUE : FALSE;
g_string_truncate (sctx->buf, 0);
static void
handle_start_font (GstSamiContext * sctx, const gchar ** atts)
int i;
sami_context_pop_state (sctx, SPAN_TAG);
if (atts != NULL) {
g_string_append (sctx->buf, "<span");
for (i = 0; (atts[i] != NULL); i += 2) {
const gchar *key, *value;
key = atts[i];
value = atts[i + 1];
if (!value)
if (!g_ascii_strcasecmp ("color", key)) {
* There are invalid color value in many
* sami files.
* It will fix hex color value that start without '#'
const gchar *sharp = "";
int len = strlen (value);
if (!(*value == '#' && len == 7)) {
gchar *r;
/* check if it looks like hex */
if (strtol ((const char *) value, &r, 16) >= 0 &&
((gchar *) r == (value + 6) && len == 6)) {
sharp = "#";
/* some colours can be found in many sami files, but X RGB database
* doesn't contain a colour by this name, so map explicitly */
if (!g_ascii_strcasecmp ("aqua", value)) {
value = "#00ffff";
} else if (!g_ascii_strcasecmp ("crimson", value)) {
value = "#dc143c";
} else if (!g_ascii_strcasecmp ("fuchsia", value)) {
value = "#ff00ff";
} else if (!g_ascii_strcasecmp ("indigo", value)) {
value = "#4b0082";
} else if (!g_ascii_strcasecmp ("lime", value)) {
value = "#00ff00";
} else if (!g_ascii_strcasecmp ("olive", value)) {
value = "#808000";
} else if (!g_ascii_strcasecmp ("silver", value)) {
value = "#c0c0c0";
} else if (!g_ascii_strcasecmp ("teal", value)) {
value = "#008080";
g_string_append_printf (sctx->buf, " foreground=\"%s%s\"", sharp,
} else if (!g_ascii_strcasecmp ("face", key)) {
g_string_append_printf (sctx->buf, " font_family=\"%s\"", value);
g_string_append_c (sctx->buf, '>');
sami_context_push_state (sctx, SPAN_TAG);
static void
handle_start_element (HtmlContext * ctx, const gchar * name,
const char **atts, gpointer user_data)
GstSamiContext *sctx = (GstSamiContext *) user_data;
GST_LOG ("name:%s", name);
if (!g_ascii_strcasecmp ("sync", name)) {
handle_start_sync (sctx, atts);
sctx->in_sync = TRUE;
} else if (!g_ascii_strcasecmp ("font", name)) {
handle_start_font (sctx, atts);
} else if (!g_ascii_strcasecmp ("ruby", name)) {
sami_context_push_state (sctx, RUBY_TAG);
} else if (!g_ascii_strcasecmp ("br", name)) {
g_string_append_c (sctx->buf, '\n');
/* FIXME: support for furigana/ruby once implemented in pango */
} else if (!g_ascii_strcasecmp ("rt", name)) {
if (has_tag (sctx->state, ITALIC_TAG)) {
g_string_append (sctx->rubybuf, "<i>");
g_string_append (sctx->rubybuf, "<span size='xx-small' rise='-100'>");
sami_context_push_state (sctx, RT_TAG);
} else if (!g_ascii_strcasecmp ("i", name)) {
g_string_append (sctx->buf, "<i>");
sami_context_push_state (sctx, ITALIC_TAG);
} else if (!g_ascii_strcasecmp ("p", name)) {
static void
handle_end_element (HtmlContext * ctx, const char *name, gpointer user_data)
GstSamiContext *sctx = (GstSamiContext *) user_data;
GST_LOG ("name:%s", name);
if (!g_ascii_strcasecmp ("sync", name)) {
sctx->in_sync = FALSE;
} else if ((!g_ascii_strcasecmp ("body", name)) ||
(!g_ascii_strcasecmp ("sami", name))) {
/* We will usually have one buffer left when the body is closed
* as we need the next sync to actually send it */
if (sctx->buf->len != 0) {
/* Only set a new start time if we don't have text pending */
if (sctx->resultbuf->len == 0)
sctx->time1 = sctx->time2;
sctx->time2 = GST_CLOCK_TIME_NONE;
g_string_append (sctx->resultbuf, sctx->buf->str);
sctx->has_result = (sctx->resultbuf->len != 0) ? TRUE : FALSE;
g_string_truncate (sctx->buf, 0);
} else if (!g_ascii_strcasecmp ("font", name)) {
sami_context_pop_state (sctx, SPAN_TAG);
} else if (!g_ascii_strcasecmp ("ruby", name)) {
sami_context_pop_state (sctx, RUBY_TAG);
} else if (!g_ascii_strcasecmp ("i", name)) {
sami_context_pop_state (sctx, ITALIC_TAG);
static void
handle_text (HtmlContext * ctx, const gchar * text, gsize text_len,
gpointer user_data)
GstSamiContext *sctx = (GstSamiContext *) user_data;
/* Skip everything except content of the sync elements */
if (!sctx->in_sync)
if (has_tag (sctx->state, RT_TAG)) {
g_string_append_c (sctx->rubybuf, ' ');
g_string_append (sctx->rubybuf, text);
g_string_append_c (sctx->rubybuf, ' ');
} else {
g_string_append (sctx->buf, text);
static HtmlParser samiParser = {
handle_start_element, /* start_element */
handle_end_element, /* end_element */
handle_text, /* text */
sami_context_init (ParserState * state)
GstSamiContext *context;
g_assert (state->user_data == NULL);
context = g_new0 (GstSamiContext, 1);
context->htmlctxt = html_context_new (&samiParser, context);
context->buf = g_string_new ("");
context->rubybuf = g_string_new ("");
context->resultbuf = g_string_new ("");
context->state = g_string_new ("");
state->user_data = context;
sami_context_deinit (ParserState * state)
GstSamiContext *context = (GstSamiContext *) state->user_data;
if (context) {
html_context_free (context->htmlctxt);
context->htmlctxt = NULL;
g_string_free (context->buf, TRUE);
g_string_free (context->rubybuf, TRUE);
g_string_free (context->resultbuf, TRUE);
g_string_free (context->state, TRUE);
g_free (context);
state->user_data = NULL;
sami_context_reset (ParserState * state)
GstSamiContext *context = (GstSamiContext *) state->user_data;
if (context) {
g_string_truncate (context->buf, 0);
g_string_truncate (context->rubybuf, 0);
g_string_truncate (context->resultbuf, 0);
g_string_truncate (context->state, 0);
context->has_result = FALSE;
context->in_sync = FALSE;
context->time1 = 0;
context->time2 = 0;
gchar *
parse_sami (ParserState * state, const gchar * line)
gchar *ret = NULL;
GstSamiContext *context = (GstSamiContext *) state->user_data;
gchar *unescaped = unescape_string (line);
html_context_parse (context->htmlctxt, (gchar *) unescaped,
strlen (unescaped));
g_free (unescaped);
if (context->has_result) {
if (context->rubybuf->len) {
context->rubybuf = g_string_append_c (context->rubybuf, '\n');
g_string_prepend (context->resultbuf, context->rubybuf->str);
context->rubybuf = g_string_truncate (context->rubybuf, 0);
ret = g_string_free (context->resultbuf, FALSE);
context->resultbuf = g_string_new ("");
state->start_time = context->time1;
state->duration = context->time2 - context->time1;
context->has_result = FALSE;
return ret;