gst/subparse/gstsubparse.*: Text subtitle files may or may not be UTF-8. If it's not, we don't really want to see '?'...

Original commit message from CVS:
* gst/subparse/gstsubparse.c: (convert_encoding),
(gst_sub_parse_change_state):
* gst/subparse/gstsubparse.h:
Text subtitle files may or may not be UTF-8. If it's not, we
don't really want to see '?' characters in place of non-ASCII
characters like accented characters. So let's assume the input
is UTF-8 until we come across text that is clearly not. If it's
not UTF-8, we don't really know what it is, so try the following:
(a) see whether the GST_SUBTITLE_ENCODING environment variable
is set; if not, check (b) if the current locale encoding is
non-UTF-8 and use that if it is, or (c) assume ISO-8859-15 if
the current locale encoding is UTF-8 and the environment variable
was not set to any particular encoding. Not perfect, but better
than nothing (and better than before, I think) (fixes #172848).
This commit is contained in:
Tim-Philipp Müller 2006-03-24 17:57:39 +00:00
parent e7acd7aac6
commit 2ecb455728
3 changed files with 54 additions and 28 deletions

View file

@ -1,3 +1,20 @@
2006-03-24 Tim-Philipp Müller <tim at centricular dot net>
* gst/subparse/gstsubparse.c: (convert_encoding),
(gst_sub_parse_change_state):
* gst/subparse/gstsubparse.h:
Text subtitle files may or may not be UTF-8. If it's not, we
don't really want to see '?' characters in place of non-ASCII
characters like accented characters. So let's assume the input
is UTF-8 until we come across text that is clearly not. If it's
not UTF-8, we don't really know what it is, so try the following:
(a) see whether the GST_SUBTITLE_ENCODING environment variable
is set; if not, check (b) if the current locale encoding is
non-UTF-8 and use that if it is, or (c) assume ISO-8859-15 if
the current locale encoding is UTF-8 and the environment variable
was not set to any particular encoding. Not perfect, but better
than nothing (and better than before, I think) (fixes #172848).
2006-03-24 Thomas Vander Stichele <thomas at apestaart dot org>
* configure.ac:

View file

@ -230,38 +230,45 @@ beach:
static gchar *
convert_encoding (GstSubParse * self, const gchar * str, gsize len)
{
gsize bytes_read, bytes_written;
gchar *rv;
GString *converted;
const gchar *encoding;
GError *err = NULL;
gchar *ret;
converted = g_string_new (NULL);
while (len) {
#ifndef GST_DISABLE_GST_DEBUG
gchar *dbg = g_strndup (str, len);
GST_DEBUG ("Trying to convert '%s'", dbg);
g_free (dbg);
#endif
rv = g_locale_to_utf8 (str, len, &bytes_read, &bytes_written, NULL);
if (rv) {
g_string_append_len (converted, rv, bytes_written);
g_free (rv);
len -= bytes_read;
str += bytes_read;
if (self->valid_utf8) {
if (g_utf8_validate (str, len, NULL)) {
GST_LOG_OBJECT (self, "valid UTF-8, no conversion needed");
return g_strndup (str, len);
}
if (len) {
/* conversion error ocurred => skip one char */
len--;
str++;
g_string_append_c (converted, '?');
GST_INFO_OBJECT (self, "invalid UTF-8!");
self->valid_utf8 = FALSE;
}
encoding = g_getenv ("GST_SUBTITLE_ENCODING");
if (encoding == NULL || *encoding == '\0') {
/* if local encoding is UTF-8 and no encoding specified
* via the environment variable, assume ISO-8859-15 */
if (g_get_charset (&encoding)) {
encoding = "ISO-8859-15";
}
}
rv = converted->str;
g_string_free (converted, FALSE);
GST_DEBUG ("Converted to '%s'", rv);
return rv;
ret = g_convert_with_fallback (str, len, "UTF-8", encoding, "*", NULL,
NULL, &err);
if (err) {
GST_WARNING_OBJECT (self, "could not convert string from '%s' to UTF-8: %s",
encoding, err->message);
g_error_free (err);
/* invalid input encoding, fall back to ISO-8859-15 (always succeeds) */
ret = g_convert_with_fallback (str, len, "UTF-8", "ISO-8859-15", "*",
NULL, NULL, NULL);
}
GST_LOG_OBJECT (self, "successfully converted %d characters from %s to UTF-8"
"%s", len, encoding, (err) ? " , using ISO-8859-15 as fallback" : "");
return ret;
}
static gchar *
@ -833,6 +840,7 @@ gst_sub_parse_change_state (GstElement * element, GstStateChange transition)
/* format detection will init the parser state */
self->offset = self->next_offset = 0;
self->parser_type = GST_SUB_PARSE_FORMAT_UNKNOWN;
self->valid_utf8 = TRUE;
break;
default:
break;

View file

@ -81,6 +81,7 @@ struct _GstSubParse {
gboolean need_segment;
gboolean flushing;
gboolean valid_utf8;
};
struct _GstSubParseClass {