mirror of
https://gitlab.freedesktop.org/gstreamer/gstreamer.git
synced 2025-02-17 03:35:21 +00:00
gst/subparse/gstsubparse.*: Text subtitle files may or may not be UTF-8. If it's not, we don't really want to see '?'...
Original commit message from CVS: * gst/subparse/gstsubparse.c: (convert_encoding), (gst_sub_parse_change_state): * gst/subparse/gstsubparse.h: Text subtitle files may or may not be UTF-8. If it's not, we don't really want to see '?' characters in place of non-ASCII characters like accented characters. So let's assume the input is UTF-8 until we come across text that is clearly not. If it's not UTF-8, we don't really know what it is, so try the following: (a) see whether the GST_SUBTITLE_ENCODING environment variable is set; if not, check (b) if the current locale encoding is non-UTF-8 and use that if it is, or (c) assume ISO-8859-15 if the current locale encoding is UTF-8 and the environment variable was not set to any particular encoding. Not perfect, but better than nothing (and better than before, I think) (fixes #172848).
This commit is contained in:
parent
e7acd7aac6
commit
2ecb455728
3 changed files with 54 additions and 28 deletions
17
ChangeLog
17
ChangeLog
|
@ -1,3 +1,20 @@
|
|||
2006-03-24 Tim-Philipp Müller <tim at centricular dot net>
|
||||
|
||||
* gst/subparse/gstsubparse.c: (convert_encoding),
|
||||
(gst_sub_parse_change_state):
|
||||
* gst/subparse/gstsubparse.h:
|
||||
Text subtitle files may or may not be UTF-8. If it's not, we
|
||||
don't really want to see '?' characters in place of non-ASCII
|
||||
characters like accented characters. So let's assume the input
|
||||
is UTF-8 until we come across text that is clearly not. If it's
|
||||
not UTF-8, we don't really know what it is, so try the following:
|
||||
(a) see whether the GST_SUBTITLE_ENCODING environment variable
|
||||
is set; if not, check (b) if the current locale encoding is
|
||||
non-UTF-8 and use that if it is, or (c) assume ISO-8859-15 if
|
||||
the current locale encoding is UTF-8 and the environment variable
|
||||
was not set to any particular encoding. Not perfect, but better
|
||||
than nothing (and better than before, I think) (fixes #172848).
|
||||
|
||||
2006-03-24 Thomas Vander Stichele <thomas at apestaart dot org>
|
||||
|
||||
* configure.ac:
|
||||
|
|
|
@ -230,38 +230,45 @@ beach:
|
|||
static gchar *
|
||||
convert_encoding (GstSubParse * self, const gchar * str, gsize len)
|
||||
{
|
||||
gsize bytes_read, bytes_written;
|
||||
gchar *rv;
|
||||
GString *converted;
|
||||
const gchar *encoding;
|
||||
GError *err = NULL;
|
||||
gchar *ret;
|
||||
|
||||
converted = g_string_new (NULL);
|
||||
while (len) {
|
||||
#ifndef GST_DISABLE_GST_DEBUG
|
||||
gchar *dbg = g_strndup (str, len);
|
||||
|
||||
GST_DEBUG ("Trying to convert '%s'", dbg);
|
||||
g_free (dbg);
|
||||
#endif
|
||||
|
||||
rv = g_locale_to_utf8 (str, len, &bytes_read, &bytes_written, NULL);
|
||||
if (rv) {
|
||||
g_string_append_len (converted, rv, bytes_written);
|
||||
g_free (rv);
|
||||
|
||||
len -= bytes_read;
|
||||
str += bytes_read;
|
||||
if (self->valid_utf8) {
|
||||
if (g_utf8_validate (str, len, NULL)) {
|
||||
GST_LOG_OBJECT (self, "valid UTF-8, no conversion needed");
|
||||
return g_strndup (str, len);
|
||||
}
|
||||
if (len) {
|
||||
/* conversion error ocurred => skip one char */
|
||||
len--;
|
||||
str++;
|
||||
g_string_append_c (converted, '?');
|
||||
GST_INFO_OBJECT (self, "invalid UTF-8!");
|
||||
self->valid_utf8 = FALSE;
|
||||
}
|
||||
|
||||
encoding = g_getenv ("GST_SUBTITLE_ENCODING");
|
||||
if (encoding == NULL || *encoding == '\0') {
|
||||
/* if local encoding is UTF-8 and no encoding specified
|
||||
* via the environment variable, assume ISO-8859-15 */
|
||||
if (g_get_charset (&encoding)) {
|
||||
encoding = "ISO-8859-15";
|
||||
}
|
||||
}
|
||||
rv = converted->str;
|
||||
g_string_free (converted, FALSE);
|
||||
GST_DEBUG ("Converted to '%s'", rv);
|
||||
return rv;
|
||||
|
||||
ret = g_convert_with_fallback (str, len, "UTF-8", encoding, "*", NULL,
|
||||
NULL, &err);
|
||||
|
||||
if (err) {
|
||||
GST_WARNING_OBJECT (self, "could not convert string from '%s' to UTF-8: %s",
|
||||
encoding, err->message);
|
||||
g_error_free (err);
|
||||
|
||||
/* invalid input encoding, fall back to ISO-8859-15 (always succeeds) */
|
||||
ret = g_convert_with_fallback (str, len, "UTF-8", "ISO-8859-15", "*",
|
||||
NULL, NULL, NULL);
|
||||
}
|
||||
|
||||
GST_LOG_OBJECT (self, "successfully converted %d characters from %s to UTF-8"
|
||||
"%s", len, encoding, (err) ? " , using ISO-8859-15 as fallback" : "");
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static gchar *
|
||||
|
@ -833,6 +840,7 @@ gst_sub_parse_change_state (GstElement * element, GstStateChange transition)
|
|||
/* format detection will init the parser state */
|
||||
self->offset = self->next_offset = 0;
|
||||
self->parser_type = GST_SUB_PARSE_FORMAT_UNKNOWN;
|
||||
self->valid_utf8 = TRUE;
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
|
|
|
@ -81,6 +81,7 @@ struct _GstSubParse {
|
|||
gboolean need_segment;
|
||||
|
||||
gboolean flushing;
|
||||
gboolean valid_utf8;
|
||||
};
|
||||
|
||||
struct _GstSubParseClass {
|
||||
|
|
Loading…
Reference in a new issue