mirror of
https://gitlab.freedesktop.org/gstreamer/gstreamer.git
synced 2025-01-14 03:15:47 +00:00
tag: id3v2: Rework string parsing to always walk over BOM markers in UTF16 strings, using the endianness indicated by the innermost one ...
Original commit message from CVS: * gst/autodetect/gstautoaudiosink.c: (gst_auto_audio_sink_find_best): * gst/autodetect/gstautovideosink.c: (gst_auto_video_sink_find_best): Make the name of the child element be based on the name of the parent, so that debug output is more useful. * gst-libs/gst/tag/id3v2frames.c: (find_utf16_bom), (parse_insert_string_field), (parse_split_strings): Rework string parsing to always walk over BOM markers in UTF16 strings, using the endianness indicated by the innermost one, then trying the opposite endianness if that fails to convert to valid UTF-8. Fixes #341774
This commit is contained in:
parent
2fd7d6c3eb
commit
5b67108b7d
1 changed files with 69 additions and 43 deletions
|
@ -667,39 +667,21 @@ id3v2_genre_fields_to_taglist (ID3TagsWorking * work, const gchar * tag_name,
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void
|
static const gchar utf16enc[] = "UTF-16";
|
||||||
parse_insert_string_field (const gchar * encoding, gchar * data, gint data_size,
|
static const gchar utf16leenc[] = "UTF-16LE";
|
||||||
GArray * fields)
|
static const gchar utf16beenc[] = "UTF-16BE";
|
||||||
{
|
|
||||||
gchar *field = NULL;
|
|
||||||
|
|
||||||
if (strcmp (encoding, "UTF-8") != 0) {
|
|
||||||
field = g_convert (data, data_size, "UTF-8", encoding, NULL, NULL, NULL);
|
|
||||||
if (field == NULL) {
|
|
||||||
GST_WARNING ("could not convert string from %s to UTF-8. Ignoring",
|
|
||||||
encoding);
|
|
||||||
}
|
|
||||||
} else if (g_utf8_validate (data, data_size, NULL)) {
|
|
||||||
field = g_strndup (data, data_size);
|
|
||||||
} else {
|
|
||||||
GST_WARNING ("alleged UTF-8 string is not valid UTF-8. Ignoring");
|
|
||||||
}
|
|
||||||
|
|
||||||
if (field)
|
|
||||||
g_array_append_val (fields, field);
|
|
||||||
}
|
|
||||||
|
|
||||||
static gboolean
|
static gboolean
|
||||||
has_utf16_bom (gchar * data, const gchar ** p_in_encoding)
|
find_utf16_bom (gchar * data, const gchar ** p_in_encoding)
|
||||||
{
|
{
|
||||||
guint16 marker = (GST_READ_UINT8 (data) << 8) | GST_READ_UINT8 (data + 1);
|
guint16 marker = (GST_READ_UINT8 (data) << 8) | GST_READ_UINT8 (data + 1);
|
||||||
|
|
||||||
switch (marker) {
|
switch (marker) {
|
||||||
case 0xFFFE:
|
case 0xFFFE:
|
||||||
*p_in_encoding = "UTF16LE";
|
*p_in_encoding = utf16leenc;
|
||||||
return TRUE;
|
return TRUE;
|
||||||
case 0xFEFF:
|
case 0xFEFF:
|
||||||
*p_in_encoding = "UTF16BE";
|
*p_in_encoding = utf16beenc;
|
||||||
return TRUE;
|
return TRUE;
|
||||||
default:
|
default:
|
||||||
break;
|
break;
|
||||||
|
@ -707,6 +689,63 @@ has_utf16_bom (gchar * data, const gchar ** p_in_encoding)
|
||||||
return FALSE;
|
return FALSE;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
parse_insert_string_field (guint8 encoding, gchar * data, gint data_size,
|
||||||
|
GArray * fields)
|
||||||
|
{
|
||||||
|
gchar *field = NULL;
|
||||||
|
|
||||||
|
switch (encoding) {
|
||||||
|
case ID3V2_ENCODING_UTF16:
|
||||||
|
case ID3V2_ENCODING_UTF16BE:
|
||||||
|
{
|
||||||
|
const gchar *in_encode;
|
||||||
|
|
||||||
|
if (encoding == ID3V2_ENCODING_UTF16)
|
||||||
|
in_encode = utf16enc;
|
||||||
|
else
|
||||||
|
in_encode = utf16beenc;
|
||||||
|
|
||||||
|
/* Sometimes we see strings with multiple BOM markers at the start.
|
||||||
|
* In that case, we assume the innermost one is correct. If that fails
|
||||||
|
* to produce valid UTF-8, we try the other endianness anyway */
|
||||||
|
while (data_size > 2 && find_utf16_bom (data, &in_encode)) {
|
||||||
|
data += 2; /* skip BOM */
|
||||||
|
data_size -= 2;
|
||||||
|
}
|
||||||
|
|
||||||
|
field = g_convert (data, data_size, "UTF-8", in_encode, NULL, NULL, NULL);
|
||||||
|
|
||||||
|
if (field == NULL || g_utf8_validate (field, -1, NULL) == FALSE) {
|
||||||
|
/* As a fallback, try interpreting UTF-16 in the other endianness */
|
||||||
|
if (in_encode == utf16beenc)
|
||||||
|
field = g_convert (data, data_size, "UTF-8", utf16leenc,
|
||||||
|
NULL, NULL, NULL);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
break;
|
||||||
|
case ID3V2_ENCODING_ISO8859:
|
||||||
|
field = g_convert (data, data_size, "UTF-8", "ISO-8859-1",
|
||||||
|
NULL, NULL, NULL);
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
field = g_strndup (data, data_size);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (field) {
|
||||||
|
if (g_utf8_validate (field, -1, NULL)) {
|
||||||
|
g_array_append_val (fields, field);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
GST_DEBUG ("%s was bad UTF-8 after conversion from encoding %d. Ignoring",
|
||||||
|
field, encoding);
|
||||||
|
g_free (field);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
static void
|
static void
|
||||||
parse_split_strings (guint8 encoding, gchar * data, gint data_size,
|
parse_split_strings (guint8 encoding, gchar * data, gint data_size,
|
||||||
GArray ** out_fields)
|
GArray ** out_fields)
|
||||||
|
@ -721,13 +760,13 @@ parse_split_strings (guint8 encoding, gchar * data, gint data_size,
|
||||||
case ID3V2_ENCODING_ISO8859:
|
case ID3V2_ENCODING_ISO8859:
|
||||||
for (text_pos = 0; text_pos < data_size; text_pos++) {
|
for (text_pos = 0; text_pos < data_size; text_pos++) {
|
||||||
if (data[text_pos] == 0) {
|
if (data[text_pos] == 0) {
|
||||||
parse_insert_string_field ("ISO-8859-1", data + prev,
|
parse_insert_string_field (encoding, data + prev,
|
||||||
text_pos - prev + 1, fields);
|
text_pos - prev + 1, fields);
|
||||||
prev = text_pos + 1;
|
prev = text_pos + 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (data_size - prev > 0 && data[prev] != 0x00) {
|
if (data_size - prev > 0 && data[prev] != 0x00) {
|
||||||
parse_insert_string_field ("ISO-8859-1", data + prev,
|
parse_insert_string_field (encoding, data + prev,
|
||||||
data_size - prev, fields);
|
data_size - prev, fields);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -735,34 +774,24 @@ parse_split_strings (guint8 encoding, gchar * data, gint data_size,
|
||||||
case ID3V2_ENCODING_UTF8:
|
case ID3V2_ENCODING_UTF8:
|
||||||
for (prev = 0, text_pos = 0; text_pos < data_size; text_pos++) {
|
for (prev = 0, text_pos = 0; text_pos < data_size; text_pos++) {
|
||||||
if (data[text_pos] == '\0') {
|
if (data[text_pos] == '\0') {
|
||||||
parse_insert_string_field ("UTF-8", data + prev,
|
parse_insert_string_field (encoding, data + prev,
|
||||||
text_pos - prev + 1, fields);
|
text_pos - prev + 1, fields);
|
||||||
prev = text_pos + 1;
|
prev = text_pos + 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (data_size - prev > 0 && data[prev] != 0x00) {
|
if (data_size - prev > 0 && data[prev] != 0x00) {
|
||||||
parse_insert_string_field ("UTF-8", data + prev,
|
parse_insert_string_field (encoding, data + prev,
|
||||||
data_size - prev, fields);
|
data_size - prev, fields);
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
case ID3V2_ENCODING_UTF16:
|
case ID3V2_ENCODING_UTF16:
|
||||||
case ID3V2_ENCODING_UTF16BE:
|
case ID3V2_ENCODING_UTF16BE:
|
||||||
{
|
{
|
||||||
const gchar *in_encode;
|
|
||||||
|
|
||||||
if (encoding == ID3V2_ENCODING_UTF16)
|
|
||||||
in_encode = "UTF-16";
|
|
||||||
else
|
|
||||||
in_encode = "UTF-16BE";
|
|
||||||
|
|
||||||
/* Find '\0\0' terminator */
|
/* Find '\0\0' terminator */
|
||||||
for (text_pos = 0; text_pos < data_size - 1; text_pos += 2) {
|
for (text_pos = 0; text_pos < data_size - 1; text_pos += 2) {
|
||||||
if (data[text_pos] == '\0' && data[text_pos + 1] == '\0') {
|
if (data[text_pos] == '\0' && data[text_pos + 1] == '\0') {
|
||||||
if (has_utf16_bom (data + prev, &in_encode)) {
|
|
||||||
prev += 2; /* skip BOM */
|
|
||||||
}
|
|
||||||
/* found a delimiter */
|
/* found a delimiter */
|
||||||
parse_insert_string_field (in_encode, data + prev,
|
parse_insert_string_field (encoding, data + prev,
|
||||||
text_pos - prev + 2, fields);
|
text_pos - prev + 2, fields);
|
||||||
text_pos++; /* Advance to the 2nd NULL terminator */
|
text_pos++; /* Advance to the 2nd NULL terminator */
|
||||||
prev = text_pos + 1;
|
prev = text_pos + 1;
|
||||||
|
@ -771,11 +800,8 @@ parse_split_strings (guint8 encoding, gchar * data, gint data_size,
|
||||||
}
|
}
|
||||||
if (data_size - prev > 1 &&
|
if (data_size - prev > 1 &&
|
||||||
(data[prev] != 0x00 || data[prev + 1] != 0x00)) {
|
(data[prev] != 0x00 || data[prev + 1] != 0x00)) {
|
||||||
if (has_utf16_bom (data + prev, &in_encode)) {
|
|
||||||
prev += 2; /* skip BOM */
|
|
||||||
}
|
|
||||||
/* There were 2 or more non-null chars left, convert those too */
|
/* There were 2 or more non-null chars left, convert those too */
|
||||||
parse_insert_string_field (in_encode, data + prev,
|
parse_insert_string_field (encoding, data + prev,
|
||||||
data_size - prev, fields);
|
data_size - prev, fields);
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
|
Loading…
Reference in a new issue