tag: id3v2: Rewrite parsing of text tags to handle multiple NULL terminated strings. Parse numeric genre strings a...

Original commit message from CVS:
* gst-libs/gst/tag/id3v2.c: (id3demux_read_id3v2_tag):
* gst-libs/gst/tag/id3v2.h:
* gst-libs/gst/tag/id3v2frames.c: (id3demux_id3v2_parse_frame),
(parse_comment_frame), (parse_text_identification_frame),
(id3v2_tag_to_taglist), (id3v2_are_digits),
(id3v2_genre_string_to_taglist), (id3v2_genre_fields_to_taglist),
(parse_split_strings), (free_tag_strings):
Rewrite parsing of text tags to handle multiple NULL terminated
strings. Parse numeric genre strings and ID3v2 type
"(3)(6)Alternative" style genre strings.
Parse dates that are only YYYY or YYYY-mm format.
This commit is contained in:
Jan Schmidt 2006-01-23 09:22:17 +00:00 committed by Tim-Philipp Müller
parent a9c6822e3f
commit a6f7ebffa2
3 changed files with 236 additions and 84 deletions

View file

@ -198,6 +198,9 @@ id3demux_read_id3v2_tag (GstBuffer * buffer, guint * id3v2_size,
*tags = work.tags;
}
if (work.prev_genre)
g_free (work.prev_genre);
return result;
}

View file

@ -72,6 +72,9 @@ typedef struct {
guint8 *parse_data;
guint parse_size;
/* Previous genre string, for simple duplicate removal */
gchar *prev_genre;
} ID3TagsWorking;
enum {

View file

@ -23,6 +23,7 @@
#endif
#include <string.h>
#include <stdlib.h>
#include <gst/tag/tag.h>
#ifdef HAVE_ZLIB
@ -35,11 +36,16 @@ GST_DEBUG_CATEGORY_EXTERN (id3demux_debug);
#define GST_CAT_DEFAULT (id3demux_debug)
static gchar *parse_comment_frame (ID3TagsWorking * work);
static gchar *parse_text_identification_frame (ID3TagsWorking * work);
static GArray *parse_text_identification_frame (ID3TagsWorking * work);
static gboolean id3v2_tag_to_taglist (ID3TagsWorking * work,
const gchar * tag_name, gchar * tag_str);
static void parse_split_strings (ID3TagsWorking * work, guint8 encoding,
gchar ** field1, gchar ** field2);
const gchar * tag_name, const gchar * tag_str);
/* Parse a single string into an array of gchar* */
static void parse_split_strings (guint8 encoding, gchar * data, gint data_size,
GArray ** out_fields);
static void free_tag_strings (GArray * fields);
static gboolean
id3v2_genre_fields_to_taglist (ID3TagsWorking * work, const gchar * tag_name,
GArray * tag_fields);
#define ID3V2_ENCODING_ISO8859 0x00
#define ID3V2_ENCODING_UTF16 0x01
@ -57,6 +63,7 @@ id3demux_id3v2_parse_frame (ID3TagsWorking * work)
guint8 *frame_data = work->hdr.frame_data;
guint frame_data_size = work->cur_frame_size;
gchar *tag_str = NULL;
GArray *tag_fields = NULL;
/* Check that the frame id is valid */
for (i = 0; i < 5 && work->frame_id[i] != '\0'; i++) {
@ -118,7 +125,7 @@ id3demux_id3v2_parse_frame (ID3TagsWorking * work)
if (work->frame_id[0] == 'T') {
if (strcmp (work->frame_id, "TXXX") != 0) {
/* Text identification frame */
tag_str = parse_text_identification_frame (work);
tag_fields = parse_text_identification_frame (work);
} else {
/* Handle user text frame */
}
@ -142,6 +149,16 @@ id3demux_id3v2_parse_frame (ID3TagsWorking * work)
result = id3v2_tag_to_taglist (work, tag_name, tag_str);
g_free (tag_str);
}
if (tag_fields != NULL) {
if (strcmp (work->frame_id, "TCON") == 0) {
/* Genre strings need special treatment */
result |= id3v2_genre_fields_to_taglist (work, tag_name, tag_fields);
} else {
tag_str = g_array_index (tag_fields, gchar *, 0);
result |= id3v2_tag_to_taglist (work, tag_name, tag_str);
}
free_tag_strings (tag_fields);
}
return result;
}
@ -151,9 +168,9 @@ parse_comment_frame (ID3TagsWorking * work)
{
guint8 encoding;
gchar language[4];
gchar *description = NULL;
gchar *text = NULL;
GArray *fields = NULL;
gchar *out_str = NULL;
gchar *description, *text;
if (work->parse_size < 6)
return NULL;
@ -164,12 +181,15 @@ parse_comment_frame (ID3TagsWorking * work)
language[2] = work->parse_data[3];
language[3] = 0;
parse_split_strings (work, encoding, &description, &text);
parse_split_strings (encoding, (gchar *) work->parse_data + 4,
work->parse_size - 4, &fields);
if (text == NULL || description == NULL) {
if (fields == NULL || fields->len < 2) {
GST_WARNING ("Failed to decode comment frame");
goto fail;
}
description = g_array_index (fields, gchar *, 0);
text = g_array_index (fields, gchar *, 1);
if (!g_utf8_validate (text, -1, NULL)) {
GST_WARNING ("Converted string is not valid utf-8");
@ -184,53 +204,30 @@ parse_comment_frame (ID3TagsWorking * work)
}
fail:
g_free (description);
g_free (text);
free_tag_strings (fields);
return out_str;
}
static gchar *
static GArray *
parse_text_identification_frame (ID3TagsWorking * work)
{
guchar encoding;
gchar *text = NULL;
GArray *fields = NULL;
if (work->parse_size < 2)
return NULL;
encoding = work->parse_data[0];
parse_split_strings (encoding, (gchar *) work->parse_data + 1,
work->parse_size - 1, &fields);
switch (encoding) {
case ID3V2_ENCODING_ISO8859:
text = g_convert ((gchar *) (work->parse_data + 1),
work->parse_size - 1, "UTF-8", "ISO-8859-1", NULL, NULL, NULL);
break;
case ID3V2_ENCODING_UTF8:
text = g_strndup ((gchar *) (work->parse_data + 1), work->parse_size - 1);
break;
case ID3V2_ENCODING_UTF16:
text = g_convert ((gchar *) (work->parse_data + 1),
work->parse_size - 1, "UTF-8", "UTF-16", NULL, NULL, NULL);
break;
case ID3V2_ENCODING_UTF16BE:
text = g_convert ((gchar *) (work->parse_data + 1),
work->parse_size - 1, "UTF-8", "UTF-16BE", NULL, NULL, NULL);
break;
}
if (text != NULL && !g_utf8_validate (text, -1, NULL)) {
GST_WARNING ("Converted string is not valid utf-8");
g_free (text);
text = NULL;
}
return text;
return fields;
}
static gboolean
id3v2_tag_to_taglist (ID3TagsWorking * work, const gchar * tag_name,
gchar * tag_str)
const gchar * tag_str)
{
GType tag_type = gst_tag_get_type (tag_name);
GstTagList *tag_list = work->tags;
@ -243,17 +240,7 @@ id3v2_tag_to_taglist (ID3TagsWorking * work, const gchar * tag_name,
tmp = strtoul ((char *) tag_str, &check, 10);
if (strcmp (tag_name, GST_TAG_DATE) == 0) {
GDate *d;
if (*check != '\0')
break;
if (tmp == 0)
break;
d = g_date_new_dmy (1, 1, tmp);
tmp = g_date_get_julian (d);
g_date_free (d);
} else if (strcmp (tag_name, GST_TAG_TRACK_NUMBER) == 0) {
if (strcmp (tag_name, GST_TAG_TRACK_NUMBER) == 0) {
if (*check == '/') {
guint total;
@ -290,7 +277,7 @@ id3v2_tag_to_taglist (ID3TagsWorking * work, const gchar * tag_name,
guint64 tmp;
g_assert (strcmp (tag_name, GST_TAG_DURATION) == 0);
tmp = strtoul ((char *) tag_str, NULL, 10);
tmp = strtoul (tag_str, NULL, 10);
if (tmp == 0) {
break;
}
@ -299,19 +286,41 @@ id3v2_tag_to_taglist (ID3TagsWorking * work, const gchar * tag_name,
break;
}
case G_TYPE_STRING:{
if (!strcmp (tag_name, GST_TAG_GENRE)) {
if (work->prev_genre && !strcmp (tag_str, work->prev_genre))
break; /* Same as the last genre */
g_free (work->prev_genre);
work->prev_genre = g_strdup (tag_str);
}
gst_tag_list_add (tag_list, GST_TAG_MERGE_APPEND,
tag_name, (const gchar *) tag_str, NULL);
tag_name, tag_str, NULL);
break;
}
/* handles GST_TYPE_DATE and anything else */
default:{
gchar *tmp = NULL;
if (tag_type == GST_TYPE_DATE) {
guint year = 1901, month = 1, day = 1;
/* Dates can be yyyy-MM-dd, yyyy-MM or yyyy, but we need
* the first type */
if (sscanf (tag_str, "%04u-%02u-%02u", &year, &month, &day) == 0)
break;
tmp = g_strdup_printf ("%04u-%02u-%02u", year, month, day);
tag_str = tmp;
break;
}
/* handles anything else */
GValue src = { 0, };
GValue dest = { 0, };
g_value_init (&src, G_TYPE_STRING);
g_value_set_string (&src, (const gchar *) tag_str);
g_value_init (&dest, tag_type);
if (g_value_transform (&src, &dest)) {
gst_tag_list_add_values (tag_list, GST_TAG_MERGE_APPEND,
tag_name, &dest, NULL);
@ -319,8 +328,10 @@ id3v2_tag_to_taglist (ID3TagsWorking * work, const gchar * tag_name,
GST_WARNING ("Failed to transform tag from string to type '%s'",
g_type_name (tag_type));
}
g_value_unset (&src);
g_value_unset (&dest);
g_free (tmp);
break;
}
}
@ -328,61 +339,196 @@ id3v2_tag_to_taglist (ID3TagsWorking * work, const gchar * tag_name,
return TRUE;
}
static void
parse_split_strings (ID3TagsWorking * work, guint8 encoding,
gchar ** field1, gchar ** field2)
/* Check that an array of characters contains only digits */
static gboolean
id3v2_are_digits (const gchar * chars, gint size)
{
guint text_pos;
gint i;
*field1 = *field2 = NULL;
for (i = 0; i < size; i++) {
if (!g_ascii_isdigit (chars[i]))
return FALSE;
}
return TRUE;
}
static gboolean
id3v2_genre_string_to_taglist (ID3TagsWorking * work, const gchar * tag_name,
const gchar * tag_str, gint len)
{
g_return_val_if_fail (tag_str != NULL, FALSE);
/* If it's a number, it might be a defined genre */
if (id3v2_are_digits (tag_str, len)) {
tag_str = gst_tag_id3_genre_get (strtol (tag_str, NULL, 10));
if (tag_str != NULL)
return id3v2_tag_to_taglist (work, tag_name, tag_str);
}
/* Otherwise it might be "RX" or "CR" */
if (len == 2) {
if (g_ascii_strncasecmp ("rx", tag_str, len) == 0)
return id3v2_tag_to_taglist (work, tag_name, "Remix");
if (g_ascii_strncasecmp ("cr", tag_str, len) == 0)
return id3v2_tag_to_taglist (work, tag_name, "Cover");
}
/* Otherwise it's a string */
return id3v2_tag_to_taglist (work, tag_name, tag_str);
}
static gboolean
id3v2_genre_fields_to_taglist (ID3TagsWorking * work, const gchar * tag_name,
GArray * tag_fields)
{
gchar *tag_str = NULL;
gboolean result = FALSE;
gint i;
for (i = 0; i < tag_fields->len; i++) {
gint len;
tag_str = g_array_index (tag_fields, gchar *, 0);
if (tag_str == NULL)
continue;
len = strlen (tag_str);
if (work->hdr.version <= 0x300) { /* <= 2.3.0 */
/* Check for genre numbers wrapped in parentheses, possibly
* followed by a string */
while (len >= 2) {
gint pos;
gboolean found = FALSE;
/* Double parenthesis ends the numeric genres */
if (tag_str[0] == '(' && tag_str[1] == '(')
break;
for (pos = 1; pos < len; pos++) {
if (tag_str[pos] == ')') {
gchar *tmp_str;
tmp_str = g_strndup (tag_str + 1, pos - 1);
result |=
id3v2_genre_string_to_taglist (work, tag_name, tmp_str,
pos - 1);
g_free (tmp_str);
tag_str += pos + 1;
len -= pos + 1;
found = TRUE;
break;
}
}
if (!found)
break; /* There was no closing parenthesis */
}
}
if (len > 0)
result |= id3v2_genre_string_to_taglist (work, tag_name, tag_str, len);
}
return result;
}
static void
parse_split_strings (guint8 encoding, gchar * data, gint data_size,
GArray ** out_fields)
{
GArray *fields = g_array_new (FALSE, TRUE, sizeof (gchar *));
gchar *field;
gint text_pos;
gint prev = 0;
g_return_if_fail (out_fields != NULL);
switch (encoding) {
case ID3V2_ENCODING_ISO8859:
for (text_pos = 4; text_pos < work->parse_size - 5; text_pos++) {
if (work->parse_data[text_pos] == 0) {
*field1 = g_convert ((gchar *) (work->parse_data + 4),
text_pos - 4, "UTF-8", "ISO-8859-1", NULL, NULL, NULL);
*field2 = g_convert ((gchar *) (work->parse_data + text_pos + 5),
work->parse_size - text_pos - 5,
for (text_pos = 0; text_pos < data_size; text_pos++) {
if (data[text_pos] == 0) {
field = g_convert (data + prev, text_pos - prev + 1,
"UTF-8", "ISO-8859-1", NULL, NULL, NULL);
break;
if (field)
g_array_append_val (fields, field);
prev = text_pos + 1;
}
}
if (data_size - prev > 0 && data[prev] != 0x00) {
field = g_convert (data + prev, data_size - prev,
"UTF-8", "ISO-8859-1", NULL, NULL, NULL);
if (field)
g_array_append_val (fields, field);
}
break;
case ID3V2_ENCODING_UTF8:
*field1 = g_strndup ((gchar *) (work->parse_data + 4),
work->parse_size - 4);
text_pos = 4 + strlen (*field1) + 1; /* Offset by one more for the null */
if (text_pos < work->parse_size) {
*field2 = g_strndup ((gchar *) (work->parse_data + text_pos),
work->parse_size - text_pos);
for (prev = 0, text_pos = 0; text_pos < data_size; text_pos++) {
if (data[text_pos]) {
field = g_strndup (data + prev, text_pos - prev + 1);
if (field)
g_array_append_val (fields, field);
prev = text_pos + 1;
}
}
if (data_size - prev > 0 && data[prev] != 0x00) {
field = g_strndup (data + prev, data_size - prev);
if (field)
g_array_append_val (fields, field);
}
break;
case ID3V2_ENCODING_UTF16:
case ID3V2_ENCODING_UTF16BE:
{
/* Find '\0\0' terminator */
for (text_pos = 4; text_pos < work->parse_size - 6; text_pos++) {
if (work->parse_data[text_pos] == 0 &&
work->parse_data[text_pos + 1] == 0) {
/* found our delimiter */
for (text_pos = 0; text_pos < data_size - 1; text_pos += 2) {
if (data[text_pos] == 0 && data[text_pos + 1] == 0) {
/* found a delimiter */
if (encoding == ID3V2_ENCODING_UTF16) {
*field1 = g_convert ((gchar *) (work->parse_data + 4),
text_pos - 4, "UTF-8", "UTF-16", NULL, NULL, NULL);
*field2 = g_convert ((gchar *) (work->parse_data + text_pos + 6),
work->parse_size - text_pos - 6,
field = g_convert (data + prev, text_pos - prev + 2,
"UTF-8", "UTF-16", NULL, NULL, NULL);
} else {
*field1 = g_convert ((gchar *) (work->parse_data + 4),
text_pos - 4, "UTF-8", "UTF-16BE", NULL, NULL, NULL);
*field2 = g_convert ((gchar *) (work->parse_data + text_pos + 6),
work->parse_size - text_pos - 6,
field = g_convert (data + prev, text_pos - prev + 2,
"UTF-8", "UTF-16BE", NULL, NULL, NULL);
}
if (field)
g_array_append_val (fields, field);
text_pos++; /* Advance to the 2nd NULL terminator */
prev = text_pos + 1;
break;
}
}
if (data_size - prev > 1 &&
(data[prev] != 0x00 || data[prev + 1] != 0x00)) {
/* There were 2 or more non-null chars left, convert those too */
if (encoding == ID3V2_ENCODING_UTF16) {
field = g_convert (data + prev, data_size - prev,
"UTF-8", "UTF-16", NULL, NULL, NULL);
} else {
field = g_convert (data + prev, data_size - prev,
"UTF-8", "UTF-16BE", NULL, NULL, NULL);
}
if (field)
g_array_append_val (fields, field);
}
break;
}
}
if (fields->len > 0)
*out_fields = fields;
else
g_array_free (fields, TRUE);
}
static void
free_tag_strings (GArray * fields)
{
if (fields) {
gint i;
gchar *c;
for (i = 0; i < fields->len; i++) {
c = g_array_index (fields, gchar *, i);
g_free (c);
}
g_array_free (fields, TRUE);
}
}