typefind: typefind UTF-16 and UTF-32

This avoids the MP3 typefinder from getting the highest score
every time it thinks there's something it might possibly be
able to parse.

https://bugzilla.gnome.org/show_bug.cgi?id=607619
This commit is contained in:
Vincent Penquerc'h 2011-09-30 20:00:50 +01:00
parent c554463025
commit e67aa28de9
2 changed files with 158 additions and 1 deletions

View file

@ -7,7 +7,7 @@ libgsttypefindfunctions_la_CFLAGS = \
libgsttypefindfunctions_la_LDFLAGS = $(GST_PLUGIN_LDFLAGS)
libgsttypefindfunctions_la_LIBADD = \
$(top_builddir)/gst-libs/gst/pbutils/libgstpbutils-@GST_MAJORMINOR@.la \
$(GST_LIBS) $(GIO_LIBS)
$(GST_BASE_LIBS) $(GST_LIBS) $(GIO_LIBS)
libgsttypefindfunctions_la_LIBTOOLFLAGS = --tag=disable-static

View file

@ -41,6 +41,7 @@
#include <ctype.h>
#include <gst/pbutils/pbutils.h>
#include <gst/base/gstbytereader.h>
GST_DEBUG_CATEGORY_STATIC (type_find_debug);
#define GST_CAT_DEFAULT type_find_debug
@ -206,6 +207,157 @@ utf8_type_find (GstTypeFind * tf, gpointer unused)
gst_type_find_suggest (tf, (start_prob + mid_prob) / 2, UTF8_CAPS);
}
/*** text/utf-16 and text/utf-32} ***/
/* While UTF-8 is unicode too, using text/plain for UTF-16 and UTF-32
is going to break stuff. */
typedef struct
{
size_t bomlen;
const char *const bom;
gboolean (*checker) (const guint8 *, gint, gint);
int boost;
int endianness;
} GstUnicodeTester;
static gboolean
check_utf16 (const guint8 * data, gint len, gint endianness)
{
GstByteReader br;
guint16 high, low;
if (len & 1)
return FALSE;
gst_byte_reader_init (&br, data, len);
while (len >= 2) {
/* test first for a single 16 bit value in the BMP */
if (endianness == G_BIG_ENDIAN)
gst_byte_reader_get_uint16_be (&br, &high);
else
gst_byte_reader_get_uint16_le (&br, &high);
if (high >= 0xD800 && high <= 0xDBFF) {
/* start of a surrogate pair */
if (len < 4)
return FALSE;
len -= 2;
if (endianness == G_BIG_ENDIAN)
gst_byte_reader_get_uint16_be (&br, &low);
else
gst_byte_reader_get_uint16_le (&br, &low);
if (low >= 0xDC00 && low <= 0xDFFF) {
/* second half of the surrogate pair */
} else
return FALSE;
} else {
if (high >= 0xDC00 && high <= 0xDFFF)
return FALSE;
}
len -= 2;
}
return TRUE;
}
static gboolean
check_utf32 (const guint8 * data, gint len, gint endianness)
{
if (len & 3)
return FALSE;
while (len > 3) {
guint32 v;
if (endianness == G_BIG_ENDIAN)
v = (data[0] << 24) | (data[1] << 16) | (data[2] << 8) | data[3];
else
v = (data[3] << 24) | (data[2] << 16) | (data[1] << 8) | data[0];
if (v >= 0x10FFFF)
return FALSE;
data += 4;
len -= 4;
}
return TRUE;
}
static void
unicode_type_find (GstTypeFind * tf, const GstUnicodeTester * tester,
guint n_tester, const char *media_type)
{
size_t n;
gint len = 4;
const guint8 *data = gst_type_find_peek (tf, 0, len);
int prob = -1;
const gint max_scan_size = 256 * 1024;
int endianness;
if (!data) {
len = 2;
data = gst_type_find_peek (tf, 0, len);
if (!data)
return;
}
/* find a large enough size that works */
while (len < max_scan_size) {
size_t newlen = len << 1;
const guint8 *newdata = gst_type_find_peek (tf, 0, newlen);
if (!newdata)
break;
len = newlen;
data = newdata;
}
for (n = 0; n < n_tester; ++n) {
int bom_boost = 0, tmpprob;
if (len >= tester[n].bomlen) {
if (!memcmp (data, tester[n].bom, tester[n].bomlen))
bom_boost = tester[n].boost;
}
if (!(*tester[n].checker) (data, len, tester[n].endianness))
continue;
tmpprob = GST_TYPE_FIND_POSSIBLE - 20 + bom_boost;
if (tmpprob > prob) {
prob = tmpprob;
endianness = tester[n].endianness;
}
}
if (prob > 0) {
GST_DEBUG ("This is valid %s %s", media_type,
endianness == G_BIG_ENDIAN ? "be" : "le");
gst_type_find_suggest_simple (tf, prob, media_type,
"endianness", G_TYPE_INT, endianness, NULL);
}
}
static GstStaticCaps utf16_caps = GST_STATIC_CAPS ("text/utf-16");
#define UTF16_CAPS gst_static_caps_get(&utf16_caps)
static void
utf16_type_find (GstTypeFind * tf, gpointer unused)
{
static const GstUnicodeTester utf16tester[2] = {
{2, "\xff\xfe", check_utf16, 10, G_LITTLE_ENDIAN},
{2, "\xfe\xff", check_utf16, 20, G_BIG_ENDIAN},
};
unicode_type_find (tf, utf16tester, G_N_ELEMENTS (utf16tester),
"text/utf-16");
}
static GstStaticCaps utf32_caps = GST_STATIC_CAPS ("text/utf-32");
#define UTF32_CAPS gst_static_caps_get(&utf32_caps)
static void
utf32_type_find (GstTypeFind * tf, gpointer unused)
{
static const GstUnicodeTester utf32tester[2] = {
{4, "\xff\xfe\x00\x00", check_utf32, 10, G_LITTLE_ENDIAN},
{4, "\x00\x00\xfe\xff", check_utf32, 20, G_BIG_ENDIAN}
};
unicode_type_find (tf, utf32tester, G_N_ELEMENTS (utf32tester),
"text/utf-32");
}
/*** text/uri-list ***/
static GstStaticCaps uri_caps = GST_STATIC_CAPS ("text/uri-list");
@ -4262,6 +4414,7 @@ plugin_init (GstPlugin * plugin)
static const gchar *rm_exts[] = { "ra", "ram", "rm", "rmvb", NULL };
static const gchar *swf_exts[] = { "swf", "swfl", NULL };
static const gchar *utf8_exts[] = { "txt", NULL };
static const gchar *unicode_exts[] = { "txt", NULL };
static const gchar *wav_exts[] = { "wav", NULL };
static const gchar *aiff_exts[] = { "aiff", "aif", "aifc", NULL };
static const gchar *svx_exts[] = { "iff", "svx", NULL };
@ -4436,6 +4589,10 @@ plugin_init (GstPlugin * plugin)
flv_exts, "FLV", 3, GST_TYPE_FIND_MAXIMUM);
TYPE_FIND_REGISTER (plugin, "text/plain", GST_RANK_MARGINAL, utf8_type_find,
utf8_exts, UTF8_CAPS, NULL, NULL);
TYPE_FIND_REGISTER (plugin, "text/utf-16", GST_RANK_MARGINAL, utf16_type_find,
unicode_exts, UTF16_CAPS, NULL, NULL);
TYPE_FIND_REGISTER (plugin, "text/utf-32", GST_RANK_MARGINAL, utf32_type_find,
unicode_exts, UTF32_CAPS, NULL, NULL);
TYPE_FIND_REGISTER (plugin, "text/uri-list", GST_RANK_MARGINAL, uri_type_find,
uri_exts, URI_CAPS, NULL, NULL);
TYPE_FIND_REGISTER (plugin, "application/x-hls", GST_RANK_MARGINAL,