gst/typefind/gsttypefindfunctions.c: Make plain/text typefinder more conservative: firstly, check for embedded zeroes...

Original commit message from CVS:
* gst/typefind/gsttypefindfunctions.c:
(utf8_type_find_count_embedded_zeroes),
(utf8_type_find_have_valid_utf8_at_offset), (utf8_type_find):
Make plain/text typefinder more conservative: firstly, check
for embedded zeroes, which are perfectly valid UTF-8 characters,
but also a fairly good sign that something is not a plain text
file; secondly, probe into the middle of the file if possible.
If we can't probe into the middle, limit the probability value
to be returned to TYPE_FIND_POSSIBLE (see #333900).
This commit is contained in:
Tim-Philipp Müller 2006-03-08 17:11:29 +00:00
parent 1d629c12d7
commit ca6e20ae1a
2 changed files with 82 additions and 11 deletions

View file

@ -1,3 +1,15 @@
2006-03-08 Tim-Philipp Müller <tim at centricular dot net>
* gst/typefind/gsttypefindfunctions.c:
(utf8_type_find_count_embedded_zeroes),
(utf8_type_find_have_valid_utf8_at_offset), (utf8_type_find):
Make plain/text typefinder more conservative: firstly, check
for embedded zeroes, which are perfectly valid UTF-8 characters,
but also a fairly good sign that something is not a plain text
file; secondly, probe into the middle of the file if possible.
If we can't probe into the middle, limit the probability value
to be returned to TYPE_FIND_POSSIBLE (see #333900).
2006-03-08 Michael Smith <msmith@fluendo.com>
* gst/typefind/gsttypefindfunctions.c: (plugin_init):

View file

@ -45,35 +45,94 @@ static gboolean xml_check_first_element (GstTypeFind * tf,
static GstStaticCaps utf8_caps = GST_STATIC_CAPS ("text/plain");
#define UTF8_CAPS gst_static_caps_get(&utf8_caps)
static void
utf8_type_find (GstTypeFind * tf, gpointer unused)
static guint
utf8_type_find_count_embedded_zeroes (const gchar * data, guint size)
{
guint num = 0;
while (size > 0) {
if (data[size - 1] == 0)
++num;
--size;
}
return num;
}
static gboolean
utf8_type_find_have_valid_utf8_at_offset (GstTypeFind * tf, guint64 offset,
GstTypeFindProbability * prob)
{
guint8 *data;
/* randomly decided values */
guint size = 1024; /* starting size */
guint min_size = 16; /* minimum size */
guint size = 32 * 1024; /* starting size */
guint probability = 95; /* starting probability */
guint step = 10; /* how much we reduce probability in each
* iteration */
/* leave xml to the xml typefinders */
if (xml_check_first_element (tf, "", 0))
return;
while (probability > step) {
data = gst_type_find_peek (tf, 0, size);
while (probability > step && size > min_size) {
data = gst_type_find_peek (tf, offset, size);
if (data) {
gchar *end;
gchar *start = (gchar *) data;
if (g_utf8_validate (start, size, (const gchar **) &end) || (end - start + 4 > size)) { /* allow last char to be cut off */
gst_type_find_suggest (tf, probability, UTF8_CAPS);
/* embedded zeroes are a sure sign that this isn't a plain text file */
if (utf8_type_find_count_embedded_zeroes (start, size) <= 2) {
*prob = probability;
return TRUE;
}
}
return;
*prob = 0;
return FALSE;
}
size /= 2;
probability -= step;
}
*prob = 0;
return FALSE;
}
static void
utf8_type_find (GstTypeFind * tf, gpointer unused)
{
GstTypeFindProbability start_prob, mid_prob;
guint64 length;
/* leave xml to the xml typefinders */
if (xml_check_first_element (tf, "", 0))
return;
/* check beginning of stream */
if (!utf8_type_find_have_valid_utf8_at_offset (tf, 0, &start_prob))
return;
GST_LOG ("start is plain text with probability of %u", start_prob);
/* POSSIBLE is the highest probability we ever return if we can't
* probe into the middle of the file and don't know its length */
length = gst_type_find_get_length (tf);
if (length == 0 || length == (guint64) - 1) {
gst_type_find_suggest (tf, MIN (start_prob, GST_TYPE_FIND_POSSIBLE),
UTF8_CAPS);
return;
}
if (length < 64 * 1024) {
gst_type_find_suggest (tf, start_prob, UTF8_CAPS);
return;
}
/* check middle of stream */
if (!utf8_type_find_have_valid_utf8_at_offset (tf, length / 2, &mid_prob))
return;
GST_LOG ("middle is plain text with probability of %u", mid_prob);
gst_type_find_suggest (tf, (start_prob + mid_prob) / 2, UTF8_CAPS);
}
/*** text/uri-list ***/