mirror of
https://gitlab.freedesktop.org/gstreamer/gstreamer.git
synced 2024-11-18 15:51:11 +00:00
gst/typefind/gsttypefindfunctions.c: Make plain/text typefinder more conservative: firstly, check for embedded zeroes...
Original commit message from CVS: * gst/typefind/gsttypefindfunctions.c: (utf8_type_find_count_embedded_zeroes), (utf8_type_find_have_valid_utf8_at_offset), (utf8_type_find): Make plain/text typefinder more conservative: firstly, check for embedded zeroes, which are perfectly valid UTF-8 characters, but also a fairly good sign that something is not a plain text file; secondly, probe into the middle of the file if possible. If we can't probe into the middle, limit the probability value to be returned to TYPE_FIND_POSSIBLE (see #333900).
This commit is contained in:
parent
1d629c12d7
commit
ca6e20ae1a
2 changed files with 82 additions and 11 deletions
12
ChangeLog
12
ChangeLog
|
@ -1,3 +1,15 @@
|
|||
2006-03-08 Tim-Philipp Müller <tim at centricular dot net>
|
||||
|
||||
* gst/typefind/gsttypefindfunctions.c:
|
||||
(utf8_type_find_count_embedded_zeroes),
|
||||
(utf8_type_find_have_valid_utf8_at_offset), (utf8_type_find):
|
||||
Make plain/text typefinder more conservative: firstly, check
|
||||
for embedded zeroes, which are perfectly valid UTF-8 characters,
|
||||
but also a fairly good sign that something is not a plain text
|
||||
file; secondly, probe into the middle of the file if possible.
|
||||
If we can't probe into the middle, limit the probability value
|
||||
to be returned to TYPE_FIND_POSSIBLE (see #333900).
|
||||
|
||||
2006-03-08 Michael Smith <msmith@fluendo.com>
|
||||
|
||||
* gst/typefind/gsttypefindfunctions.c: (plugin_init):
|
||||
|
|
|
@ -45,35 +45,94 @@ static gboolean xml_check_first_element (GstTypeFind * tf,
|
|||
static GstStaticCaps utf8_caps = GST_STATIC_CAPS ("text/plain");
|
||||
|
||||
#define UTF8_CAPS gst_static_caps_get(&utf8_caps)
|
||||
static void
|
||||
utf8_type_find (GstTypeFind * tf, gpointer unused)
|
||||
|
||||
static guint
|
||||
utf8_type_find_count_embedded_zeroes (const gchar * data, guint size)
|
||||
{
|
||||
guint num = 0;
|
||||
|
||||
while (size > 0) {
|
||||
if (data[size - 1] == 0)
|
||||
++num;
|
||||
--size;
|
||||
}
|
||||
|
||||
return num;
|
||||
}
|
||||
|
||||
static gboolean
|
||||
utf8_type_find_have_valid_utf8_at_offset (GstTypeFind * tf, guint64 offset,
|
||||
GstTypeFindProbability * prob)
|
||||
{
|
||||
guint8 *data;
|
||||
|
||||
/* randomly decided values */
|
||||
guint size = 1024; /* starting size */
|
||||
guint min_size = 16; /* minimum size */
|
||||
guint size = 32 * 1024; /* starting size */
|
||||
guint probability = 95; /* starting probability */
|
||||
guint step = 10; /* how much we reduce probability in each
|
||||
* iteration */
|
||||
|
||||
/* leave xml to the xml typefinders */
|
||||
if (xml_check_first_element (tf, "", 0))
|
||||
return;
|
||||
|
||||
while (probability > step) {
|
||||
data = gst_type_find_peek (tf, 0, size);
|
||||
while (probability > step && size > min_size) {
|
||||
data = gst_type_find_peek (tf, offset, size);
|
||||
if (data) {
|
||||
gchar *end;
|
||||
gchar *start = (gchar *) data;
|
||||
|
||||
if (g_utf8_validate (start, size, (const gchar **) &end) || (end - start + 4 > size)) { /* allow last char to be cut off */
|
||||
gst_type_find_suggest (tf, probability, UTF8_CAPS);
|
||||
/* embedded zeroes are a sure sign that this isn't a plain text file */
|
||||
if (utf8_type_find_count_embedded_zeroes (start, size) <= 2) {
|
||||
*prob = probability;
|
||||
return TRUE;
|
||||
}
|
||||
}
|
||||
return;
|
||||
*prob = 0;
|
||||
return FALSE;
|
||||
}
|
||||
size /= 2;
|
||||
probability -= step;
|
||||
}
|
||||
*prob = 0;
|
||||
return FALSE;
|
||||
}
|
||||
|
||||
static void
|
||||
utf8_type_find (GstTypeFind * tf, gpointer unused)
|
||||
{
|
||||
GstTypeFindProbability start_prob, mid_prob;
|
||||
guint64 length;
|
||||
|
||||
/* leave xml to the xml typefinders */
|
||||
if (xml_check_first_element (tf, "", 0))
|
||||
return;
|
||||
|
||||
/* check beginning of stream */
|
||||
if (!utf8_type_find_have_valid_utf8_at_offset (tf, 0, &start_prob))
|
||||
return;
|
||||
|
||||
GST_LOG ("start is plain text with probability of %u", start_prob);
|
||||
|
||||
/* POSSIBLE is the highest probability we ever return if we can't
|
||||
* probe into the middle of the file and don't know its length */
|
||||
|
||||
length = gst_type_find_get_length (tf);
|
||||
if (length == 0 || length == (guint64) - 1) {
|
||||
gst_type_find_suggest (tf, MIN (start_prob, GST_TYPE_FIND_POSSIBLE),
|
||||
UTF8_CAPS);
|
||||
return;
|
||||
}
|
||||
|
||||
if (length < 64 * 1024) {
|
||||
gst_type_find_suggest (tf, start_prob, UTF8_CAPS);
|
||||
return;
|
||||
}
|
||||
|
||||
/* check middle of stream */
|
||||
if (!utf8_type_find_have_valid_utf8_at_offset (tf, length / 2, &mid_prob))
|
||||
return;
|
||||
|
||||
GST_LOG ("middle is plain text with probability of %u", mid_prob);
|
||||
gst_type_find_suggest (tf, (start_prob + mid_prob) / 2, UTF8_CAPS);
|
||||
}
|
||||
|
||||
/*** text/uri-list ***/
|
||||
|
|
Loading…
Reference in a new issue