subparse: fix typefind with small srt files

The typefind code was rejecting content smaller than 128 bytes making it
impossible to play files with very small srt files.
But those can actually be properly detected so fix typefind to allow
smaller content and try its best with it.

Part-of: <https://gitlab.freedesktop.org/gstreamer/gstreamer/-/merge_requests/6937>
This commit is contained in:
Guillaume Desmottes 2024-05-28 09:24:51 +02:00
parent f7c8f4bb26
commit 81de6b7738
3 changed files with 44 additions and 10 deletions

View file

@ -291,25 +291,36 @@ gst_sub_parse_type_find (GstTypeFind * tf, gpointer private)
{
GstSubParseFormat format;
const guint8 *data;
guint64 data_len = 128, checked_len;
GstCaps *caps;
gchar *str;
gchar *encoding = NULL;
const gchar *end;
if (!(data = gst_type_find_peek (tf, 0, 129)))
return;
/* use the first 128 bytes for detection, if available */
data = gst_type_find_peek (tf, 0, data_len);
if (!data) {
/* less that 128 bytes are available, try to detect using whatever is available */
data_len = gst_type_find_get_length (tf);
if (data_len == 0)
return;
data = gst_type_find_peek (tf, 0, data_len);
if (!data)
return;
}
/* make sure string passed to _autodetect() is NUL-terminated */
str = g_malloc0 (129);
memcpy (str, data, 128);
str = g_malloc0 (data_len + 1);
memcpy (str, data, data_len);
if ((encoding = gst_sub_parse_detect_encoding (str, 128)) != NULL) {
if ((encoding = gst_sub_parse_detect_encoding (str, data_len)) != NULL) {
gchar *converted_str;
GError *err = NULL;
gsize tmp;
converted_str =
gst_sub_parse_gst_convert_to_utf8 (str, 128, encoding, &tmp, &err);
gst_sub_parse_gst_convert_to_utf8 (str, data_len, encoding, &tmp, &err);
if (converted_str == NULL) {
GST_DEBUG ("Encoding '%s' detected but conversion failed: %s", encoding,
err->message);
@ -321,9 +332,15 @@ gst_sub_parse_type_find (GstTypeFind * tf, gpointer private)
g_free (encoding);
}
/* Check if at least the first 120 chars are valid UTF8,
* otherwise convert as always */
if (!g_utf8_validate (str, 128, &end) && (end - str) < 120) {
/* Check if content is valid UTF-8 but allow for the 8 last bytes to not be in
* case of incomplete unicode sequence. */
if (data_len > 8)
checked_len = data_len - 8;
else
checked_len = data_len;
if (!g_utf8_validate (str, data_len, &end) && (end - str) < checked_len) {
/* Invalid UTF-8, try converting */
gchar *converted_str;
gsize tmp;
const gchar *enc;
@ -337,7 +354,7 @@ gst_sub_parse_type_find (GstTypeFind * tf, gpointer private)
}
}
converted_str =
gst_sub_parse_gst_convert_to_utf8 (str, 128, enc, &tmp, NULL);
gst_sub_parse_gst_convert_to_utf8 (str, data_len, enc, &tmp, NULL);
if (converted_str != NULL) {
g_free (str);
str = converted_str;

View file

@ -512,11 +512,25 @@ GST_START_TEST (test_subparse)
{
const gchar *type;
GstCaps *caps = NULL;
GstTypeFindProbability prob;
guint8 one_byte[] = {
'A',
};
caps = typefind_test_file ("subrip.srt");
type = gst_structure_get_name (gst_caps_get_structure (caps, 0));
fail_unless_equals_string (type, "application/x-subtitle");
gst_caps_unref (caps);
caps = typefind_test_file ("subrip-short.srt");
type = gst_structure_get_name (gst_caps_get_structure (caps, 0));
fail_unless_equals_string (type, "application/x-subtitle");
gst_caps_unref (caps);
/* check that one byte content does not crash subparse typefinder */
prob = 0;
caps = typefind_data (one_byte, sizeof (one_byte), &prob);
fail_unless (caps == NULL);
}
GST_END_TEST;

View file

@ -0,0 +1,3 @@
1
00:00:01,000 --> 00:00:02,000
One