From 81de6b7738b283b4694bfeaa1fedd71e59f2639f Mon Sep 17 00:00:00 2001 From: Guillaume Desmottes Date: Tue, 28 May 2024 09:24:51 +0200 Subject: [PATCH] subparse: fix typefind with small srt files The typefind code was rejecting content smaller than 128 bytes making it impossible to play files with very small srt files. But those can actually be properly detected so fix typefind to allow smaller content and try its best with it. Part-of: --- .../gst/subparse/gstsubparseelement.c | 37 ++++++++++++++----- .../tests/check/gst/typefindfunctions.c | 14 +++++++ .../tests/files/subrip-short.srt | 3 ++ 3 files changed, 44 insertions(+), 10 deletions(-) create mode 100644 subprojects/gst-plugins-base/tests/files/subrip-short.srt diff --git a/subprojects/gst-plugins-base/gst/subparse/gstsubparseelement.c b/subprojects/gst-plugins-base/gst/subparse/gstsubparseelement.c index 9c03ee10b8..97fb83d8ad 100644 --- a/subprojects/gst-plugins-base/gst/subparse/gstsubparseelement.c +++ b/subprojects/gst-plugins-base/gst/subparse/gstsubparseelement.c @@ -291,25 +291,36 @@ gst_sub_parse_type_find (GstTypeFind * tf, gpointer private) { GstSubParseFormat format; const guint8 *data; + guint64 data_len = 128, checked_len; GstCaps *caps; gchar *str; gchar *encoding = NULL; const gchar *end; - if (!(data = gst_type_find_peek (tf, 0, 129))) - return; + /* use the first 128 bytes for detection, if available */ + data = gst_type_find_peek (tf, 0, data_len); + if (!data) { + /* less that 128 bytes are available, try to detect using whatever is available */ + data_len = gst_type_find_get_length (tf); + if (data_len == 0) + return; + + data = gst_type_find_peek (tf, 0, data_len); + if (!data) + return; + } /* make sure string passed to _autodetect() is NUL-terminated */ - str = g_malloc0 (129); - memcpy (str, data, 128); + str = g_malloc0 (data_len + 1); + memcpy (str, data, data_len); - if ((encoding = gst_sub_parse_detect_encoding (str, 128)) != NULL) { + if ((encoding = gst_sub_parse_detect_encoding (str, data_len)) != NULL) { gchar *converted_str; GError *err = NULL; gsize tmp; converted_str = - gst_sub_parse_gst_convert_to_utf8 (str, 128, encoding, &tmp, &err); + gst_sub_parse_gst_convert_to_utf8 (str, data_len, encoding, &tmp, &err); if (converted_str == NULL) { GST_DEBUG ("Encoding '%s' detected but conversion failed: %s", encoding, err->message); @@ -321,9 +332,15 @@ gst_sub_parse_type_find (GstTypeFind * tf, gpointer private) g_free (encoding); } - /* Check if at least the first 120 chars are valid UTF8, - * otherwise convert as always */ - if (!g_utf8_validate (str, 128, &end) && (end - str) < 120) { + /* Check if content is valid UTF-8 but allow for the 8 last bytes to not be in + * case of incomplete unicode sequence. */ + if (data_len > 8) + checked_len = data_len - 8; + else + checked_len = data_len; + + if (!g_utf8_validate (str, data_len, &end) && (end - str) < checked_len) { + /* Invalid UTF-8, try converting */ gchar *converted_str; gsize tmp; const gchar *enc; @@ -337,7 +354,7 @@ gst_sub_parse_type_find (GstTypeFind * tf, gpointer private) } } converted_str = - gst_sub_parse_gst_convert_to_utf8 (str, 128, enc, &tmp, NULL); + gst_sub_parse_gst_convert_to_utf8 (str, data_len, enc, &tmp, NULL); if (converted_str != NULL) { g_free (str); str = converted_str; diff --git a/subprojects/gst-plugins-base/tests/check/gst/typefindfunctions.c b/subprojects/gst-plugins-base/tests/check/gst/typefindfunctions.c index 9c47bb4436..f5539ad2ab 100644 --- a/subprojects/gst-plugins-base/tests/check/gst/typefindfunctions.c +++ b/subprojects/gst-plugins-base/tests/check/gst/typefindfunctions.c @@ -512,11 +512,25 @@ GST_START_TEST (test_subparse) { const gchar *type; GstCaps *caps = NULL; + GstTypeFindProbability prob; + guint8 one_byte[] = { + 'A', + }; caps = typefind_test_file ("subrip.srt"); type = gst_structure_get_name (gst_caps_get_structure (caps, 0)); fail_unless_equals_string (type, "application/x-subtitle"); gst_caps_unref (caps); + + caps = typefind_test_file ("subrip-short.srt"); + type = gst_structure_get_name (gst_caps_get_structure (caps, 0)); + fail_unless_equals_string (type, "application/x-subtitle"); + gst_caps_unref (caps); + + /* check that one byte content does not crash subparse typefinder */ + prob = 0; + caps = typefind_data (one_byte, sizeof (one_byte), &prob); + fail_unless (caps == NULL); } GST_END_TEST; diff --git a/subprojects/gst-plugins-base/tests/files/subrip-short.srt b/subprojects/gst-plugins-base/tests/files/subrip-short.srt new file mode 100644 index 0000000000..42ee5ffba3 --- /dev/null +++ b/subprojects/gst-plugins-base/tests/files/subrip-short.srt @@ -0,0 +1,3 @@ +1 +00:00:01,000 --> 00:00:02,000 +One