subparse: fix typefind with small srt files

The typefind code was rejecting content smaller than 128 bytes making it impossible to play files with very small srt files. But those can actually be properly detected so fix typefind to allow smaller content and try its best with it. Part-of: <https://gitlab.freedesktop.org/gstreamer/gstreamer/-/merge_requests/6937>
2025-03-30 12:49:40 +00:00 · 2024-05-28 09:24:51 +02:00 · 2024-05-28 09:24:51 +02:00 · 81de6b7738
commit 81de6b7738
parent f7c8f4bb26
3 changed files with 44 additions and 10 deletions
--- a/subprojects/gst-plugins-base/gst/subparse/gstsubparseelement.c
+++ b/subprojects/gst-plugins-base/gst/subparse/gstsubparseelement.c
@ -291,25 +291,36 @@ gst_sub_parse_type_find (GstTypeFind * tf, gpointer private)
 {
  GstSubParseFormat format;
  const guint8 *data;
+  guint64 data_len = 128, checked_len;
  GstCaps *caps;
  gchar *str;
  gchar *encoding = NULL;
  const gchar *end;

-  if (!(data = gst_type_find_peek (tf, 0, 129)))
-    return;
+  /* use the first 128 bytes for detection, if available */
+  data = gst_type_find_peek (tf, 0, data_len);
+  if (!data) {
+    /* less that 128 bytes are available, try to detect using whatever is available */
+    data_len = gst_type_find_get_length (tf);
+    if (data_len == 0)
+      return;
+
+    data = gst_type_find_peek (tf, 0, data_len);
+    if (!data)
+      return;
+  }

  /* make sure string passed to _autodetect() is NUL-terminated */
-  str = g_malloc0 (129);
-  memcpy (str, data, 128);
+  str = g_malloc0 (data_len + 1);
+  memcpy (str, data, data_len);

-  if ((encoding = gst_sub_parse_detect_encoding (str, 128)) != NULL) {
+  if ((encoding = gst_sub_parse_detect_encoding (str, data_len)) != NULL) {
    gchar *converted_str;
    GError *err = NULL;
    gsize tmp;

    converted_str =
-        gst_sub_parse_gst_convert_to_utf8 (str, 128, encoding, &tmp, &err);
+        gst_sub_parse_gst_convert_to_utf8 (str, data_len, encoding, &tmp, &err);
    if (converted_str == NULL) {
      GST_DEBUG ("Encoding '%s' detected but conversion failed: %s", encoding,
          err->message);
@ -321,9 +332,15 @@ gst_sub_parse_type_find (GstTypeFind * tf, gpointer private)
    g_free (encoding);
  }

-  /* Check if at least the first 120 chars are valid UTF8,
-   * otherwise convert as always */
-  if (!g_utf8_validate (str, 128, &end) && (end - str) < 120) {
+  /* Check if content is valid UTF-8 but allow for the 8 last bytes to not be in
+   * case of incomplete unicode sequence. */
+  if (data_len > 8)
+    checked_len = data_len - 8;
+  else
+    checked_len = data_len;
+
+  if (!g_utf8_validate (str, data_len, &end) && (end - str) < checked_len) {
+    /* Invalid UTF-8, try converting */
    gchar *converted_str;
    gsize tmp;
    const gchar *enc;
@ -337,7 +354,7 @@ gst_sub_parse_type_find (GstTypeFind * tf, gpointer private)
      }
    }
    converted_str =
-        gst_sub_parse_gst_convert_to_utf8 (str, 128, enc, &tmp, NULL);
+        gst_sub_parse_gst_convert_to_utf8 (str, data_len, enc, &tmp, NULL);
    if (converted_str != NULL) {
      g_free (str);
      str = converted_str;
--- a/subprojects/gst-plugins-base/tests/check/gst/typefindfunctions.c
+++ b/subprojects/gst-plugins-base/tests/check/gst/typefindfunctions.c
@ -512,11 +512,25 @@ GST_START_TEST (test_subparse)
 {
  const gchar *type;
  GstCaps *caps = NULL;
+  GstTypeFindProbability prob;
+  guint8 one_byte[] = {
+    'A',
+  };

  caps = typefind_test_file ("subrip.srt");
  type = gst_structure_get_name (gst_caps_get_structure (caps, 0));
  fail_unless_equals_string (type, "application/x-subtitle");
  gst_caps_unref (caps);
+
+  caps = typefind_test_file ("subrip-short.srt");
+  type = gst_structure_get_name (gst_caps_get_structure (caps, 0));
+  fail_unless_equals_string (type, "application/x-subtitle");
+  gst_caps_unref (caps);
+
+  /* check that one byte content does not crash subparse typefinder */
+  prob = 0;
+  caps = typefind_data (one_byte, sizeof (one_byte), &prob);
+  fail_unless (caps == NULL);
 }

 GST_END_TEST;
--- a/subprojects/gst-plugins-base/tests/files/subrip-short.srt
+++ b/subprojects/gst-plugins-base/tests/files/subrip-short.srt
@ -0,0 +1,3 @@
+1
+00:00:01,000 --> 00:00:02,000
+One