splitfilesrc: specify filenames via normal wildcards instead of regular expressions

Less cracktastic in the end.
2024-12-23 08:46:40 +00:00 · 2011-11-30 19:00:42 +00:00 · 2011-11-30 19:00:42 +00:00 · 0584ae8f98
commit 0584ae8f98
parent e6c4979a42
4 changed files with 424 additions and 36 deletions
--- a/gst/multifile/Makefile.am
+++ b/gst/multifile/Makefile.am
@ -5,13 +5,14 @@ libgstmultifile_la_SOURCES = \
 	gstmultifilesink.c   \
 	gstmultifilesrc.c    \
 	gstmultifile.c       \
-	gstsplitfilesrc.c
+	gstsplitfilesrc.c    \
 	patternspec.c
 libgstmultifile_la_CFLAGS = $(GST_BASE_CFLAGS) $(GST_CFLAGS) $(GIO_CFLAGS)
 libgstmultifile_la_LIBADD = $(GST_BASE_LIBS) $(GST_LIBS) $(GIO_LIBS)
 libgstmultifile_la_LDFLAGS = $(GST_PLUGIN_LDFLAGS)
 libgstmultifile_la_LIBTOOLFLAGS = --tag=disable-static
-noinst_HEADERS = gstmultifilesrc.h gstmultifilesink.h gstsplitfilesrc.h
+noinst_HEADERS = gstmultifilesrc.h gstmultifilesink.h gstsplitfilesrc.h patternspec.h
 Android.mk: Makefile.am $(BUILT_SOURCES)
--- a/gst/multifile/gstsplitfilesrc.c
+++ b/gst/multifile/gstsplitfilesrc.c
@ -25,17 +25,14 @@
 * had to be split into multiple parts due to filesystem file size limitations,
 * for example.
 *
- * The files to select are chosen via the location property, which takes a
+ * The files to select are chosen via the location property, which supports
- * regular expression (note: shell-style wildcards will not work). If the
+ * (and expects) shell-style wildcards (but only for the filename, not for
- * location is an absolute path or contains directory components, only the
+ * directories). The results will be sorted.
 * base file name part will be considered a regular expression. The results
 * will be sorted. The location may include directory components, but the
 * regular expression to select the files can only be in the filename part.
 *
 * <refsect2>
 * <title>Example launch line</title>
 * |[
- * gst-launch splitfilesrc location="/path/to/part-.*.mpg" ! decodebin ! ... \
+ * gst-launch splitfilesrc location="/path/to/part-*.mpg" ! decodebin ! ... \
 * ]| Plays the different parts as if they were one single MPEG file.
 * </refsect2>
 *
@ -51,9 +48,16 @@
 #endif
 #include "gstsplitfilesrc.h"
 #include "patternspec.h"
 #include <string.h>
 #ifdef G_OS_WIN32
 #define DEFAULT_PATTERN_MATCH_MODE MATCH_MODE_UTF8
 #else
 #define DEFAULT_PATTERN_MATCH_MODE MATCH_MODE_AUTO
 #endif
 enum
 {
  PROP_LOCATION = 1
@ -105,6 +109,12 @@ gst_split_file_src_base_init (gpointer g_class)
      "Tim-Philipp Müller <tim.muller@collabora.co.uk>");
 }
 #ifdef G_OS_WIN32
 #define WIN32_BLURB " Location string must be in UTF-8 encoding (on Windows)."
 #else
 #define WIN32_BLURB             /* nothing */
 #endif
 static void
 gst_split_file_src_class_init (GstSplitFileSrcClass * klass)
 {
@ -115,16 +125,12 @@ gst_split_file_src_class_init (GstSplitFileSrcClass * klass)
  gobject_class->get_property = gst_split_file_src_get_property;
  gobject_class->finalize = gst_split_file_src_finalize;
  /* We're using a regular expression here instead of wildcards, because
   * GPatternSpec can only handle UTF-8 and filenames on unix tend to be
   * just bytes and are often ISO-8859-X, and we don't feel like
   * re-inventing GPatternSpec */
  g_object_class_install_property (gobject_class, PROP_LOCATION,
      g_param_spec_string ("location", "File Location",
-          "Regular expression to create file names of the input files. If "
+          "Wildcard pattern to match file names of the input files. If "
          "the location is an absolute path or contains directory components, "
-          "only the base file name part will be considered a regular "
+          "only the base file name part will be considered for pattern "
-          "expression. The results will be sorted.",
+          "matching. The results will be sorted." WIN32_BLURB,
          DEFAULT_LOCATION, G_PARAM_READWRITE | G_PARAM_STATIC_STRINGS));
  gstbasesrc_class->start = GST_DEBUG_FUNCPTR (gst_split_file_src_start);
@ -203,6 +209,12 @@ gst_split_file_src_set_property (GObject * object, guint prop_id,
      GST_OBJECT_LOCK (src);
      g_free (src->location);
      src->location = g_value_dup_string (value);
 #ifdef G_OS_WIN32
      if (!g_utf8_validate (src->location, -1, NULL)) {
        g_warning ("splitfilesrc 'location' property must be in UTF-8 "
            "encoding on Windows");
      }
 #endif
      GST_OBJECT_UNLOCK (src);
      break;
    default:
@ -239,10 +251,9 @@ static gchar **
 gst_split_file_src_find_files (GstSplitFileSrc * src, const gchar * dirname,
    const gchar * basename, GError ** err)
 {
  PatternSpec *pspec;
  GPtrArray *files;
  GRegex *regex;
  const gchar *name;
  gchar *regex_string;
  GDir *dir;
  if (dirname == NULL || basename == NULL)
@ -255,25 +266,20 @@ gst_split_file_src_find_files (GstSplitFileSrc * src, const gchar * dirname,
  if (dir == NULL)
    return NULL;
-  /* we want the filename to be the whole filename, not just some match
+  if (DEFAULT_PATTERN_MATCH_MODE == MATCH_MODE_UTF8 &&
-   * in the middle of the filename */
+      !g_utf8_validate (basename, -1, NULL)) {
-  if (g_str_has_suffix (basename, "$"))
+    goto not_utf8;
-    regex_string = g_strdup (basename);
+  }
  else
    regex_string = g_strconcat (basename, "$", NULL);
-  regex = g_regex_new (regex_string, G_REGEX_RAW, (GRegexMatchFlags) 0, err);
+  /* mode will be AUTO on linux/unix and UTF8 on win32 */
-  g_free (regex_string);
+  pspec = pattern_spec_new (basename, DEFAULT_PATTERN_MATCH_MODE);
  if (regex == NULL)
    goto regex_fail;
  files = g_ptr_array_new ();
  while ((name = g_dir_read_name (dir))) {
    GST_TRACE_OBJECT (src, "check: %s", name);
-    if (g_regex_match (regex, name, (GRegexMatchFlags) 0, NULL)) {
+    if (pattern_match_string (pspec, name)) {
-      GST_LOG_OBJECT (src, "match: %s", name);
+      GST_DEBUG_OBJECT (src, "match: %s", name);
      g_ptr_array_add (files, g_build_filename (dirname, name, NULL));
    }
  }
@ -284,7 +290,7 @@ gst_split_file_src_find_files (GstSplitFileSrc * src, const gchar * dirname,
  g_ptr_array_sort (files, (GCompareFunc) gst_split_file_src_array_sortfunc);
  g_ptr_array_add (files, NULL);
-  g_regex_unref (regex);
+  pattern_spec_free (pspec);
  g_dir_close (dir);
  return (gchar **) g_ptr_array_free (files, FALSE);
@ -296,21 +302,21 @@ invalid_location:
        "No filename specified.");
    return NULL;
  }
-regex_fail:
+not_utf8:
  {
    GST_WARNING_OBJECT (src, "g_regex_new() failed: %s", (*err)->message);
    g_dir_close (dir);
    g_set_error_literal (err, G_FILE_ERROR, G_FILE_ERROR_INVAL,
        "Filename pattern must be UTF-8 on Windows.");
    return NULL;
  }
 no_matches:
  {
-    g_regex_unref (regex);
+    pattern_spec_free (pspec);
    g_dir_close (dir);
    g_set_error_literal (err, G_FILE_ERROR, G_FILE_ERROR_NOENT,
        "Found no files matching the pattern.");
    return NULL;
  }
 }
 static gboolean
--- a/gst/multifile/patternspec.c
+++ b/gst/multifile/patternspec.c
@ -0,0 +1,334 @@
 /* GPattern copy that supports raw (non-utf8) matching
 * based on: GLIB - Library of useful routines for C programming
 * Copyright (C) 1995-1997, 1999  Peter Mattis, Red Hat, Inc.
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the
 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
 * Boston, MA 02111-1307, USA.
 */
 #ifdef HAVE_CONFIG_H
 #include "config.h"
 #endif
 #include "patternspec.h"
 #include <string.h>
 typedef enum
 {
  MATCH_ALL,                    /* "*A?A*" */
  MATCH_ALL_TAIL,               /* "*A?AA" */
  MATCH_HEAD,                   /* "AAAA*" */
  MATCH_TAIL,                   /* "*AAAA" */
  MATCH_EXACT,                  /* "AAAAA" */
  MATCH_LAST
 } MatchType;
 struct _PatternSpec
 {
  MatchMode match_mode;
  MatchType match_type;
  guint pattern_length;
  guint min_length;
  guint max_length;
  gchar *pattern;
 };
 static inline gchar *
 raw_strreverse (const gchar * str, gssize size)
 {
  g_assert (size > 0);
  return g_strreverse (g_strndup (str, size));
 }
 static inline gboolean
 pattern_ph_match (const gchar * match_pattern, MatchMode match_mode,
    const gchar * match_string, gboolean * wildcard_reached_p)
 {
  register const gchar *pattern, *string;
  register gchar ch;
  pattern = match_pattern;
  string = match_string;
  ch = *pattern;
  pattern++;
  while (ch) {
    switch (ch) {
      case '?':
        if (!*string)
          return FALSE;
        if (match_mode == MATCH_MODE_UTF8)
          string = g_utf8_next_char (string);
        else
          ++string;
        break;
      case '*':
        *wildcard_reached_p = TRUE;
        do {
          ch = *pattern;
          pattern++;
          if (ch == '?') {
            if (!*string)
              return FALSE;
            if (match_mode == MATCH_MODE_UTF8)
              string = g_utf8_next_char (string);
            else
              ++string;
          }
        }
        while (ch == '*' || ch == '?');
        if (!ch)
          return TRUE;
        do {
          gboolean next_wildcard_reached = FALSE;
          while (ch != *string) {
            if (!*string)
              return FALSE;
            if (match_mode == MATCH_MODE_UTF8)
              string = g_utf8_next_char (string);
            else
              ++string;
          }
          string++;
          if (pattern_ph_match (pattern, match_mode, string,
                  &next_wildcard_reached))
            return TRUE;
          if (next_wildcard_reached)
            /* the forthcoming pattern substring up to the next wildcard has
             * been matched, but a mismatch occoured for the rest of the
             * pattern, following the next wildcard.
             * there's no need to advance the current match position any
             * further if the rest pattern will not match.
             */
            return FALSE;
        }
        while (*string);
        break;
      default:
        if (ch == *string)
          string++;
        else
          return FALSE;
        break;
    }
    ch = *pattern;
    pattern++;
  }
  return *string == 0;
 }
 static gboolean
 pattern_match (PatternSpec * pspec, guint string_length,
    const gchar * string, const gchar * string_reversed)
 {
  MatchMode match_mode;
  g_assert (pspec != NULL);
  g_assert (string != NULL);
  if (string_length < pspec->min_length || string_length > pspec->max_length)
    return FALSE;
  match_mode = pspec->match_mode;
  if (match_mode == MATCH_MODE_AUTO) {
    if (!g_utf8_validate (string, string_length, NULL))
      match_mode = MATCH_MODE_RAW;
    else
      match_mode = MATCH_MODE_UTF8;
  }
  switch (pspec->match_type) {
      gboolean dummy;
    case MATCH_ALL:
      return pattern_ph_match (pspec->pattern, match_mode, string, &dummy);
    case MATCH_ALL_TAIL:
      if (string_reversed)
        return pattern_ph_match (pspec->pattern, match_mode, string_reversed,
            &dummy);
      else {
        gboolean result;
        gchar *tmp;
        if (match_mode == MATCH_MODE_UTF8) {
          tmp = g_utf8_strreverse (string, string_length);
        } else {
          tmp = raw_strreverse (string, string_length);
        }
        result = pattern_ph_match (pspec->pattern, match_mode, tmp, &dummy);
        g_free (tmp);
        return result;
      }
    case MATCH_HEAD:
      if (pspec->pattern_length == string_length)
        return memcmp (pspec->pattern, string, string_length) == 0;
      else if (pspec->pattern_length)
        return memcmp (pspec->pattern, string, pspec->pattern_length) == 0;
      else
        return TRUE;
    case MATCH_TAIL:
      if (pspec->pattern_length)
        /* compare incl. NUL terminator */
        return memcmp (pspec->pattern,
            string + (string_length - pspec->pattern_length),
            pspec->pattern_length + 1) == 0;
      else
        return TRUE;
    case MATCH_EXACT:
      if (pspec->pattern_length != string_length)
        return FALSE;
      else
        return memcmp (pspec->pattern, string, string_length) == 0;
    default:
      g_return_val_if_fail (pspec->match_type < MATCH_LAST, FALSE);
      return FALSE;
  }
 }
 PatternSpec *
 pattern_spec_new (const gchar * pattern, MatchMode match_mode)
 {
  PatternSpec *pspec;
  gboolean seen_joker = FALSE, seen_wildcard = FALSE, more_wildcards = FALSE;
  gint hw_pos = -1, tw_pos = -1, hj_pos = -1, tj_pos = -1;
  gboolean follows_wildcard = FALSE;
  guint pending_jokers = 0;
  const gchar *s;
  gchar *d;
  guint i;
  g_assert (pattern != NULL);
  g_assert (match_mode != MATCH_MODE_UTF8
      || g_utf8_validate (pattern, -1, NULL));
  /* canonicalize pattern and collect necessary stats */
  pspec = g_new (PatternSpec, 1);
  pspec->match_mode = match_mode;
  pspec->pattern_length = strlen (pattern);
  pspec->min_length = 0;
  pspec->max_length = 0;
  pspec->pattern = g_new (gchar, pspec->pattern_length + 1);
  if (pspec->match_mode == MATCH_MODE_AUTO) {
    if (!g_utf8_validate (pattern, -1, NULL))
      pspec->match_mode = MATCH_MODE_RAW;
  }
  d = pspec->pattern;
  for (i = 0, s = pattern; *s != 0; s++) {
    switch (*s) {
      case '*':
        if (follows_wildcard) { /* compress multiple wildcards */
          pspec->pattern_length--;
          continue;
        }
        follows_wildcard = TRUE;
        if (hw_pos < 0)
          hw_pos = i;
        tw_pos = i;
        break;
      case '?':
        pending_jokers++;
        pspec->min_length++;
        if (pspec->match_mode == MATCH_MODE_RAW) {
          pspec->max_length += 1;
        } else {
          pspec->max_length += 4;       /* maximum UTF-8 character length */
        }
        continue;
      default:
        for (; pending_jokers; pending_jokers--, i++) {
          *d++ = '?';
          if (hj_pos < 0)
            hj_pos = i;
          tj_pos = i;
        }
        follows_wildcard = FALSE;
        pspec->min_length++;
        pspec->max_length++;
        break;
    }
    *d++ = *s;
    i++;
  }
  for (; pending_jokers; pending_jokers--) {
    *d++ = '?';
    if (hj_pos < 0)
      hj_pos = i;
    tj_pos = i;
  }
  *d++ = 0;
  seen_joker = hj_pos >= 0;
  seen_wildcard = hw_pos >= 0;
  more_wildcards = seen_wildcard && hw_pos != tw_pos;
  if (seen_wildcard)
    pspec->max_length = G_MAXUINT;
  /* special case sole head/tail wildcard or exact matches */
  if (!seen_joker && !more_wildcards) {
    if (pspec->pattern[0] == '*') {
      pspec->match_type = MATCH_TAIL;
      memmove (pspec->pattern, pspec->pattern + 1, --pspec->pattern_length);
      pspec->pattern[pspec->pattern_length] = 0;
      return pspec;
    }
    if (pspec->pattern_length > 0 &&
        pspec->pattern[pspec->pattern_length - 1] == '*') {
      pspec->match_type = MATCH_HEAD;
      pspec->pattern[--pspec->pattern_length] = 0;
      return pspec;
    }
    if (!seen_wildcard) {
      pspec->match_type = MATCH_EXACT;
      return pspec;
    }
  }
  /* now just need to distinguish between head or tail match start */
  tw_pos = pspec->pattern_length - 1 - tw_pos;  /* last pos to tail distance */
  tj_pos = pspec->pattern_length - 1 - tj_pos;  /* last pos to tail distance */
  if (seen_wildcard)
    pspec->match_type = tw_pos > hw_pos ? MATCH_ALL_TAIL : MATCH_ALL;
  else                          /* seen_joker */
    pspec->match_type = tj_pos > hj_pos ? MATCH_ALL_TAIL : MATCH_ALL;
  if (pspec->match_type == MATCH_ALL_TAIL) {
    gchar *tmp = pspec->pattern;
    if (pspec->match_mode == MATCH_MODE_RAW) {
      pspec->pattern = raw_strreverse (pspec->pattern, pspec->pattern_length);
    } else {
      pspec->pattern =
          g_utf8_strreverse (pspec->pattern, pspec->pattern_length);
    }
    g_free (tmp);
  }
  return pspec;
 }
 void
 pattern_spec_free (PatternSpec * pspec)
 {
  g_assert (pspec != NULL);
  g_free (pspec->pattern);
  g_free (pspec);
 }
 gboolean
 pattern_match_string (PatternSpec * pspec, const gchar * string)
 {
  return pattern_match (pspec, strlen (string), string, NULL);
 }
--- a/gst/multifile/patternspec.h
+++ b/gst/multifile/patternspec.h
@ -0,0 +1,47 @@
 /* GPattern copy that supports raw (non-utf8) matching
 * based on: GLIB - Library of useful routines for C programming
 * Copyright (C) 1995-1997, 1999  Peter Mattis, Red Hat, Inc.
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the
 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
 * Boston, MA 02111-1307, USA.
 */
 #ifndef __PATTERN_SPEC_H__
 #define __PATTERN_SPEC_H__
 #include <glib.h>
 G_BEGIN_DECLS
 typedef enum
 {
  MATCH_MODE_AUTO = 0,
  MATCH_MODE_UTF8,
  MATCH_MODE_RAW
 } MatchMode;
 typedef struct _PatternSpec PatternSpec;
 PatternSpec * pattern_spec_new       (const gchar  * pattern,
                                      MatchMode      match_mode);
 void          pattern_spec_free      (PatternSpec  * pspec);
 gboolean      pattern_match_string   (PatternSpec  * pspec,
                                      const gchar  * string);
 G_END_DECLS
 #endif /* __PATTERN_SPEC_H__ */