splitfilesrc: specify filenames via normal wildcards instead of regular expressions

Less cracktastic in the end.
This commit is contained in:
Tim-Philipp Müller 2011-11-30 19:00:42 +00:00
parent e6c4979a42
commit 0584ae8f98
4 changed files with 424 additions and 36 deletions

View file

@ -5,13 +5,14 @@ libgstmultifile_la_SOURCES = \
gstmultifilesink.c \
gstmultifilesrc.c \
gstmultifile.c \
gstsplitfilesrc.c
gstsplitfilesrc.c \
patternspec.c
libgstmultifile_la_CFLAGS = $(GST_BASE_CFLAGS) $(GST_CFLAGS) $(GIO_CFLAGS)
libgstmultifile_la_LIBADD = $(GST_BASE_LIBS) $(GST_LIBS) $(GIO_LIBS)
libgstmultifile_la_LDFLAGS = $(GST_PLUGIN_LDFLAGS)
libgstmultifile_la_LIBTOOLFLAGS = --tag=disable-static
noinst_HEADERS = gstmultifilesrc.h gstmultifilesink.h gstsplitfilesrc.h
noinst_HEADERS = gstmultifilesrc.h gstmultifilesink.h gstsplitfilesrc.h patternspec.h
Android.mk: Makefile.am $(BUILT_SOURCES)

View file

@ -25,17 +25,14 @@
* had to be split into multiple parts due to filesystem file size limitations,
* for example.
*
* The files to select are chosen via the location property, which takes a
* regular expression (note: shell-style wildcards will not work). If the
* location is an absolute path or contains directory components, only the
* base file name part will be considered a regular expression. The results
* will be sorted. The location may include directory components, but the
* regular expression to select the files can only be in the filename part.
* The files to select are chosen via the location property, which supports
* (and expects) shell-style wildcards (but only for the filename, not for
* directories). The results will be sorted.
*
* <refsect2>
* <title>Example launch line</title>
* |[
* gst-launch splitfilesrc location="/path/to/part-.*.mpg" ! decodebin ! ... \
* gst-launch splitfilesrc location="/path/to/part-*.mpg" ! decodebin ! ... \
* ]| Plays the different parts as if they were one single MPEG file.
* </refsect2>
*
@ -51,9 +48,16 @@
#endif
#include "gstsplitfilesrc.h"
#include "patternspec.h"
#include <string.h>
#ifdef G_OS_WIN32
#define DEFAULT_PATTERN_MATCH_MODE MATCH_MODE_UTF8
#else
#define DEFAULT_PATTERN_MATCH_MODE MATCH_MODE_AUTO
#endif
enum
{
PROP_LOCATION = 1
@ -105,6 +109,12 @@ gst_split_file_src_base_init (gpointer g_class)
"Tim-Philipp Müller <tim.muller@collabora.co.uk>");
}
#ifdef G_OS_WIN32
#define WIN32_BLURB " Location string must be in UTF-8 encoding (on Windows)."
#else
#define WIN32_BLURB /* nothing */
#endif
static void
gst_split_file_src_class_init (GstSplitFileSrcClass * klass)
{
@ -115,16 +125,12 @@ gst_split_file_src_class_init (GstSplitFileSrcClass * klass)
gobject_class->get_property = gst_split_file_src_get_property;
gobject_class->finalize = gst_split_file_src_finalize;
/* We're using a regular expression here instead of wildcards, because
* GPatternSpec can only handle UTF-8 and filenames on unix tend to be
* just bytes and are often ISO-8859-X, and we don't feel like
* re-inventing GPatternSpec */
g_object_class_install_property (gobject_class, PROP_LOCATION,
g_param_spec_string ("location", "File Location",
"Regular expression to create file names of the input files. If "
"Wildcard pattern to match file names of the input files. If "
"the location is an absolute path or contains directory components, "
"only the base file name part will be considered a regular "
"expression. The results will be sorted.",
"only the base file name part will be considered for pattern "
"matching. The results will be sorted." WIN32_BLURB,
DEFAULT_LOCATION, G_PARAM_READWRITE | G_PARAM_STATIC_STRINGS));
gstbasesrc_class->start = GST_DEBUG_FUNCPTR (gst_split_file_src_start);
@ -203,6 +209,12 @@ gst_split_file_src_set_property (GObject * object, guint prop_id,
GST_OBJECT_LOCK (src);
g_free (src->location);
src->location = g_value_dup_string (value);
#ifdef G_OS_WIN32
if (!g_utf8_validate (src->location, -1, NULL)) {
g_warning ("splitfilesrc 'location' property must be in UTF-8 "
"encoding on Windows");
}
#endif
GST_OBJECT_UNLOCK (src);
break;
default:
@ -239,10 +251,9 @@ static gchar **
gst_split_file_src_find_files (GstSplitFileSrc * src, const gchar * dirname,
const gchar * basename, GError ** err)
{
PatternSpec *pspec;
GPtrArray *files;
GRegex *regex;
const gchar *name;
gchar *regex_string;
GDir *dir;
if (dirname == NULL || basename == NULL)
@ -255,25 +266,20 @@ gst_split_file_src_find_files (GstSplitFileSrc * src, const gchar * dirname,
if (dir == NULL)
return NULL;
/* we want the filename to be the whole filename, not just some match
* in the middle of the filename */
if (g_str_has_suffix (basename, "$"))
regex_string = g_strdup (basename);
else
regex_string = g_strconcat (basename, "$", NULL);
if (DEFAULT_PATTERN_MATCH_MODE == MATCH_MODE_UTF8 &&
!g_utf8_validate (basename, -1, NULL)) {
goto not_utf8;
}
regex = g_regex_new (regex_string, G_REGEX_RAW, (GRegexMatchFlags) 0, err);
g_free (regex_string);
if (regex == NULL)
goto regex_fail;
/* mode will be AUTO on linux/unix and UTF8 on win32 */
pspec = pattern_spec_new (basename, DEFAULT_PATTERN_MATCH_MODE);
files = g_ptr_array_new ();
while ((name = g_dir_read_name (dir))) {
GST_TRACE_OBJECT (src, "check: %s", name);
if (g_regex_match (regex, name, (GRegexMatchFlags) 0, NULL)) {
GST_LOG_OBJECT (src, "match: %s", name);
if (pattern_match_string (pspec, name)) {
GST_DEBUG_OBJECT (src, "match: %s", name);
g_ptr_array_add (files, g_build_filename (dirname, name, NULL));
}
}
@ -284,7 +290,7 @@ gst_split_file_src_find_files (GstSplitFileSrc * src, const gchar * dirname,
g_ptr_array_sort (files, (GCompareFunc) gst_split_file_src_array_sortfunc);
g_ptr_array_add (files, NULL);
g_regex_unref (regex);
pattern_spec_free (pspec);
g_dir_close (dir);
return (gchar **) g_ptr_array_free (files, FALSE);
@ -296,21 +302,21 @@ invalid_location:
"No filename specified.");
return NULL;
}
regex_fail:
not_utf8:
{
GST_WARNING_OBJECT (src, "g_regex_new() failed: %s", (*err)->message);
g_dir_close (dir);
g_set_error_literal (err, G_FILE_ERROR, G_FILE_ERROR_INVAL,
"Filename pattern must be UTF-8 on Windows.");
return NULL;
}
no_matches:
{
g_regex_unref (regex);
pattern_spec_free (pspec);
g_dir_close (dir);
g_set_error_literal (err, G_FILE_ERROR, G_FILE_ERROR_NOENT,
"Found no files matching the pattern.");
return NULL;
}
}
static gboolean

334
gst/multifile/patternspec.c Normal file
View file

@ -0,0 +1,334 @@
/* GPattern copy that supports raw (non-utf8) matching
* based on: GLIB - Library of useful routines for C programming
* Copyright (C) 1995-1997, 1999 Peter Mattis, Red Hat, Inc.
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 02111-1307, USA.
*/
#ifdef HAVE_CONFIG_H
#include "config.h"
#endif
#include "patternspec.h"
#include <string.h>
typedef enum
{
MATCH_ALL, /* "*A?A*" */
MATCH_ALL_TAIL, /* "*A?AA" */
MATCH_HEAD, /* "AAAA*" */
MATCH_TAIL, /* "*AAAA" */
MATCH_EXACT, /* "AAAAA" */
MATCH_LAST
} MatchType;
struct _PatternSpec
{
MatchMode match_mode;
MatchType match_type;
guint pattern_length;
guint min_length;
guint max_length;
gchar *pattern;
};
static inline gchar *
raw_strreverse (const gchar * str, gssize size)
{
g_assert (size > 0);
return g_strreverse (g_strndup (str, size));
}
static inline gboolean
pattern_ph_match (const gchar * match_pattern, MatchMode match_mode,
const gchar * match_string, gboolean * wildcard_reached_p)
{
register const gchar *pattern, *string;
register gchar ch;
pattern = match_pattern;
string = match_string;
ch = *pattern;
pattern++;
while (ch) {
switch (ch) {
case '?':
if (!*string)
return FALSE;
if (match_mode == MATCH_MODE_UTF8)
string = g_utf8_next_char (string);
else
++string;
break;
case '*':
*wildcard_reached_p = TRUE;
do {
ch = *pattern;
pattern++;
if (ch == '?') {
if (!*string)
return FALSE;
if (match_mode == MATCH_MODE_UTF8)
string = g_utf8_next_char (string);
else
++string;
}
}
while (ch == '*' || ch == '?');
if (!ch)
return TRUE;
do {
gboolean next_wildcard_reached = FALSE;
while (ch != *string) {
if (!*string)
return FALSE;
if (match_mode == MATCH_MODE_UTF8)
string = g_utf8_next_char (string);
else
++string;
}
string++;
if (pattern_ph_match (pattern, match_mode, string,
&next_wildcard_reached))
return TRUE;
if (next_wildcard_reached)
/* the forthcoming pattern substring up to the next wildcard has
* been matched, but a mismatch occoured for the rest of the
* pattern, following the next wildcard.
* there's no need to advance the current match position any
* further if the rest pattern will not match.
*/
return FALSE;
}
while (*string);
break;
default:
if (ch == *string)
string++;
else
return FALSE;
break;
}
ch = *pattern;
pattern++;
}
return *string == 0;
}
static gboolean
pattern_match (PatternSpec * pspec, guint string_length,
const gchar * string, const gchar * string_reversed)
{
MatchMode match_mode;
g_assert (pspec != NULL);
g_assert (string != NULL);
if (string_length < pspec->min_length || string_length > pspec->max_length)
return FALSE;
match_mode = pspec->match_mode;
if (match_mode == MATCH_MODE_AUTO) {
if (!g_utf8_validate (string, string_length, NULL))
match_mode = MATCH_MODE_RAW;
else
match_mode = MATCH_MODE_UTF8;
}
switch (pspec->match_type) {
gboolean dummy;
case MATCH_ALL:
return pattern_ph_match (pspec->pattern, match_mode, string, &dummy);
case MATCH_ALL_TAIL:
if (string_reversed)
return pattern_ph_match (pspec->pattern, match_mode, string_reversed,
&dummy);
else {
gboolean result;
gchar *tmp;
if (match_mode == MATCH_MODE_UTF8) {
tmp = g_utf8_strreverse (string, string_length);
} else {
tmp = raw_strreverse (string, string_length);
}
result = pattern_ph_match (pspec->pattern, match_mode, tmp, &dummy);
g_free (tmp);
return result;
}
case MATCH_HEAD:
if (pspec->pattern_length == string_length)
return memcmp (pspec->pattern, string, string_length) == 0;
else if (pspec->pattern_length)
return memcmp (pspec->pattern, string, pspec->pattern_length) == 0;
else
return TRUE;
case MATCH_TAIL:
if (pspec->pattern_length)
/* compare incl. NUL terminator */
return memcmp (pspec->pattern,
string + (string_length - pspec->pattern_length),
pspec->pattern_length + 1) == 0;
else
return TRUE;
case MATCH_EXACT:
if (pspec->pattern_length != string_length)
return FALSE;
else
return memcmp (pspec->pattern, string, string_length) == 0;
default:
g_return_val_if_fail (pspec->match_type < MATCH_LAST, FALSE);
return FALSE;
}
}
PatternSpec *
pattern_spec_new (const gchar * pattern, MatchMode match_mode)
{
PatternSpec *pspec;
gboolean seen_joker = FALSE, seen_wildcard = FALSE, more_wildcards = FALSE;
gint hw_pos = -1, tw_pos = -1, hj_pos = -1, tj_pos = -1;
gboolean follows_wildcard = FALSE;
guint pending_jokers = 0;
const gchar *s;
gchar *d;
guint i;
g_assert (pattern != NULL);
g_assert (match_mode != MATCH_MODE_UTF8
|| g_utf8_validate (pattern, -1, NULL));
/* canonicalize pattern and collect necessary stats */
pspec = g_new (PatternSpec, 1);
pspec->match_mode = match_mode;
pspec->pattern_length = strlen (pattern);
pspec->min_length = 0;
pspec->max_length = 0;
pspec->pattern = g_new (gchar, pspec->pattern_length + 1);
if (pspec->match_mode == MATCH_MODE_AUTO) {
if (!g_utf8_validate (pattern, -1, NULL))
pspec->match_mode = MATCH_MODE_RAW;
}
d = pspec->pattern;
for (i = 0, s = pattern; *s != 0; s++) {
switch (*s) {
case '*':
if (follows_wildcard) { /* compress multiple wildcards */
pspec->pattern_length--;
continue;
}
follows_wildcard = TRUE;
if (hw_pos < 0)
hw_pos = i;
tw_pos = i;
break;
case '?':
pending_jokers++;
pspec->min_length++;
if (pspec->match_mode == MATCH_MODE_RAW) {
pspec->max_length += 1;
} else {
pspec->max_length += 4; /* maximum UTF-8 character length */
}
continue;
default:
for (; pending_jokers; pending_jokers--, i++) {
*d++ = '?';
if (hj_pos < 0)
hj_pos = i;
tj_pos = i;
}
follows_wildcard = FALSE;
pspec->min_length++;
pspec->max_length++;
break;
}
*d++ = *s;
i++;
}
for (; pending_jokers; pending_jokers--) {
*d++ = '?';
if (hj_pos < 0)
hj_pos = i;
tj_pos = i;
}
*d++ = 0;
seen_joker = hj_pos >= 0;
seen_wildcard = hw_pos >= 0;
more_wildcards = seen_wildcard && hw_pos != tw_pos;
if (seen_wildcard)
pspec->max_length = G_MAXUINT;
/* special case sole head/tail wildcard or exact matches */
if (!seen_joker && !more_wildcards) {
if (pspec->pattern[0] == '*') {
pspec->match_type = MATCH_TAIL;
memmove (pspec->pattern, pspec->pattern + 1, --pspec->pattern_length);
pspec->pattern[pspec->pattern_length] = 0;
return pspec;
}
if (pspec->pattern_length > 0 &&
pspec->pattern[pspec->pattern_length - 1] == '*') {
pspec->match_type = MATCH_HEAD;
pspec->pattern[--pspec->pattern_length] = 0;
return pspec;
}
if (!seen_wildcard) {
pspec->match_type = MATCH_EXACT;
return pspec;
}
}
/* now just need to distinguish between head or tail match start */
tw_pos = pspec->pattern_length - 1 - tw_pos; /* last pos to tail distance */
tj_pos = pspec->pattern_length - 1 - tj_pos; /* last pos to tail distance */
if (seen_wildcard)
pspec->match_type = tw_pos > hw_pos ? MATCH_ALL_TAIL : MATCH_ALL;
else /* seen_joker */
pspec->match_type = tj_pos > hj_pos ? MATCH_ALL_TAIL : MATCH_ALL;
if (pspec->match_type == MATCH_ALL_TAIL) {
gchar *tmp = pspec->pattern;
if (pspec->match_mode == MATCH_MODE_RAW) {
pspec->pattern = raw_strreverse (pspec->pattern, pspec->pattern_length);
} else {
pspec->pattern =
g_utf8_strreverse (pspec->pattern, pspec->pattern_length);
}
g_free (tmp);
}
return pspec;
}
void
pattern_spec_free (PatternSpec * pspec)
{
g_assert (pspec != NULL);
g_free (pspec->pattern);
g_free (pspec);
}
gboolean
pattern_match_string (PatternSpec * pspec, const gchar * string)
{
return pattern_match (pspec, strlen (string), string, NULL);
}

View file

@ -0,0 +1,47 @@
/* GPattern copy that supports raw (non-utf8) matching
* based on: GLIB - Library of useful routines for C programming
* Copyright (C) 1995-1997, 1999 Peter Mattis, Red Hat, Inc.
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 02111-1307, USA.
*/
#ifndef __PATTERN_SPEC_H__
#define __PATTERN_SPEC_H__
#include <glib.h>
G_BEGIN_DECLS
typedef enum
{
MATCH_MODE_AUTO = 0,
MATCH_MODE_UTF8,
MATCH_MODE_RAW
} MatchMode;
typedef struct _PatternSpec PatternSpec;
PatternSpec * pattern_spec_new (const gchar * pattern,
MatchMode match_mode);
void pattern_spec_free (PatternSpec * pspec);
gboolean pattern_match_string (PatternSpec * pspec,
const gchar * string);
G_END_DECLS
#endif /* __PATTERN_SPEC_H__ */