Zsh Mailing List Archive
Messages sorted by:
Reverse Date,
Date,
Thread,
Author
Re: PATCH: short-circuiting glob exclusion operator
On Sat, 26 Mar 2016 08:40:42 -0700
Bart Schaefer <schaefer@xxxxxxxxxxxxxxxx> wrote:
> Maybe this should be a glob qualifier instead of a pattern, since it does
> not make sense in generic pattern context. We have Y3 to short-circuit
> after 3 matches; perhaps Y~ to short-circuit when an exclusion matches?
> Or are qualifiers also parsed too late?
It looks like it has to be Y- (or something else); a ~ in parentheses is
taken as an indication that the parentheses are for grouping, not flags,
and it's too late to change that assumption now. But '-' means 'not' or
'something other than the postive integer you might have been lead to
expect', and stuff.
Otherwise this seems to work.
pws
commit 30bace3e5571530abc68d8629e8f215bd87e7457
Author: Peter Stephenson <p.w.stephenson@xxxxxxxxxxxx>
Date: Mon Mar 21 20:02:22 2016 +0000
(Y-) globbing flag: addapted patch from 38199
This allows exclusions with ~ to match against intermediate directories
in order to prune searches early.
diff --git a/Doc/Zsh/expn.yo b/Doc/Zsh/expn.yo
index c6e7b6f..81bb897 100644
--- a/Doc/Zsh/expn.yo
+++ b/Doc/Zsh/expn.yo
@@ -2082,6 +2082,23 @@ Multiple patterns can be excluded by
`var(foo)tt(~)var(bar)tt(~)var(baz)'.
In the exclusion pattern (var(y)), `tt(/)' and `tt(.)' are not treated
specially the way they usually are in globbing.
+
+In filename generation a top-level pattern after a tt(~) (i.e. outside
+any parentheses) is treated specially if the globbing flag tt(Y-) is
+also preseent. As before, the pattern var(x) must match and anything
+matched by var(y) is excluded from the match. However, in this case
+var(y) is used additionally to prune intermediate directories. In other
+words, var(y) can match not just a complete file name to be returned,
+but also any directory by which the file is reached. This is much more
+efficient than the previous form for deep directory hierarchies as the
+directory is excluded as soon as it is reached and is never entered.
+
+For example, `tt(**/*.o~(*/|)lib+LPAR()Y-+RPAR())' searches recursively
+for files with the suffix `tt(.o)', but if a directory named `tt(lib)'
+is reached at the top level or in any subdirectory found recursively it
+is not searched. With the pattern `tt(**/*~(*/|)lib+LPAR()/Y-+RPAR()),
+which searches recursively for directories, directories named tt(lib)
+are not only not searched but are not returned as a match.
)
item(var(x)tt(#))(
(Requires tt(EXTENDED_GLOB) to be set.)
@@ -2734,6 +2751,12 @@ matches in directory traversal order will be considered.
Implies tt(oN) when no tt(o)var(c) qualifier is used.
)
+item(tt(Y-))(
+Also implies short-circuit, but in this case with exclusions using the
+tt(~) extended glob pattern. Any intermediate directory that matches
+the excluded pattern causes globbing to stop at that point. For
+more detail see the description of the var(x)tt(~)var(y) pattern above.
+)
item(tt(o)var(c))(
specifies how the names of the files should be sorted. If var(c) is
tt(n) they are sorted by name; if it is tt(L) they
diff --git a/Src/glob.c b/Src/glob.c
index 2051016..5c59be5 100644
--- a/Src/glob.c
+++ b/Src/glob.c
@@ -183,8 +183,10 @@ struct globdata {
int gd_gf_nullglob, gd_gf_markdirs, gd_gf_noglobdots, gd_gf_listtypes;
int gd_gf_numsort;
int gd_gf_follow, gd_gf_sorts, gd_gf_nsorts;
+ int gd_exclude_shortcut;
struct globsort gd_gf_sortlist[MAX_SORTS];
LinkList gd_gf_pre_words, gd_gf_post_words;
+ Patprog gd_exclude;
char *gd_glob_pre, *gd_glob_suf;
};
@@ -259,7 +261,7 @@ struct complist {
/**/
static void
-addpath(char *s, int l)
+addpath(const char *s, int l)
{
DPUTS(!pathbuf, "BUG: pathbuf not initialised");
while (pathpos + l + 1 >= pathbufsz)
@@ -459,6 +461,38 @@ insert(char *s, int checked)
return;
}
+/*
+ * Decide whether to prune at this point in scanning. 1 = yes.
+ *
+ * We'll make temporary use of pathbuf for this; it's convenient because
+ * most of a long path won't need copying and if we are later going to
+ * add this file to it we know the buffer will be large enough.
+ *
+ * The filename fn is here metafied.
+ */
+/**/
+static int
+scanner_prune(const char *fn, int len)
+{
+ int ret, savepathpos, saverrsfound, savforceerrs;
+ if (!curglobdata.gd_exclude)
+ return 0;
+ savepathpos = pathpos;
+ addpath(fn, len);
+ if (pathpos && pathbuf[pathpos-1] == '/')
+ pathbuf[pathpos-1] = '\0';
+ /*
+ * As we're in the middle of a glob, be careful about
+ * the counters needed for approximation across directories.
+ */
+ patsaveerrs(&saverrsfound, &savforceerrs);
+ ret = pattry(curglobdata.gd_exclude, pathbuf);
+ patrestoreerrs(saverrsfound, savforceerrs);
+ pathbuf[pathpos = savepathpos] = '\0';
+
+ return ret;
+}
+
/* Do the globbing: scanner is called recursively *
* with successive bits of the path until we've *
* tried all of it. */
@@ -510,6 +544,8 @@ scanner(Complist q, int shortcircuit)
}
pathbufcwd = pathpos;
}
+ if (scanner_prune(str, l))
+ return;
if (q->next) {
/* Not the last path section. Just add it to the path. */
int oppos = pathpos;
@@ -563,6 +599,8 @@ scanner(Complist q, int shortcircuit)
continue;
errsfound = errssofar;
if (pattry(p, fn)) {
+ if (scanner_prune(fn, strlen(fn)))
+ continue;
/* if this name matchs the pattern... */
if (pbcwdsav == pathbufcwd &&
strlen(fn) + pathpos - pathbufcwd >= PATH_MAX) {
@@ -675,9 +713,10 @@ scanner(Complist q, int shortcircuit)
/**/
static Complist
-parsecomplist(char *instr)
+parsecomplist(char *instr, char **post_exclude)
{
Patprog p1;
+ struct patcompargs compargs;
Complist l1;
char *str;
int compflags = gf_noglobdots ? (PAT_FILE|PAT_NOGLD) : PAT_FILE;
@@ -699,11 +738,11 @@ parsecomplist(char *instr)
/* Now get the next path component if there is one. */
l1 = (Complist) zhalloc(sizeof *l1);
- if ((l1->next = parsecomplist(instr)) == NULL) {
+ if ((l1->next = parsecomplist(instr, post_exclude)) == NULL) {
errflag |= ERRFLAG_ERROR;
return NULL;
}
- l1->pat = patcompile(NULL, compflags | PAT_ANY, NULL);
+ l1->pat = patcompile(NULL, compflags | PAT_ANY, &compargs);
l1->closure = 1; /* ...zero or more times. */
l1->follow = follow;
return l1;
@@ -715,8 +754,9 @@ parsecomplist(char *instr)
!skipparens(Inpar, Outpar, (char **)&str) &&
*str == zpc_special[ZPC_HASH] && str[-2] == '/') {
instr++;
- if (!(p1 = patcompile(instr, compflags, &instr)))
+ if (!(p1 = patcompile(instr, compflags, &compargs)))
return NULL;
+ instr = compargs.endexp;
if (instr[0] == '/' && instr[1] == Outpar && instr[2] == Pound) {
int pdflag = 0;
@@ -730,21 +770,28 @@ parsecomplist(char *instr)
/* special case (/)# to avoid infinite recursion */
l1->closure = (*((char *)p1 + p1->startoff)) ? 1 + pdflag : 0;
l1->follow = 0;
- l1->next = parsecomplist(instr);
+ l1->next = parsecomplist(instr, NULL);
return (l1->pat) ? l1 : NULL;
}
} else {
/* parse single path component */
- if (!(p1 = patcompile(instr, compflags|PAT_FILET, &instr)))
+ if (post_exclude)
+ compflags |= PAT_FILET|PAT_EXCL_PRUNE;
+ else
+ compflags |= PAT_FILET;
+ if (!(p1 = patcompile(instr, compflags, &compargs)))
return NULL;
+ instr = compargs.endexp;
/* then do the remaining path components */
if (*instr == '/' || !*instr) {
int ef = *instr == '/';
+ if (post_exclude && compargs.post_exclude)
+ *post_exclude = dupstring(compargs.post_exclude);
l1 = (Complist) zhalloc(sizeof *l1);
l1->pat = p1;
l1->closure = 0;
- l1->next = ef ? parsecomplist(instr+1) : NULL;
+ l1->next = ef ? parsecomplist(instr+1, post_exclude) : NULL;
return (ef && !l1->next) ? NULL : l1;
}
}
@@ -756,7 +803,7 @@ parsecomplist(char *instr)
/**/
static Complist
-parsepat(char *str)
+parsepat(char *str, char **post_exclude)
{
long assert;
int ignore;
@@ -785,7 +832,7 @@ parsepat(char *str)
} else /* pattern is relative to pwd */
pathbuf[pathpos = 0] = '\0';
- return parsecomplist(str);
+ return parsecomplist(str, post_exclude);
}
/* get number after qualifier */
@@ -1187,6 +1234,7 @@ zglob(LinkList list, LinkNode np, int nountok)
int first = 0, end = -1; /* index of first match to return */
/* and index+1 of the last match */
struct globdata saved; /* saved glob state */
+ char *post_exclude = NULL; /* text after ~~ */
int nobareglob = !isset(BAREGLOBQUAL);
int shortcircuit = 0; /* How many files to match; */
/* 0 means no limit */
@@ -1220,7 +1268,7 @@ zglob(LinkList list, LinkNode np, int nountok)
gf_listtypes = gf_follow = 0;
gf_noglobdots = unset(GLOBDOTS);
gf_numsort = isset(NUMERICGLOBSORT);
- gf_sorts = gf_nsorts = 0;
+ gf_sorts = gf_nsorts = curglobdata.gd_exclude_shortcut = 0;
gf_pre_words = gf_post_words = NULL;
/* Check for qualifiers */
@@ -1550,12 +1598,18 @@ zglob(LinkList list, LinkNode np, int nountok)
shortcircuit = !(sense & 1);
if (shortcircuit) {
/* Parse the argument. */
- data = qgetnum(&s);
- if ((shortcircuit = data) != data) {
- /* Integer overflow */
- zerr("value too big: Y%s", s_saved);
- restore_globstate(saved);
- return;
+ if (*s == '-') {
+ curglobdata.gd_exclude_shortcut = 1;
+ shortcircuit = 0; /* handled differently */
+ s++;
+ } else {
+ data = qgetnum(&s);
+ if ((shortcircuit = data) != data) {
+ /* Integer overflow */
+ zerr("value too big: Y%s", s_saved);
+ restore_globstate(saved);
+ return;
+ }
}
}
break;
@@ -1774,7 +1828,7 @@ zglob(LinkList list, LinkNode np, int nountok)
qfirst = qn;
for (qlast = qfirst; qlast->next;
qlast = qlast->next)
- ;
+ ;
} else
qfirst = dup_qual_list(qn, &qlast);
/* ... link into new `or' chain ... */
@@ -1804,8 +1858,20 @@ zglob(LinkList list, LinkNode np, int nountok)
} else if (newquals)
quals = newquals;
}
- q = parsepat(str);
- if (!q || errflag) { /* if parsing failed */
+ q = parsepat(str, curglobdata.gd_exclude_shortcut ? &post_exclude : NULL);
+ if (q && !errflag && post_exclude) {
+ /*
+ * Exclusion after ~ with (Y-).
+ * This is a normal pattern, not a file, but indicate the
+ * special case as if we encounter a further top-level tilde
+ * it indicates patterns to be or'd in the exclusion.
+ */
+ curglobdata.gd_exclude = patcompile(post_exclude, PAT_EXCL_PRUNE, NULL);
+ } else {
+ curglobdata.gd_exclude = NULL;
+ }
+ if (!q || errflag || (post_exclude && !curglobdata.gd_exclude)) {
+ /* if parsing failed */
restore_globstate(saved);
if (unset(BADPATTERN)) {
if (!nountok)
diff --git a/Src/pattern.c b/Src/pattern.c
index 4e2f236..7ef64a7 100644
--- a/Src/pattern.c
+++ b/Src/pattern.c
@@ -521,21 +521,24 @@ patcompstart(void)
* Top level pattern compilation subroutine
* exp is a null-terminated, metafied string.
* inflags is an or of some PAT_* flags.
- * endexp, if non-null, is set to a pointer to the end of the
- * part of exp which was compiled. This is used when
- * compiling patterns for directories which must be
+ *
+ * compargs, if non-null, contains pointers that may be set
+ * to indicate the following. This is used when matching files
+ * as in this case the pattern has multiple parts.
+ * - endexp is set to the end of the part of exp which was compiled.
+ * This is used when compiling patterns for directories which must be
* matched recursively.
*/
/**/
mod_export Patprog
-patcompile(char *exp, int inflags, char **endexp)
+patcompile(char *exp, int inflags, Patcompargs compargs)
{
int flags = 0;
long len = 0;
long startoff;
Upat pscan;
- char *lng, *strp = NULL;
+ char *lng, *strp = NULL, *post_exclude = NULL;
Patprog p;
queue_signals();
@@ -602,7 +605,7 @@ patcompile(char *exp, int inflags, char **endexp)
if (!strp || (*strp && *strp != '/')) {
/* No, do normal compilation. */
strp = NULL;
- if (patcompswitch(0, &flags) == 0) {
+ if (patcompswitch(0, &flags, &post_exclude) == 0) {
unqueue_signals();
return NULL;
}
@@ -735,13 +738,47 @@ patcompile(char *exp, int inflags, char **endexp)
p = newp;
}
- if (endexp)
- *endexp = patparse;
+ if (compargs) {
+ compargs->endexp = patparse;
+ if ((compargs->post_exclude = post_exclude))
+ {
+ /*
+ * The top-level ~~ is the last thing that can appear.
+ */
+ compargs->endexp += strlen(patparse);
+ }
+ }
unqueue_signals();
return p;
}
+/* Flags for exclusion with ~ */
+enum {
+ TILDE_SINGLE = 1,
+ TILDE_SLASH = 2,
+};
+
+/*
+ * Look for an active tilde.
+ *
+ * If term_match, we're looking for a terminator (something that can't
+ * be part of a pattern itself) to follow; otherwise, we're
+ * looking for a tilde without a terminator following.
+ */
+
+static int
+pattilde(int toplevel, int term_match)
+{
+ if (*patparse != zpc_special[ZPC_TILDE])
+ return 0;
+ if (patparse[1] == '/' || !memchr(zpc_special, patparse[1],
+ ZPC_SEG_COUNT))
+ return term_match ? TILDE_SINGLE : 0;
+ else
+ return term_match ? 0 : TILDE_SINGLE;
+}
+
/*
* Main body or parenthesized subexpression in pattern
* Parenthesis (and any ksh_glob gubbins) will have been removed.
@@ -749,7 +786,7 @@ patcompile(char *exp, int inflags, char **endexp)
/**/
static long
-patcompswitch(int paren, int *flagp)
+patcompswitch(int paren, int *flagp, char **post_exclude)
{
long starter, br, ender, excsync = 0;
int parno = 0;
@@ -783,17 +820,45 @@ patcompswitch(int paren, int *flagp)
*flagp |= flags & (P_HSTART|P_PURESTR);
- while (*patparse == zpc_chars[ZPC_BAR] ||
- (*patparse == zpc_special[ZPC_TILDE] &&
- (patparse[1] == '/' ||
- !memchr(zpc_special, patparse[1], ZPC_SEG_COUNT)))) {
- int tilde = *patparse++ == zpc_special[ZPC_TILDE];
- long gfnode = 0, newbr;
+ for (;;) {
+ int tilde;
+ long gfnode, newbr;
+ if (*patparse == zpc_chars[ZPC_BAR]) {
+ /* handle alternation */
+ tilde = 0;
+ patparse++;
+ } else if ((tilde = pattilde(!paren, 1))) {
+ patparse += tilde;
+ if ((patflags & (PAT_FILET|PAT_EXCL_PRUNE)) == PAT_EXCL_PRUNE)
+ {
+ /*
+ * We're looking at something like the second tilde in
+ * foo~bar1~bar2(Y-). We need to treat this like
+ * foo excluding bar1|bar2, so treat the tilde as a bar.
+ */
+ tilde = 0;
+ }
+ /* else handle exclusion... */
+ } else {
+ break;
+ }
+ gfnode = 0;
*flagp &= ~P_PURESTR;
if (tilde) {
union upat up;
+ if ((patflags & (PAT_FILET|PAT_EXCL_PRUNE)) ==
+ (PAT_FILET|PAT_EXCL_PRUNE)) {
+ /*
+ * Prruning a partial path, not just the fully expanded
+ * file: handle specially
+ */
+ *post_exclude = patparse;
+ patparse += strlen(patparse);
+ break;
+ }
+
/* excsync remembers the P_EXCSYNC node before a chain of
* exclusions: all point back to this. only the
* original (non-excluded) branch gets a trailing P_EXCSYNC.
@@ -825,7 +890,7 @@ patcompswitch(int paren, int *flagp)
patadd((char *)&up, 0, sizeof(up), 0);
/* / is not treated as special if we are at top level */
if (!paren && zpc_special[ZPC_SLASH] == '/') {
- tilde++;
+ tilde |= TILDE_SLASH;
zpc_special[ZPC_SLASH] = Marker;
}
} else {
@@ -864,7 +929,7 @@ patcompswitch(int paren, int *flagp)
}
}
newbr = patcompbranch(&flags, paren);
- if (tilde == 2) {
+ if (tilde & TILDE_SLASH) {
/* restore special treatment of / */
zpc_special[ZPC_SLASH] = '/';
}
@@ -935,8 +1000,7 @@ patcompbranch(int *flagp, int paren)
starter = chain = 0;
while (!memchr(zpc_special, *patparse, ZPC_SEG_COUNT) ||
- (*patparse == zpc_special[ZPC_TILDE] && patparse[1] != '/' &&
- memchr(zpc_special, patparse[1], ZPC_SEG_COUNT))) {
+ pattilde(!paren, 0)) {
if ((*patparse == zpc_special[ZPC_INPAR] &&
patparse[1] == zpc_special[ZPC_HASH]) ||
(*patparse == zpc_special[ZPC_KSH_AT] && patparse[1] == Inpar &&
@@ -1298,9 +1362,7 @@ patcomppiece(int *flagp, int paren)
*/
if (kshchar ||
(memchr(zpc_special, *patparse, ZPC_NO_KSH_GLOB) &&
- (*patparse != zpc_special[ZPC_TILDE] ||
- patparse[1] == '/' ||
- !memchr(zpc_special, patparse[1], ZPC_SEG_COUNT)))) {
+ !pattilde(!paren, 0))) {
break;
}
}
@@ -1507,7 +1569,7 @@ patcomppiece(int *flagp, int paren)
*/
if (!(starter = patcompnot(1, &flags2)))
return 0;
- } else if (!(starter = patcompswitch(1, &flags2)))
+ } else if (!(starter = patcompswitch(1, &flags2, NULL)))
return 0;
flags |= flags2 & P_HSTART;
break;
@@ -1759,7 +1821,8 @@ patcompnot(int paren, int *flagsp)
pattail(starter, excl = patnode(P_EXCLUDE));
up.p = NULL;
patadd((char *)&up, 0, sizeof(up), 0);
- if (!(br = (paren ? patcompswitch(1, &dummy) : patcompbranch(&dummy, 0))))
+ if (!(br = (paren ? patcompswitch(1, &dummy, NULL) :
+ patcompbranch(&dummy, 0))))
return 0;
pattail(br, patnode(P_EXCEND));
n = patnode(P_NOTHING); /* just so much easier */
@@ -2040,6 +2103,26 @@ pattrystart(void)
errsfound = 0;
}
+/* Save the error count during a glob. */
+
+/**/
+void
+patsaveerrs(int *errsfoundp, int *forceerrsp)
+{
+ *errsfoundp = errsfound;
+ *forceerrsp = forceerrs;
+}
+
+/* Restore the error count during a glob. */
+
+/**/
+void
+patrestoreerrs(int errsfoundin, int forceerrsin)
+{
+ errsfound = errsfoundin;
+ forceerrs = forceerrsin;
+}
+
/*
* Fix up string length stuff.
*
diff --git a/Src/zsh.h b/Src/zsh.h
index eee31da..7f58354 100644
--- a/Src/zsh.h
+++ b/Src/zsh.h
@@ -511,6 +511,7 @@ typedef struct options *Options;
typedef struct optname *Optname;
typedef struct param *Param;
typedef struct paramdef *Paramdef;
+typedef struct patcompargs *Patcompargs;
typedef struct patstralloc *Patstralloc;
typedef struct patprog *Patprog;
typedef struct prepromptfn *Prepromptfn;
@@ -1500,6 +1501,14 @@ struct patstralloc {
int progstrunmetalen; /* Length of the foregoing */
};
+struct patcompargs {
+ char *endexp; /* End of the part of the input
+ * expression that was compiled.*/
+ char *post_exclude; /* Expression following ~ in top-level
+ * file match: handled specially.
+ * Enabled by PAT_FILET flag. */
+};
+
/* Flags used in pattern matchers (Patprog) and passed down to patcompile */
#define PAT_FILE 0x0001 /* Pattern is a file name */
@@ -1515,6 +1524,12 @@ struct patstralloc {
#define PAT_NOTEND 0x0400 /* End of string is not real end */
#define PAT_HAS_EXCLUDP 0x0800 /* (internal): top-level path1~path2. */
#define PAT_LCMATCHUC 0x1000 /* equivalent to setting (#l) */
+#define PAT_EXCL_PRUNE 0x2000 /* With PAT_FILET, any trailing ~ pattern
+ * can prune directories, too.
+ * Without PAT_FILET, this is that
+ * pattern, so subsequent "~"s have the
+ * effect of "|".
+ */
/**
* Indexes into the array of active pattern characters.
diff --git a/Test/D02glob.ztst b/Test/D02glob.ztst
index a6b704a..83d36f6 100644
--- a/Test/D02glob.ztst
+++ b/Test/D02glob.ztst
@@ -655,3 +655,28 @@
[[ "1" = [$~cset] ]] || print Fail 5
[[ "b" != [$~cset] ]] || print Fail 6
0:character set specified as active variabe
+
+ (cd glob.tmp
+ setopt extendedglob
+ mkdir DT DT/one DT/two DT/three
+ for dir in one two three; do
+ mkdir DT/$dir/sub{1..3}
+ touch DT/$dir/sub{1..3}/file.{a,b,c}
+ done
+ print -l DT/o*/*3/file.*
+ print -l DT/**/*~DT(.NY-)
+ # Attention: EXCLUSIONS BLOW YOUR MIND warning...
+ print -l DT/**/*.*~(*sub[123]*.[ab]|*sub[13]*)(Y-)
+ print -l DT/**/*.*~*/(sub[12]|one|two)(Y-)
+ )
+0:~~ exclusions
+>DT/one/sub3/file.a
+>DT/one/sub3/file.b
+>DT/one/sub3/file.c
+>
+>DT/one/sub2/file.c
+>DT/three/sub2/file.c
+>DT/two/sub2/file.c
+>DT/three/sub3/file.a
+>DT/three/sub3/file.b
+>DT/three/sub3/file.c
Messages sorted by:
Reverse Date,
Date,
Thread,
Author