Zsh Mailing List Archive
Messages sorted by:
Reverse Date,
Date,
Thread,
Author
Re: Quantifier ( braces in regexp )
- X-seq: zsh-workers 23713
- From: Peter Stephenson <p.w.stephenson@xxxxxxxxxxxx>
- To: Zsh Hackers' List <zsh-workers@xxxxxxxxxx>
- Subject: Re: Quantifier ( braces in regexp )
- Date: Fri, 27 Jul 2007 22:40:07 +0100
- In-reply-to: <20070724111354.61a34ad4@xxxxxxxxxxxxxx>
- Mailing-list: contact zsh-workers-help@xxxxxxxxxx; run by ezmlm
- References: <20070723073116.GA5210@xxxxxxxxxxxxxxxxxxxxx> <20070724111354.61a34ad4@xxxxxxxxxxxxxx>
On Tue, 24 Jul 2007 11:13:54 +0100 Peter Stephenson <pws@xxxxxxx> wrote:
> I will have a look at how difficult quantifiers are, but the syntax will be
> different since braces are already used: probably something like an
> extended globbing flag (#c2,3).
I think I've found the pukka way of doing this. It would be possible to
optimise repetitions of individual characters in a similar fashion to
how it's done with # and ##, but it's not clear how worth while that
is. The minimum and maximum are limited to longs; as you get a function
call for each instance of the match it doesn't seem likely that's going
to be a major limitation.
Index: Doc/Zsh/expn.yo
===================================================================
RCS file: /cvsroot/zsh/zsh/Doc/Zsh/expn.yo,v
retrieving revision 1.79
diff -u -r1.79 expn.yo
--- Doc/Zsh/expn.yo 6 Jul 2007 13:10:44 -0000 1.79
+++ Doc/Zsh/expn.yo 27 Jul 2007 21:35:16 -0000
@@ -1636,6 +1636,18 @@
Deactivate backreferences, negating the effect of the tt(b) flag from that
point on.
)
+item(tt(c)var(N)tt(,)var(M))(
+The flag tt(LPAR()#c)var(N)tt(,)var(M)tt(RPAR()) can be used anywhere
+that the tt(#) or tt(##) operators can be used; it cannot be combined
+with other globbing flags and a bad pattern error occurs if it is
+misplaced. It is equivalent to the form tt({)var(N)tt(,)var(M)tt(}) in
+regular expressions. The previous character or group is required to
+match between var(N) and var(M) times, inclusive. The form
+tt(LPAR()#c)var(N)tt(RPAR()) requires exactly tt(N) matches;
+tt(LPAR()#c,)var(M)tt(RPAR()) is equivalent to specifying var(N) as 0;
+tt(LPAR()#c)var(N)tt(,RPAR()) specifies that there is no maximum limit
+on the number of matches.
+)
item(tt(m))(
Set references to the match data for the entire string matched; this is
similar to backreferencing and does not work in filename generation. The
Index: Misc/globtests
===================================================================
RCS file: /cvsroot/zsh/zsh/Misc/globtests,v
retrieving revision 1.5
diff -u -r1.5 globtests
--- Misc/globtests 25 Jul 2006 18:10:38 -0000 1.5
+++ Misc/globtests 27 Jul 2007 21:35:16 -0000
@@ -182,5 +182,15 @@
f path/testy *((#s)|/)test((#e)|/)*
f path/testy/ohyes *((#s)|/)test((#e)|/)*
f path/atest/ohyes *((#s)|/)test((#e)|/)*
+t XabcdabcY X(ab|c|d)(#c5)Y
+t XabcdabcY X(ab|c|d)(#c1,5)Y
+t XabcdabcY X(ab|c|d)(#c5,8)Y
+t XabcdabcY X(ab|c|d)(#c4,)Y
+f XabcdabcY X(ab|c|d)(#c6,)Y
+f XabcdabcY X(ab|c|d)(#c1,4)Y
+t ZX Z(|)(#c1)X
+t froofroo (fro(#c2))(#c2)
+f froofroofroo (fro(#c2))(#c2)
+f froofro (fro(#c2))(#c2)
EOT
print "$failed tests failed."
Index: Src/pattern.c
===================================================================
RCS file: /cvsroot/zsh/zsh/Src/pattern.c,v
retrieving revision 1.39
diff -u -r1.39 pattern.c
--- Src/pattern.c 1 Aug 2006 11:58:04 -0000 1.39
+++ Src/pattern.c 27 Jul 2007 21:35:17 -0000
@@ -105,6 +105,8 @@
#define P_GFLAGS 0x08 /* long Match nothing and set globbing flags */
#define P_ISSTART 0x09 /* no Match start of string. */
#define P_ISEND 0x0a /* no Match end of string. */
+#define P_COUNTSTART 0x0b /* no Initialise P_COUNT */
+#define P_COUNT 0x0c /* 3*long uc* node Match a number of repetitions */
/* numbered so we can test bit 5 for a branch */
#define P_BRANCH 0x20 /* node Match this alternative, or the next... */
#define P_WBRANCH 0x21 /* uc* node P_BRANCH, but match at least 1 char */
@@ -179,6 +181,17 @@
* for top level file globs, e.g. ** / *.c~*foo.c
* ^ I had to leave this space
* P_NUM*: zl is a zlong if that is 64-bit, else an unsigned long.
+ *
+ * P_COUNTSTART, P_COUNT: a P_COUNTSTART flags the start of a quantified
+ * closure (#cN,M) and is used to initialise the count. Executing
+ * the pattern leads back to the P_COUNT, while the next links of the
+ * P_COUNTSTART and P_COUNT lead to the tail of the pattern:
+ *
+ * ----------------
+ * v ^
+ * <COUNTSTART><COUNT>pattern<BACK> tail
+ * v v ^
+ * ------------------------
*/
#define PP_ALPHA 1
#define PP_ALNUM 2
@@ -211,6 +224,13 @@
#define P_LS_LEN(p) ((p)[1].l) /* can be used as lvalue */
#define P_LS_STR(p) ((char *)((p) + 2))
+/* Specific to P_COUNT: arguments as offset in nodes from operator */
+#define P_CT_CURRENT (1) /* Current count */
+#define P_CT_MIN (2) /* Minimum count */
+#define P_CT_MAX (3) /* Maximum count, -1 for none */
+#define P_CT_PTR (4) /* Pointer to last match start */
+#define P_CT_OPERAND (5) /* Operand of P_COUNT */
+
/* Flags needed when pattern is executed */
#define P_SIMPLE 0x01 /* Simple enough to be #/## operand. */
#define P_HSTART 0x02 /* Starts with # or ##'d pattern. */
@@ -766,7 +786,7 @@
* no top-level |'s.
*
* No gfchanged, as nothing to follow branch at top
- * level.
+ * level.
*/
union upat up;
gfnode = patnode(P_GFLAGS);
@@ -948,7 +968,7 @@
*ignore = 1;
/* (#X): assumes we are still positioned on the first X */
for (; *ptr && *ptr != Outpar; ptr++) {
- if (*ptr == 'q') {
+ if (*ptr == 'q') {
/* Glob qualifiers, ignored in pattern code */
while (*ptr && *ptr != Outpar)
ptr++;
@@ -1044,8 +1064,9 @@
static long
patcomppiece(int *flagp)
{
- long starter = 0, next, pound, op;
+ long starter = 0, next, op, opnd;
int flags, flags2, kshchar, len, ch, patch, nmeta;
+ int pound, count;
union upat up;
char *nptr, *str0, *ptr, *patprev;
zrange_t from, to;
@@ -1098,10 +1119,13 @@
/* more than one character matched? */
morelen = (patprev > str0);
/*
- * If we have more than one character, a following hash only
- * applies to the last, so backtrack one character.
+ * If we have more than one character, a following hash
+ * or (#c...) only applies to the last, so backtrack one character.
*/
- if (isset(EXTENDEDGLOB) && *patparse == Pound && morelen)
+ if (isset(EXTENDEDGLOB) &&
+ (*patparse == Pound ||
+ (*patparse == Inpar && patparse[1] == Pound &&
+ patparse[2] == 'c')) && morelen)
patparse = patprev;
/*
* If len is 1, we can't have an active # following, so doesn't
@@ -1345,7 +1369,10 @@
}
}
+ count = 0;
if (!(pound = (*patparse == Pound && isset(EXTENDEDGLOB))) &&
+ !(count = (isset(EXTENDEDGLOB) && *patparse == Inpar &&
+ patparse[1] == Pound && patparse[2] == 'c')) &&
(kshchar <= 0 || kshchar == '@' || kshchar == '!')) {
*flagp = flags;
return starter;
@@ -1364,6 +1391,10 @@
} else if (kshchar == '?') {
op = 0;
*flagp = 0;
+ } else if (count) {
+ op = P_COUNT;
+ patparse += 3;
+ *flagp = P_HSTART;
} else if (*++patparse == Pound) {
op = P_TWOHASH;
patparse++;
@@ -1383,7 +1414,56 @@
* each time we fail on a non-empty branch, we try the empty branch,
* which is equivalent to backtracking.
*/
- if ((flags & P_SIMPLE) && (op == P_ONEHASH || op == P_TWOHASH) &&
+ if (op == P_COUNT) {
+ /* (#cN,M) */
+ union upat countargs[P_CT_OPERAND];
+ char *opp = patparse;
+
+ countargs[0].l = P_COUNT;
+ countargs[P_CT_CURRENT].l = 0L;
+ countargs[P_CT_MIN].l = (long)zstrtol(patparse, &patparse, 10);
+ if (patparse == opp) {
+ /* missing number treated as zero */
+ countargs[P_CT_MIN].l = 0L;
+ }
+ if (*patparse != ',' && *patparse != Comma) {
+ /* either max = min or error */
+ if (*patparse != Outpar)
+ return 0;
+ countargs[P_CT_MAX].l = countargs[P_CT_MIN].l;
+ } else {
+ opp = ++patparse;
+ countargs[P_CT_MAX].l = (long)zstrtol(patparse, &patparse, 10);
+ if (*patparse != Outpar)
+ return 0;
+ if (patparse == opp) {
+ /* missing number treated as infinity: record as -1 */
+ countargs[P_CT_MAX].l = -1L;
+ }
+ }
+ patparse++;
+ countargs[P_CT_PTR].p = NULL;
+ /* Mark this chain as a min/max count... */
+ patinsert(P_COUNTSTART, starter, (char *)countargs, sizeof(countargs));
+ /*
+ * The next of the operand is a loop back to the P_COUNT. This is
+ * how we get recursion for the count. We don't loop back to
+ * the P_COUNTSTART; that's used for initialising the count
+ * and saving and restoring the count for any enclosing use
+ * of the match.
+ */
+ opnd = P_OPERAND(starter) + P_CT_OPERAND;
+ pattail(opnd, patnode(P_BACK));
+ pattail(opnd, P_OPERAND(starter));
+ /*
+ * The next of the counter operators is what follows the
+ * closure.
+ * This handles matching of the tail.
+ */
+ next = patnode(P_NOTHING);
+ pattail(starter, next);
+ pattail(P_OPERAND(starter), next);
+ } else if ((flags & P_SIMPLE) && (op == P_ONEHASH || op == P_TWOHASH) &&
P_OP((Upat)patout+starter) == P_ANY) {
/* Optimize ?# to *. Silly thing to do, since who would use
* use ?# ? But it makes the later code shorter.
@@ -1397,7 +1477,7 @@
uptr->l = (uptr->l & ~0xff) | P_STAR;
}
} else if ((flags & P_SIMPLE) && op && !(patglobflags & 0xff)) {
- /* Don't simplify if we need to look for approximations. */
+ /* Simplify, but not if we need to look for approximations. */
patinsert(op, starter, NULL, 0);
} else if (op == P_ONEHASH) {
/* Emit x# as (x&|), where & means "self". */
@@ -2830,6 +2910,54 @@
if (patinput < patinend || (patflags & PAT_NOTEND))
fail = 1;
break;
+ case P_COUNTSTART:
+ {
+ /*
+ * Save and restore the current count and the
+ * start pointer in case the pattern has been
+ * executed by a previous repetition of a
+ * closure.
+ */
+ long *curptr = &P_OPERAND(scan)[P_CT_CURRENT].l;
+ long savecount = *curptr;
+ unsigned char *saveptr = scan[P_CT_PTR].p;
+ int ret;
+
+ *curptr = 0L;
+ ret = patmatch(P_OPERAND(scan));
+ *curptr = savecount;
+ scan[P_CT_PTR].p = saveptr;
+ return ret;
+ }
+ case P_COUNT:
+ {
+ /* (#cN,M): execution is relatively straightforward */
+ long cur = scan[P_CT_CURRENT].l;
+ long min = scan[P_CT_MIN].l;
+ long max = scan[P_CT_MAX].l;
+
+ if (cur && cur >= min &&
+ (unsigned char *)patinput == scan[P_CT_PTR].p) {
+ /*
+ * Not at the first attempt to match so
+ * the previous attempt managed zero length.
+ * We can do this indefinitely so there's
+ * no point in going on. Simply try to
+ * match the remainder of the pattern.
+ */
+ return patmatch(next);
+ }
+ scan[P_CT_PTR].p = (unsigned char *)patinput;
+
+ if (max < 0 || cur < max) {
+ scan[P_CT_CURRENT].l = cur + 1;
+ if (patmatch(scan + P_CT_OPERAND))
+ return 1;
+ }
+ if (cur < min)
+ return 0;
+ return patmatch(next);
+ }
case P_END:
if (!(fail = (patinput < patinend && !(patflags & PAT_NOANCH))))
return 1;
Index: Test/D02glob.ztst
===================================================================
RCS file: /cvsroot/zsh/zsh/Test/D02glob.ztst,v
retrieving revision 1.12
diff -u -r1.12 D02glob.ztst
--- Test/D02glob.ztst 25 Jul 2006 18:10:38 -0000 1.12
+++ Test/D02glob.ztst 27 Jul 2007 21:35:17 -0000
@@ -177,6 +177,16 @@
>1: [[ path/testy = *((#s)|/)test((#e)|/)* ]]
>1: [[ path/testy/ohyes = *((#s)|/)test((#e)|/)* ]]
>1: [[ path/atest/ohyes = *((#s)|/)test((#e)|/)* ]]
+>0: [[ XabcdabcY = X(ab|c|d)(#c5)Y ]]
+>0: [[ XabcdabcY = X(ab|c|d)(#c1,5)Y ]]
+>0: [[ XabcdabcY = X(ab|c|d)(#c5,8)Y ]]
+>0: [[ XabcdabcY = X(ab|c|d)(#c4,)Y ]]
+>1: [[ XabcdabcY = X(ab|c|d)(#c6,)Y ]]
+>1: [[ XabcdabcY = X(ab|c|d)(#c1,4)Y ]]
+>0: [[ ZX = Z(|)(#c1)X ]]
+>0: [[ froofroo = (fro(#c2))(#c2) ]]
+>1: [[ froofroofroo = (fro(#c2))(#c2) ]]
+>1: [[ froofro = (fro(#c2))(#c2) ]]
>0 tests failed.
globtest globtests.ksh
@@ -357,3 +367,7 @@
> [[:IFSSPACE:]] yes
>/ [[:WORD:]] no
>/ [[:WORD:]] yes
+
+ [[ foo = (#c0)foo ]]
+1:Misplaced (#c...) flag
+?(eval):1: bad pattern: (#c0)foo
--
Peter Stephenson <p.w.stephenson@xxxxxxxxxxxx>
Web page now at http://homepage.ntlworld.com/p.w.stephenson/
Messages sorted by:
Reverse Date,
Date,
Thread,
Author