Zsh Mailing List Archive
Messages sorted by:
Reverse Date,
Date,
Thread,
Author
PATCH: 4.1.4: POSIX ranges
- X-seq: zsh-workers 4209
- From: Peter Stephenson <pws@xxxxxxxxxxxxxxxxx>
- To: zsh-workers@xxxxxxxxxxxxxxx (Zsh hackers list)
- Subject: PATCH: 4.1.4: POSIX ranges
- Date: Wed, 08 Jul 1998 15:36:15 +0200
Somebody was complaining about the lack of these. See the manual page
change.
I implemented the look up of 'alnum', 'alpha' etc. in the trivial way:
firstly because setting up a hash table uses even more code and
memory, secondly because the corresponding isalnum() etc. are usually
macros rather than functions so are untableable. I'm hoping there
aren't too many underendowed ctype macro sets out there, or we're
going to have to add a configure test.
As usual with zsh, there were some extra issues buried in there.
First, the guarantees in the manual page that []...] would match a
literal ']' and [^]...] wouldn't, were being upheld by a test in
lex.c, instead of the globbing code. This had the effect that
e.g. [...[] didn't work. I've moved the test to where I think it
should be in glob.c: this means that all ]'s, even not preceeded by a
[, get tokenised. The globbing code is supposed to be able to cope
with this sort of thing.
Second, the tokenise() code --- on the fly tokenisation for
substituted variables that should turn into glob patterns, and so on
--- was also getting its paws onto ['s and ]'s and trying to second
guess the globbing code. As above, I've simply made the routine
blindly tokenise every unquoted [ and ] it comes across.
In both cases, if anyone knows somewhere where tokenising an unquoted
[ or ] is definitely wrong, say. As far as I know, the only
difference is that glob() will be called unnecessarily a few times and
will harmlessly untokenise the bracket in question.
By the way, I haven't touched this, but what's happened to
nobadpattern?
% setopt nobadpattern
% [[ [ = [ ]]
zsh: bad pattern: [
Is it really not supposed to work inside tests? Maybe it's safer that
way.
*** Doc/Zsh/expn.yo.range Mon Jun 15 09:52:33 1998
--- Doc/Zsh/expn.yo Wed Jul 8 12:23:06 1998
***************
*** 735,740 ****
--- 735,756 ----
can be specified by separating two characters by a `tt(-)'.
A `tt(-)' or `tt(])' may be matched by including it as the
first character in the list.
+ There are also several named classes of characters, in the form
+ `tt([:)var(name)(tt:])' with the following meanings: `tt([:alnum:])'
+ alphanmueric, `tt([:alpha:])' alphabetic,
+ `tt([:cntrl:])' control character, `tt([:digit:])' decimal
+ digit, `tt([:graph:])' printing character expect space,
+ `tt([:lower:])' lowercase letter, `tt([:print:])' printable character,
+ `tt([:punct:])' printing character neither alphanumeric nor space,
+ `tt([:space:])' whitespace character, `tt([:upper:])' uppercase letter,
+ `tt([:xdigit:])' hexadecimal digit. These use the macros provided by
+ the operating system to test for the given character combinations,
+ including any modifications due to local language settings: see
+ manref(ctype)(3). Note that the square brackets are additional
+ to those enclosing the whole set of characters, so to test for a
+ single alphanumeric character you need `tt([[:alnum:]])'. Named
+ character sets can be used alongside other types,
+ e.g. `tt([[:alpha:]0-9])'.
)
xitem(tt([^)...tt(]))
item(tt([!)...tt(]))(
*** Misc/globtests.range Sat Apr 25 19:09:19 1998
--- Misc/globtests Wed Jul 8 15:26:52 1998
***************
*** 1,6 ****
#!/usr/local/bin/zsh -f
! setopt extendedglob
unsetopt kshglob
failed=0
--- 1,6 ----
#!/usr/local/bin/zsh -f
! setopt extendedglob badpattern
unsetopt kshglob
failed=0
***************
*** 95,99 ****
--- 95,107 ----
f foob (^foo)b*
t foobb (^foo)b*
f zsh ^z*
+ t a%1X [[:alpha:][:punct:]]#[[:digit:]][^[:lower:]]
+ f a%1 [[:alpha:][:punct:]]#[[:digit:]][^[:lower:]]
+ t [: [[:]#
+ t :] []:]#
+ t :] [:]]#
+ t [ [[]
+ t ] []]
+ t [] [^]]]
EOT
print "$failed tests failed."
*** Src/glob.c.range Mon May 11 09:43:29 1998
--- Src/glob.c Wed Jul 8 15:25:05 1998
***************
*** 633,641 ****
return NULL;
} else if (*pptr == Inbrack) {
/* Character set: brackets had better match */
! while (*++pptr && *pptr != Outbrack)
! if (itok(*pptr))
*pptr = ztokens[*pptr - Pound];
if (*pptr != Outbrack)
return NULL;
} else if (itok(*pptr) && *pptr != Star && *pptr != Quest)
--- 633,655 ----
return NULL;
} else if (*pptr == Inbrack) {
/* Character set: brackets had better match */
! if (pptr[1] == Outbrack)
! *++pptr = ']';
! else if ((pptr[1] == Hat || pptr[1] == '^' || pptr[1] == '!') &&
! pptr[2] == Outbrack)
! *(pptr += 2) = ']';
! while (*++pptr && *pptr != Outbrack) {
! if (itok(*pptr)) {
! /* POSIX classes: make sure it's a real one, *
! * leave the Inbrack tokenised if so. */
! char *nptr;
! if (*pptr == Inbrack && pptr[1] == ':'
! && (nptr = strchr(pptr+2, ':')) &&
! *++nptr == Outbrack)
! pptr = nptr;
*pptr = ztokens[*pptr - Pound];
+ }
+ }
if (*pptr != Outbrack)
return NULL;
} else if (itok(*pptr) && *pptr != Star && *pptr != Quest)
***************
*** 2192,2197 ****
--- 2206,2293 ----
/**/
static int
+ posix_range(char **patptr, int ch)
+ {
+ /* Match POSIX ranges, which correspond to ctype macros, *
+ * e.g. [:alpha:] -> isalpha. It just doesn't seem worth *
+ * the palaver of creating a hash table for this. */
+ char *start = *patptr;
+ int len;
+
+ /* we made sure in parsecomp() there was a ':' to search for */
+ *patptr = strchr(start, ':');
+ len = *patptr++ - start;
+
+ if (!strncmp(start, "alpha", len))
+ return isalpha(ch);
+ if (!strncmp(start, "alnum", len))
+ return isalnum(ch);
+ if (!strncmp(start, "cntrl", len))
+ return iscntrl(ch);
+ if (!strncmp(start, "digit", len))
+ return isdigit(ch);
+ if (!strncmp(start, "graph", len))
+ return isgraph(ch);
+ if (!strncmp(start, "lower", len))
+ return islower(ch);
+ if (!strncmp(start, "print", len))
+ return isprint(ch);
+ if (!strncmp(start, "punct", len))
+ return ispunct(ch);
+ if (!strncmp(start, "space", len))
+ return isspace(ch);
+ if (!strncmp(start, "upper", len))
+ return isupper(ch);
+ if (!strncmp(start, "xdigit", len))
+ return isxdigit(ch);
+ return 0;
+ }
+
+ /**/
+ static void
+ rangematch(char **patptr, int ch, int rchar)
+ {
+ /* Check for a character in a [...] or [^...]. The [ *
+ * and optional ^ have already been skipped. */
+
+ char *pat = *patptr;
+ #ifdef HAVE_STRCOLL
+ char l_buf[2], r_buf[2], ch_buf[2];
+
+ ch_buf[0] = ch;
+ l_buf[1] = r_buf[1] = ch_buf[1] = '\0';
+ #endif
+
+ #define PAT(X) (pat[X] == Meta ? pat[(X)+1] ^ 32 : untok(pat[X]))
+ #define PPAT(X) (pat[(X)-1] == Meta ? pat[X] ^ 32 : untok(pat[X]))
+
+ for (pat++; *pat != Outbrack && *pat;
+ *pat == Meta ? pat += 2 : pat++) {
+ if (*pat == Inbrack) {
+ /* Inbrack can only occur inside a range if we found [:...:]. */
+ pat += 2;
+ if (posix_range(&pat, ch))
+ break;
+ } else if (*pat == '-' && pat[-1] != rchar &&
+ pat[1] != Outbrack) {
+ #ifdef HAVE_STRCOLL
+ l_buf[0] = PPAT(-1);
+ r_buf[0] = PAT(1);
+ if (strcoll(l_buf, ch_buf) <= 0 &&
+ strcoll(ch_buf, r_buf) <= 0)
+ #else
+ if (PPAT(-1) <= ch && PAT(1) >= ch)
+ #endif
+ break;
+ } else if (ch == PAT(0))
+ break;
+ }
+
+ *patptr = pat;
+ }
+
+ /**/
+ static int
matchonce(Comp c)
{
char *pat = c->str;
***************
*** 2304,2341 ****
}
if (*pat == Inbrack) {
/* Match groups of characters */
- #define PAT(X) (pat[X] == Meta ? pat[(X)+1] ^ 32 : untok(pat[X]))
- #define PPAT(X) (pat[(X)-1] == Meta ? pat[X] ^ 32 : untok(pat[X]))
char ch;
- #ifdef HAVE_STRCOLL
- char l_buf[2], r_buf[2], ch_buf[2];
-
- l_buf[1] = r_buf[1] = ch_buf[1] = '\0';
- #endif
if (!*pptr)
break;
ch = *pptr == Meta ? pptr[1] ^ 32 : *pptr;
- #ifdef HAVE_STRCOLL
- ch_buf[0] = ch;
- #endif
if (pat[1] == Hat || pat[1] == '^' || pat[1] == '!') {
/* group is negated */
! pat[1] = Hat;
! for (pat += 2; *pat != Outbrack && *pat;
! *pat == Meta ? pat += 2 : pat++)
! if (*pat == '-' && pat[-1] != Hat && pat[1] != Outbrack) {
! #ifdef HAVE_STRCOLL
! l_buf[0] = PPAT(-1);
! r_buf[0] = PAT(1);
! if (strcoll(l_buf, ch_buf) <= 0 &&
! strcoll(ch_buf, r_buf) <= 0)
! #else
! if (PPAT(-1) <= ch && PAT(1) >= ch)
! #endif
! break;
! } else if (ch == PAT(0))
! break;
DPUTS(!*pat, "BUG: something is very wrong in doesmatch()");
if (*pat != Outbrack)
break;
--- 2400,2414 ----
}
if (*pat == Inbrack) {
/* Match groups of characters */
char ch;
if (!*pptr)
break;
ch = *pptr == Meta ? pptr[1] ^ 32 : *pptr;
if (pat[1] == Hat || pat[1] == '^' || pat[1] == '!') {
/* group is negated */
! *++pat = Hat;
! rangematch(&pat, ch, Hat);
DPUTS(!*pat, "BUG: something is very wrong in doesmatch()");
if (*pat != Outbrack)
break;
***************
*** 2344,2364 ****
continue;
} else {
/* pattern is not negated (affirmed? asserted?) */
! for (pat++; *pat != Outbrack && *pat;
! *pat == Meta ? pat += 2 : pat++)
! if (*pat == '-' && pat[-1] != Inbrack &&
! pat[1] != Outbrack) {
! #ifdef HAVE_STRCOLL
! l_buf[0] = PPAT(-1);
! r_buf[0] = PAT(1);
! if (strcoll(l_buf, ch_buf) <= 0 &&
! strcoll(ch_buf, r_buf) <= 0)
! #else
! if (PPAT(-1) <= ch && PAT(1) >= ch)
! #endif
! break;
! } else if (ch == PAT(0))
! break;
DPUTS(!pat || !*pat, "BUG: something is very wrong in doesmatch()");
if (*pat == Outbrack)
break;
--- 2417,2423 ----
continue;
} else {
/* pattern is not negated (affirmed? asserted?) */
! rangematch(&pat, ch, Inbrack);
DPUTS(!pat || !*pat, "BUG: something is very wrong in doesmatch()");
if (*pat == Outbrack)
break;
***************
*** 2461,2480 ****
}
bslash = 1;
continue;
- case '[':
- if (bslash) {
- s[-1] = Bnull;
- break;
- }
- t = s;
- if (*++s == '^' || *s == '!')
- s++;
- while (*s && *++s != ']');
- if (!*s)
- return;
- *t = Inbrack;
- *s = Outbrack;
- break;
case '<':
if (isset(SHGLOB))
break;
--- 2520,2525 ----
***************
*** 2502,2507 ****
--- 2547,2554 ----
case ')':
if (isset(SHGLOB))
break;
+ case '[':
+ case ']':
case '*':
case '?':
for (t = ztokens; *t; t++)
*** Src/lex.c.range Wed Jul 8 14:20:46 1998
--- Src/lex.c Wed Jul 8 15:22:14 1998
***************
*** 876,902 ****
}
break;
case LX2_INBRACK:
- add(c);
if (!in_brace_param)
brct++;
! c = hgetc();
! if (c == '!' || c == '^') {
! add(c);
! c = hgetc();
! }
! if (c == ']')
! break;
! if (lexstop)
! goto brk;
! intpos = 0;
! continue;
case LX2_OUTBRACK:
if (!in_brace_param)
brct--;
! if (brct < 0) {
brct = 0;
- break;
- }
c = Outbrack;
break;
case LX2_INPAR:
--- 876,890 ----
}
break;
case LX2_INBRACK:
if (!in_brace_param)
brct++;
! c = Inbrack;
! break;
case LX2_OUTBRACK:
if (!in_brace_param)
brct--;
! if (brct < 0)
brct = 0;
c = Outbrack;
break;
case LX2_INPAR:
--
Peter Stephenson <pws@xxxxxxxxxxxxxxxxx> Tel: +39 50 844536
WWW: http://www.ifh.de/~pws/
Gruppo Teorico, Dipartimento di Fisica
Piazza Torricelli 2, 56100 Pisa, Italy
Messages sorted by:
Reverse Date,
Date,
Thread,
Author