Zsh Mailing List Archive
Messages sorted by:
Reverse Date,
Date,
Thread,
Author
PATCH: parameter operators with multibyte characters
- X-seq: zsh-workers 22562
- From: Peter Stephenson <p.w.stephenson@xxxxxxxxxxxx>
- To: zsh <zsh-workers@xxxxxxxxxx>
- Subject: PATCH: parameter operators with multibyte characters
- Date: Sun, 30 Jul 2006 18:53:04 +0100
- Mailing-list: contact zsh-workers-help@xxxxxxxxxx; run by ezmlm
This fixes up the parameter #, ##, % and %% and beflagged variants
(though the only flag I've tested with multibyte characters is (S)).
My heart sank when I saw this but it wasn't so difficult.
Index: Src/glob.c
===================================================================
RCS file: /cvsroot/zsh/zsh/Src/glob.c,v
retrieving revision 1.52
diff -u -r1.52 glob.c
--- Src/glob.c 10 Jul 2006 13:08:23 -0000 1.52
+++ Src/glob.c 30 Jul 2006 17:49:52 -0000
@@ -208,18 +208,6 @@
int follow; /* 1 to go thru symlinks */
};
-/* Next character after one which may be a Meta (x is any char *) */
-#define METANEXT(x) (*(x) == Meta ? (x)+2 : (x)+1)
-/*
- * Increment pointer which may be on a Meta (x is a pointer variable),
- * returning the incremented value (i.e. like pre-increment).
- */
-#define METAINC(x) ((x) += (*(x) == Meta) ? 2 : 1)
-/*
- * Return unmetafied char from string (x is any char *)
- */
-#define UNMETA(x) (*(x) == Meta ? (x)[1] ^ 32 : *(x))
-
/* Add a component to pathbuf: This keeps track of how *
* far we are into a file name, since each path component *
* must be matched separately. */
@@ -2226,6 +2214,371 @@
}
/**/
+#ifdef MULTIBYTE_SUPPORT
+
+/*
+ * Increment *tp over character which may be multibyte.
+ * Return number of bytes that remain in the character after unmetafication.
+ */
+
+/**/
+static int iincchar(char **tp)
+{
+ char *t = *tp;
+ int mbclen = mb_metacharlenconv(t, NULL);
+ int umlen = 0;
+
+ while (mbclen--) {
+ umlen++;
+ if (*t++ == Meta) {
+ t++;
+ mbclen--;
+ }
+ }
+ *tp = t;
+
+ return umlen;
+}
+
+/**/
+static int
+igetmatch(char **sp, Patprog p, int fl, int n, char *replstr)
+{
+ char *s = *sp, *t, *tmatch;
+ /*
+ * Note that ioff counts (possibly multibyte) characters in the
+ * character set (Meta's are not included), while l counts characters in
+ * the metafied string.
+ *
+ * umlen is a counter for (unmetafied) byte lengths---neither characters
+ * nor raw byte indices; this is simply an optimisation for allocation.
+ * umltot is the full length of the string in this scheme.
+ *
+ * l is the raw string length, used together with any pointers into
+ * the string (typically t).
+ */
+ int ioff, l = strlen(*sp), matched = 1, umltot = ztrlen(*sp);
+ int umlen, nmatches;
+ /*
+ * List of bits of matches to concatenate with replacement string.
+ * The data is a struct repldata. It is not used in cases like
+ * ${...//#foo/bar} even though SUB_GLOBAL is set, since the match
+ * is anchored. It goes on the heap.
+ */
+ LinkList repllist = NULL;
+
+ /* perform must-match test for complex closures */
+ if (p->mustoff)
+ {
+ /*
+ * Yuk. Probably we should rewrite this whole function to
+ * use an unmetafied test string.
+ *
+ * Use META_HEAPDUP because we need a terminating NULL.
+ */
+ char *muststr = metafy((char *)p + p->mustoff,
+ p->patmlen, META_HEAPDUP);
+
+ if (!strstr(s, muststr))
+ matched = 0;
+ }
+
+ /* in case we used the prog before... */
+ p->flags &= ~(PAT_NOTSTART|PAT_NOTEND);
+
+ if (fl & SUB_ALL) {
+ int i = matched && pattry(p, s);
+ *sp = get_match_ret(*sp, 0, i ? l : 0, fl, i ? replstr : 0, repllist);
+ if (! **sp && (((fl & SUB_MATCH) && !i) || ((fl & SUB_REST) && i)))
+ return 0;
+ return 1;
+ }
+ if (matched) {
+ switch (fl & (SUB_END|SUB_LONG|SUB_SUBSTR)) {
+ case 0:
+ case SUB_LONG:
+ /*
+ * Largest/smallest possible match at head of string.
+ * First get the longest match...
+ */
+ if (pattry(p, s)) {
+ /* patmatchlen returns metafied length, as we need */
+ int mlen = patmatchlen();
+ if (!(fl & SUB_LONG) && !(p->flags & PAT_PURES)) {
+ /*
+ * ... now we know whether it's worth looking for the
+ * shortest, which we do by brute force.
+ */
+ mb_metacharinit();
+ for (t = s, umlen = 0; t < s + mlen; ) {
+ set_pat_end(p, *t);
+ if (pattrylen(p, s, t - s, umlen, 0)) {
+ mlen = patmatchlen();
+ break;
+ }
+ umlen += iincchar(&t);
+ }
+ }
+ *sp = get_match_ret(*sp, 0, mlen, fl, replstr, repllist);
+ return 1;
+ }
+ break;
+
+ case SUB_END:
+ /*
+ * Smallest possible match at tail of string.
+ * As we can only be sure we've got wide characters right
+ * when going forwards, we need to match at every point
+ * until we fail and record the last successful match.
+ *
+ * It's important that we return the last successful match
+ * so that match, mbegin, mend and MATCH, MBEGIN, MEND are
+ * correct.
+ */
+ mb_metacharinit();
+ tmatch = NULL;
+ for (ioff = 0, t = s, umlen = umltot; t < s + l; ioff++) {
+ set_pat_start(p, t-s);
+ if (pattrylen(p, t, s + l - t, umlen, ioff))
+ tmatch = t;
+ umlen -= iincchar(&t);
+ }
+ if (tmatch) {
+ *sp = get_match_ret(*sp, tmatch - s, l, fl, replstr, repllist);
+ return 1;
+ }
+ if (pattrylen(p, s + l, 0, 0, ioff)) {
+ *sp = get_match_ret(*sp, l, l, fl, replstr, repllist);
+ return 1;
+ }
+ break;
+
+ case (SUB_END|SUB_LONG):
+ /* Largest possible match at tail of string: *
+ * move forward along string until we get a match. *
+ * Again there's no optimisation. */
+ mb_metacharinit();
+ for (ioff = 0, t = s, umlen = umltot; t < s + l; ioff++) {
+ set_pat_start(p, t-s);
+ if (pattrylen(p, t, s + l - t, umlen, ioff)) {
+ *sp = get_match_ret(*sp, t-s, l, fl, replstr, repllist);
+ return 1;
+ }
+ umlen -= iincchar(&t);
+ }
+ break;
+
+ case SUB_SUBSTR:
+ /* Smallest at start, but matching substrings. */
+ set_pat_start(p, l);
+ if (!(fl & SUB_GLOBAL) && pattry(p, s + l) && !--n) {
+ *sp = get_match_ret(*sp, 0, 0, fl, replstr, repllist);
+ return 1;
+ } /* fall through */
+ case (SUB_SUBSTR|SUB_LONG):
+ /* longest or smallest at start with substrings */
+ t = s;
+ if (fl & SUB_GLOBAL)
+ repllist = newlinklist();
+ ioff = 0; /* offset into string */
+ umlen = umltot;
+ mb_metacharinit();
+ do {
+ /* loop over all matches for global substitution */
+ matched = 0;
+ for (; t < s + l; ioff++) {
+ /* Find the longest match from this position. */
+ set_pat_start(p, t-s);
+ if (pattrylen(p, t, s + l - t, umlen, ioff)) {
+ char *mpos = t + patmatchlen();
+ if (!(fl & SUB_LONG) && !(p->flags & PAT_PURES)) {
+ char *ptr;
+ int umlen2;
+ /*
+ * If searching for the shortest match,
+ * start with a zero length and increase
+ * it until we reach the longest possible
+ * match, accepting the first successful
+ * match.
+ */
+ for (ptr = t, umlen2 = 0; ptr < mpos;) {
+ set_pat_end(p, *ptr);
+ if (pattrylen(p, t, ptr - t, umlen2, ioff)) {
+ mpos = t + patmatchlen();
+ break;
+ }
+ umlen2 += iincchar(&ptr);
+ }
+ }
+ if (!--n || (n <= 0 && (fl & SUB_GLOBAL))) {
+ *sp = get_match_ret(*sp, t-s, mpos-s, fl,
+ replstr, repllist);
+ if (mpos == t)
+ mpos += mb_metacharlenconv(mpos, NULL);
+ }
+ if (!(fl & SUB_GLOBAL)) {
+ if (n) {
+ /*
+ * Looking for a later match: in this case,
+ * we can continue looking for matches from
+ * the next character, even if it overlaps
+ * with what we just found.
+ */
+ umlen -= iincchar(&t);
+ continue;
+ } else {
+ return 1;
+ }
+ }
+ /*
+ * For a global match, we need to skip the stuff
+ * which is already marked for replacement.
+ */
+ matched = 1;
+ while (t < mpos) {
+ ioff++;
+ umlen -= iincchar(&t);
+ }
+ break;
+ }
+ umlen -= iincchar(&t);
+ }
+ } while (matched);
+ /*
+ * check if we can match a blank string, if so do it
+ * at the start. Goodness knows if this is a good idea
+ * with global substitution, so it doesn't happen.
+ */
+ set_pat_start(p, l);
+ if ((fl & (SUB_LONG|SUB_GLOBAL)) == SUB_LONG &&
+ pattry(p, s + l) && !--n) {
+ *sp = get_match_ret(*sp, 0, 0, fl, replstr, repllist);
+ return 1;
+ }
+ break;
+
+ case (SUB_END|SUB_SUBSTR):
+ case (SUB_END|SUB_LONG|SUB_SUBSTR):
+ /* Longest/shortest at end, matching substrings. */
+ if (!(fl & SUB_LONG)) {
+ set_pat_start(p, l);
+ if (pattrylen(p, s + l, 0, 0, umltot) && !--n) {
+ *sp = get_match_ret(*sp, l, l, fl, replstr, repllist);
+ return 1;
+ }
+ }
+ /*
+ * If multibyte characters are present we need to start from the
+ * beginning. This is a bit unpleasant because we can't tell in
+ * advance how many times it will match and from where, so if n is
+ * greater then 1 we will need to count the number of times it
+ * matched and then go through again until we reach the right
+ * point. (Either that or record every single match in a list,
+ * which isn't stupid; it involves more memory management at this
+ * level but less use of the pattern matcher.)
+ */
+ nmatches = 0;
+ tmatch = NULL;
+ mb_metacharinit();
+ for (ioff = 0, t = s, umlen = umltot; t < s + l; ioff++) {
+ set_pat_start(p, t-s);
+ if (pattrylen(p, t, s + l - t, umlen, ioff)) {
+ nmatches++;
+ tmatch = t;
+ }
+ umlen -= iincchar(&t);
+ }
+ if (nmatches) {
+ char *mpos;
+ if (n > 1) {
+ /*
+ * We need to find the n'th last match.
+ */
+ n = nmatches - n;
+ mb_metacharinit();
+ for (ioff = 0, t = s, umlen = umltot; t < s + l; ioff++) {
+ set_pat_start(p, t-s);
+ if (pattrylen(p, t, s + l - t, umlen, ioff) &&
+ !n--) {
+ tmatch = t;
+ break;
+ }
+ umlen -= iincchar(&t);
+ }
+ }
+ mpos = tmatch + patmatchlen();
+ /* Look for the shortest match if necessary */
+ if (!(fl & SUB_LONG) && !(p->flags & PAT_PURES)) {
+ for (t = tmatch, umlen = 0; t < mpos; ) {
+ set_pat_end(p, *t);
+ if (pattrylen(p, tmatch, t - tmatch, umlen, ioff)) {
+ mpos = tmatch + patmatchlen();
+ break;
+ }
+ umlen += iincchar(&t);
+ }
+ }
+ *sp = get_match_ret(*sp, tmatch-s, mpos-s, fl,
+ replstr, repllist);
+ return 1;
+ }
+ set_pat_start(p, l);
+ if ((fl & SUB_LONG) && pattrylen(p, s + l, 0, 0, umltot) && !--n) {
+ *sp = get_match_ret(*sp, l, l, fl, replstr, repllist);
+ return 1;
+ }
+ break;
+ }
+ }
+
+ if (repllist && nonempty(repllist)) {
+ /* Put all the bits of a global search and replace together. */
+ LinkNode nd;
+ Repldata rd;
+ int lleft = 0; /* size of returned string */
+ char *ptr, *start;
+ int i;
+
+ i = 0; /* start of last chunk we got from *sp */
+ for (nd = firstnode(repllist); nd; incnode(nd)) {
+ rd = (Repldata) getdata(nd);
+ lleft += rd->b - i; /* previous chunk of *sp */
+ lleft += strlen(rd->replstr); /* the replaced bit */
+ i = rd->e; /* start of next chunk of *sp */
+ }
+ lleft += l - i; /* final chunk from *sp */
+ start = t = zhalloc(lleft+1);
+ i = 0;
+ for (nd = firstnode(repllist); nd; incnode(nd)) {
+ rd = (Repldata) getdata(nd);
+ memcpy(t, s + i, rd->b - i);
+ t += rd->b - i;
+ ptr = rd->replstr;
+ while (*ptr)
+ *t++ = *ptr++;
+ i = rd->e;
+ }
+ memcpy(t, s + i, l - i);
+ start[lleft] = '\0';
+ *sp = (char *)start;
+ return 1;
+ }
+
+ /* munge the whole string: no match, so no replstr */
+ *sp = get_match_ret(*sp, 0, 0, fl, 0, 0);
+ return 1;
+}
+
+/**/
+#else
+
+/*
+ * Increment pointer which may be on a Meta (x is a pointer variable),
+ * returning the incremented value (i.e. like pre-increment).
+ */
+#define METAINC(x) ((x) += (*(x) == Meta) ? 2 : 1)
+
+/**/
static int
igetmatch(char **sp, Patprog p, int fl, int n, char *replstr)
{
@@ -2496,6 +2849,9 @@
return 1;
}
+/**/
+#endif /* MULTIBYTE_SUPPORT */
+
/* blindly turn a string into a tokenised expression without lexing */
/**/
Index: Test/D07multibyte.ztst
===================================================================
RCS file: /cvsroot/zsh/zsh/Test/D07multibyte.ztst,v
retrieving revision 1.8
diff -u -r1.8 D07multibyte.ztst
--- Test/D07multibyte.ztst 25 Jul 2006 18:10:38 -0000 1.8
+++ Test/D07multibyte.ztst 30 Jul 2006 17:49:52 -0000
@@ -264,3 +264,22 @@
>62: space
>64: space
>70: punct
+
+ ioh="Ἐν ἀρχῇ ἦν ὁ λόγος, καὶ ὁ λόγος ἦν πρὸς τὸν θεόν, καὶ θεὸς ἦν ὁ λόγος"
+ print ${ioh#[[:alpha:]]##}
+ print ${ioh##[[:alpha:]]##}
+ print ${ioh%[[:alpha:]]##}
+ print ${ioh%%[[:alpha:]]##}
+ print ${(S)ioh#λ*ς}
+ print ${(S)ioh##λ*ς}
+ print ${(S)ioh%θ*ς}
+ print ${(S)ioh%%θ*ς}
+0:Parameter #, ##, %, %% with multibyte characters
+>ν ἀρχῇ ἦν ὁ λόγος, καὶ ὁ λόγος ἦν πρὸς τὸν θεόν, καὶ θεὸς ἦν ὁ λόγος
+> ἀρχῇ ἦν ὁ λόγος, καὶ ὁ λόγος ἦν πρὸς τὸν θεόν, καὶ θεὸς ἦν ὁ λόγος
+>Ἐν ἀρχῇ ἦν ὁ λόγος, καὶ ὁ λόγος ἦν πρὸς τὸν θεόν, καὶ θεὸς ἦν ὁ λόγο
+>Ἐν ἀρχῇ ἦν ὁ λόγος, καὶ ὁ λόγος ἦν πρὸς τὸν θεόν, καὶ θεὸς ἦν ὁ
+>Ἐν ἀρχῇ ἦν ὁ , καὶ ὁ λόγος ἦν πρὸς τὸν θεόν, καὶ θεὸς ἦν ὁ λόγος
+>Ἐν ἀρχῇ ἦν ὁ
+>Ἐν ἀρχῇ ἦν ὁ λόγος, καὶ ὁ λόγος ἦν πρὸς τὸν θεόν, καὶ ἦν ὁ λόγος
+>Ἐν ἀρχῇ ἦν ὁ λόγος, καὶ ὁ λόγος ἦν πρὸς τὸν θεόν, καὶ
--
Peter Stephenson <p.w.stephenson@xxxxxxxxxxxx>
Web page now at http://homepage.ntlworld.com/p.w.stephenson/
Messages sorted by:
Reverse Date,
Date,
Thread,
Author