Zsh Mailing List Archive Messages sorted by: Reverse Date, Date, Thread, Author
PATCH: convert lower levels of matching to wide char / multibyte

X-seq: zsh-workers 26047
From: Peter Stephenson <p.w.stephenson@xxxxxxxxxxxx>
To: zsh-workers@xxxxxxxxxx (Zsh hackers list)
Subject: PATCH: convert lower levels of matching to wide char / multibyte
Date: Sat, 15 Nov 2008 21:11:34 +0000
Mailing-list: contact zsh-workers-help@xxxxxxxxxx; run by ezmlm
This continues the process of converting completion matching to support
multibyte strings.  It is a reasonably complete (but not necessarily
bug-free) conversion of the lower level of the system, roughly up to the
level of join_strs() and join_sub(), though there are some fixes to
string lengths that are needed even within those functions.  bld_line()
and below (the pattern matching functions) should be OK since they
mostly deal with patterns and wide strings rather than strings and
I think I've fixed up the latter.

The remaining task is to convert the higher levels to use lengths
corresponding to multibyte characters instead of single-byte characters.
This looks like it's going to be very long, boring, and intricate.  This
is where I started the last time I attempted this but gave up; this time
with a solider unpinning I can probably break the task down more easily.

Consequently, I'm aware this is incomplete, but feel free to report
problems anyway since they may give me a useful way into the remaining
work.

This has always been fairly broken even with respect to metafied
strings, not just multibyte strings, so there is at least some hope it
isn't any worse.

Index: Src/pattern.c
===================================================================
RCS file: /cvsroot/zsh/zsh/Src/pattern.c,v
retrieving revision 1.48
diff -u -r1.48 pattern.c
--- Src/pattern.c	11 Oct 2008 21:57:03 -0000	1.48
+++ Src/pattern.c	15 Nov 2008 21:09:35 -0000
@@ -3344,7 +3344,6 @@
 }
 
 
-#if 0
 /*
  * This is effectively the reverse of mb_patmatchrange().
  * Given a range descriptor of the same form, and an index into it,
@@ -3353,11 +3352,6 @@
  * return the type in mtp instead.  Return 1 if successful, 0 if
  * there was no corresponding index.  Note all pointer arguments
  * must be non-null.
- *
- * TODO: for now the completion matching code does not handle
- * multibyte.  When it does, we will need either this, or
- * patmatchindex(), but not both---unlike user-initiated pattern
- * matching, multibyte mode in the line editor is always on when available.
  */
 
 /**/
@@ -3438,10 +3432,9 @@
     /* No corresponding index. */
     return 0;
 }
-#endif
 
 /**/
-#endif
+#endif /* MULTIBYTE_SUPPORT */
 
 /*
  * Identical function to mb_patmatchrange() above for single-byte
@@ -3572,9 +3565,17 @@
     return 0;
 }
 
+
+/**/
+#ifndef MULTIBYTE_SUPPORT
+
 /*
  * Identical function to mb_patmatchindex() above for single-byte
  * characters.  Here -1 represents a character that needs a special type.
+ *
+ * Unlike patmatchrange, we only need this in ZLE, which always
+ * uses MULTIBYTE_SUPPORT if compiled in; hence we don't use
+ * this function in that case.
  */
 
 /**/
@@ -3658,6 +3659,9 @@
     return 0;
 }
 
+/**/
+#endif /* MULTIBYTE_SUPPORT */
+
 /*
  * Repeatedly match something simple and say how many times.
  * charstart is an array parallel to that starting at patinput
Index: Src/Zle/comp.h
===================================================================
RCS file: /cvsroot/zsh/zsh/Src/Zle/comp.h,v
retrieving revision 1.18
diff -u -r1.18 comp.h
--- Src/Zle/comp.h	8 Jun 2008 17:53:55 -0000	1.18
+++ Src/Zle/comp.h	15 Nov 2008 21:09:35 -0000
@@ -190,10 +190,7 @@
 				 * Note the allocated length may be longer
 				 * than the null-terminated string.
 				 */
-	int chr;		/* if a single character, it
-				 * TODO: eventually should be a
-				 * convchar_t.
-				 */
+	convchar_t chr;		/* if a single character, it */
     } u;
 };
 
@@ -201,9 +198,17 @@
  * For now this just handles single-byte characters.
  * TODO: this will change.
  */
+#ifdef MULTIBYTE_SUPPORT
+#define PATMATCHRANGE(r, c, ip, mtp)	mb_patmatchrange(r, c, ip, mtp)
+#define PATMATCHINDEX(r, i, cp, mtp)	mb_patmatchindex(r, i, cp, mtp)
+#define CONVCAST(c)			((wchar_t)(c))
+#define CHR_INVALID			(WEOF)
+#else
 #define PATMATCHRANGE(r, c, ip, mtp)	patmatchrange(r, c, ip, mtp)
 #define PATMATCHINDEX(r, i, cp, mtp)	patmatchindex(r, i, cp, mtp)
-#define CONVCAST(c)	(c)
+#define CONVCAST(c)			(c)
+#define CHR_INVALID			(-1)
+#endif
 
 /* This is a special return value for parse_cmatcher(), *
  * signalling an error. */
Index: Src/Zle/complete.c
===================================================================
RCS file: /cvsroot/zsh/zsh/Src/Zle/complete.c,v
retrieving revision 1.43
diff -u -r1.43 complete.c
--- Src/Zle/complete.c	30 Oct 2008 20:29:55 -0000	1.43
+++ Src/Zle/complete.c	15 Nov 2008 21:09:36 -0000
@@ -381,11 +381,12 @@
 {
     Cpattern ret = NULL, r = NULL, n;
     char *s = *sp;
-    int inchar;
-    int l = 0;
+    convchar_t inchar;
+    int l = 0, inlen;
 
     *err = 0;
 
+    MB_METACHARINIT();
     while (*s && (e ? (*s != e) : !inblank(*s))) {
 	n = (Cpattern) hcalloc(sizeof(*n));
 	n->next = NULL;
@@ -409,11 +410,12 @@
 	    if (*s == '\\' && s[1])
 		s++;
 
-	    if (*s == Meta)
-		inchar = STOUC(*++s) ^ 32;
-	    else
-		inchar = STOUC(*s);
-	    s++;
+	    inlen = MB_METACHARLENCONV(s, &inchar);
+#ifdef MULTIBYTE_SUPPORT
+	    if (inchar == WEOF)
+		inchar = (convchar_t)(*s == Meta ? s[1] ^ 32 : *s);
+#endif
+	    s += inlen;
 	    n->tp = CPAT_CHAR;
 	    n->u.chr = inchar;
 	}
Index: Src/Zle/compmatch.c
===================================================================
RCS file: /cvsroot/zsh/zsh/Src/Zle/compmatch.c,v
retrieving revision 1.61
diff -u -r1.61 compmatch.c
--- Src/Zle/compmatch.c	30 Oct 2008 22:04:03 -0000	1.61
+++ Src/Zle/compmatch.c	15 Nov 2008 21:09:37 -0000
@@ -1152,11 +1152,10 @@
  */
 
 /**/
-mod_export int
-pattern_match1(Cpattern p, int c, int *mtp)
+mod_export convchar_t
+pattern_match1(Cpattern p, convchar_t c, int *mtp)
 {
-    /* TODO: should become convchar_t */
-    int ind;
+    convchar_t ind;
 
     *mtp = 0;
     switch (p->tp) {
@@ -1193,29 +1192,31 @@
  * wind is the index returned by a pattern match on the word pattern,
  * with type wmtp.
  * wchr is the word character.
- * Return -1 if no matching character, else the character.
+ * Return CHR_INVALID if no matching character, else the character.
  *
  * Only makes sense if lp->tp == CPAT_EQUIV and the (unseen) word
  * pattern also has that type.
  */
 
 /**/
-mod_export int
-pattern_match_equivalence(Cpattern lp, int wind, int wmtp, int wchr)
+mod_export convchar_t
+pattern_match_equivalence(Cpattern lp, convchar_t wind, int wmtp,
+			  convchar_t wchr)
 {
-    int lchr, lmtp;
+    convchar_t lchr;
+    int lmtp;
 
     if (!PATMATCHINDEX(lp->u.str, wind-1, &lchr, &lmtp)) {
 	/*
 	 * No equivalent.  No possible match; give up.
 	 */
-	return -1;
+	return CHR_INVALID;
     }
     /*
      * If we matched an exact character rather than a range
      * type, return it.
      */
-    if (lchr != -1)
+    if (lchr != CHR_INVALID)
 	return lchr;
 
     /*
@@ -1223,9 +1224,9 @@
      * version of the word character.
      */
     if (wmtp == PP_UPPER && lmtp == PP_LOWER)
-	return tulower(wchr);
+	return ZC_tolower(wchr);
     else if (wmtp == PP_LOWER && lmtp == PP_UPPER)
-	return tuupper(wchr);
+	return ZC_toupper(wchr);
     else if (wmtp == lmtp) {
 	/*
 	 * Be lenient and allow identical replacements
@@ -1238,25 +1239,21 @@
 	/*
 	 * Non-matching generic types; this can't work.
 	 */
-	return -1;
+	return CHR_INVALID;
     }
 }
 
 /*
  * Check if the given pattern matches the given string.
- * p and  s are either anchor or line pattern and string;
- * wp and ws are word (candidate) pattern and string
- *
- * If only one pattern is given, we just check if characters match.
- * If both line and word are given, we check that characters match
- * for {...} classes by comparing positions in the strings.
+ * p is either an anchor or line pattern and string;
+ * wp and wsc are word (candidate) pattern and string
  *
- * Patterns and strings are always passed in pairs, so it is enough
- * to check for non-NULL wp. p should always be present.
+ * Check that characters match for {...} classes by comparing positions in the
+ * strings.
  *
- * If prestrict is not NULL, it is a chain of patterns at least as long
+ * prestrict is a chain of patterns at least as long
  * as the line string.  In this case we are still assembling the line at
- * s (which has been allocated but doesn't yet contain anything useful)
+ * newline (which has been allocated but doesn't yet contain anything useful)
  * and must continue to do so as we go along; prestrict gives
  * restrictions on the line character to be applied along side the other
  * patterns.  In the simple case a restriction is a character to be put
@@ -1264,27 +1261,22 @@
  * deduce an actual matching character.  Note prestrict is never an
  * equivalence class.  In extreme cases we can't deduce a unique
  * character; then the match fails.
+ *
+ * If prestrict is not NULL, s will be NULL.
  */
 
 /**/
-mod_export int
-pattern_match_restrict(Cpattern p, char *s, Cpattern wp, char *ws,
-		       Cpattern prestrict)
+static int
+pattern_match_restrict(Cpattern p, Cpattern wp, convchar_t *wsc, int wsclen,  
+		       Cpattern prestrict, ZLE_STRING_T newline)
 {
-    int c, ind;
-    int wc, wind;
-    int len = 0, wlen, mt, wmt;
+    convchar_t c;
+    convchar_t ind, wind;
+    int mt, wmt;
 
-    while (p && wp && (prestrict || *s) && *ws) {
+    while (p && wp && wsclen && prestrict) {
 	/* First test the word character */
-	if (*ws == Meta) {
-	    wc = STOUC(ws[1]) ^ 32;
-	    wlen = 2;
-	} else {
-	    wc = STOUC(*ws);
-	    wlen = 1;
-	}
-	wind = pattern_match1(wp, wc, &wmt);
+	wind = pattern_match1(wp, *wsc, &wmt);
 	if (!wind)
 	    return 0;
 
@@ -1292,55 +1284,45 @@
 	 * Now the line character; deal with the case where
 	 * we don't yet have it, only a restriction on it.
 	 */
-	if (prestrict) {
-	    if (prestrict->tp == CPAT_CHAR) {
+	if (prestrict->tp == CPAT_CHAR) {
+	    /*
+	     * Easy case: restricted to an exact character on
+	     * the line.  Procede as normal.
+	     */
+	    c = prestrict->u.chr;
+	} else {
+	    if (p->tp == CPAT_CHAR) {
 		/*
-		 * Easy case: restricted to an exact character on
-		 * the line.  Procede as normal.
+		 * Normal line pattern is an exact character:  as
+		 * long as this matches prestrict, we can proceed
+		 * as usual.
 		 */
-		c = prestrict->u.chr;
-	    } else {
-		if (p->tp == CPAT_CHAR) {
-		    /*
-		     * Normal line pattern is an exact character:  as
-		     * long as this matches prestrict, we can proceed
-		     * as usual.
-		     */
-		    c = p->u.chr;
-		} else if (p->tp == CPAT_EQUIV) {
-		    /*
-		     * An equivalence, so we can deduce the character
-		     * backwards from the word pattern and see if it
-		     * matches prestrict.
-		     */
-		    if ((c = pattern_match_equivalence(p, wind, wmt, wc)) == -1)
-			return 0;
-		} else {
-		    /*
-		     * Not an equivalence, so that means we must match
-		     * the word (not just the word pattern), so grab it
-		     * and make sure it fulfills our needs.  I think.
-		     * Not 100% sure about that, but what else can
-		     * we do?  We haven't actually been passed a string
-		     * from the command line.
-		     */
-		    c = wc;
-		}
-		/* Character so deduced must match the restriction. */
-		if (!pattern_match1(prestrict, c, &mt))
+		c = p->u.chr;
+	    } else if (p->tp == CPAT_EQUIV) {
+		/*
+		 * An equivalence, so we can deduce the character
+		 * backwards from the word pattern and see if it
+		 * matches prestrict.
+		 */
+		if ((c = pattern_match_equivalence(p, wind, wmt, *wsc)) ==
+		    CHR_INVALID)
 		    return 0;
-	    }
-	    len = imeta(c) ? 2 : 1;
-	} else {
-	    /* We have the character itself. */
-	    if (*s == Meta) {
-		c = STOUC(s[1]) ^ 32;
-		len = 2;
 	    } else {
-		c = STOUC(*s);
-		len = 1;
+		/*
+		 * Not an equivalence, so that means we must match
+		 * the word (not just the word pattern), so grab it
+		 * and make sure it fulfills our needs.  I think.
+		 * Not 100% sure about that, but what else can
+		 * we do?  We haven't actually been passed a string
+		 * from the command line.
+		 */
+		c = *wsc;
 	    }
+	    /* Character so deduced must match the restriction. */
+	    if (!pattern_match1(prestrict, c, &mt))
+		return 0;
 	}
+
 	/*
 	 * If either is "?", they match each other; no further tests.
 	 * Apply this even if the character wasn't convertable;
@@ -1364,7 +1346,7 @@
 		 */
 		if ((mt == PP_LOWER || mt == PP_UPPER) &&
 		    (wmt == PP_LOWER || wmt == PP_UPPER)) {
-		    if (tulower(c) != tulower(wc))
+		    if (ZC_tolower(c) != ZC_tolower(*wsc))
 			return 0;
 		} else {
 		    /* Other different classes can't match. */
@@ -1373,71 +1355,46 @@
 	    }
 	}
 
-	if (prestrict) {
-	    /* We need to assemble the line */
-	    if (imeta(c)) {
-		*s++ = Meta;
-		*s++ = c ^ 32;
-	    } else {
-		*s++ = c;
-	    }
-	    prestrict = prestrict->next;
-	} else
-	    s += len;
-	ws += wlen;
+	/* We need to assemble the line */
+	*newline++ = (ZLE_CHAR_T)c;
+	prestrict = prestrict->next;
+	wsc++;
+	wsclen--;
 	p = p->next;
 	wp = wp->next;
     }
 
-    while (p && (prestrict || *s)) {
-	if (prestrict) {
-	    /*
-	     * As above, but with even less info to go on.
-	     * (Can this happen?)  At least handle the cases where
-	     * one of our patterns has given us a specific character.
-	     */
-	    if (prestrict->tp == CPAT_CHAR) {
-		c = prestrict->u.chr;
-	    } else {
-		if (p->tp == CPAT_CHAR) {
-		    c = p->u.chr;
-		} else {
-		    /*
-		     * OK.  Here we are in a function with just a line
-		     * pattern and another pattern to restrict the
-		     * characters that can go on the line, and no actual
-		     * characters.  We're matching two patterns against
-		     * one another to generate a character to insert.
-		     * This is a bit too psychedelic, so I'm going to
-		     * bale out now.  See you on the ground.
-		     */
-		    return 0;
-		}
-		if (!pattern_match1(prestrict, c, &mt))
-		    return 0;
-	    }
+    while (p && prestrict) {
+	/*
+	 * As above, but with even less info to go on.
+	 * (Can this happen?)  At least handle the cases where
+	 * one of our patterns has given us a specific character.
+	 */
+	if (prestrict->tp == CPAT_CHAR) {
+	    c = prestrict->u.chr;
 	} else {
-	    if (*s == Meta) {
-		c = STOUC(s[1]) ^ 32;
-		len = 2;
+	    if (p->tp == CPAT_CHAR) {
+		c = p->u.chr;
 	    } else {
-		c = STOUC(*s);
-		len = 1;
+		/*
+		 * OK.  Here we are in a function with just a line
+		 * pattern and another pattern to restrict the
+		 * characters that can go on the line, and no actual
+		 * characters.  We're matching two patterns against
+		 * one another to generate a character to insert.
+		 * This is a bit too psychedelic, so I'm going to
+		 * bale out now.  See you on the ground.
+		 */
+		return 0;
 	    }
+	    if (!pattern_match1(prestrict, c, &mt))
+		return 0;
 	}
 	if (!pattern_match1(p, c, &mt))
 	    return 0;
 	p = p->next;
-	if (prestrict) {
-	    if (imeta(c)) {
-		*s++ = Meta;
-		*s++ = c ^ 32;
-	    } else {
-		*s++ = c;
-	    }
-	    prestrict = prestrict->next;
-	} else
-	    s += len;
+	*newline++ = (ZLE_CHAR_T)c;
+	prestrict = prestrict->next;
     }
 
     if (prestrict) {
@@ -1445,35 +1402,158 @@
 	return 0;
     }
 
-    while (wp && *ws) {
+    while (wp && wsclen) {
 	/* No funny business when we only have the word pattern. */
-	if (*ws == Meta) {
-	    wc = STOUC(ws[1]) ^ 32;
-	    wlen = 2;
-	} else {
-	    wc = STOUC(*ws);
-	    wlen = 1;
-	}
-	if (!pattern_match1(wp, wc, &wmt))
+	if (!pattern_match1(wp, *wsc, &wmt))
 	    return 0;
 	wp = wp->next;
-	ws += wlen;
+	wsc++;
+	wsclen--;
     }
 
     return 1;
 }
 
+
 /*
  * The usual version of pattern matching, without the line string
  * being handled by restriction.
+ *
+ * Check if the given pattern matches the given string.
+ * p and  s are either anchor or line pattern and string;
+ * wp and ws are word (candidate) pattern and string
+ *
+ * If only one pattern is given, we just check if characters match.
+ * If both line and word are given, we check that characters match
+ * for {...} classes by comparing positions in the strings.
+ *
+ * Patterns and strings are always passed in pairs, so it is enough
+ * to check for non-NULL wp. p should always be present.
  */
 /**/
 mod_export int
 pattern_match(Cpattern p, char *s, Cpattern wp, char *ws)
 {
-    return pattern_match_restrict(p, s, wp, ws, NULL);
+    convchar_t c, wc;
+    convchar_t ind, wind;
+    int len = 0, wlen, mt, wmt;
+#ifdef MULTIBYTE_SUPPORT
+    mbstate_t lstate, wstate;
+
+    memset(&lstate, 0, sizeof(lstate));
+    memset(&wstate, 0, sizeof(wstate));
+#endif
+
+    while (p && wp && *s && *ws) {
+	/* First test the word character */
+#ifdef MULTIBYTE_SUPPORT
+	wlen = mb_metacharlenconv_r(ws, &wc, &wstate);
+#else
+	if (*ws == Meta) {
+	    wc = STOUC(ws[1]) ^ 32;
+	    wlen = 2;
+	} else {
+	    wc = STOUC(*ws);
+	    wlen = 1;
+	}
+#endif
+	wind = pattern_match1(wp, wc, &wmt);
+	if (!wind)
+	    return 0;
+
+	/*
+	 * Now the line character.
+	 */
+#ifdef MULTIBYTE_SUPPORT
+	len = mb_metacharlenconv_r(s, &c, &lstate);
+#else
+	/* We have the character itself. */
+	if (*s == Meta) {
+	    c = STOUC(s[1]) ^ 32;
+	    len = 2;
+	} else {
+	    c = STOUC(*s);
+	    len = 1;
+	}
+#endif
+	/*
+	 * If either is "?", they match each other; no further tests.
+	 * Apply this even if the character wasn't convertable;
+	 * there's no point trying to be clever in that case.
+	 */
+	if (p->tp != CPAT_ANY || wp->tp != CPAT_ANY)
+	{
+	    ind = pattern_match1(p, c, &mt);
+	    if (!ind)
+		return 0;
+	    if (ind != wind)
+		return 0;
+	    if (mt != wmt) {
+		/*
+		 * Special case if matching lower vs. upper or
+		 * vice versa.  The transformed characters must match.
+		 * We don't need to check the transformation is
+		 * the appropriate one for each character separately,
+		 * since that was done in pattern_match1(), so just
+		 * compare lower-cased versions of both.
+		 */
+		if ((mt == PP_LOWER || mt == PP_UPPER) &&
+		    (wmt == PP_LOWER || wmt == PP_UPPER)) {
+		    if (ZC_tolower(c) != ZC_tolower(wc))
+			return 0;
+		} else {
+		    /* Other different classes can't match. */
+		    return 0;
+		}
+	    }
+	}
+
+	s += len;
+	ws += wlen;
+	p = p->next;
+	wp = wp->next;
+    }
+
+    while (p && *s) {
+#ifdef MULTIBYTE_SUPPORT
+	len = mb_metacharlenconv_r(s, &c, &lstate);
+#else
+	if (*s == Meta) {
+	    c = STOUC(s[1]) ^ 32;
+	    len = 2;
+	} else {
+	    c = STOUC(*s);
+	    len = 1;
+	}
+#endif
+	if (!pattern_match1(p, c, &mt))
+	    return 0;
+	p = p->next;
+	s += len;
+    }
+
+    while (wp && *ws) {
+#ifdef MULTIBYTE_SUPPORT
+	wlen = mb_metacharlenconv_r(ws, &wc, &wstate);
+#else
+	if (*ws == Meta) {
+	    wc = STOUC(ws[1]) ^ 32;
+	    wlen = 2;
+	} else {
+	    wc = STOUC(*ws);
+	    wlen = 1;
+	}
+#endif
+	if (!pattern_match1(wp, wc, &wmt))
+	    return 0;
+	wp = wp->next;
+	ws += wlen;
+    }
+
+    return 1;
 }
 
+
 /* This splits the given string into a list of cline structs, separated
  * at those places where one of the anchors of an `*' pattern was found.
  * plen gives the number of characters on the line that matched this
@@ -1575,30 +1655,45 @@
  * buffer line.  Then we test if this line matches the string given by
  * wlen and word.
  *
- * wpat contains pattern that matched previously
- * lpat contains the pattern for line we build
+ * The matcher  ) wpat, containing pattern that matched previously
+ *   mp gives   ) lpat, containing the pattern for line we build
+ * line is the line we are assembling; it is initially empty
  * mword is a string that matched wpat before
  * word is string that we try to match now
  *
  * The return value is the length of the string matched in the word, it
  * is zero if we couldn't build a line that matches the word.
- *
- * TODO: a lot of the nastiness associated with variable string
- * lengths can go when we switch to wide characters.  (Why didn't
- * I just keep line unmetafied and metafy into place at the end?  Er...)
  */
 
 /**/
 static int
-bld_line(Cmatcher mp, char **linep, char *mword, char *word, int wlen, int sfx)
+bld_line(Cmatcher mp, ZLE_STRING_T line, char *mword, char *word,
+	 int wlen, int sfx)
 {
     Cpattern lpat = mp->line;
     Cpattern wpat = mp->word;
     Cpattern curgenpat;
-    VARARR(struct cpattern, genpatarr, mp->llen);
     Cmlist ms;
-    int llen, rl;
-    char *oword = word, *line = *linep;
+    int llen, rl, l;
+    convchar_t convchr, *wordcp;
+    VARARR(convchar_t, wordchars, wlen);
+    VARARR(struct cpattern, genpatarr, mp->llen);
+
+    /*
+     * We may need to start the "word" array from the end.  This
+     * is much easier if we convert it to an array of (possibly wide)
+     * characters.
+     */
+    MB_METACHARINIT();
+    for (l = wlen, wordcp = wordchars; l; l--) {
+	int charlen = MB_METACHARLENCONV(word, &convchr);
+#ifdef MULTIBYTE_SUPPORT
+	if (convchr == WEOF)
+	    convchr = (*word == Meta) ? word[1] ^ 32 : *word;
+#endif
+	*wordcp++ = convchr;
+	word += charlen;
+    }
 
     /*
      * Loop over all characters.  At this stage, line is an empty
@@ -1616,9 +1711,10 @@
      * when we finally match the line against the set of matchers.
      */
     curgenpat = genpatarr;
+    MB_METACHARINIT();
     while (lpat) {
-	int wchr = (*mword == Meta) ? STOUC(mword[1]) ^ 32 : STOUC(*mword);
-	int wmtp, wind;
+	convchar_t wchr, wind;
+	int wmtp, mwordlen;
 	/*
 	 * If the line pattern is an equivalence, query wpat to find the
 	 * word part of the equivalence.  If we don't find one we don't try
@@ -1628,9 +1724,10 @@
 	 * the behaviour of the old logic that this replaces.)
 	 */
 	if (lpat->tp == CPAT_EQUIV && wpat && *mword) {
+	    mwordlen = MB_METACHARLENCONV(mword, &wchr);
 	    wind = pattern_match1(wpat, wchr, &wmtp);
 	    wpat = wpat->next;
-	    mword += (*mword == Meta) ? 2 : 1;
+	    mword += mwordlen;
 	} else
 	    wind = 0;
 	if (wind) {
@@ -1638,9 +1735,9 @@
 	     * Successful match for word side of equivalence.
 	     * Find the line equivalent.
 	     */
-	    int lchr;
+	    convchar_t lchr;
 	    if ((lchr = pattern_match_equivalence(lpat, wind, wmtp, wchr))
-		== -1) {
+		== CHR_INVALID) {
 		/*
 		 * No equivalent.  No possible match; give up.
 		 */
@@ -1694,50 +1791,40 @@
     llen = mp->llen;
     rl = 0;
 
-    *line = '\0';
     if (sfx)
     {
 	/*
 	 * We need to work backwards from the end of both the
 	 * word and the line strings.
-	 *
-	 * Position at the end of the word by counting characters.
 	 */
-	int l = wlen;
-	while (l--)
-	    word += (*word == Meta) ? 2 : 1;
+	wordcp = wordchars + wlen;
 
 	/*
-	 * We construct the line from the end.  We've left
-	 * enough space for possible Meta's.
+	 * We construct the line from the end.
 	 */
-	line += 2 * llen;
-	*line = '\0';
+	line += llen;
 	curgenpat = genpatarr + llen;
-    } else
+    } else {
+	wordcp = wordchars;
 	curgenpat = genpatarr;
+    }
 
     /* we now reuse mp, lpat, wpat for the global matchers */
+    MB_METACHARINIT();
     while (llen && wlen) {
-	int wchr, wmtp;
-	char *wp;
+	convchar_t wchr;
+	int wmtp;
+	convchar_t *wp;
 	Cpattern tmpgenpat;
 
 	if (sfx) {
-	    if (word > oword + 1 && word[-2] == Meta)
-		wp = word - 2;
-	    else
-		wp = word - 1;
+	    wp = wordcp - 1;
 	    curgenpat--;
 	} else
-	    wp = word;
-	if (*wp == Meta)
-	    wchr = STOUC(wp[1]) ^ 32;
-	else
-	    wchr = STOUC(*wp);
-	if (pattern_match1(curgenpat, wchr, &wmtp))
+	    wp = wordcp;
+	if (pattern_match1(curgenpat, *wp, &wmtp))
 	{
-	    int lchr;
+	    convchar_t lchr;
 	    /*
 	     * We can match the line character directly with the word
 	     * character.  If the line character is a fixed one,
@@ -1749,36 +1836,27 @@
 		lchr = curgenpat->u.chr;
 	    else
 		lchr = wchr;
-	    if (imeta(lchr)) {
-		if (sfx)
-		    line -= 2;
-		line[0] = Meta;
-		line[1] = lchr ^ 32;
-		if (!sfx)
-		    line += 2;
-	    } else {
-		if (sfx)
-		    line--;
-		line[0] = lchr;
-		if (!sfx)
-		    line++;
-	    }
+
+	    if (sfx)
+		*--line = lchr;
+	    else
+		*line++ = lchr;
 
 	    llen--;
 	    wlen--;
 	    rl++;
 
 	    if (sfx)
-		word = wp;
+		wordcp = wp;
 	    else {
 		if (llen)
 		    curgenpat++;
-		word += (*word == Meta) ? 2 : 1;
+		wordcp++;
 	    }
 	}
 	else
 	{
-	    char *lp;
+	    ZLE_CHAR_T *lp;
 	    /*
 	     * Need to loop over pattern matchers.
 	     */
@@ -1794,66 +1872,31 @@
 		if (mp && !mp->flags && mp->wlen <= wlen &&
 		    mp->llen <= llen)
 		{
+		    lp = line;
+		    wp = wordcp;
+		    tmpgenpat = curgenpat;
+
 		    if (sfx) {
-			/*
-			 * We haven't assembled the line yet, and with
-			 * Meta characters we don't yet know the length.
-			 * We'll fix this up later.
-			 */
-			lp = line - 2 * mp->llen;
-		    } else
-			lp = line;
-		    wp = word;
-		    if (sfx) {
-			int l = mp->wlen;
-			while (l--) {
-			    if (wp > oword + 1 && wp[-2] == Meta)
-				wp -= 2;
-			    else
-				wp--;
-			}
+			lp -= mp->llen;
+			wp -= mp->wlen;
+			tmpgenpat -= mp->llen;
+		    }
 
-			tmpgenpat = curgenpat - mp->llen;
-		    } else
-			tmpgenpat = curgenpat;
-		    if (pattern_match_restrict(mp->line, lp,
-					       mp->word, wp, tmpgenpat)) {
+		    if (pattern_match_restrict(mp->line, mp->word, wp,
+					       wlen - (wp - wordchars),
+					       tmpgenpat, lp)) {
 			/*
 			 * Matched: advance over as many characters
 			 * of the patterns and strings as
 			 * we've done matches.
 			 */
 			if (sfx) {
-			    int imove = mp->llen, nchar;
-			    char *pmove = lp;
-			    word = wp;
-			    
-			    /* Close the gap we left in the line string */
-			    while (imove--)
-				pmove += (*pmove == Meta) ? 2 : 1;
-			    /* Number of bytes to move */
-			    nchar = (int)(pmove - lp);
-			    /* The size of the gap */
-			    imove = 2 * mp->llen - nchar;
-			    if (imove) {
-				lp = line - imove;
-				/* Moving up, so start at the top */
-				while (nchar--)
-				    *--line = *--lp;
-				/* line is at the start of the moved text */
-			    }
-
+			    line = lp;
+			    wordcp = wp;
 			    curgenpat = tmpgenpat;
 			} else {
-			    int cnt = mp->llen;
-			    while (cnt--) {
-				line += (*line == Meta) ? 2 : 1;
-			    }
-
-			    cnt = mp->wlen;
-			    while (cnt--)
-				word += (*word == Meta) ? 2 : 1;
-
+			    line += mp->llen;
+			    wordcp += mp->wlen;
 			    curgenpat += mp->llen;
 			}
 			llen -= mp->llen;
@@ -1869,10 +1912,6 @@
     }
     if (!llen) {
 	/* Unmatched portion in the line built, return matched length. */
-	if (sfx)
-	    *linep = line;
-	else
-	    *line = '\0';
 	return rl;
     }
     return 0;
@@ -1891,7 +1930,14 @@
 
     Cmlist ms;
     Cmatcher mp;
-    int t, bl, rr = rl;
+    int t, bl;
+    /** rr is the remaining length already allocated in rs */
+    int rr = rl;
+    /*
+     * convlen is the length we need for the string converted to
+     * char * (possibly multibyte).
+     */
+    int convlen;
     char *rp = rs;
 
     while (la && lb) {
@@ -1906,35 +1952,49 @@
 		    if ((t = pattern_match(mp->word, sa, NULL, NULL)) ||
 			pattern_match(mp->word, sb, NULL, NULL)) {
 			/* It matched one of the strings, t says which one. */
-			/* TODO: double to allow Meta, not necessary
-			   when properly unmetafied */
-			VARARR(char, linearr, 2*mp->llen + 1);
-			char **ap, **bp, *line = linearr;
+			VARARR(ZLE_CHAR_T, line, mp->llen);
+			char **ap, **bp;
 			int *alp, *blp;
 
 			if (t) {
-			    ap = &sa; alp = &la;
-			    bp = &sb; blp = &lb;
+			    ap = &sa;
+			    alp = &la;
+
+			    bp = &sb;
+			    blp = &lb;
 			} else {
-			    ap = &sb; alp = &lb;
-			    bp = &sa; blp = &la;
+			    ap = &sb;
+			    alp = &lb;
+
+			    bp = &sa;
+			    blp = &la;
 			}
 			/* Now try to build a string that matches the other
 			 * string. */
-			if ((bl = bld_line(mp, &line, *ap, *bp, *blp, 0))) {
+			if ((bl = bld_line(mp, line, *ap, *bp, *blp, 0))) {
 			    /* Found one, put it into the return string. */
-			    if (rr <= mp->llen) {
+			    char *convstr =
+				zlelineasstring(line, mp->llen, 0, &convlen,
+						NULL, 0);
+			    if (rr <= convlen) {
 				char *or = rs;
+				int alloclen = (convlen > 20) ? convlen : 20;
 
-				rs = realloc(rs, (rl += 20));
-				rr += 20;
+				rs = realloc(rs, (rl += alloclen));
+				rr += alloclen;
 				rp += rs - or;
 			    }
-			    memcpy(rp, line, mp->llen);
-			    rp += mp->llen; rr -= mp->llen;
-			    *ap += mp->wlen; *alp -= mp->wlen;
-			    *bp += bl; *blp -= bl;
+			    memcpy(rp, convstr, convlen);
+			    rp += convlen;
+			    rr -= convlen;
+			    /* HERE: multibyte chars */
+			    *ap += mp->wlen;
+			    *alp -= mp->wlen;
+
+			    *bp += bl;
+			    *blp -= bl;
 			    t = 1;
+			    free(convstr);
 			} else
 			    t = 0;
 		    }
@@ -1944,16 +2004,20 @@
 		break;
 	} else {
 	    /* Same character, just take it. */
-	    if (rr <= 1) {
+	    if (rr <= 1 /* HERE charlen */) {
 		char *or = rs;
 
 		rs = realloc(rs, (rl += 20));
 		rr += 20;
 		rp += rs - or;
 	    }
-	    *rp++ = *sa; rr--;
-	    sa++; sb++;
-	    la--; lb--;
+	    /* HERE: multibyte char */
+	    *rp++ = *sa;
+	    rr--;
+	    sa++;
+	    sb++;
+	    la--;
+	    lb--;
 	}
     }
     if (la || lb)
@@ -2035,9 +2099,11 @@
 	} else {
 	    md->line = 0;
 	    md->len = md->olen = md->cl->wlen;
+	    /* HERE: multibyte */
 	    if ((md->str = md->cl->word) && sfx)
 		md->str += md->len;
 	    md->alen = md->cl->llen;
+	    /* HERE: multibyte */
 	    if ((md->astr = md->cl->line) && sfx)
 		md->astr += md->alen;
 	}
@@ -2060,9 +2126,11 @@
 	r->wlen = 0;
 	r->flags |= CLF_LINE;
 	r->llen = md->len;
+	/* HERE: multibyte */
 	r->line = md->str - (sfx ? md->len : 0);
     } else if (md->len != md->olen) {
 	r->wlen = md->len;
+	/* HERE: multibyte */
 	r->word = md->str - (sfx ? md->len : 0);
 	DPUTS(r->wlen > 0 && !*r->word, "Bad word");
     }
@@ -2116,24 +2184,24 @@
 				       NULL, NULL)) ||
 		     pattern_match(mp->word, nw - (sfx ? mp->wlen : 0),
 				   NULL, NULL))) {
-		    /* TODO: doubled to allow Meta, not necessary
-		     * when properly unmetafied */
-		    VARARR(char, linearr, 2*mp->llen + 1);
+		    VARARR(ZLE_CHAR_T, line, mp->llen);
 		    int bl;
-		    char *mw, *line = linearr;
+		    char *mw;
 
 		    /* Then build all the possible lines and see
 		     * if one of them matches the other string. */
+		    /* HERE: they're multibyte */
 		    if (t)
 			mw = ow - (sfx ? mp->wlen : 0);
 		    else
 			mw = nw - (sfx ? mp->wlen : 0);
 
-		    if ((bl = bld_line(mp, &line, mw, (t ? nw : ow),
+		    if ((bl = bld_line(mp, line, mw, (t ? nw : ow),
 				       (t ? nl : ol), sfx)))  {
 			/* Yep, one of the lines matched the other
 			 * string. */
 
+			/* HERE: multibyte characters */
 			if (t) {
 			    ol = mp->wlen; nl = bl;
 			} else {
@@ -2146,8 +2214,10 @@
 			md->len -= nl;
 			*mlen = ol;
 
-			return get_cline(NULL, 0, dupstring(line), mp->llen,
-					 NULL, 0, CLF_JOIN);
+			return get_cline(NULL, 0,
+					 zlelineasstring(line, mp->llen,
+							 0, NULL, NULL, 1),
+					 mp->llen, NULL, 0, CLF_JOIN);
 		    }
 		}
 	    }
Index: Src/Zle/computil.c
===================================================================
RCS file: /cvsroot/zsh/zsh/Src/Zle/computil.c,v
retrieving revision 1.113
diff -u -r1.113 computil.c
--- Src/Zle/computil.c	2 Nov 2008 17:36:26 -0000	1.113
+++ Src/Zle/computil.c	15 Nov 2008 21:09:38 -0000
@@ -4062,7 +4062,7 @@
 		    len += addlen + 1;
 	    } else {
 		/* The usual set of matcher possibilities. */
-		int ind;
+		convchar_t ind;
 		if (m->line->tp == CPAT_EQUIV &&
 		    m->word->tp == CPAT_EQUIV) {
 		    /*
@@ -4086,7 +4086,7 @@
 			 * word pattern.
 			 */
 			if ((ind = pattern_match_equivalence
-			     (m->word, ind, mt, addc)) != -1) {
+			     (m->word, ind, mt, addc)) != CHR_INVALID) {
 			    if (ret) {
 				if (imeta(ind)) {
 				    *p++ = Meta;
Index: Src/Zle/zle_utils.c
===================================================================
RCS file: /cvsroot/zsh/zsh/Src/Zle/zle_utils.c,v
retrieving revision 1.53
diff -u -r1.53 zle_utils.c
--- Src/Zle/zle_utils.c	24 Apr 2008 10:19:01 -0000	1.53
+++ Src/Zle/zle_utils.c	15 Nov 2008 21:09:38 -0000
@@ -167,9 +167,10 @@
  * instead of wide characters where appropriate and with the contents
  * metafied.
  *
- * If outll is non-NULL, assign the new length.  If outcs is non-NULL,
- * assign the new character position.  This is the conventional string
- * length, without the NULL byte.
+ * If outllp is non-NULL, assign the new length.  This is the conventional
+ * string length, without the NULL byte.
+ *
+ * If outcsp is non-NULL, assign the new character position.
  *
  * If useheap is 1, memory is returned from the heap, else is allocated
  * for later freeing.


-- 
Peter Stephenson <p.w.stephenson@xxxxxxxxxxxx>
Web page now at http://homepage.ntlworld.com/p.w.stephenson/
Follow-Ups:
- Re: PATCH: convert lower levels of matching to wide char / multibyte
  - From: Peter Stephenson
Messages sorted by: Reverse Date, Date, Thread, Author