Zsh Mailing List Archive Messages sorted by: Reverse Date, Date, Thread, Author
PATCH: multibyte parameter lengths & case

X-seq: zsh-workers 22525
From: Peter Stephenson <pws@xxxxxxx>
To: zsh-workers@xxxxxxxxxx (Zsh hackers list)
Subject: PATCH: multibyte parameter lengths & case
Date: Wed, 28 Jun 2006 14:01:24 +0100
Mailing-list: contact zsh-workers-help@xxxxxxxxxx; run by ezmlm
Some simpler things:
- lengths of strings using ${#...}
- upper and lower case modification and capitalisation of parameters
- also in history modifiers
- $mbegin/$mend and $MBEGIN/$MEND respect the setting of MULTIBYTE

It's still not finished but at this point it should be possible to use
parameters with multibyte strings effectively in completion functions.
I've therefore added the multibyte option to the set that are turned on
when entering the completion system.  This should make, for example,
${#LBUFFER} consistent with $CURSOR when there are multibyte characters
present.

At some point we need to handle character widths better, otherwise the
parameter padding options aren't very useful with outsize characters.

Index: Completion/compinit
===================================================================
RCS file: /cvsroot/zsh/zsh/Completion/compinit,v
retrieving revision 1.15
diff -u -r1.15 compinit
--- Completion/compinit	17 Oct 2005 14:56:17 -0000	1.15
+++ Completion/compinit	28 Jun 2006 12:55:28 -0000
@@ -128,11 +128,12 @@
 # The standard options set in completion functions.
 
 _comp_options=(
-       glob
+       extendedglob
        bareglobqual
+       glob
+       multibyte
        nullglob
        rcexpandparam
-       extendedglob
        unset
     NO_markdirs
     NO_globsubst
Index: Src/hist.c
===================================================================
RCS file: /cvsroot/zsh/zsh/Src/hist.c,v
retrieving revision 1.64
diff -u -r1.64 hist.c
--- Src/hist.c	30 May 2006 22:35:03 -0000	1.64
+++ Src/hist.c	28 Jun 2006 12:55:30 -0000
@@ -635,10 +635,10 @@
 		quotebreak(&sline);
 		break;
 	    case 'l':
-		downcase(&sline);
+		sline = casemodify(sline, CASMOD_LOWER);
 		break;
 	    case 'u':
-		upcase(&sline);
+		sline = casemodify(sline, CASMOD_UPPER);
 		break;
 	    default:
 		herrflush();
@@ -1503,42 +1503,130 @@
     return 0;
 }
 
-/**/
-int
-makeuppercase(char **junkptr)
-{
-    char *str = *junkptr;
-
-    for (; *str; str++)
-	*str = tuupper(*str);
-    return 1;
-}
+/*
+ * Return modified version of str from the heap with modification
+ * according to one of the CASMOD_* types defined in zsh.h; CASMOD_NONE
+ * is not handled, for obvious reasons.
+ */
 
 /**/
-int
-makelowercase(char **junkptr)
+char *
+casemodify(char *str, int how)
 {
-    char *str = *junkptr;
+    char *str2 = zhalloc(2 * strlen(str) + 1);
+    char *ptr2 = str2;
+    int nextupper = 1;
+
+#ifdef MULTIBYTE_SUPPORT
+    if (isset(MULTIBYTE)) {
+	VARARR(char, mbstr, MB_CUR_MAX);
+	mbstate_t ps;
+
+	mb_metacharinit();
+	memset(&ps, 0, sizeof(ps));
+	while (*str) {
+	    wint_t wc;
+	    int len = mb_metacharlenconv(str, &wc), mod = 0, len2;
+	    /*
+	     * wc is set to WEOF if the start of str couldn't be
+	     * converted.  Presumably WEOF doesn't match iswlower(), but
+	     * better be safe.
+	     */
+	    if (wc == WEOF) {
+		while (len--)
+		    *ptr2++ = *str++;
+		/* not alphanumeric */
+		nextupper = 1;
+		continue;
+	    }
+	    switch (how) {
+	    case CASMOD_LOWER:
+		if (iswupper(wc)) {
+		    wc = towlower(wc);
+		    mod = 1;
+		}
+		break;
 
-    for (; *str; str++)
-	*str = tulower(*str);
-    return 1;
-}
+	    case CASMOD_UPPER:
+		if (iswlower(wc)) {
+		    wc = towupper(wc);
+		    mod = 1;
+		}
+		break;
 
-/**/
-int
-makecapitals(char **junkptr)
-{
-    char *str = *junkptr;
+	    case CASMOD_CAPS:
+	    default:		/* shuts up compiler */
+		if (!iswalnum(wc))
+		    nextupper = 1;
+		else if (nextupper) {
+		    if (iswlower(wc)) {
+			wc = towupper(wc);
+			mod = 1;
+		    }
+		    nextupper = 0;
+		} else if (iswupper(wc)) {
+		    wc = towlower(wc);
+		    mod = 1;
+		}
+		break;
+	    }
+	    if (mod && (len2 = wcrtomb(mbstr, wc, &ps)) > 0) {
+		char *mbptr;
 
-    for (; *str;) {
-	for (; *str && !ialnum(*str); str++);
-	if (*str)
-	    *str = tuupper(*str), str++;
-	for (; *str && ialnum(*str); str++)
-	    *str = tulower(*str);
+		for (mbptr = mbstr; mbptr < mbstr + len2; mbptr++) {
+		    if (imeta(STOUC(*mbptr))) {
+			*ptr2++ = Meta;
+			*ptr2++ = *mbptr ^ 32;
+		    } else
+			*ptr2++ = *mbptr;
+		}
+		str += len;
+	    } else {
+		while (len--)
+		    *ptr2++ = *str++;
+	    }
+	}
     }
-    return 1;
+    else
+#endif
+	while (*str) {
+	    int c;
+	    if (*str == Meta) {
+		c = str[1] ^ 32;
+		str += 2;
+	    } else
+		c = *str++;
+	    switch (how) {
+	    case CASMOD_LOWER:
+		if (isupper(c))
+		    c = tolower(c);
+		break;
+
+	    case CASMOD_UPPER:
+		if (islower(c))
+		    c = toupper(c);
+		break;
+
+	    case CASMOD_CAPS:
+	    default:		/* shuts up compiler */
+		if (!ialnum(c))
+		    nextupper = 1;
+		else if (nextupper) {
+		    if (islower(c))
+			c = toupper(c);
+		    nextupper = 0;
+		} else if (isupper(c))
+		    c = tolower(c);
+		break;
+	    }
+	    if (imeta(c)) {
+		*ptr2++ = Meta;
+		*ptr2++ = c ^ 32;
+	    } else
+		*ptr2++ = c;
+	}
+    *ptr2 = '\0';
+    return str2;
 }
 
 /**/
@@ -1645,26 +1733,6 @@
 }
 
 /**/
-void
-upcase(char **x)
-{
-    char *pp = *(char **)x;
-
-    for (; *pp; pp++)
-	*pp = tuupper(*pp);
-}
-
-/**/
-void
-downcase(char **x)
-{
-    char *pp = *(char **)x;
-
-    for (; *pp; pp++)
-	*pp = tulower(*pp);
-}
-
-/**/
 int
 quote(char **tr)
 {
Index: Src/jobs.c
===================================================================
RCS file: /cvsroot/zsh/zsh/Src/jobs.c,v
retrieving revision 1.46
diff -u -r1.46 jobs.c
--- Src/jobs.c	30 May 2006 22:35:03 -0000	1.46
+++ Src/jobs.c	28 Jun 2006 12:55:32 -0000
@@ -2014,7 +2014,7 @@
 		    return 1;
 		} else
 		    signame = *argv;
-		makeuppercase(&signame);
+		signame = casemodify(signame, CASMOD_UPPER);
 		if (!strncmp(signame, "SIG", 3))
 		    signame+=3;
 
Index: Src/pattern.c
===================================================================
RCS file: /cvsroot/zsh/zsh/Src/pattern.c,v
retrieving revision 1.34
diff -u -r1.34 pattern.c
--- Src/pattern.c	31 May 2006 01:02:05 -0000	1.34
+++ Src/pattern.c	28 Jun 2006 12:55:35 -0000
@@ -1644,17 +1644,12 @@
 }
 
 
-#ifndef PARAMETER_CODE_HANDLES_MULTIBYTE
 /*
- * TODO: We should use the other branch, but currently
- * the parameter code doesn't handle multibyte input,
- * so this would produce the wrong subscripts,
- * so just use a raw byte difference for now.
+ * Counter the number of characters between two pointers, smaller first
+ *
+ * This is used when setting values in parameters, so we obey
+ * the MULTIBYTE option (even if it's been overridden locally).
  */
-/* Counter the number of characters between two pointers, smaller first */
-# define CHARSUB(x,y)	((y) - (x))
-#else
-/* Counter the number of characters between two pointers, smaller first */
 #define CHARSUB(x,y)	charsub(x, y)
 static ptrdiff_t
 charsub(char *x, char *y)
@@ -1663,6 +1658,9 @@
     size_t ret;
     wchar_t wc;
 
+    if (!isset(MULTIBYTE))
+	return y - x;
+
     while (x < y) {
 	ret = mbrtowc(&wc, x, y-x, &shiftstate);
 
@@ -1674,13 +1672,12 @@
 	/* Treat nulls as normal characters */
 	if (!ret)
 	    ret = 1;
-	res += ret;
+	res++;
 	x += ret;
     }
 
     return res;
 }
-#endif
 
 #else /* no MULTIBYTE_SUPPORT */
 
Index: Src/subst.c
===================================================================
RCS file: /cvsroot/zsh/zsh/Src/subst.c,v
retrieving revision 1.51
diff -u -r1.51 subst.c
--- Src/subst.c	30 May 2006 22:35:03 -0000	1.51
+++ Src/subst.c	28 Jun 2006 12:55:37 -0000
@@ -1019,7 +1019,7 @@
     /* (u): straightforward. */
     int unique = 0;
     /* combination of (L), (U) and (C) flags. */
-    int casmod = 0;
+    int casmod = CASMOD_NONE;
     /*
      * quotemod says we are doing either (q) (positive), (Q) (negative)
      * or not (0).  quotetype counts the q's for the first case.
@@ -1211,13 +1211,13 @@
 		    break;
 
 		case 'L':
-		    casmod = 2;
+		    casmod = CASMOD_LOWER;
 		    break;
 		case 'U':
-		    casmod = 1;
+		    casmod = CASMOD_UPPER;
 		    break;
 		case 'C':
-		    casmod = 3;
+		    casmod = CASMOD_CAPS;
 		    break;
 
 		case 'o':
@@ -1819,17 +1819,13 @@
 		    break;
 		}
 		switch (v->pm->node.flags & (PM_LOWER | PM_UPPER)) {
-		    char *t;
-
 		case PM_LOWER:
-		    t = val;
-		    for (; (c = *t); t++)
-			*t = tulower(c);
+		    val = casemodify(val, CASMOD_LOWER);
+		    copied = 1;
 		    break;
 		case PM_UPPER:
-		    t = val;
-		    for (; (c = *t); t++)
-			*t = tuupper(c);
+		    val = casemodify(val, CASMOD_UPPER);
+		    copied = 1;
 		    break;
 		}
 	    }
@@ -2316,14 +2312,14 @@
 
 	if (isarr) {
 	    char **ctr;
-	    int sl = sep ? ztrlen(sep) : 1;
+	    int sl = sep ? MB_METASTRLEN(sep) : 1;
 
 	    if (getlen == 1)
 		for (ctr = aval; *ctr; ctr++, len++);
 	    else if (getlen == 2) {
 		if (*aval)
 		    for (len = -sl, ctr = aval;
-			 len += sl + ztrlen(*ctr), *++ctr;);
+			 len += sl + MB_METASTRLEN(*ctr), *++ctr;);
 	    }
 	    else
 		for (ctr = aval;
@@ -2331,7 +2327,7 @@
 		     len += wordcount(*ctr, spsep, getlen > 3), ctr++);
 	} else {
 	    if (getlen < 3)
-		len = ztrlen(val);
+		len = MB_METASTRLEN(val);
 	    else
 		len = wordcount(val, spsep, getlen > 3);
 	}
@@ -2387,33 +2383,19 @@
     /*
      * Perform case modififications.
      */
-    if (casmod) {
+    if (casmod != CASMOD_NONE) {
+	copied = 1;		/* string is always modified by copy */
 	if (isarr) {
-	    char **ap;
+	    char **ap, **ap2;
 
-	    if (!copied)
-		aval = arrdup(aval), copied = 1;
 	    ap = aval;
+	    ap2 = aval = (char **) zhalloc(sizeof(char *) * (arrlen(aval)+1));
 
-	    if (casmod == 1)
-		for (; *ap; ap++)
-		    makeuppercase(ap);
-	    else if (casmod == 2)
-		for (; *ap; ap++)
-		    makelowercase(ap);
-	    else
-		for (; *ap; ap++)
-		    makecapitals(ap);
-
+	    while (*ap)
+		*ap2++ = casemodify(*ap++, casmod);
+	    *ap2++ = NULL;
 	} else {
-	    if (!copied)
-		val = dupstring(val), copied = 1;
-	    if (casmod == 1)
-		makeuppercase(&val);
-	    else if (casmod == 2)
-		makelowercase(&val);
-	    else
-		makecapitals(&val);
+	    val = casemodify(val, casmod);
 	}
     }
     /*
@@ -2975,7 +2957,8 @@
 		for (t = e = *str; (tt = findword(&e, sep));) {
 		    tc = *e;
 		    *e = '\0';
-		    copy = dupstring(tt);
+		    if (c != 'l' && c != 'u')
+			copy = dupstring(tt);
 		    *e = tc;
 		    switch (c) {
 		    case 'h':
@@ -2991,10 +2974,10 @@
 			remlpaths(&copy);
 			break;
 		    case 'l':
-			downcase(&copy);
+			copy = casemodify(tt, CASMOD_LOWER);
 			break;
 		    case 'u':
-			upcase(&copy);
+			copy = casemodify(tt, CASMOD_UPPER);
 			break;
 		    case 's':
 			if (hsubl && hsubr)
@@ -3050,10 +3033,10 @@
 		    remlpaths(str);
 		    break;
 		case 'l':
-		    downcase(str);
+		    *str = casemodify(*str, CASMOD_LOWER);
 		    break;
 		case 'u':
-		    upcase(str);
+		    *str = casemodify(*str, CASMOD_UPPER);
 		    break;
 		case 's':
 		    if (hsubl && hsubr) {
Index: Src/utils.c
===================================================================
RCS file: /cvsroot/zsh/zsh/Src/utils.c,v
retrieving revision 1.124
diff -u -r1.124 utils.c
--- Src/utils.c	27 Jun 2006 12:00:44 -0000	1.124
+++ Src/utils.c	28 Jun 2006 12:55:41 -0000
@@ -3687,7 +3687,7 @@
 
 /*
  * Initialise multibyte state: called before a sequence of
- * mb_metacharlen().
+ * mb_metacharlenconv().
  */
 
 /**/
@@ -3703,18 +3703,24 @@
  * but character is not valid (e.g. possibly incomplete at end of string).
  * Returned value is guaranteed not to reach beyond the end of the
  * string (assuming correct metafication).
+ *
+ * If wcp is not NULL, the converted wide character is stored there.
+ * If no conversion could be done WEOF is used.
  */
 
 /**/
 int
-mb_metacharlen(char *s)
+mb_metacharlenconv(char *s, wint_t *wcp)
 {
     char inchar, *ptr;
     size_t ret;
     wchar_t wc;
 
-    if (!isset(MULTIBYTE))
+    if (!isset(MULTIBYTE)) {
+	if (wcp)
+	    *wcp = WEOF;
 	return 1 + (*s == Meta);
+    }
 
     ret = MB_INVALID;
     for (ptr = s; *ptr; ) {
@@ -3729,14 +3735,18 @@
 	    break;
 	if (ret == MB_INCOMPLETE)
 	    continue;
+	if (wcp)
+	    *wcp = wc;
 	return ptr - s;
     }
 
+    if (wcp)
+	*wcp = WEOF;
     /* No valid multibyte sequence */
     memset(&mb_shiftstate, 0, sizeof(mb_shiftstate));
-    if (ptr > s)
+    if (ptr > s) {
 	return 1 + (*s == Meta);	/* Treat as single byte character */
-    else
+    } else
 	return 0;		/* Probably shouldn't happen */
 }
 
Index: Src/zsh.h
===================================================================
RCS file: /cvsroot/zsh/zsh/Src/zsh.h,v
retrieving revision 1.90
diff -u -r1.90 zsh.h
--- Src/zsh.h	26 Jun 2006 18:17:32 -0000	1.90
+++ Src/zsh.h	28 Jun 2006 12:55:44 -0000
@@ -1882,6 +1882,17 @@
 #define ZSIG_ALIAS	(1<<3)  /* Trap is stored under an alias */
 #define ZSIG_SHIFT	4
 
+/************************/
+/* Flags to casemodifiy */
+/************************/
+
+enum {
+    CASMOD_NONE,		/* dummy for tests */
+    CASMOD_UPPER,
+    CASMOD_LOWER,
+    CASMOD_CAPS
+};
+
 /**********************************/
 /* Flags to third argument of zle */
 /**********************************/
@@ -1927,7 +1938,7 @@
 #ifdef MULTIBYTE_SUPPORT
 #define nicezputs(str, outs)	(void)mb_niceformat((str), (outs), NULL, 0)
 #define MB_METACHARINIT()	mb_metacharinit()
-#define MB_METACHARLEN(str)	mb_metacharlen(str)
+#define MB_METACHARLEN(str)	mb_metacharlenconv(str, NULL)
 #define MB_METASTRLEN(str)	mb_metastrlen(str)
 
 #define MB_INCOMPLETE	((size_t)-2)
Index: Test/D07multibyte.ztst
===================================================================
RCS file: /cvsroot/zsh/zsh/Test/D07multibyte.ztst,v
retrieving revision 1.2
diff -u -r1.2 D07multibyte.ztst
--- Test/D07multibyte.ztst	27 Jun 2006 16:28:46 -0000	1.2
+++ Test/D07multibyte.ztst	28 Jun 2006 12:55:44 -0000
@@ -121,3 +121,37 @@
 
 # Starting offsets with (R) seem to be so strange as to be hardly
 # worth testing.
+
+  setopt extendedglob
+  [[ $a = (#b)t(Ã©n)(Ã©b)reux ]] || print "Failed to match." >&2
+  for i in {1..${#match}}; do
+    print $match[i] $mbegin[i] $mend[i] ${a[$mbegin[i],$mend[i]]}
+  done
+0:Multibyte offsets in pattern tests
+>Ã©n 2 3 Ã©n
+>Ã©b 4 5 Ã©b
+
+  b=${(U)a}
+  print $b
+  print ${(L)b}
+  desdichado="Je suis le $a, le veuf, l'inconsolÃ©"
+  print ${(C)desdichado}
+  lxiv="l'Ã©tat c'est moi"
+  print ${(C)lxiv}
+0:Case modification of multibyte strings
+>TÃ?NÃ?BREUX
+>tÃ©nÃ©breux
+>Je Suis Le TÃ©nÃ©breux, Le Veuf, L'InconsolÃ©
+>L'Ã?tat C'Est Moi
+
+  array=(Ã¸laf Ã¸dd Ã¸pened Ã¡n encyclopÃ¦dia)
+  barray=(${(U)array})
+  print $barray
+  print ${(L)barray}
+  print ${(C)array}
+  print ${(C)barray}
+0:Case modification of arrays with multibyte strings
+>Ã?LAF Ã?DD Ã?PENED Ã?N ENCYCLOPÃ?DIA
+>Ã¸laf Ã¸dd Ã¸pened Ã¡n encyclopÃ¦dia
+>Ã?laf Ã?dd Ã?pened Ã?n EncyclopÃ¦dia
+>Ã?laf Ã?dd Ã?pened Ã?n EncyclopÃ¦dia

-- 
Peter Stephenson <pws@xxxxxxx>                  Software Engineer
CSR PLC, Churchill House, Cambridge Business Park, Cowley Road
Cambridge, CB4 0WZ, UK                          Tel: +44 (0)1223 692070
Messages sorted by: Reverse Date, Date, Thread, Author