Zsh Mailing List Archive
Messages sorted by:
Reverse Date,
Date,
Thread,
Author
PATCH: multibyte parameter lengths & case
- X-seq: zsh-workers 22525
- From: Peter Stephenson <pws@xxxxxxx>
- To: zsh-workers@xxxxxxxxxx (Zsh hackers list)
- Subject: PATCH: multibyte parameter lengths & case
- Date: Wed, 28 Jun 2006 14:01:24 +0100
- Mailing-list: contact zsh-workers-help@xxxxxxxxxx; run by ezmlm
Some simpler things:
- lengths of strings using ${#...}
- upper and lower case modification and capitalisation of parameters
- also in history modifiers
- $mbegin/$mend and $MBEGIN/$MEND respect the setting of MULTIBYTE
It's still not finished but at this point it should be possible to use
parameters with multibyte strings effectively in completion functions.
I've therefore added the multibyte option to the set that are turned on
when entering the completion system. This should make, for example,
${#LBUFFER} consistent with $CURSOR when there are multibyte characters
present.
At some point we need to handle character widths better, otherwise the
parameter padding options aren't very useful with outsize characters.
Index: Completion/compinit
===================================================================
RCS file: /cvsroot/zsh/zsh/Completion/compinit,v
retrieving revision 1.15
diff -u -r1.15 compinit
--- Completion/compinit 17 Oct 2005 14:56:17 -0000 1.15
+++ Completion/compinit 28 Jun 2006 12:55:28 -0000
@@ -128,11 +128,12 @@
# The standard options set in completion functions.
_comp_options=(
- glob
+ extendedglob
bareglobqual
+ glob
+ multibyte
nullglob
rcexpandparam
- extendedglob
unset
NO_markdirs
NO_globsubst
Index: Src/hist.c
===================================================================
RCS file: /cvsroot/zsh/zsh/Src/hist.c,v
retrieving revision 1.64
diff -u -r1.64 hist.c
--- Src/hist.c 30 May 2006 22:35:03 -0000 1.64
+++ Src/hist.c 28 Jun 2006 12:55:30 -0000
@@ -635,10 +635,10 @@
quotebreak(&sline);
break;
case 'l':
- downcase(&sline);
+ sline = casemodify(sline, CASMOD_LOWER);
break;
case 'u':
- upcase(&sline);
+ sline = casemodify(sline, CASMOD_UPPER);
break;
default:
herrflush();
@@ -1503,42 +1503,130 @@
return 0;
}
-/**/
-int
-makeuppercase(char **junkptr)
-{
- char *str = *junkptr;
-
- for (; *str; str++)
- *str = tuupper(*str);
- return 1;
-}
+/*
+ * Return modified version of str from the heap with modification
+ * according to one of the CASMOD_* types defined in zsh.h; CASMOD_NONE
+ * is not handled, for obvious reasons.
+ */
/**/
-int
-makelowercase(char **junkptr)
+char *
+casemodify(char *str, int how)
{
- char *str = *junkptr;
+ char *str2 = zhalloc(2 * strlen(str) + 1);
+ char *ptr2 = str2;
+ int nextupper = 1;
+
+#ifdef MULTIBYTE_SUPPORT
+ if (isset(MULTIBYTE)) {
+ VARARR(char, mbstr, MB_CUR_MAX);
+ mbstate_t ps;
+
+ mb_metacharinit();
+ memset(&ps, 0, sizeof(ps));
+ while (*str) {
+ wint_t wc;
+ int len = mb_metacharlenconv(str, &wc), mod = 0, len2;
+ /*
+ * wc is set to WEOF if the start of str couldn't be
+ * converted. Presumably WEOF doesn't match iswlower(), but
+ * better be safe.
+ */
+ if (wc == WEOF) {
+ while (len--)
+ *ptr2++ = *str++;
+ /* not alphanumeric */
+ nextupper = 1;
+ continue;
+ }
+ switch (how) {
+ case CASMOD_LOWER:
+ if (iswupper(wc)) {
+ wc = towlower(wc);
+ mod = 1;
+ }
+ break;
- for (; *str; str++)
- *str = tulower(*str);
- return 1;
-}
+ case CASMOD_UPPER:
+ if (iswlower(wc)) {
+ wc = towupper(wc);
+ mod = 1;
+ }
+ break;
-/**/
-int
-makecapitals(char **junkptr)
-{
- char *str = *junkptr;
+ case CASMOD_CAPS:
+ default: /* shuts up compiler */
+ if (!iswalnum(wc))
+ nextupper = 1;
+ else if (nextupper) {
+ if (iswlower(wc)) {
+ wc = towupper(wc);
+ mod = 1;
+ }
+ nextupper = 0;
+ } else if (iswupper(wc)) {
+ wc = towlower(wc);
+ mod = 1;
+ }
+ break;
+ }
+ if (mod && (len2 = wcrtomb(mbstr, wc, &ps)) > 0) {
+ char *mbptr;
- for (; *str;) {
- for (; *str && !ialnum(*str); str++);
- if (*str)
- *str = tuupper(*str), str++;
- for (; *str && ialnum(*str); str++)
- *str = tulower(*str);
+ for (mbptr = mbstr; mbptr < mbstr + len2; mbptr++) {
+ if (imeta(STOUC(*mbptr))) {
+ *ptr2++ = Meta;
+ *ptr2++ = *mbptr ^ 32;
+ } else
+ *ptr2++ = *mbptr;
+ }
+ str += len;
+ } else {
+ while (len--)
+ *ptr2++ = *str++;
+ }
+ }
}
- return 1;
+ else
+#endif
+ while (*str) {
+ int c;
+ if (*str == Meta) {
+ c = str[1] ^ 32;
+ str += 2;
+ } else
+ c = *str++;
+ switch (how) {
+ case CASMOD_LOWER:
+ if (isupper(c))
+ c = tolower(c);
+ break;
+
+ case CASMOD_UPPER:
+ if (islower(c))
+ c = toupper(c);
+ break;
+
+ case CASMOD_CAPS:
+ default: /* shuts up compiler */
+ if (!ialnum(c))
+ nextupper = 1;
+ else if (nextupper) {
+ if (islower(c))
+ c = toupper(c);
+ nextupper = 0;
+ } else if (isupper(c))
+ c = tolower(c);
+ break;
+ }
+ if (imeta(c)) {
+ *ptr2++ = Meta;
+ *ptr2++ = c ^ 32;
+ } else
+ *ptr2++ = c;
+ }
+ *ptr2 = '\0';
+ return str2;
}
/**/
@@ -1645,26 +1733,6 @@
}
/**/
-void
-upcase(char **x)
-{
- char *pp = *(char **)x;
-
- for (; *pp; pp++)
- *pp = tuupper(*pp);
-}
-
-/**/
-void
-downcase(char **x)
-{
- char *pp = *(char **)x;
-
- for (; *pp; pp++)
- *pp = tulower(*pp);
-}
-
-/**/
int
quote(char **tr)
{
Index: Src/jobs.c
===================================================================
RCS file: /cvsroot/zsh/zsh/Src/jobs.c,v
retrieving revision 1.46
diff -u -r1.46 jobs.c
--- Src/jobs.c 30 May 2006 22:35:03 -0000 1.46
+++ Src/jobs.c 28 Jun 2006 12:55:32 -0000
@@ -2014,7 +2014,7 @@
return 1;
} else
signame = *argv;
- makeuppercase(&signame);
+ signame = casemodify(signame, CASMOD_UPPER);
if (!strncmp(signame, "SIG", 3))
signame+=3;
Index: Src/pattern.c
===================================================================
RCS file: /cvsroot/zsh/zsh/Src/pattern.c,v
retrieving revision 1.34
diff -u -r1.34 pattern.c
--- Src/pattern.c 31 May 2006 01:02:05 -0000 1.34
+++ Src/pattern.c 28 Jun 2006 12:55:35 -0000
@@ -1644,17 +1644,12 @@
}
-#ifndef PARAMETER_CODE_HANDLES_MULTIBYTE
/*
- * TODO: We should use the other branch, but currently
- * the parameter code doesn't handle multibyte input,
- * so this would produce the wrong subscripts,
- * so just use a raw byte difference for now.
+ * Counter the number of characters between two pointers, smaller first
+ *
+ * This is used when setting values in parameters, so we obey
+ * the MULTIBYTE option (even if it's been overridden locally).
*/
-/* Counter the number of characters between two pointers, smaller first */
-# define CHARSUB(x,y) ((y) - (x))
-#else
-/* Counter the number of characters between two pointers, smaller first */
#define CHARSUB(x,y) charsub(x, y)
static ptrdiff_t
charsub(char *x, char *y)
@@ -1663,6 +1658,9 @@
size_t ret;
wchar_t wc;
+ if (!isset(MULTIBYTE))
+ return y - x;
+
while (x < y) {
ret = mbrtowc(&wc, x, y-x, &shiftstate);
@@ -1674,13 +1672,12 @@
/* Treat nulls as normal characters */
if (!ret)
ret = 1;
- res += ret;
+ res++;
x += ret;
}
return res;
}
-#endif
#else /* no MULTIBYTE_SUPPORT */
Index: Src/subst.c
===================================================================
RCS file: /cvsroot/zsh/zsh/Src/subst.c,v
retrieving revision 1.51
diff -u -r1.51 subst.c
--- Src/subst.c 30 May 2006 22:35:03 -0000 1.51
+++ Src/subst.c 28 Jun 2006 12:55:37 -0000
@@ -1019,7 +1019,7 @@
/* (u): straightforward. */
int unique = 0;
/* combination of (L), (U) and (C) flags. */
- int casmod = 0;
+ int casmod = CASMOD_NONE;
/*
* quotemod says we are doing either (q) (positive), (Q) (negative)
* or not (0). quotetype counts the q's for the first case.
@@ -1211,13 +1211,13 @@
break;
case 'L':
- casmod = 2;
+ casmod = CASMOD_LOWER;
break;
case 'U':
- casmod = 1;
+ casmod = CASMOD_UPPER;
break;
case 'C':
- casmod = 3;
+ casmod = CASMOD_CAPS;
break;
case 'o':
@@ -1819,17 +1819,13 @@
break;
}
switch (v->pm->node.flags & (PM_LOWER | PM_UPPER)) {
- char *t;
-
case PM_LOWER:
- t = val;
- for (; (c = *t); t++)
- *t = tulower(c);
+ val = casemodify(val, CASMOD_LOWER);
+ copied = 1;
break;
case PM_UPPER:
- t = val;
- for (; (c = *t); t++)
- *t = tuupper(c);
+ val = casemodify(val, CASMOD_UPPER);
+ copied = 1;
break;
}
}
@@ -2316,14 +2312,14 @@
if (isarr) {
char **ctr;
- int sl = sep ? ztrlen(sep) : 1;
+ int sl = sep ? MB_METASTRLEN(sep) : 1;
if (getlen == 1)
for (ctr = aval; *ctr; ctr++, len++);
else if (getlen == 2) {
if (*aval)
for (len = -sl, ctr = aval;
- len += sl + ztrlen(*ctr), *++ctr;);
+ len += sl + MB_METASTRLEN(*ctr), *++ctr;);
}
else
for (ctr = aval;
@@ -2331,7 +2327,7 @@
len += wordcount(*ctr, spsep, getlen > 3), ctr++);
} else {
if (getlen < 3)
- len = ztrlen(val);
+ len = MB_METASTRLEN(val);
else
len = wordcount(val, spsep, getlen > 3);
}
@@ -2387,33 +2383,19 @@
/*
* Perform case modififications.
*/
- if (casmod) {
+ if (casmod != CASMOD_NONE) {
+ copied = 1; /* string is always modified by copy */
if (isarr) {
- char **ap;
+ char **ap, **ap2;
- if (!copied)
- aval = arrdup(aval), copied = 1;
ap = aval;
+ ap2 = aval = (char **) zhalloc(sizeof(char *) * (arrlen(aval)+1));
- if (casmod == 1)
- for (; *ap; ap++)
- makeuppercase(ap);
- else if (casmod == 2)
- for (; *ap; ap++)
- makelowercase(ap);
- else
- for (; *ap; ap++)
- makecapitals(ap);
-
+ while (*ap)
+ *ap2++ = casemodify(*ap++, casmod);
+ *ap2++ = NULL;
} else {
- if (!copied)
- val = dupstring(val), copied = 1;
- if (casmod == 1)
- makeuppercase(&val);
- else if (casmod == 2)
- makelowercase(&val);
- else
- makecapitals(&val);
+ val = casemodify(val, casmod);
}
}
/*
@@ -2975,7 +2957,8 @@
for (t = e = *str; (tt = findword(&e, sep));) {
tc = *e;
*e = '\0';
- copy = dupstring(tt);
+ if (c != 'l' && c != 'u')
+ copy = dupstring(tt);
*e = tc;
switch (c) {
case 'h':
@@ -2991,10 +2974,10 @@
remlpaths(©);
break;
case 'l':
- downcase(©);
+ copy = casemodify(tt, CASMOD_LOWER);
break;
case 'u':
- upcase(©);
+ copy = casemodify(tt, CASMOD_UPPER);
break;
case 's':
if (hsubl && hsubr)
@@ -3050,10 +3033,10 @@
remlpaths(str);
break;
case 'l':
- downcase(str);
+ *str = casemodify(*str, CASMOD_LOWER);
break;
case 'u':
- upcase(str);
+ *str = casemodify(*str, CASMOD_UPPER);
break;
case 's':
if (hsubl && hsubr) {
Index: Src/utils.c
===================================================================
RCS file: /cvsroot/zsh/zsh/Src/utils.c,v
retrieving revision 1.124
diff -u -r1.124 utils.c
--- Src/utils.c 27 Jun 2006 12:00:44 -0000 1.124
+++ Src/utils.c 28 Jun 2006 12:55:41 -0000
@@ -3687,7 +3687,7 @@
/*
* Initialise multibyte state: called before a sequence of
- * mb_metacharlen().
+ * mb_metacharlenconv().
*/
/**/
@@ -3703,18 +3703,24 @@
* but character is not valid (e.g. possibly incomplete at end of string).
* Returned value is guaranteed not to reach beyond the end of the
* string (assuming correct metafication).
+ *
+ * If wcp is not NULL, the converted wide character is stored there.
+ * If no conversion could be done WEOF is used.
*/
/**/
int
-mb_metacharlen(char *s)
+mb_metacharlenconv(char *s, wint_t *wcp)
{
char inchar, *ptr;
size_t ret;
wchar_t wc;
- if (!isset(MULTIBYTE))
+ if (!isset(MULTIBYTE)) {
+ if (wcp)
+ *wcp = WEOF;
return 1 + (*s == Meta);
+ }
ret = MB_INVALID;
for (ptr = s; *ptr; ) {
@@ -3729,14 +3735,18 @@
break;
if (ret == MB_INCOMPLETE)
continue;
+ if (wcp)
+ *wcp = wc;
return ptr - s;
}
+ if (wcp)
+ *wcp = WEOF;
/* No valid multibyte sequence */
memset(&mb_shiftstate, 0, sizeof(mb_shiftstate));
- if (ptr > s)
+ if (ptr > s) {
return 1 + (*s == Meta); /* Treat as single byte character */
- else
+ } else
return 0; /* Probably shouldn't happen */
}
Index: Src/zsh.h
===================================================================
RCS file: /cvsroot/zsh/zsh/Src/zsh.h,v
retrieving revision 1.90
diff -u -r1.90 zsh.h
--- Src/zsh.h 26 Jun 2006 18:17:32 -0000 1.90
+++ Src/zsh.h 28 Jun 2006 12:55:44 -0000
@@ -1882,6 +1882,17 @@
#define ZSIG_ALIAS (1<<3) /* Trap is stored under an alias */
#define ZSIG_SHIFT 4
+/************************/
+/* Flags to casemodifiy */
+/************************/
+
+enum {
+ CASMOD_NONE, /* dummy for tests */
+ CASMOD_UPPER,
+ CASMOD_LOWER,
+ CASMOD_CAPS
+};
+
/**********************************/
/* Flags to third argument of zle */
/**********************************/
@@ -1927,7 +1938,7 @@
#ifdef MULTIBYTE_SUPPORT
#define nicezputs(str, outs) (void)mb_niceformat((str), (outs), NULL, 0)
#define MB_METACHARINIT() mb_metacharinit()
-#define MB_METACHARLEN(str) mb_metacharlen(str)
+#define MB_METACHARLEN(str) mb_metacharlenconv(str, NULL)
#define MB_METASTRLEN(str) mb_metastrlen(str)
#define MB_INCOMPLETE ((size_t)-2)
Index: Test/D07multibyte.ztst
===================================================================
RCS file: /cvsroot/zsh/zsh/Test/D07multibyte.ztst,v
retrieving revision 1.2
diff -u -r1.2 D07multibyte.ztst
--- Test/D07multibyte.ztst 27 Jun 2006 16:28:46 -0000 1.2
+++ Test/D07multibyte.ztst 28 Jun 2006 12:55:44 -0000
@@ -121,3 +121,37 @@
# Starting offsets with (R) seem to be so strange as to be hardly
# worth testing.
+
+ setopt extendedglob
+ [[ $a = (#b)t(én)(éb)reux ]] || print "Failed to match." >&2
+ for i in {1..${#match}}; do
+ print $match[i] $mbegin[i] $mend[i] ${a[$mbegin[i],$mend[i]]}
+ done
+0:Multibyte offsets in pattern tests
+>én 2 3 én
+>éb 4 5 éb
+
+ b=${(U)a}
+ print $b
+ print ${(L)b}
+ desdichado="Je suis le $a, le veuf, l'inconsolé"
+ print ${(C)desdichado}
+ lxiv="l'état c'est moi"
+ print ${(C)lxiv}
+0:Case modification of multibyte strings
+>TÃ?NÃ?BREUX
+>ténébreux
+>Je Suis Le Ténébreux, Le Veuf, L'Inconsolé
+>L'Ã?tat C'Est Moi
+
+ array=(ølaf ødd øpened án encyclopædia)
+ barray=(${(U)array})
+ print $barray
+ print ${(L)barray}
+ print ${(C)array}
+ print ${(C)barray}
+0:Case modification of arrays with multibyte strings
+>Ã?LAF Ã?DD Ã?PENED Ã?N ENCYCLOPÃ?DIA
+>ølaf ødd øpened án encyclopædia
+>�laf �dd �pened �n Encyclopædia
+>�laf �dd �pened �n Encyclopædia
--
Peter Stephenson <pws@xxxxxxx> Software Engineer
CSR PLC, Churchill House, Cambridge Business Park, Cowley Road
Cambridge, CB4 0WZ, UK Tel: +44 (0)1223 692070
Messages sorted by:
Reverse Date,
Date,
Thread,
Author