Zsh Mailing List Archive
Messages sorted by: Reverse Date, Date, Thread, Author

PATCH: multibyte parameter subscript handling



This is my first attempt to make parameter subscripts work in multibyte
mode.  You need to have MULTIBYTE_SUPPORT compiled in, a sensible
locale, and the option MULTIBYTE turned on.  The following should now
work, where X is a multibyte character:

a=aXbXc
print ${a[2,-2]}  # prints XbX
print ${a[(i)X]}  # prints 2
print ${a[(I)x]}  # prints 4

Note that it attacks nothing else, just numbers involving expressions in
square brackets after parameters including the numbers returned by
reverse subscripting.  It doesn't handle other forms of parameter
substitution involving multibyte characters (${#a}, ${(L)a} don't work
yte) and it leaves a few things inconsistent, for example I haven't yet
fixed up the fact that $MBEGIN/$MEND and $mbegin/$mend values may now be
incorrect.  Consequently it's of limited practical value.  Nonetheless,
it should work as far as it goes, so please try it and report problems.
I will commit this immediately since it's the only way of teasing out
problems.

Unlike for patterns, there are no flags to turn on or off multibyte
handling, overriding the MULTIBYTE option.  I've come to the conclusion
this is just too confusing.  For example, in

print ${#${${(L)a[2,3]}##pattern]}}

there are four different points at which the parameter might or might
not be in multibyte mode
- the pattern
- the subscript
- the lower casing (yes, it's pointless here)
- the character counting.
Forcing people to use the option therefore seems easier.  If there are
problems for which more control is needed we can deal with them later.

Even by the standards of being confused by zsh's internals, this
confused me a lot.  The problem is that there are all sorts of
adjustments of the subscripts by one under obscure conditions, and it's
hard to tell which are to account for the fact that subscripting
sometimes does require offsetting by one, and which are to move by a
single character (which may now be multiple bytes).  In principle the
presence of tests for Meta should have indicated but apparently this
wasn't always the case.  The most confusing of the lot is the adjustment
by startprevlen at line 1480 of params.c; I had to dig the character
length out from getarg() and pass it up and apparently there was no
handling of Meta characters at this point before.  So I'm still a bit
suspicious.  However, it apparently works.  If anybody understands
what's going on enough to simplify this then please do so.

All the tests still pass, however they are all with the MULTIBYTE option
off and a simple locale, so that doesn't prove very much.  (I rewrote
one test because when it failed the output was obscure; the original
form should actually still work although explicitly turning off
multibyte mode is better for future proofing.)  I tried indexing an
expression with accented characters until I was happy with it.  We need
a set of multibyte tests, which can be done by attempting to find a
UTF-8 locale using a set of likely choices and doing a simple test to
see if a multibyte character matches as a single character.

Index: Src/params.c
===================================================================
RCS file: /cvsroot/zsh/zsh/Src/params.c,v
retrieving revision 1.114
diff -u -r1.114 params.c
--- Src/params.c	5 Jun 2006 13:21:56 -0000	1.114
+++ Src/params.c	26 Jun 2006 17:51:02 -0000
@@ -918,9 +918,33 @@
     return !ss[1];
 }
 
+/*
+ * Parse a single argument to a parameter subscript.
+ * The subscripts starts at *str; *str is updated (input/output)
+ *
+ * *inv is set to indicate if the subscript is reversed (output)
+ * v is the Value for the parameter being accessed (input; note
+ *  v->isarr may be modified, and if v is a hash the parameter will
+ *  be updated to the element of the hash)
+ * a2 is 1 if this is the second subscript of a range (input)
+ * *w is only set if we need to find the end of a word (input; should
+ *  be set to 0 by the caller).
+ *
+ * The final two arguments are to support multibyte characters.
+ * If supplied they are set to the length of the character before
+ * the index position and the one at the index position.  If
+ * multibyte characters are not in use they are set to 1 for
+ * consistency.
+ *
+ * Returns a raw offset into the value from the start or end (i.e.
+ * after the arithmetic for Meta and possible multibyte characters has
+ * been taken into account).
+ */
+
 /**/
 static zlong
-getarg(char **str, int *inv, Value v, int a2, zlong *w)
+getarg(char **str, int *inv, Value v, int a2, zlong *w,
+       int *prevcharlen, int *nextcharlen)
 {
     int hasbeg = 0, word = 0, rev = 0, ind = 0, down = 0, l, i, ishash;
     int keymatch = 0, needtok = 0;
@@ -929,6 +953,10 @@
     Patprog pprog = NULL;
 
     ishash = (v->pm && PM_TYPE(v->pm->node.flags) == PM_HASHED);
+    if (prevcharlen)
+	*prevcharlen = 1;
+    if (nextcharlen)
+	*nextcharlen = 1;
 
     /* first parse any subscription flags */
     if (v->pm && (*s == '(' || *s == Inpar)) {
@@ -1133,17 +1161,43 @@
 
 	    return (a2 ? s : d + 1) - t;
 	} else if (!v->isarr && !word) {
+	    int lastcharlen = 1;
 	    s = getstrvalue(v);
+	    /*
+	     * Note for the confused (= pws):  the index r we
+	     * have so far is that specified by the user.  The value
+	     * passed back is an offset from the start or end of
+	     * the string.  Hence it needs correcting at least
+	     * for Meta characters and maybe for multibyte characters.
+	     */
 	    if (r > 0) {
-		for (t = s + r - 1; *s && s < t;)
-		    if (*s++ == Meta)
-			s++, t++, r++;
+		zlong nchars = r;
+
+		MB_METACHARINIT();
+		for (t = s; nchars && *t; nchars--)
+		    t += (lastcharlen = MB_METACHARLEN(t));
+		/* for consistency, keep any remainder off the end */
+		r = (zlong)(t - s) + nchars;
+		if (prevcharlen)
+		    *prevcharlen = lastcharlen;
+		if (nextcharlen && *t)
+		    *nextcharlen = MB_METACHARLEN(t);
 	    } else {
-		r += ztrlen(s);
-		for (t = s + r; *s && s < t; r--)
-		    if (*s++ == Meta)
-			t++, r++;
-		r -= strlen(s);
+		zlong nchars = (zlong)MB_METASTRLEN(s) + r;
+
+		if (nchars < 0) {
+		    /* invalid but keep index anyway */
+		    r = nchars;
+		} else {
+		    MB_METACHARINIT();
+		    for (t = s; nchars && *t; nchars--)
+			t += (lastcharlen = MB_METACHARLEN(t));
+		    r = - (zlong)strlen(t); /* keep negative */
+		    if (prevcharlen)
+			*prevcharlen = lastcharlen;
+		    if (nextcharlen && *t)
+			*nextcharlen = MB_METACHARLEN(t);
+		}
 	    }
 	}
     } else {
@@ -1338,19 +1392,57 @@
 	s += 2;
     } else {
 	zlong we = 0, dummy;
+	int startprevlen, startnextlen;
 
-	start = getarg(&s, &inv, v, 0, &we);
+	start = getarg(&s, &inv, v, 0, &we, &startprevlen, &startnextlen);
 
 	if (inv) {
 	    if (!v->isarr && start != 0) {
 		char *t, *p;
 		t = getstrvalue(v);
+		/*
+		 * Note for the confused (= pws): this is an inverse
+		 * offset so at this stage we need to convert from
+		 * the immediate offset into the value that we have
+		 * into a logical character position.
+		 */
 		if (start > 0) {
-		    for (p = t + start - 1; p-- > t; )
-			if (*p == Meta)
-			    start--;
-		} else
-		    start = -ztrlen(t + start + strlen(t));
+		    int nstart = 0;
+		    char *target = t + start - startprevlen;
+
+		    p = t;
+		    MB_METACHARINIT();
+		    while (*p) {
+			/*
+			 * move up characters, counting how many we
+			 * found
+			 */
+			p += MB_METACHARLEN(p);
+			if (p < target)
+			    nstart++;
+			else {
+			    if (p == target)
+				nstart++;
+			    else
+				p = target; /* pretend we hit exactly */
+			    break;
+			}
+		    }
+		    /* if start was too big, keep the difference */
+		    start = nstart + (target - p) + startprevlen;
+		} else {
+		    zlong startoff = start + strlen(t);
+		    if (startoff < 0) {
+			/* invalid: keep index but don't dereference */
+			start = startoff;
+		    } else {
+			/* find start in full characters */
+			MB_METACHARINIT();
+			for (p = t; p < t + startoff;)
+			    p += MB_METACHARLEN(p);
+			start = - MB_METASTRLEN(p);
+		    }
+		}
 	    }
 	    if (start > 0 && (isset(KSHARRAYS) || (v->pm->node.flags & PM_HASHED)))
 		start--;
@@ -1373,15 +1465,21 @@
 
 	    if ((com = (*s == ','))) {
 		s++;
-		end = getarg(&s, &inv, v, 1, &dummy);
+		end = getarg(&s, &inv, v, 1, &dummy, NULL, NULL);
 	    } else {
 		end = we ? we : start;
 	    }
-	    if (start != end) com = 1;
+	    if (start != end)
+		com = 1;
+	    /*
+	     * Somehow the logic sometimes forces us to use the previous
+	     * or next character to what we would expect, which is
+	     * why we had to calculate them in getarg().
+	     */
 	    if (start > 0)
-		start--;
+		start -= startprevlen;
 	    else if (start == 0 && end == 0)
-		end++;
+		end = startnextlen;
 	    if (s == tbrack) {
 		s++;
 		if (v->isarr && !com &&
@@ -1578,13 +1676,19 @@
 	if (v->start < 0)
 	    v->start = 0;
     }
-    if (v->end < 0)
-	v->end += strlen(s) + 1;
+    if (v->end < 0) {
+	v->end += strlen(s);
+	if (v->end >= 0) {
+	    char *eptr = s + v->end;
+	    if (*eptr)
+		v->end += MB_METACHARLEN(eptr);
+	}
+    }
     s = (v->start > (int)strlen(s)) ? dupstring("") : dupstring(s + v->start);
     if (v->end <= v->start)
 	s[0] = '\0';
     else if (v->end - v->start <= (int)strlen(s))
-	s[v->end - v->start + (s[v->end - v->start - 1] == Meta)] = '\0';
+	s[v->end - v->start] = '\0';
 
     return s;
 }
@@ -2791,7 +2895,7 @@
 tiedarrgetfn(Param pm)
 {
     struct tieddata *dptr = (struct tieddata *)pm->u.data;
-    return *dptr->arrptr ? zjoin(*dptr->arrptr, dptr->joinchar, 1) : "";
+    return *dptr->arrptr ? zjoin(*dptr->arrptr, STOUC(dptr->joinchar), 1) : "";
 }
 
 /**/
@@ -3463,7 +3567,7 @@
 	return;
 
     if (pm->node.flags & PM_TIED)
-	joinchar = ((struct tieddata *)pm->u.data)->joinchar;
+	joinchar = STOUC(((struct tieddata *)pm->u.data)->joinchar);
     else
 	joinchar = ':';
 
Index: Src/utils.c
===================================================================
RCS file: /cvsroot/zsh/zsh/Src/utils.c,v
retrieving revision 1.122
diff -u -r1.122 utils.c
--- Src/utils.c	5 Jun 2006 16:55:38 -0000	1.122
+++ Src/utils.c	26 Jun 2006 17:51:03 -0000
@@ -3683,6 +3683,112 @@
     return width;
 }
 
+static mbstate_t mb_shiftstate;
+
+/*
+ * Initialise multibyte state: called before a sequence of
+ * mb_metacharlen().
+ */
+
+/**/
+void
+mb_metacharinit(void)
+{
+    memset(&mb_shiftstate, 0, sizeof(mb_shiftstate));
+}
+
+/*
+ * Length of metafied string s which contains the next multibyte
+ * character; single (possibly metafied) character if string is not null
+ * but character is not valid (e.g. possibly incomplete at end of string).
+ * Returned value is guaranteed not to reach beyond the end of the
+ * string (assuming correct metafication).
+ */
+
+/**/
+int
+mb_metacharlen(char *s)
+{
+    char inchar, *ptr;
+    size_t ret;
+    wchar_t wc;
+
+    if (!isset(MULTIBYTE))
+	return 1 + (*s == Meta);
+
+    ret = MB_INVALID;
+    for (ptr = s; *ptr; ) {
+	if (*ptr == Meta)
+	    inchar = *++ptr ^ 32;
+	else
+	    inchar = *ptr;
+	ptr++;
+	ret = mbrtowc(&wc, &inchar, 1, &mb_shiftstate);
+
+	if (ret == MB_INVALID)
+	    break;
+	if (ret == MB_INCOMPLETE)
+	    continue;
+	return ptr - s;
+    }
+
+    /* No valid multibyte sequence */
+    memset(&mb_shiftstate, 0, sizeof(mb_shiftstate));
+    if (ptr > s)
+	return 1 + (*s == Meta);	/* Treat as single byte character */
+    else
+	return 0;		/* Probably shouldn't happen */
+}
+
+/*
+ * Total number of multibyte characters in metafied string s.
+ * Same answer as iterating mb_metacharlen() and counting calls
+ * until end of string.
+ */
+
+/**/
+int
+mb_metastrlen(char *ptr)
+{
+    char inchar, *laststart;
+    size_t ret;
+    wchar_t wc;
+    int num, num_in_char;
+
+    if (!isset(MULTIBYTE))
+	return ztrlen(ptr);
+
+    laststart = ptr;
+    ret = MB_INVALID;
+    num = num_in_char = 0;
+
+    memset(&mb_shiftstate, 0, sizeof(mb_shiftstate));
+    while (*ptr) {
+	if (*ptr == Meta)
+	    inchar = *++ptr ^ 32;
+	else
+	    inchar = *ptr;
+	ptr++;
+	ret = mbrtowc(&wc, &inchar, 1, &mb_shiftstate);
+
+	if (ret == MB_INCOMPLETE) {
+	    num_in_char++;
+	} else {
+	    if (ret == MB_INVALID) {
+		/* Reset, treat as single character */
+		memset(&mb_shiftstate, 0, sizeof(mb_shiftstate));
+		ptr = laststart + (*laststart == Meta) + 1;
+	    } else
+		laststart = ptr;
+	    num++;
+	    num_in_char = 0;
+	}
+    }
+
+    /* If incomplete, treat remainder as trailing single bytes */
+    return num + num_in_char;
+}
+
 /**/
 #endif /* MULTIBYTE_SUPPORT */
 
Index: Src/zsh.h
===================================================================
RCS file: /cvsroot/zsh/zsh/Src/zsh.h,v
retrieving revision 1.89
diff -u -r1.89 zsh.h
--- Src/zsh.h	19 Apr 2006 16:09:07 -0000	1.89
+++ Src/zsh.h	26 Jun 2006 17:51:03 -0000
@@ -1926,6 +1926,9 @@
 
 #ifdef MULTIBYTE_SUPPORT
 #define nicezputs(str, outs)	(void)mb_niceformat((str), (outs), NULL, 0)
+#define MB_METACHARINIT()	mb_metacharinit()
+#define MB_METACHARLEN(str)	mb_metacharlen(str)
+#define MB_METASTRLEN(str)	mb_metastrlen(str)
 
 #define MB_INCOMPLETE	((size_t)-2)
 #define MB_INVALID	((size_t)-1)
@@ -1946,6 +1949,9 @@
 #define ZWS(s)	L ## s
 
 #else
+#define MB_METACHARINIT()
+#define MB_METACHARLEN(str)	(*(str) == Meta ? 2 : 1)
+#define MB_METASTRLEN(str)	ztrlen(str)
 
 /* Leave character or string as is. */
 #define ZWC(c)	c
Index: Test/B02typeset.ztst
===================================================================
RCS file: /cvsroot/zsh/zsh/Test/B02typeset.ztst,v
retrieving revision 1.11
diff -u -r1.11 B02typeset.ztst
--- Test/B02typeset.ztst	11 Aug 2005 16:25:10 -0000	1.11
+++ Test/B02typeset.ztst	26 Jun 2006 17:51:03 -0000
@@ -182,13 +182,26 @@
 >l o c a l
 >l:o:c:a l o c a
 
+ (setopt NO_multibyte cbases
+ LC_ALL=C 2>/dev/null
  typeset -T SCALAR=$'l\x83o\x83c\x83a\x83l' array $'\x83'
  print $array
  typeset -U SCALAR
- print $SCALAR $array
+ for (( i = 1; i <= ${#SCALAR}; i++ )); do
+   char=$SCALAR[i]
+   print $(( [#16] #char ))
+ done
+ print $array)
 0:Tied parameters and uniquified arrays with meta-character as separator
 >l o c a l
->lÂ?oÂ?cÂ?a l o c a
+>0x6C
+>0x83
+>0x6F
+>0x83
+>0x63
+>0x83
+>0x61
+>l o c a
 
  typeset -T SCALAR=$'l\000o\000c\000a\000l' array $'\000'
  typeset -U SCALAR

-- 
Peter Stephenson <pws@xxxxxxx>                  Software Engineer
CSR PLC, Churchill House, Cambridge Business Park, Cowley Road
Cambridge, CB4 0WZ, UK                          Tel: +44 (0)1223 692070



Messages sorted by: Reverse Date, Date, Thread, Author