Zsh Mailing List Archive
Messages sorted by: Reverse Date, Date, Thread, Author

PATCH: zle input changes for Unicode



This is thoroughly incomplete, but doesn't break the existing input
system, so the shell should work with no noticeable change, and shows
the way things are going.  The TODOs show the areas in need of most
immediate attention.

Please tell me if you think I'm doing something wrong, or if there
are things that I might have forgotten about lurking nearby.  A lot of
the complexity is in the attempt to maintain compatibility with the
current system in order not to break things just because the user
happens to be (wittingly or otherwise) in a UTF-8 locale.

I moved the definitions from system.h to the more appropriate zle.h.

Note also that lastchar, and its new companion lastchar_wide, are now
set by the routines that actually read the character.  This is obviously
neater and it was hard to see from the code if there was a good reason
why it wasn't done before, but I strongly suspect there wasn't.

The big hack is lastchar_wide_valid and getrestchar().  This allows
us to do handling of input as at present, but ensure we have a complete
wide character when we need one.  There's a good chance I've missed
somewhere where the special handling is necessary.

Note that I've renamed some functions with "getkey" to "getbyte" to
reflect the function better.

Index: Src/system.h
===================================================================
RCS file: /cvsroot/zsh/zsh/Src/system.h,v
retrieving revision 1.27
diff -u -r1.27 system.h
--- Src/system.h	1 Feb 2005 10:53:18 -0000	1.27
+++ Src/system.h	18 Feb 2005 13:11:41 -0000
@@ -705,40 +705,3 @@
 #   endif
 # endif
 #endif
-
-#ifdef ZLE_UNICODE_SUPPORT
-typedef wchar_t ZLE_CHAR_T;
-typedef wchar_t *ZLE_STRING_T;
-#define ZLE_CHAR_SIZE	sizeof(wchar_t)
-
-/*
- * MB_CUR_MAX is the maximum number of bytes that a single wide
- * character will convert into.  We use it to keep strings
- * sufficiently long.  It should always be defined, but if it isn't
- * just assume we are using Unicode which requires 6 characters.
- * (Note that it's not necessarily defined to a constant.)
- */
-#ifndef MB_CUR_MAX
-#define MB_CUR_MAX 6
-#endif
-
-#define ZLENL	L'\n'
-#define ZLENUL	L'\0'
-#define ZLETAB	L'\t'
-#define ZLENULSTR	L""
-#define ZS_memcpy wmemcpy
-#define ZS_memmove wmemmove
-#define ZC_icntrl iswcntrl
-#else
-typedef int ZLE_CHAR_T;
-typedef unsigned char *ZLE_STRING_T;
-#define ZLE_CHAR_SIZE	sizeof(unsigned char)
-
-#define ZLENL	'\n'
-#define ZLENUL	'\0'
-#define ZLETAB	'\t'
-#define ZLENULSTR	""
-#define ZS_memcpy memcpy
-#define ZS_memmove memmove
-#define ZC_icntrl icntrl
-#endif
Index: Src/Zle/complist.c
===================================================================
RCS file: /cvsroot/zsh/zsh/Src/Zle/complist.c,v
retrieving revision 1.63
diff -u -r1.63 complist.c
--- Src/Zle/complist.c	14 Jan 2005 13:05:23 -0000	1.63
+++ Src/Zle/complist.c	18 Feb 2005 13:11:41 -0000
@@ -1869,6 +1869,10 @@
     msearchpush(ptr, back);
 
     if (ins) {
+	/*
+	 * TODO: probably need to convert back to multibyte character
+	 * string?  Who knows...
+	 */
         s[0] = lastchar;
         s[1] = '\0';
 
@@ -2802,9 +2806,7 @@
                     }
                 }
                 if (cmd == Th(z_selfinsertunmeta)) {
-                    lastchar &= 0x7f;
-                    if (lastchar == '\r')
-                        lastchar = '\n';
+		    fixunmeta();
                 }
                 wrap = 0;
                 np = msearch(p, ins, (ins ? (mode == MM_BSEARCH) : back),
Index: Src/Zle/deltochar.c
===================================================================
RCS file: /cvsroot/zsh/zsh/Src/Zle/deltochar.c,v
retrieving revision 1.3
diff -u -r1.3 deltochar.c
--- Src/Zle/deltochar.c	14 Jan 2005 13:05:23 -0000	1.3
+++ Src/Zle/deltochar.c	18 Feb 2005 13:11:41 -0000
@@ -37,7 +37,8 @@
 static int
 deltochar(UNUSED(char **args))
 {
-    int c = getkey(0), dest = zlecs, ok = 0, n = zmult;
+    ZLE_INT_T c = getfullchar(0);
+    int dest = zlecs, ok = 0, n = zmult;
     int zap = (bindk->widget == w_zaptochar);
 
     if (n > 0) {
Index: Src/Zle/zle.h
===================================================================
RCS file: /cvsroot/zsh/zsh/Src/Zle/zle.h,v
retrieving revision 1.6
diff -u -r1.6 zle.h
--- Src/Zle/zle.h	26 Jan 2005 18:12:18 -0000	1.6
+++ Src/Zle/zle.h	18 Feb 2005 13:11:41 -0000
@@ -27,6 +27,75 @@
  *
  */
 
+#ifdef ZLE_UNICODE_SUPPORT
+typedef wchar_t ZLE_CHAR_T;
+typedef wchar_t *ZLE_STRING_T;
+typedef int_t   ZLE_INT_T;
+#define ZLE_CHAR_SIZE	sizeof(wchar_t)
+
+/*
+ * MB_CUR_MAX is the maximum number of bytes that a single wide
+ * character will convert into.  We use it to keep strings
+ * sufficiently long.  It should always be defined, but if it isn't
+ * just assume we are using Unicode which requires 6 characters.
+ * (Note that it's not necessarily defined to a constant.)
+ */
+#ifndef MB_CUR_MAX
+#define MB_CUR_MAX 6
+#endif
+
+#define ZLENL	L'\n'
+#define ZLENUL	L'\0'
+#define ZLETAB	L'\t'
+
+#define DIGIT_1		L'1'
+#define DIGIT_9		L'9'
+#define LETTER_a	L'a'
+#define LETTER_z	L'z'
+#define LETTER_A	L'A'
+#define LETTER_Z	L'Z'
+#define LETTER_y	L'y'
+#define LETTER_n	L'n'
+
+#define ZLENULSTR	L""
+#define ZLEEOF	WEOF
+#define ZS_memcpy wmemcpy
+#define ZS_memmove wmemmove
+#define ZC_icntrl iswcntrl
+
+#define LASTFULLCHAR	lastchar_wide
+
+#else  /* Not ZLE_UNICODE_SUPPORT: old single-byte code */
+
+typedef int ZLE_CHAR_T;
+typedef unsigned char *ZLE_STRING_T;
+typedef int ZLE_INT_T;
+#define ZLE_CHAR_SIZE	sizeof(unsigned char)
+
+#define ZLENL	'\n'
+#define ZLENUL	'\0'
+#define ZLETAB	'\t'
+
+#define DIGIT_1		'1'
+#define DIGIT_9		'9'
+#define LETTER_a	'a'
+#define LETTER_z	'z'
+#define LETTER_A	'A'
+#define LETTER_Z	'Z'
+#define LETTER_y	'y'
+#define LETTER_n	'n'
+
+#define ZLENULSTR	""
+#define ZLEEOF	EOF
+#define ZS_memcpy memcpy
+#define ZS_memmove memmove
+#define ZC_icntrl icntrl
+
+#define LASTFULLCHAR	lastchar
+
+#endif
+
+
 typedef struct widget *Widget;
 typedef struct thingy *Thingy;
 
Index: Src/Zle/zle_hist.c
===================================================================
RCS file: /cvsroot/zsh/zsh/Src/Zle/zle_hist.c,v
retrieving revision 1.17
diff -u -r1.17 zle_hist.c
--- Src/Zle/zle_hist.c	26 Jan 2005 18:12:18 -0000	1.17
+++ Src/Zle/zle_hist.c	18 Feb 2005 13:11:41 -0000
@@ -420,11 +420,12 @@
 int
 insertlastword(char **args)
 {
-    int n, nwords, histstep = -1, wordpos = 0, deleteword = 0;
+    int n, nwords, histstep = -1, wordpos = 0, deleteword = 0, len, sz;
     char *s, *t;
     Histent he = NULL;
     LinkList l = NULL;
     LinkNode node;
+    ZLE_STRING_T zs;
 
     static char *lastinsert;
     static int lasthist, lastpos, lastlen;
@@ -554,7 +555,10 @@
     memcpy(lastinsert, s, lastlen);
     n = zmult;
     zmult = 1;
-    doinsert(s);
+
+    zs = stringaszleline((unsigned char *)s, &len, &sz);
+    doinsert(zs, len);
+    zfree(zs, sz);
     zmult = n;
     *t = save;
     return 0;
@@ -780,7 +784,7 @@
 	char *arg;
 	savekeys = kungetct;
 	arg = getkeystring(*args, &len, 2, NULL);
-	ungetkeys(arg, len);
+	ungetbytes(arg, len);
     }
 
     strcpy(ibuf, ISEARCH_PROMPT);
@@ -951,18 +955,23 @@
 		sbuf[sbptr] = '^';
 		zrefresh();
 	    }
-	    if ((lastchar = getkey(0)) == EOF)
+	    if (getfullchar(0) == ZLEEOF)
 		feep = 1;
 	    else
 		goto ins;
 	} else {
 	    if(cmd == Th(z_selfinsertunmeta)) {
-		lastchar &= 0x7f;
-		if(lastchar == '\r')
-		    lastchar = '\n';
-	    } else if (cmd == Th(z_magicspace))
-		lastchar = ' ';
-	    else if (cmd != Th(z_selfinsert)) {
+		fixunmeta();
+	    } else if (cmd == Th(z_magicspace)) {
+		fixmagicspace();
+	    } else if (cmd == Th(z_selfinsert)) {
+#ifdef ZLE_UNICODE_SUPPORT
+		if (!lastchar_wide_valid)
+		    getfullcharrest(lastchar);
+#else
+		;
+#endif
+	    } else {
 		ungetkeycmd();
 		if (cmd == Th(z_sendbreak))
 		    sbptr = 0;
@@ -979,6 +988,8 @@
 		sbuf = ibuf + FIRST_SEARCH_CHAR;
 		sibuf *= 2;
 	    }
+	    /* TODO: use lastchar_wide if available, convert back to
+	     * multibyte string.  Yuk.  */
 	    sbuf[sbptr++] = lastchar;
 	}
 	if (feep)
@@ -1093,7 +1104,7 @@
 	    break;
 	}
 	if(cmd == Th(z_magicspace)) {
-	    lastchar = ' ';
+	    fixmagicspace();
 	    cmd = Th(z_selfinsert);
 	}
 	if(cmd == Th(z_redisplay)) {
@@ -1128,15 +1139,20 @@
 		sbuf[sptr] = '^';
 		zrefresh();
 	    }
-	    if ((lastchar = getkey(0)) == EOF)
+	    if (getfullchar(0) == ZLEEOF)
 		feep = 1;
 	    else
 		goto ins;
 	} else if(cmd == Th(z_selfinsertunmeta) || cmd == Th(z_selfinsert)) {
 	    if(cmd == Th(z_selfinsertunmeta)) {
-		lastchar &= 0x7f;
-		if(lastchar == '\r')
-		    lastchar = '\n';
+		fixunmeta();
+	    } else {
+#ifdef ZLE_UNICODE_SUPPORT
+		if (!lastchar_wide_valid)
+		    getrestchar(lastchar);
+#else
+		;
+#endif
 	    }
 	  ins:
 	    if(sptr == ssbuf - 1) {
@@ -1144,6 +1160,7 @@
 		strcpy(newbuf, sbuf);
 		statusline = sbuf = newbuf;
 	    }
+	    /* TODO: may be wide char, convert back to multibyte string */
 	    sbuf[sptr++] = lastchar;
 	} else {
 	    feep = 1;
Index: Src/Zle/zle_keymap.c
===================================================================
RCS file: /cvsroot/zsh/zsh/Src/Zle/zle_keymap.c,v
retrieving revision 1.15
diff -u -r1.15 zle_keymap.c
--- Src/Zle/zle_keymap.c	2 Jun 2004 22:15:01 -0000	1.15
+++ Src/Zle/zle_keymap.c	18 Feb 2005 13:11:41 -0000
@@ -1272,7 +1272,21 @@
 
     keybuflen = 0;
     keybuf[0] = 0;
-    while((lastchar = getkeybuf(!!lastlen)) != EOF) {
+    /*
+     * getkeybuf returns multibyte strings, which may not
+     * yet correspond to complete wide characters, regardless
+     * of the locale.  This is because we can't be sure whether
+     * the key bindings and keyboard input always return such
+     * characters.  So we always look up bindings for each
+     * chunk of string.  Intelligence within self-insert tries
+     * to fix up insertion of real wide characters properly.
+     *
+     * Note that this does not stop the user binding wide characters to
+     * arbitrary functions, just so long as the string used in the
+     * argument to bindkey is in the correct form for the locale.
+     * That's beyond our control.
+     */
+    while(getkeybuf(!!lastlen) != EOF) {
 	char *s;
 	Thingy f;
 	int loc = 1;
@@ -1296,7 +1310,7 @@
 	lastchar = lastc;
     if(lastlen != keybuflen) {
 	unmetafy(keybuf + lastlen, &keybuflen);
-	ungetkeys(keybuf+lastlen, keybuflen);
+	ungetbytes(keybuf+lastlen, keybuflen);
 	if(vichgflag)
 	    vichgbufptr -= keybuflen;
 	keybuf[lastlen] = 0;
@@ -1306,11 +1320,24 @@
     return keybuf;
 }
 
+/*
+ * Add a (possibly metafied) byte to the key input so far.
+ * This handles individual bytes of a multibyte string separately;
+ * see note in getkeymapcmd.  Hence there is no wide character
+ * support at this level.
+ *
+ * TODO: Need to be careful about whether we return EOF in the
+ * middle of a wide character.  However, I think we're OK since
+ * EOF and 0xff are distinct and we're reading bytes from the
+ * lower level, so EOF really does mean something went wrong.  Even so,
+ * I'm worried enough to leave this note here for now.
+ */
+
 /**/
 static int
 getkeybuf(int w)
 {
-    int c = getkey(w);
+    int c = getbyte(w);
 
     if(c < 0)
 	return EOF;
@@ -1332,7 +1359,7 @@
 mod_export void
 ungetkeycmd(void)
 {
-    ungetkeys(keybuf, keybuflen);
+    ungetbytes(keybuf, keybuflen);
 }
 
 /* read a command from the current keymap, with widgets */
@@ -1359,7 +1386,7 @@
 	    return NULL;
 	}
 	pb = unmetafy(ztrdup(str), &len);
-	ungetkeys(pb, len);
+	ungetbytes(pb, len);
 	zfree(pb, strlen(str) + 1);
 	goto sentstring;
     }
Index: Src/Zle/zle_main.c
===================================================================
RCS file: /cvsroot/zsh/zsh/Src/Zle/zle_main.c,v
retrieving revision 1.58
diff -u -r1.58 zle_main.c
--- Src/Zle/zle_main.c	26 Jan 2005 18:12:18 -0000	1.58
+++ Src/Zle/zle_main.c	18 Feb 2005 13:11:41 -0000
@@ -78,10 +78,30 @@
 /**/
 int mark;
 
-/* last character pressed */
+/*
+ * Last character pressed.
+ *
+ * Depending how far we are with processing, the lastcharacter may
+ * be a single byte read (lastchar_wide_valid is 0, lastchar_wide is not
+ * valid) or a full wide character.  This is needed because we can't be
+ * sure whether the user is typing old \M-style commands or multibyte
+ * input.
+ *
+ * Calling getfullchar or getrestchar is guaranteed to ensure we have
+ * a valid wide character (although this may be WEOF).  In many states
+ * we know this and don't need to test lastchar_wide_valid.
+ */
 
 /**/
-mod_export int lastchar;
+mod_export int
+lastchar;
+#ifdef ZLE_UNICODE_SUPPORT
+/**/
+mod_export ZLE_INT_T lastchar_wide;
+/**/
+mod_export int
+lastchar_wide_valid;
+#endif
 
 /* the bindings for the previous and for this key */
 
@@ -148,7 +168,7 @@
 /**/
 int prefixflag;
 
-/* Number of characters waiting to be read by the ungetkeys mechanism */
+/* Number of characters waiting to be read by the ungetbytes mechanism */
 /**/
 int kungetct;
 
@@ -196,7 +216,7 @@
 	 * we can't set up the terminal for zle *at all* until
 	 * we are sure there is no more typeahead to come.  So
 	 * if there is typeahead, we set the flag delayzsetterm.
-	 * Then getkey() performs another FIONREAD call; if that is
+	 * Then getbyte() performs another FIONREAD call; if that is
 	 * 0, we have finally used up all the typeahead, and it is
 	 * safe to alter the terminal, which we do at that point.
 	 */
@@ -266,7 +286,7 @@
     ti.tio.c_cc[VMIN] = 1;
     ti.tio.c_cc[VTIME] = 0;
     ti.tio.c_iflag |= (INLCR | ICRNL);
- /* this line exchanges \n and \r; it's changed back in getkey
+ /* this line exchanges \n and \r; it's changed back in getbyte
 	so that the net effect is no change at all inside the shell.
 	This double swap is to allow typeahead in common cases, eg.
 
@@ -275,12 +295,12 @@
 	echo foo<return>  <--- typed before sleep returns
 
 	The shell sees \n instead of \r, since it was changed by the kernel
-	while zsh wasn't looking. Then in getkey() \n is changed back to \r,
+	while zsh wasn't looking. Then in getbyte() \n is changed back to \r,
 	and it sees "echo foo<accept line>", as expected. Without the double
 	swap the shell would see "echo foo\n", which is translated to
 	"echo fooecho foo<accept line>" because of the binding.
 	Note that if you type <line-feed> during the sleep the shell just sees
-	\n, which is translated to \r in getkey(), and you just get another
+	\n, which is translated to \r in getbyte(), and you just get another
 	prompt. For type-ahead to work in ALL cases you have to use
 	stty inlcr.
 
@@ -321,9 +341,16 @@
 static char *kungetbuf;
 static int kungetsz;
 
+/*
+ * Note on ungetbyte and ungetbytes for the confused (pws):
+ * these are low level and deal with bytes before they
+ * have been converted into (possibly wide) characters.
+ * Hence the names.
+ */
+
 /**/
 void
-ungetkey(int ch)
+ungetbyte(int ch)
 {
     if (kungetct == kungetsz)
 	kungetbuf = realloc(kungetbuf, kungetsz *= 2);
@@ -332,11 +359,11 @@
 
 /**/
 void
-ungetkeys(char *s, int len)
+ungetbytes(char *s, int len)
 {
     s += len;
     while (len--)
-	ungetkey(*--s);
+	ungetbyte(*--s);
 }
 
 #if defined(pyr) && defined(HAVE_SELECT)
@@ -356,7 +383,7 @@
 #endif
 
 static int
-raw_getkey(int keytmout, char *cptr)
+raw_getbyte(int keytmout, char *cptr)
 {
     long exp100ths;
     int ret;
@@ -591,13 +618,22 @@
 
 /**/
 mod_export int
-getkey(int keytmout)
+getbyte(int keytmout)
 {
     char cc;
     unsigned int ret;
     int die = 0, r, icnt = 0;
     int old_errno = errno, obreaks = breaks;
 
+#ifdef ZLE_UNICODE_SUPPORT
+    /*
+     * Reading a single byte always invalidates the status
+     * of lastchar_wide.  We may fix this up in getrestchar
+     * if this is the last byte of a wide character.
+     */
+    lastchar_wide_valid = 0;
+#endif
+
     if (kungetct)
 	ret = STOUC(kungetbuf[--kungetct]);
     else {
@@ -612,10 +648,10 @@
 	for (;;) {
 	    int q = queue_signal_level();
 	    dont_queue_signals();
-	    r = raw_getkey(keytmout, &cc);
+	    r = raw_getbyte(keytmout, &cc);
 	    restore_queue_signals(q);
 	    if (r == -2)	/* timeout */
-		return EOF;
+		return lastchar = EOF;
 	    if (r == 1)
 		break;
 	    if (r == 0) {
@@ -642,7 +678,7 @@
 		errflag = 0;
 		breaks = obreaks;
 		errno = old_errno;
-		return EOF;
+		return lastchar = EOF;
 	    } else if (errno == EWOULDBLOCK) {
 		fcntl(0, F_SETFL, 0);
 	    } else if (errno == EIO && !die) {
@@ -665,15 +701,96 @@
 
 	ret = STOUC(cc);
     }
+    /*
+     * TODO: if vichgbuf is to be characters instead of a multibyte
+     * string the following needs moving to getfullchar().
+     */
     if (vichgflag) {
 	if (vichgbufptr == vichgbufsz)
 	    vichgbuf = realloc(vichgbuf, vichgbufsz *= 2);
 	vichgbuf[vichgbufptr++] = ret;
     }
     errno = old_errno;
-    return ret;
+    return lastchar = ret;
 }
 
+
+/*
+ * Get a full character rather than just a single byte.
+ * (TODO: Strictly we ought to call this getbyte and the above
+ * function getbyte.)
+ */
+
+/**/
+mod_export ZLE_INT_T
+getfullchar(int keytmout)
+{
+    int inchar = getbyte(keytmout);
+
+#ifdef ZLE_UNICODE_SUPPORT
+    return getrestchar(inchar);
+#else
+    return inchar;
+#endif
+}
+
+
+/**/
+#ifdef ZLE_UNICODE_SUPPORT
+/*
+ * Get the remainder of a character if we support multibyte
+ * input strings.  It may not require any more input, but
+ * we haven't yet checked.  The character previously returned
+ * by getbyte() is passed down as inchar.
+ */
+
+/**/
+mod_export ZLE_INT_T
+getrestchar(int inchar)
+{
+    char cnull = '\0';
+    char buf[MB_CUR_MAX], *ptr;
+    wchar_t outchar;
+    int ret;
+
+    /*
+     * We are guaranteed to set a valid wide last character,
+     * although it may be WEOF (which is technically not
+     * a wide character at all...)
+     */
+    lastchar_wide_valid = 1;
+
+    if (inchar == EOF)
+	return lastchar_wide = WEOF;
+
+    /* reset shift state by converting null */
+    mbrtowc(&outchar, &cnull, 1, &ps);
+
+    ptr = buf;
+    *ptr++ = inchar;
+    /*
+     * Return may be zero if we have a NULL; handle this like
+     * any other character.
+     */
+    while ((ret = mbrtowc(&outchar, buf, ptr - buf, &ps)) < 0) {
+	if (ret == -1) {
+	    /*
+	     * Invalid input.  Hmm, what's the right thing to do here?
+	     */
+	    return lastchar_wide = WEOF;
+	}
+	/* No timeout here as we really need the character. */
+	inchar = getbyte(0);
+	if (inchar == EOF)
+	    return lastchar_wide = WEOF;
+	*ptr++ = inchar;
+    }
+    return lastchar_wide = (wint_t)outchar;
+}
+/**/
+#endif
+
+
 /**/
 void
 zlecore(void)
@@ -1445,7 +1562,7 @@
     zlereadptr = zleread;
     zlesetkeymapptr = zlesetkeymap;
 
-    getkeyptr = getkey;
+    getkeyptr = getbyte;
 
     /* initialise the thingies */
     init_thingies();
Index: Src/Zle/zle_misc.c
===================================================================
RCS file: /cvsroot/zsh/zsh/Src/Zle/zle_misc.c,v
retrieving revision 1.14
diff -u -r1.14 zle_misc.c
--- Src/Zle/zle_misc.c	25 Jan 2005 16:41:24 -0000	1.14
+++ Src/Zle/zle_misc.c	18 Feb 2005 13:11:41 -0000
@@ -34,13 +34,13 @@
 
 /**/
 void
-doinsert(char *str)
+doinsert(ZLE_STRING_T zstr, int len)
 {
-    char *s;
-    int len = ztrlen(str);
-    int c1 = *str == Meta ? STOUC(str[1])^32 : STOUC(*str);/* first character */
+    ZLE_STRING_T s;
+    ZLE_CHAR_T c1 = *zstr;	     /* first character */
     int neg = zmult < 0;             /* insert *after* the cursor? */
     int m = neg ? -zmult : zmult;    /* number of copies to insert */
+    int count;
 
     iremovesuffix(c1, 0);
     invalidatelist();
@@ -50,8 +50,8 @@
     else if(zlecs + m * len > zlell)
 	spaceinline(zlecs + m * len - zlell);
     while(m--)
-	for(s = str; *s; s++)
-	    zleline[zlecs++] = *s == Meta ? *++s ^ 32 : *s;
+	for(s = zstr, count = len; count; s++, count--)
+	    zleline[zlecs++] = *s;
     if(neg)
 	zlecs += zmult * len;
 }
@@ -60,25 +60,41 @@
 mod_export int
 selfinsert(UNUSED(char **args))
 {
-    char s[3], *p = s;
-
-    if(imeta(lastchar)) {
-	*p++ = Meta;
-	lastchar ^= 32;
-    }
-    *p++ = lastchar;
-    *p = 0;
-    doinsert(s);
+#ifdef ZLE_UNICODE_SUPPORT
+    if (!lastchar_wide_valid)
+	getrestchar(lastchar);
+    doinsert(&lastchar_wide, 1);
+#else
+    char s = lastchar;
+    doinsert(&s, 1);
+#endif
     return 0;
 }
 
 /**/
-mod_export int
-selfinsertunmeta(char **args)
+mod_export void
+fixunmeta(void)
 {
     lastchar &= 0x7f;
     if (lastchar == '\r')
 	lastchar = '\n';
+#ifdef ZLE_UNICODE_SUPPORT
+    /*
+     * TODO: can we do this better?
+     * We need a wide character to insert.
+     * selfinsertunmeta is intrinsically problematic
+     * with multibyte input.
+     */
+    lastchar_wide = (ZLE_CHAR_T)lastchar;
+    lastchar_wide_valid = TRUE;
+#endif
+}
+
+/**/
+mod_export int
+selfinsertunmeta(char **args)
+{
+    fixunmeta();
     return selfinsert(args);
 }
 
@@ -490,11 +506,11 @@
     sob.sg_flags = (sob.sg_flags | RAW) & ~ECHO;
     ioctl(SHTTY, TIOCSETN, &sob);
 #endif
-    lastchar = getkey(0);
+    getfullchar(0);
 #ifndef HAS_TIO
     zsetterm();
 #endif
-    if (lastchar < 0)
+    if (LASTFULLCHAR == ZLEEOF)
 	return 1;
     else
 	return selfinsert(args);
@@ -506,9 +522,20 @@
 {
     int sign = (zmult < 0) ? -1 : 1;
 
+#ifdef ZLE_UNICODE_SUPPORT
+    /*
+     * It's too dangerous to allow metafied input.  See
+     * universalargument for comments on (possibly suboptimal) handling
+     * of digits.  We are assuming ASCII is a subset of the multibyte
+     * encoding.
+     */
+    if (lastchar < '0' || lastchar > '9')
+	return 1;
+#else
     /* allow metafied as well as ordinary digits */
     if ((lastchar & 0x7f) < '0' || (lastchar & 0x7f) > '9')
 	return 1;
+#endif
 
     if (!(zmod.flags & MOD_TMULT))
 	zmod.tmult = 0;
@@ -546,7 +573,22 @@
 	zmod.flags |= MOD_MULT;
 	return 0;
     }
-    while ((gotk = getkey(0)) != EOF) {
+    /*
+     * TODO: this is quite tricky to do when trying to maintain
+     * compatibility between the old input system and Unicode.
+     * We don't know what follows the digits, so if we try to
+     * read wide characters we may fail (e.g. we may come across an old
+     * \M-style binding).
+     *
+     * If we assume individual bytes are either explicitly ASCII or
+     * not (a la UTF-8), we get away with it; we can back up individual
+     * bytes and everything will work.  We may want to relax this
+     * assumption later.  ("Much later" - (C) Steven Singer,
+     * CSR BlueCore firmware, ca. 2000.)
+     *
+     * Hence for now this remains byte-by-byte.
+     */
+    while ((gotk = getbyte(0)) != EOF) {
 	if (gotk == '-' && !digcnt) {
 	    minus = -1;
 	    digcnt++;
@@ -554,7 +596,7 @@
 	    pref = pref * 10 + (gotk & 0xf);
 	    digcnt++;
 	} else {
-	    ungetkey(gotk);
+	    ungetbyte(gotk);
 	    break;
 	}
     }
@@ -765,24 +807,32 @@
 	} else if(cmd == Th(z_viquotedinsert)) {
 	    *ptr = '^';
 	    zrefresh();
-	    lastchar = getkey(0);
-	    if(lastchar == EOF || !lastchar || len == NAMLEN)
+	    getfullchar(0);
+	    if(LASTFULLCHAR == ZLEEOF || !LASTFULLCHAR || len == NAMLEN)
 		feep = 1;
-	    else
+	    else {
+		/* TODO: convert back to multibyte string */
 		*ptr++ = lastchar, len++, curlist = 0;
+	    }
 	} else if(cmd == Th(z_quotedinsert)) {
-	    if((lastchar = getkey(0)) == EOF || !lastchar || len == NAMLEN)
+	    if(getfullchar(0) == ZLEEOF ||
+	       !LASTFULLCHAR || len == NAMLEN)
 		feep = 1;
-	    else
+	    else {
+		/* TODO: convert back to multibyte string */
 		*ptr++ = lastchar, len++, curlist = 0;
+	    }
 	} else if(cmd == Th(z_backwarddeletechar) ||
 	    	cmd == Th(z_vibackwarddeletechar)) {
-	    if (len)
+	    if (len) {
+		/* TODO: backward full character in multibyte string. Yuk. */
 		len--, ptr--, curlist = 0;
+	    }
 	} else if(cmd == Th(z_killregion) || cmd == Th(z_backwardkillword) ||
 		  cmd == Th(z_vibackwardkillword)) {
 	    if (len)
 		curlist = 0;
+	    /* TODO: backward full character in multibyte string. Yuk. */
 	    while (len && (len--, *--ptr != '-'));
 	} else if(cmd == Th(z_killwholeline) || cmd == Th(z_vikillline) ||
 	    	cmd == Th(z_backwardkillline)) {
@@ -812,9 +862,7 @@
 		unrefthingy(r);
 	    }
 	    if(cmd == Th(z_selfinsertunmeta)) {
-		lastchar &= 0x7f;
-		if(lastchar == '\r')
-		    lastchar = '\n';
+		fixunmeta();
 		cmd = Th(z_selfinsert);
 	    }
 	    if (cmd == Th(z_listchoices) || cmd == Th(z_deletecharorlist) ||
@@ -867,11 +915,24 @@
 		    len = cmdambig;
 		}
 	    } else {
-		if (len == NAMLEN || icntrl(lastchar) ||
-		    cmd != Th(z_selfinsert))
+		if (len == NAMLEN || cmd != Th(z_selfinsert))
 		    feep = 1;
-		else
-		    *ptr++ = lastchar, len++, curlist = 0;
+		else {
+#ifdef ZLE_UNICODE_SUPPORT
+		    if (!lastchar_wide_valid)
+			getrestchar(0);
+		    if (iswcntrl(lastchar))
+#else
+		    if (icntrl(lastchar))
+#endif
+		    {
+			feep = 1;
+		    }
+		    else {
+			/* TODO: convert back to multibyte string */
+			*ptr++ = lastchar, len++, curlist = 0;
+		    }
+		}
 	    }
 	}
 	if (feep)
@@ -911,6 +972,9 @@
 /* Length of suffix to remove when inserting each possible character value.  *
  * suffixlen[256] is the length to remove for non-insertion editing actions. */
 
+/*
+ * TODO: Aargh, this is completely broken with wide characters.
+ */
 /**/
 mod_export int suffixlen[257];
 
@@ -1000,7 +1064,7 @@
 
 /**/
 mod_export void
-iremovesuffix(int c, int keep)
+iremovesuffix(ZLE_CHAR_T c, int keep)
 {
     if (suffixfunc) {
 	Eprog prog = getshfunc(suffixfunc);
@@ -1024,7 +1088,12 @@
 	zsfree(suffixfunc);
 	suffixfunc = NULL;
     } else {
+#ifdef ZLE_UNICODE_SUPPORT
+	/* TODO: best I can think of for now... */
+	int sl = (unsigned int)c < 256 ? suffixlen[c] : 0;
+#else
 	int sl = suffixlen[c];
+#endif
 	if(sl) {
 	    backdel(sl);
 	    if (!keep)
Index: Src/Zle/zle_move.c
===================================================================
RCS file: /cvsroot/zsh/zsh/Src/Zle/zle_move.c,v
retrieving revision 1.4
diff -u -r1.4 zle_move.c
--- Src/Zle/zle_move.c	14 Jan 2005 13:05:24 -0000	1.4
+++ Src/Zle/zle_move.c	18 Feb 2005 13:11:41 -0000
@@ -353,13 +353,14 @@
     return 0;
 }
 
-static int vfindchar, vfinddir, tailadd;
+static ZLE_INT_T vfindchar;
+static int vfinddir, tailadd;
 
 /**/
 int
 vifindnextchar(char **args)
 {
-    if ((vfindchar = vigetkey()) != -1) {
+    if ((vfindchar = vigetkey()) != ZLEEOF) {
 	vfinddir = 1;
 	tailadd = 0;
 	return virepeatfind(args);
@@ -371,7 +372,7 @@
 int
 vifindprevchar(char **args)
 {
-    if ((vfindchar = vigetkey()) != -1) {
+    if ((vfindchar = vigetkey()) != ZLEEOF) {
 	vfinddir = -1;
 	tailadd = 0;
 	return virepeatfind(args);
@@ -383,7 +384,7 @@
 int
 vifindnextcharskip(char **args)
 {
-    if ((vfindchar = vigetkey()) != -1) {
+    if ((vfindchar = vigetkey()) != ZLEEOF) {
 	vfinddir = 1;
 	tailadd = -1;
 	return virepeatfind(args);
@@ -395,7 +396,7 @@
 int
 vifindprevcharskip(char **args)
 {
-    if ((vfindchar = vigetkey()) != -1) {
+    if ((vfindchar = vigetkey()) != ZLEEOF) {
 	vfinddir = -1;
 	tailadd = 1;
 	return virepeatfind(args);
@@ -465,12 +466,12 @@
 int
 visetmark(UNUSED(char **args))
 {
-    int ch;
+    ZLE_INT_T ch;
 
-    ch = getkey(0);
-    if (ch < 'a' || ch > 'z')
+    ch = getfullchar(0);
+    if (ch < LETTER_a || ch > LETTER_z)
 	return 1;
-    ch -= 'a';
+    ch -= LETTER_a;
     vimarkcs[ch] = zlecs;
     vimarkline[ch] = histline;
     return 0;
@@ -480,15 +481,15 @@
 int
 vigotomark(UNUSED(char **args))
 {
-    int ch;
+    ZLE_INT_T ch;
 
-    ch = getkey(0);
-    if (ch == lastchar)
+    ch = getfullchar(0);
+    if (ch == LASTFULLCHAR)
 	ch = 26;
     else {
-	if (ch < 'a' || ch > 'z')
+	if (ch < LETTER_a || ch > LETTER_z)
 	    return 1;
-	ch -= 'a';
+	ch -= LETTER_a;
     }
     if (!vimarkline[ch])
 	return 1;
Index: Src/Zle/zle_thingy.c
===================================================================
RCS file: /cvsroot/zsh/zsh/Src/Zle/zle_thingy.c,v
retrieving revision 1.15
diff -u -r1.15 zle_thingy.c
--- Src/Zle/zle_thingy.c	8 Sep 2004 15:24:07 -0000	1.15
+++ Src/Zle/zle_thingy.c	18 Feb 2005 13:11:42 -0000
@@ -473,7 +473,7 @@
 	return 1;
     }
     while (p > b)
-	ungetkey((int) *--p);
+	ungetbyte((int) *--p);
     return 0;
 }
 
Index: Src/Zle/zle_tricky.c
===================================================================
RCS file: /cvsroot/zsh/zsh/Src/Zle/zle_tricky.c,v
retrieving revision 1.47
diff -u -r1.47 zle_tricky.c
--- Src/Zle/zle_tricky.c	14 Jan 2005 13:05:24 -0000	1.47
+++ Src/Zle/zle_tricky.c	18 Feb 2005 13:11:42 -0000
@@ -2298,12 +2298,27 @@
 }
 
 /**/
+void
+fixmagicspace(void)
+{
+    lastchar = ' ';
+#ifdef ZLE_UNICODE_SUPPORT
+    /*
+     * This is redundant if the multibyte encoding extends ASCII,
+     * since lastchar is a full character, but it's safer anyway...
+     */
+    lastchar_wide = L' ';
+    lastchar_wide_valid = TRUE;
+#endif
+}
+
+/**/
 int
 magicspace(char **args)
 {
     char *bangq;
     int ret;
-    lastchar = ' ';
+    fixmagicspace();
     for (bangq = (char *)zleline; (bangq = strchr(bangq, bangchar));
 	 bangq += 2)
 	if (bangq[1] == '"' && (bangq == (char *)zleline || bangq[-1] != '\\'))
Index: Src/Zle/zle_utils.c
===================================================================
RCS file: /cvsroot/zsh/zsh/Src/Zle/zle_utils.c,v
retrieving revision 1.17
diff -u -r1.17 zle_utils.c
--- Src/Zle/zle_utils.c	26 Jan 2005 18:12:18 -0000	1.17
+++ Src/Zle/zle_utils.c	18 Feb 2005 13:11:42 -0000
@@ -510,7 +510,7 @@
 mod_export int
 getzlequery(int yesno)
 {
-    int c;
+    ZLE_INT_T c;
 #ifdef FIONREAD
     int val;
 
@@ -525,18 +525,18 @@
 #endif
 
     /* get a character from the tty and interpret it */
-    c = getkey(0);
+    c = getfullchar(0);
     if (yesno) {
-	if (c == '\t')
-	    c = 'y';
+	if (c == ZLETAB)
+	    c = LETTER_y;
 	else if (icntrl(c) || c == EOF)
-	    c = 'n';
+	    c = LETTER_n;
 	else
 	    c = tulower(c);
     }
     /* echo response and return */
-    if (c != '\n')
-	putc(c, shout);
+    if (c != ZLENL)
+	putc(c, shout);		/* TODO: convert to multibyte */
     return c;
 }
 
Index: Src/Zle/zle_vi.c
===================================================================
RCS file: /cvsroot/zsh/zsh/Src/Zle/zle_vi.c,v
retrieving revision 1.4
diff -u -r1.4 zle_vi.c
--- Src/Zle/zle_vi.c	14 Jan 2005 13:05:25 -0000	1.4
+++ Src/Zle/zle_vi.c	18 Feb 2005 13:11:42 -0000
@@ -50,6 +50,11 @@
 /**/
 int vichgbufsz, vichgbufptr, vichgflag;
 
+/*
+ * TODO: need consistent handling of vichgbuf: ZLE_STRING_T or
+ * char *?  Consequently, use of lastchar in this file needs fixing
+ * too.
+ */
 /**/
 char *vichgbuf;
 
@@ -95,15 +100,15 @@
 }
 
 /**/
-int
+ZLE_INT_T
 vigetkey(void)
 {
     Keymap mn = openkeymap("main");
     char m[3], *str;
     Thingy cmd;
 
-    if((lastchar = getkey(0)) == EOF)
-	return -1;
+    if(getbyte(0) == EOF)
+	return ZLEEOF;
 
     m[0] = lastchar;
     metafy(m, 1, META_NOALLOC);
@@ -112,23 +117,35 @@
     else
 	cmd = t_undefinedkey;
 
+    /*
+     * TODO: if this was bound to self-insert, we may
+     * be on the first character of a multibyte string
+     * and need to acquire the rest.
+     */
     if (!cmd || cmd == Th(z_sendbreak)) {
-	return -1;
+	return ZLEEOF;
     } else if (cmd == Th(z_quotedinsert)) {
-	if ((lastchar = getkey(0)) == EOF)
-	    return -1;
+	if (getfullchar(0) == ZLEEOF)
+	    return ZLEEOF;
     } else if(cmd == Th(z_viquotedinsert)) {
-	char sav = zleline[zlecs];
+	ZLE_CHAR_T sav = zleline[zlecs];
 
 	zleline[zlecs] = '^';
 	zrefresh();
-	lastchar = getkey(0);
+	getfullchar(0);
 	zleline[zlecs] = sav;
-	if(lastchar == EOF)
-	    return -1;
-    } else if (cmd == Th(z_vicmdmode))
-	return -1;
-    return lastchar;
+	if(LASTFULLCHAR == ZLEEOF)
+	    return ZLEEOF;
+    } else if (cmd == Th(z_vicmdmode)) {
+	return ZLEEOF;
+    }
+#ifdef ZLE_UNICODE_SUPPORT
+    if (!lastchar_wide_valid)
+    {
+	getrestchar(lastchar);
+    }
+#endif
+    return LASTFULLCHAR;
 }
 
 /**/
@@ -489,7 +506,7 @@
 	return 1;
     }
     /* get key */
-    if((ch = vigetkey()) == -1) {
+    if((ch = vigetkey()) == ZLEEOF) {
 	vichgflag = 0;
 	return 1;
     }
@@ -593,7 +610,7 @@
     }
     /* repeat the command */
     inrepeat = 1;
-    ungetkeys(vichgbuf, vichgbufptr);
+    ungetbytes(vichgbuf, vichgbufptr);
     return 0;
 }
 
@@ -817,26 +834,35 @@
     statusline = "press a lowercase key to continue";
     statusll = strlen(statusline);
     zrefresh();
-    while (!islower(getkey(0)));
+#ifdef ZLE_UNICODE_SUPPORT
+    while (!iswlower(getfullchar(0)));
+#else
+    while (!islower(getfullchar(0)));
+#endif
     statusline = NULL;
     return 0;
 }
 
+#ifdef ZLE_UNICODE_SUPPORT
+#else
+#endif
+
 /**/
 int
 visetbuffer(UNUSED(char **args))
 {
-    int ch;
+    ZLE_INT_T ch;
 
     if ((zmod.flags & MOD_VIBUF) ||
-	(((ch = getkey(0)) < '1' || ch > '9') &&
-	 (ch < 'a' || ch > 'z') && (ch < 'A' || ch > 'Z')))
+	(((ch = getfullchar(0)) < DIGIT_1 || ch > DIGIT_9) &&
+	 (ch < LETTER_a || ch > LETTER_z) &&
+	 (ch < LETTER_A || ch > LETTER_Z)))
 	return 1;
-    if (ch >= 'A' && ch <= 'Z')	/* needed in cut() */
+    if (ch >= LETTER_A && ch <= LETTER_Z)	/* needed in cut() */
 	zmod.flags |= MOD_VIAPP;
     else
 	zmod.flags &= ~MOD_VIAPP;
-    zmod.vibuf = tulower(ch) + (idigit(ch) ? -'1' + 26 : -'a');
+    zmod.vibuf = tulower(ch) + (idigit(ch) ? - DIGIT_1 + 26 : -LETTER_a);
     zmod.flags |= MOD_VIBUF;
     prefixflag = 1;
     return 0;
@@ -897,12 +923,12 @@
     sob.sg_flags = (sob.sg_flags | RAW) & ~ECHO;
     ioctl(SHTTY, TIOCSETN, &sob);
 #endif
-    lastchar = getkey(0);
+    getfullchar(0);
 #ifndef HAS_TIO
     zsetterm();
 #endif
     foredel(1);
-    if(lastchar < 0)
+    if(LASTFULLCHAR == ZLEEOF)
 	return 1;
     else
 	return selfinsert(args);

-- 
Peter Stephenson <pws@xxxxxxx>                  Software Engineer
CSR PLC, Churchill House, Cambridge Business Park, Cowley Road
Cambridge, CB4 0WZ, UK                          Tel: +44 (0)1223 692070


**********************************************************************
This email and any files transmitted with it are confidential and
intended solely for the use of the individual or entity to whom they
are addressed. If you have received this email in error please notify
the system manager.

**********************************************************************



Messages sorted by: Reverse Date, Date, Thread, Author