Zsh Mailing List Archive
Messages sorted by: Reverse Date, Date, Thread, Author

PATCH: self-insert key binding for multibyte sequences



This looks like it goes a long way to resolving the remaining problems
with dealing consistently with bindings for multibytes sequences.

What I haven't looked at is how much the remaining uses of getrestchar()
that work at a level higher than the key buffer need tweaking or are
needed at all.  If I do work it out getrestchar_keybuf may well
disappear inside zle_keymap.c as a static.

(UTF-8 in the following.  Not crucial to description.)

Top tip: on my GB keyboard, which isn't far from a basic English
language one, AltGr + i generates a right arrow, →, which has a
character in it that needs Meta.  This is useful for testing e.g. I'd
forgotten that keybuf despite having an explicit length was already
metafied.  In X03zlebindkey I've continued to use Mikael's Special
Character for meta effects since it looks nice.

pws

diff --git a/Src/Zle/zle_keymap.c b/Src/Zle/zle_keymap.c
index d6d116b..382eb8d 100644
--- a/Src/Zle/zle_keymap.c
+++ b/Src/Zle/zle_keymap.c
@@ -1449,6 +1449,104 @@ default_bindings(void)
 /*************************/
 /* reading key sequences */
 /*************************/
+/**/
+#ifdef MULTIBYTE_SUPPORT
+/*
+ * Get the remainder of a character if we support multibyte
+ * input strings.  It may not require any more input, but
+ * we haven't yet checked.  What's read in so far is available
+ * in keybuf; if we read more we will top keybuf up.
+ *
+ * This version is used when we are still resolving the input key stream
+ * into bindings.  Once that has been done this function shouldn't be
+ * used: instead, see getrestchar() in zle_main.c.
+ *
+ * This supports a self-insert binding at any stage of a key sequence.
+ * Typically we handle 8-bit characters by having only the first byte
+ * bound to self insert; then we immediately get here and read in as
+ * many further bytes as necessary.  However, it's possible that any set
+ * of bytes up to full character is bound to self-insert; then we get
+ * here later and read as much as possible, which could be a complete
+ * character, from keybuf before attempting further input.
+ *
+ * At the end of the process, the full multibyte character is available
+ * in keybuf, so the return value may be superfluous.
+ */
+
+/**/
+mod_export ZLE_INT_T
+getrestchar_keybuf(void)
+{
+    char c;
+    wchar_t outchar;
+    int inchar, timeout, bufind = 0, buflen = keybuflen;
+    static mbstate_t mbs;
+    size_t cnt;
+
+    /*
+     * We are guaranteed to set a valid wide last character,
+     * although it may be WEOF (which is technically not
+     * a wide character at all...)
+     */
+    lastchar_wide_valid = 1;
+    memset(&mbs, 0, sizeof mbs);
+
+    /*
+     * Return may be zero if we have a NULL; handle this like
+     * any other character.
+     */
+    while (1) {
+	if (bufind < buflen) {
+	    c = STOUC(keybuf[bufind++]);
+	    if (c == Meta) {
+		DPUTS(bufind == buflen, "Meta at end of keybuf");
+		c = STOUC(keybuf[bufind++]) ^ 32;
+	    }
+	} else {
+	    /*
+	     * Always apply KEYTIMEOUT to the remains of the input
+	     * character.  The parts of a multibyte character should
+	     * arrive together.  If we don't do this the input can
+	     * get stuck if an invalid byte sequence arrives.
+	     */
+	    inchar = getbyte(1L, &timeout);
+	    /* getbyte deliberately resets lastchar_wide_valid */
+	    lastchar_wide_valid = 1;
+	    if (inchar == EOF) {
+		memset(&mbs, 0, sizeof mbs);
+		if (timeout)
+		{
+		    /*
+		     * This case means that we got a valid initial byte
+		     * (since we tested for EOF above), but the followup
+		     * timed out.  This probably indicates a duff character.
+		     * Return a '?'.
+		     */
+		    lastchar = '?';
+		    return lastchar_wide = L'?';
+		}
+		else
+		    return lastchar_wide = WEOF;
+	    }
+	    c = inchar;
+	    addkeybuf(inchar);
+	}
+
+	cnt = mbrtowc(&outchar, &c, 1, &mbs);
+	if (cnt == MB_INVALID) {
+	    /*
+	     * Invalid input.  Hmm, what's the right thing to do here?
+	     */
+	    memset(&mbs, 0, sizeof mbs);
+	    return lastchar_wide = WEOF;
+	}
+	if (cnt != MB_INCOMPLETE)
+	    break;
+    }
+    return lastchar_wide = (ZLE_INT_T)outchar;
+}
+/**/
+#endif
 
 /* read a sequence of keys that is bound to some command in a keymap */
 
@@ -1504,15 +1602,8 @@ getkeymapcmd(Keymap km, Thingy *funcp, char **strp)
 #ifdef MULTIBYTE_SUPPORT
 	    if ((f == Th(z_selfinsert) || f == Th(z_selfinsertunmeta)) &&
 		!lastchar_wide_valid && !ispfx) {
-		int len;
-		VARARR(char, mbc, MB_CUR_MAX);
-		ZLE_INT_T inchar = getrestchar(lastchar, mbc, &len);
-		if (inchar != WEOF && len) {
-		    char *ptr = mbc;
-		    while (len--)
-			addkeybuf(STOUC(*ptr++));
-		    lastlen = keybuflen;
-		}
+		(void)getrestchar_keybuf();
+		lastlen = keybuflen;
 	    }
 #endif
 	}
diff --git a/Test/X03zlebindkey.ztst b/Test/X03zlebindkey.ztst
index 70c42f9..e6fead5 100644
--- a/Test/X03zlebindkey.ztst
+++ b/Test/X03zlebindkey.ztst
@@ -1,13 +1,12 @@
 # Tests of the vi mode of ZLE
 
 %prep
-  mb_ok=
+  ZSH_TEST_LANG=
   langs=(en_{US,GB}.{UTF-,utf}8 en.UTF-8
 	 $(locale -a 2>/dev/null | egrep 'utf8|UTF-8'))
   for LANG in $langs; do
     if [[ é = ? ]]; then
       ZSH_TEST_LANG=$LANG
-      mb_ok=1
       break;
     fi
   done
@@ -29,7 +28,7 @@
 >BUFFER: foo
 >CURSOR: 3
 
-  if [[ -z $mb_ok ]]; then
+  if [[ -z $ZSH_TEST_LANG ]]; then
     ZTST_skip="bindkey multibyte test skipped"
   else
     zpty_run 'alias unbind="bindkey -r ホ"'
@@ -37,9 +36,17 @@
     zletest 'ホ'
     zpty_run unbind
     zletest 'ホ'
+    zpty_run 'bindkey ホ self-insert'
+    zletest 'ホ'
+    zpty_run unbind
+    zletest 'ホ'
   fi
 0:bindkey -s multibyte characters
 >BUFFER: bar
 >CURSOR: 3
 >BUFFER: ホ
 >CURSOR: 1
+>BUFFER: ホ
+>CURSOR: 1
+>BUFFER: ホ
+>CURSOR: 1



Messages sorted by: Reverse Date, Date, Thread, Author