Zsh Mailing List Archive
Messages sorted by: Reverse Date, Date, Thread, Author

PATCH: multibyte $WORDCHARS



This fully fixes up WORDCHARS to use multibyte characters, with the
single exception of the [[:WORD:]] test --- I don't want to mess around
with the pattern code until we start work properly on the main shell.

I've rewritten the existing code involving wordtriggers() and
zle_wordchars, which were unused --- in fact I'd forgotten they were there.
(For that reason I tend to find it's better not to add to the support
routines until we know what we're doing with them.)

I've also improved support for testing identifiers.  This isn't quite so
good: since the main shell has no idea about multibyte characters, there
didn't seem any point allowing non-ASCII characters to be identified as
identifiers, so only ASCII characters are allowed in zle, too.

You'll also see I've removed a chunk that made bytes from 160 to 255
behave as if they were alphabet characters when ZLE_UNICODE_SUPPORT is
defined.  This chunk obviously isn't appropriate if we're trying to do
things properly.

Index: Src/init.c
===================================================================
RCS file: /cvsroot/zsh/zsh/Src/init.c,v
retrieving revision 1.58
diff -u -r1.58 init.c
--- Src/init.c	9 Sep 2005 16:06:48 -0000	1.58
+++ Src/init.c	20 Sep 2005 15:01:57 -0000
@@ -1180,9 +1180,6 @@
 #endif /* !LINKED_XMOD_zshQszle */
 
 /**/
-mod_export ZleVoidFn wordcharstriggerptr = noop_function;
-
-/**/
 unsigned char *
 autoload_zleread(char **lp, char **rp, int ha, int con)
 {
Index: Src/params.c
===================================================================
RCS file: /cvsroot/zsh/zsh/Src/params.c,v
retrieving revision 1.103
diff -u -r1.103 params.c
--- Src/params.c	9 Sep 2005 16:06:48 -0000	1.103
+++ Src/params.c	20 Sep 2005 15:01:58 -0000
@@ -3346,7 +3346,6 @@
     zsfree(wordchars);
     wordchars = x;
     inittyptab();
-    wordcharstriggerptr();
 }
 
 /* Function to get value for special parameter `_' */
Index: Src/pattern.c
===================================================================
RCS file: /cvsroot/zsh/zsh/Src/pattern.c,v
retrieving revision 1.27
diff -u -r1.27 pattern.c
--- Src/pattern.c	9 May 2005 10:46:15 -0000	1.27
+++ Src/pattern.c	20 Sep 2005 15:01:58 -0000
@@ -2749,6 +2749,10 @@
 		    return 1;
 		break;
 	    case PP_WORD:
+		/*
+		 * HERE: when we support multibyte characters,
+		 * this test needs to be wcsiword().
+		 */
 		if (iword(ch))
 		    return 1;
 		break;
Index: Src/utils.c
===================================================================
RCS file: /cvsroot/zsh/zsh/Src/utils.c,v
retrieving revision 1.90
diff -u -r1.90 utils.c
--- Src/utils.c	17 Sep 2005 18:23:50 -0000	1.90
+++ Src/utils.c	20 Sep 2005 15:01:58 -0000
@@ -35,6 +35,16 @@
 /**/
 char *scriptname;
 
+#ifdef ZLE_UNICODE_SUPPORT
+/*
+ * The wordchars variable turned into a wide character array.
+ * This is much more convenient for testing.
+ */
+
+/**/
+mod_export wchar_t *wordchars_wide;
+#endif
+
 /* Print an error */
  
 /**/
@@ -2456,8 +2466,18 @@
 	typtab[t0] = IDIGIT | IALNUM | IWORD | IIDENT | IUSER;
     for (t0 = 'a'; t0 <= 'z'; t0++)
 	typtab[t0] = typtab[t0 - 'a' + 'A'] = IALPHA | IALNUM | IIDENT | IUSER | IWORD;
+#ifndef ZLE_UNICODE_SUPPORT
+    /*
+     * This really doesn't seem to me the right thing to do when
+     * we have multibyte character support...  it was a hack to assume
+     * eight bit characters `worked' for some values of work before
+     * we could test for them properly.  I'm not 100% convinced
+     * having IIDENT here is a good idea at all, but this code
+     * should disappear into history...
+     */
     for (t0 = 0240; t0 != 0400; t0++)
 	typtab[t0] = IALPHA | IALNUM | IIDENT | IUSER | IWORD;
+#endif
     typtab['_'] = IIDENT | IUSER;
     typtab['-'] = IUSER;
     typtab[' '] |= IBLANK | INBLANK;
@@ -2477,8 +2497,44 @@
 	}
 	typtab[STOUC(*s == Meta ? *++s ^ 32 : *s)] |= ISEP;
     }
-    for (s = wordchars ? wordchars : DEFAULT_WORDCHARS; *s; s++)
-	typtab[STOUC(*s == Meta ? *++s ^ 32 : *s)] |= IWORD;
+#ifdef ZLE_UNICODE_SUPPORT
+    if (wordchars) {
+	const char *wordchars_ptr = wordchars;
+	mbstate_t mbs;
+	int nchars;
+
+	memset(&mbs, 0, sizeof(mbs));
+	wordchars_wide = (wchar_t *)
+	    zrealloc(wordchars_wide, (strlen(wordchars)+1)*sizeof(wchar_t));
+	nchars = mbsrtowcs(wordchars_wide, &wordchars_ptr, strlen(wordchars),
+			   &mbs);
+	if (nchars == -1) {
+	    /* Conversion state is undefined: better just set to null */
+	    *wordchars_wide = L'\0';
+	} else {
+	    wordchars_wide[nchars] = L'\0';
+	}
+    } else {
+	wordchars_wide = zrealloc(wordchars_wide, sizeof(wchar_t));
+	*wordchars_wide = L'\0';
+    }
+#endif
+    for (s = wordchars ? wordchars : DEFAULT_WORDCHARS; *s; s++) {
+	int c = STOUC(*s == Meta ? *++s ^ 32 : *s);
+#ifdef ZLE_UNICODE_SUPPORT
+	if (!isascii(c)) {
+	    /*
+	     * If we have support for multibyte characters, we don't
+	     * handle non-ASCII characters here; instead, we turn
+	     * wordchars into a wide character array.
+	     * (We may actually have a single-byte 8-bit character set,
+	     * but it works the same way.)
+	     */
+	    continue;
+	}
+#endif
+	typtab[c] |= IWORD;
+    }
     for (s = SPECCHARS; *s; s++)
 	typtab[STOUC(*s)] |= ISPECIAL;
     if (isset(BANGHIST) && bangchar && interact && isset(SHINSTDIN))
@@ -2503,9 +2559,6 @@
      * produces an ASCII character.  If it does, use iword on that.
      * If it doesn't, use iswalnum on the original character.  This
      * is pretty good most of the time.
-     *
-     * TODO: extend WORDCHARS to handle multibyte chars by some kind
-     * of hierarchical list or hash table.
      */
     len = wctomb(outstr, c);
 
@@ -2515,7 +2568,40 @@
     } else if (len == 1 && isascii(*outstr)) {
 	return iword(*outstr);
     } else {
-	return iswalnum(c);
+	return iswalnum(c) || wcschr(wordchars_wide, c);
+    }
+}
+
+/*
+ * iident() macro extended to support wide characters.
+ *
+ * The macro is intended to test if a character is allowed in an
+ * internal zsh identifier.  Until the main shell handles multibyte
+ * characters it's not a good idea to allow characters other than
+ * ASCII characters; it would cause zle to allow characters that
+ * the main shell would reject.  Eventually we should be able
+ * to allow all alphanumerics.
+ *
+ * Otherwise similar to wcsiword.
+ */
+
+/**/
+mod_export int
+wcsiident(wchar_t c)
+{
+    int len;
+    VARARR(char, outstr, MB_CUR_MAX);
+
+    len = wctomb(outstr, c);
+
+    if (len == 0) {
+	/* NULL is special */
+	return 0;
+    } else if (len == 1 && isascii(*outstr)) {
+	return iword(*outstr);
+    } else {
+	/* not currently allowed, see above */
+	return 0;
     }
 }
 #endif
Index: Src/Zle/zle.h
===================================================================
RCS file: /cvsroot/zsh/zsh/Src/Zle/zle.h,v
retrieving revision 1.18
diff -u -r1.18 zle.h
--- Src/Zle/zle.h	9 Sep 2005 20:34:43 -0000	1.18
+++ Src/Zle/zle.h	20 Sep 2005 15:01:58 -0000
@@ -66,12 +66,7 @@
 
 #define ZC_iblank iswspace
 #define ZC_icntrl iswcntrl
-/*
- * TODO: doesn't work on arguments with side effects.
- * Also YUK.  Not even sure this is guaranteed to work.
- * Should be easy to do along the lines of wcsiword.
- */
-#define ZC_iident(x)	(x < 256 && iident((int)x))
+#define ZC_iident wcsiident
 
 #define ZC_tolower towlower
 #define ZC_toupper towupper
Index: Src/Zle/zle_main.c
===================================================================
RCS file: /cvsroot/zsh/zsh/Src/Zle/zle_main.c,v
retrieving revision 1.75
diff -u -r1.75 zle_main.c
--- Src/Zle/zle_main.c	9 Sep 2005 16:55:21 -0000	1.75
+++ Src/Zle/zle_main.c	20 Sep 2005 15:01:58 -0000
@@ -106,11 +106,6 @@
 /**/
 mod_export int
 lastchar_wide_valid;
-
-/**/
-mod_export ZLE_STRING_T zle_wordchars;
-#else
-# define zle_wordchars wordchars;
 #endif
 
 /* the bindings for the previous and for this key */
@@ -1558,17 +1553,6 @@
 	kungetct = 0;
 }
 
-/**/
-mod_export void
-wordcharstrigger(void)
-{
-#ifdef ZLE_UNICODE_SUPPORT
-    zrealloc(zle_wordchars, strlen(wordchars)*MB_CUR_MAX);
-    mbsrtowcs(zle_wordchars, (const char **)&wordchars,
-	      strlen(wordchars), NULL);
-    /* TODO: error handling here */
-#endif
-}
 
 /* Hook functions. Used to allow access to zle parameters if zle is
  * active. */
@@ -1636,8 +1620,6 @@
     kungetbuf = (char *) zalloc(kungetsz = 32);
     comprecursive = 0;
     rdstrs = NULL;
-    wordcharstriggerptr = wordcharstrigger;
-    wordcharstrigger();
 
     /* initialise the keymap system */
     init_keymaps();
@@ -1712,7 +1694,6 @@
     zlegetlineptr = NULL;
     zlereadptr = fallback_zleread;
     zlesetkeymapptr= noop_function_int;
-    wordcharstriggerptr = noop_function;
 
     getkeyptr = NULL;
 


-- 
Peter Stephenson <pws@xxxxxxx>                  Software Engineer
CSR PLC, Churchill House, Cambridge Business Park, Cowley Road
Cambridge, CB4 0WZ, UK                          Tel: +44 (0)1223 692070


**********************************************************************
This email and any files transmitted with it are confidential and
intended solely for the use of the individual or entity to whom they
are addressed. If you have received this email in error please notify
the system manager.

**********************************************************************



Messages sorted by: Reverse Date, Date, Thread, Author