Zsh Mailing List Archive Messages sorted by: Reverse Date, Date, Thread, Author

Re: Cannot paste unicode <0221>, <0234> - <024f>

X-seq: zsh-workers 41037
From: "Jun T." <takimoto-j@xxxxxxxxxxxxxxxxx>
To: zsh-workers@xxxxxxx
Subject: Re: Cannot paste unicode <0221>, <0234> - <024f>
Date: Tue, 2 May 2017 00:52:03 +0900
In-reply-to: <1B66A5C4-6855-4013-93F9-57857BCE0C45@kba.biglobe.ne.jp>
List-help: <mailto:zsh-workers-help@zsh.org>
List-id: Zsh Workers List <zsh-workers.zsh.org>
List-post: <mailto:zsh-workers@zsh.org>
Mailing-list: contact zsh-workers-help@xxxxxxx; run by ezmlm
References: <CGME20170428111102epcas3p1cc3d86dc54fdafd8cd0e613bbaeba69b@epcas3p1.samsung.com> <etPan.59031f8e.515f007c.15fbc@MacMini.local> <20170428124439.73447db2@pwslap01u.europe.root.pri> <etPan.59033168.1190cde7.15fbc@MacMini.local> <20170428141650.7ed174d6@pwslap01u.europe.root.pri> <etPan.5903498f.140e0f76.15fbc@MacMini.local> <20170428154135.2e2b5626@pwslap01u.europe.root.pri> <etPan.5904351e.41a7c4c9.15fbc@MacMini.local> <1B66A5C4-6855-4013-93F9-57857BCE0C45@kba.biglobe.ne.jp>

Here is a quick (maybe too simple) patch.

wcwidth() on MacOSX was broken for combining characters,
but Apple has fixed *this* problem a few years ago,
probably in OSX 10.8 (Mavericks). So BROKEN_WCWIDTH is
NOT defined on recent macOS.

In the patch below, I added a test in configure.ac using U+0234
for both wcwidth() and iswprint() (both are broken on macOS;
wcwidth() returns -1 and iswprint() returns 0=false).

As a replacement for the broken iswprint(), I added a very (or
too) simple function wc_isprint(), which returns false only for
those characters for which mk_wcwidth() returns -1, i.e.,
0 <= wc <= 0x1f and 0x7f <= wc <= 0x9f (8bit control chars).

Another possibility is to use --enable-unicode9 if wcwidth()
and/or iswprint() are broken (--enable-unicode9 works fine
without any additional libraries). There is no iswprint-replacement
in wcwidth.h, but implementing it would be easy if we can use the
array wcwidth9_nonprint in wcwidth9.h.
# But I must say I couldn't understand the array; for example,
# why U+00ad is not printable while U+2028 is printable?


diff --git a/Src/Zle/zle_refresh.c b/Src/Zle/zle_refresh.c
index 8391739..d0dd1ef 100644
--- a/Src/Zle/zle_refresh.c
+++ b/Src/Zle/zle_refresh.c
@@ -1278,7 +1278,7 @@ zrefresh(void)
 #ifdef __STDC_ISO_10646__
 		 !ZSH_INVALID_WCHAR_TEST(*t) &&
 #endif
-		 iswprint(*t) && (width = WCWIDTH(*t)) > 0) {
+		 WC_ISPRINT(*t) && (width = WCWIDTH(*t)) > 0) {
 	    int ichars;
 	    if (width > rpms.sen - rpms.s) {
 		int started = 0;
@@ -1460,7 +1460,7 @@ zrefresh(void)
 	u = outputline;
 	for (; u < outputline + outll; u++) {
 #ifdef MULTIBYTE_SUPPORT
-	    if (iswprint(*u)) {
+	    if (WC_ISPRINT(*u)) {
 		int width = WCWIDTH(*u);
 		/* Handle wide characters as above */
 		if (width > rpms.sen - rpms.s) {
@@ -2468,7 +2468,7 @@ singlerefresh(ZLE_STRING_T tmpline, int tmpll, int tmpcs)
 	if (tmpline[t0] == ZWC('\t'))
 	    vsiz = (vsiz | 7) + 2;
 #ifdef MULTIBYTE_SUPPORT
-	else if (iswprint(tmpline[t0]) && ((width = WCWIDTH(tmpline[t0])) > 0)) {
+	else if (WC_ISPRINT(tmpline[t0]) && ((width = WCWIDTH(tmpline[t0])) > 0)) {
 	    vsiz += width;
 	    if (isset(COMBININGCHARS) && IS_BASECHAR(tmpline[t0])) {
 		while (t0 < tmpll-1 && IS_COMBINING(tmpline[t0+1]))
@@ -2556,7 +2556,7 @@ singlerefresh(ZLE_STRING_T tmpline, int tmpll, int tmpcs)
 	    vp->atr = all_atr_on | all_atr_off;
 	    vp++;
 #ifdef MULTIBYTE_SUPPORT
-	} else if (iswprint(tmpline[t0]) &&
+	} else if (WC_ISPRINT(tmpline[t0]) &&
 		   (width = WCWIDTH(tmpline[t0])) > 0) {
 	    int ichars;
 	    if (isset(COMBININGCHARS) && IS_BASECHAR(tmpline[t0])) {
diff --git a/Src/compat.c b/Src/compat.c
index a295694..ca9713b 100644
--- a/Src/compat.c
+++ b/Src/compat.c
@@ -1017,3 +1017,20 @@ isprint_ascii(int c)
 
 /**/
 #endif /* __APPLE__ && BROKEN_ISPRINT */
+
+/**/
+#if defined(__APPLE__) && defined(BROKEN_ISWPRINT)
+
+/**/
+int
+wc_isprint(wint_t ucs)
+{
+    if (ucs <= 0)
+	return 0;
+    if (ucs < 32 || (ucs >= 0x7f && ucs < 0xa0))
+	return 0;
+    return 1;
+}
+
+/**/
+#endif /* __APPLE__ && BROKEN_ISWPRINT */
diff --git a/Src/pattern.c b/Src/pattern.c
index 75db016..fc7c737 100644
--- a/Src/pattern.c
+++ b/Src/pattern.c
@@ -3625,7 +3625,7 @@ mb_patmatchrange(char *range, wchar_t ch, int zmb_ind, wint_t *indptr, int *mtp)
 		    return 1;
 		break;
 	    case PP_PRINT:
-		if (iswprint(ch))
+		if (WC_ISPRINT(ch))
 		    return 1;
 		break;
 	    case PP_PUNCT:
diff --git a/Src/utils.c b/Src/utils.c
index ea4b34b..8aceb79 100644
--- a/Src/utils.c
+++ b/Src/utils.c
@@ -629,7 +629,7 @@ wcs_nicechar_sel(wchar_t c, size_t *widthp, char **swidep, int quotable)
     }
 
     s = buf;
-    if (!iswprint(c) && (c < 0x80 || !isset(PRINTEIGHTBIT))) {
+    if (!WC_ISPRINT(c) && (c < 0x80 || !isset(PRINTEIGHTBIT))) {
 	if (c == 0x7f) {
 	    if (quotable) {
 		*s++ = '\\';
@@ -734,7 +734,7 @@ wcs_nicechar(wchar_t c, size_t *widthp, char **swidep)
 /**/
 mod_export int is_wcs_nicechar(wchar_t c)
 {
-    if (!iswprint(c) && (c < 0x80 || !isset(PRINTEIGHTBIT))) {
+    if (!WC_ISPRINT(c) && (c < 0x80 || !isset(PRINTEIGHTBIT))) {
 	if (c == 0x7f || c == L'\n' || c == L'\t' || c < 0x20)
 	    return 1;
 	if (c >= 0x80) {
diff --git a/Src/ztype.h b/Src/ztype.h
index 76589b1..a8f5fe5 100644
--- a/Src/ztype.h
+++ b/Src/ztype.h
@@ -72,7 +72,11 @@
 
 #ifdef MULTIBYTE_SUPPORT
 #define WC_ZISTYPE(X,Y) wcsitype((X),(Y))
-#define WC_ISPRINT(X)	iswprint(X)
+# if defined(__APPLE__) && defined(BROKEN_ISWPRINT)
+#  define WC_ISPRINT(X)	wc_isprint(X)
+# else
+#  define WC_ISPRINT(X)	iswprint(X)
+# endif
 #else
 #define WC_ZISTYPE(X,Y)	zistype((X),(Y))
 #define WC_ISPRINT(X)	isprint(X)
diff --git a/configure.ac b/configure.ac
index 911cc45..d2f418d 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2591,14 +2591,18 @@ fi])
 
 AH_TEMPLATE([BROKEN_WCWIDTH],
 [Define to 1 if the wcwidth() function is present but broken.])
+AH_TEMPLATE([BROKEN_ISWPRINT],
+[Define to 1 if the iswprint() function is present but broken.])
 AH_TEMPLATE([BROKEN_ISPRINT],
 [Define to 1 if the isprint() function is broken under UTF-8 locale.])
 if test x$zsh_cv_c_unicode_support = xyes; then
   AC_DEFINE(MULTIBYTE_SUPPORT)
 
-  dnl Test for a wcwidth() implementation that gives the wrong width for
-  dnl zero-width combining characters.
-  dnl For the test we use a combining acute accent (\u0301).
+  dnl Test for a wcwidth() implementation that gives the wrong width for either
+  dnl   zero-width combining characters, or
+  dnl   some characters in the Latin Extended-B.
+  dnl For the test we use a combining acute accent (\u0301) or
+  dnl a LATIN SMALL LETTER L WITH CURL (\u0234).
   dnl We input it as UTF-8 since that is the standard we can rely
   dnl upon most:  we can't rely on a wchar_t being stored as a
   dnl Unicode code point on all systems.
@@ -2607,9 +2611,8 @@ if test x$zsh_cv_c_unicode_support = xyes; then
   dnl - the programme compiled, linked and ran
   dnl - we successfully set a UTF-8 locale
   dnl - the locale we set plausibly converted the UTF-8 string
-  dnl   for a zero-width combining character (the only way to be
-  dnl   100% sure would be to output it and ask if it looked right)
-  dnl - the converted wide character gave a non-zero width.
+  dnl   into the correct wide character
+  dnl - but the converted wide character gave a wrong width.
   dnl locale -a is a fallback; on most systems we should find en_US.UTF-8.
   [locale_prog='char *my_locales[] = {
   "en_US.UTF-8", "en_GB.UTF-8", "en.UTF-8", '
@@ -2625,17 +2628,19 @@ if test x$zsh_cv_c_unicode_support = xyes; then
   int main() {
     char **localep;
     char comb_acute_mb[] = { (char)0xcc, (char)0x81 };
+    char u_0234[] = { (char)0xc8, (char)0xb4 };
     wchar_t wc;
 
     for (localep = my_locales; *localep; localep++)
-      if (setlocale(LC_ALL, *localep) &&
-          mbtowc(&wc, comb_acute_mb, 2) == 2)
+      if (setlocale(LC_ALL, *localep))
 	  break;
     if (!*localep)
       return 1;
-    if (wcwidth(wc) == 0)
-      return 1;
-    return 0;
+    if (mbtowc(&wc, comb_acute_mb, 2) == 2 && wcwidth(wc) != 0)
+      return 0;
+    if (mbtowc(&wc, u_0234, 2) == 2 && wcwidth(wc) != 1)
+      return 0;
+    return 1;
   }
   "]
 
@@ -2649,6 +2654,43 @@ if test x$zsh_cv_c_unicode_support = xyes; then
     AC_DEFINE(BROKEN_WCWIDTH)
   fi
 
+  dnl Check if iswprint() is broken.
+  [locale_prog='char *my_locales[] = {
+  "en_US.UTF-8", "en_GB.UTF-8", "en.UTF-8", '
+  locale_prog="$locale_prog"`locale -a 2>/dev/null | \
+    sed -e 's/utf8/UTF-8/' | grep UTF-8 | \
+    while read line; do echo " \"$line\","; done;`
+  locale_prog="$locale_prog 0 };
+  #include <stdlib.h>
+  #include <locale.h>
+  #include <wchar.h>
+  #include <wctype.h>
+
+  int main() {
+    char **localep;
+    char u_0234[] = { (char)0xc8, (char)0xb4 };
+    wchar_t wc;
+    for (localep = my_locales; *localep; localep++)
+      if (setlocale(LC_ALL, *localep))
+	break;
+    if (!*localep)
+      return 1;
+    if (mbtowc(&wc, u_0234, 2) == 2 && !iswprint(wc))
+      return 0;
+    return 1;
+  }
+  "]
+
+  AC_CACHE_CHECK(if the iswprint() function is broken,
+  zsh_cv_c_broken_iswprint,
+  [AC_TRY_RUN([$locale_prog],
+  zsh_cv_c_broken_iswprint=yes,
+  zsh_cv_c_broken_iswprint=no,
+  zsh_cv_c_broken_iswprint=no)])
+  if test x$zsh_cv_c_broken_iswprint = xyes; then
+    AC_DEFINE(BROKEN_ISWPRINT)
+  fi
+
   dnl Check if isprint() behaves correctly under UTF-8 locale.
   dnl On some platform (maybe only on Mac OS X), isprint() returns
   dnl true for all characters in the range from 0xa0 to 0xff if

Follow-Ups:
- Re: Cannot paste unicode <0221>, <0234> - <024f>
  - From: Jun T.
- Re: Cannot paste unicode <0221>, <0234> - <024f>
  - From: Sebastian Gniazdowski
- Re: Cannot paste unicode <0221>, <0234> - <024f>
  - From: Sebastian Gniazdowski
- Re: Cannot paste unicode <0221>, <0234> - <024f>
  - From: Peter Stephenson

References:
- Cannot paste unicode <0221>, <0234> - <024f>
  - From: Sebastian Gniazdowski
- Re: Cannot paste unicode <0221>, <0234> - <024f>
  - From: Peter Stephenson
- Re: Cannot paste unicode <0221>, <0234> - <024f>
  - From: Sebastian Gniazdowski
- Re: Cannot paste unicode <0221>, <0234> - <024f>
  - From: Peter Stephenson
- Re: Cannot paste unicode <0221>, <0234> - <024f>
  - From: Sebastian Gniazdowski
- Re: Cannot paste unicode <0221>, <0234> - <024f>
  - From: Peter Stephenson
- Re: Cannot paste unicode <0221>, <0234> - <024f>
  - From: Sebastian Gniazdowski
- Re: Cannot paste unicode <0221>, <0234> - <024f>
  - From: Jun T.

Messages sorted by: Reverse Date, Date, Thread, Author