Zsh Mailing List Archive
Messages sorted by:
Reverse Date,
Date,
Thread,
Author
Re: Cannot paste unicode <0221>, <0234> - <024f>
Here is a quick (maybe too simple) patch.
wcwidth() on MacOSX was broken for combining characters,
but Apple has fixed *this* problem a few years ago,
probably in OSX 10.8 (Mavericks). So BROKEN_WCWIDTH is
NOT defined on recent macOS.
In the patch below, I added a test in configure.ac using U+0234
for both wcwidth() and iswprint() (both are broken on macOS;
wcwidth() returns -1 and iswprint() returns 0=false).
As a replacement for the broken iswprint(), I added a very (or
too) simple function wc_isprint(), which returns false only for
those characters for which mk_wcwidth() returns -1, i.e.,
0 <= wc <= 0x1f and 0x7f <= wc <= 0x9f (8bit control chars).
Another possibility is to use --enable-unicode9 if wcwidth()
and/or iswprint() are broken (--enable-unicode9 works fine
without any additional libraries). There is no iswprint-replacement
in wcwidth.h, but implementing it would be easy if we can use the
array wcwidth9_nonprint in wcwidth9.h.
# But I must say I couldn't understand the array; for example,
# why U+00ad is not printable while U+2028 is printable?
diff --git a/Src/Zle/zle_refresh.c b/Src/Zle/zle_refresh.c
index 8391739..d0dd1ef 100644
--- a/Src/Zle/zle_refresh.c
+++ b/Src/Zle/zle_refresh.c
@@ -1278,7 +1278,7 @@ zrefresh(void)
#ifdef __STDC_ISO_10646__
!ZSH_INVALID_WCHAR_TEST(*t) &&
#endif
- iswprint(*t) && (width = WCWIDTH(*t)) > 0) {
+ WC_ISPRINT(*t) && (width = WCWIDTH(*t)) > 0) {
int ichars;
if (width > rpms.sen - rpms.s) {
int started = 0;
@@ -1460,7 +1460,7 @@ zrefresh(void)
u = outputline;
for (; u < outputline + outll; u++) {
#ifdef MULTIBYTE_SUPPORT
- if (iswprint(*u)) {
+ if (WC_ISPRINT(*u)) {
int width = WCWIDTH(*u);
/* Handle wide characters as above */
if (width > rpms.sen - rpms.s) {
@@ -2468,7 +2468,7 @@ singlerefresh(ZLE_STRING_T tmpline, int tmpll, int tmpcs)
if (tmpline[t0] == ZWC('\t'))
vsiz = (vsiz | 7) + 2;
#ifdef MULTIBYTE_SUPPORT
- else if (iswprint(tmpline[t0]) && ((width = WCWIDTH(tmpline[t0])) > 0)) {
+ else if (WC_ISPRINT(tmpline[t0]) && ((width = WCWIDTH(tmpline[t0])) > 0)) {
vsiz += width;
if (isset(COMBININGCHARS) && IS_BASECHAR(tmpline[t0])) {
while (t0 < tmpll-1 && IS_COMBINING(tmpline[t0+1]))
@@ -2556,7 +2556,7 @@ singlerefresh(ZLE_STRING_T tmpline, int tmpll, int tmpcs)
vp->atr = all_atr_on | all_atr_off;
vp++;
#ifdef MULTIBYTE_SUPPORT
- } else if (iswprint(tmpline[t0]) &&
+ } else if (WC_ISPRINT(tmpline[t0]) &&
(width = WCWIDTH(tmpline[t0])) > 0) {
int ichars;
if (isset(COMBININGCHARS) && IS_BASECHAR(tmpline[t0])) {
diff --git a/Src/compat.c b/Src/compat.c
index a295694..ca9713b 100644
--- a/Src/compat.c
+++ b/Src/compat.c
@@ -1017,3 +1017,20 @@ isprint_ascii(int c)
/**/
#endif /* __APPLE__ && BROKEN_ISPRINT */
+
+/**/
+#if defined(__APPLE__) && defined(BROKEN_ISWPRINT)
+
+/**/
+int
+wc_isprint(wint_t ucs)
+{
+ if (ucs <= 0)
+ return 0;
+ if (ucs < 32 || (ucs >= 0x7f && ucs < 0xa0))
+ return 0;
+ return 1;
+}
+
+/**/
+#endif /* __APPLE__ && BROKEN_ISWPRINT */
diff --git a/Src/pattern.c b/Src/pattern.c
index 75db016..fc7c737 100644
--- a/Src/pattern.c
+++ b/Src/pattern.c
@@ -3625,7 +3625,7 @@ mb_patmatchrange(char *range, wchar_t ch, int zmb_ind, wint_t *indptr, int *mtp)
return 1;
break;
case PP_PRINT:
- if (iswprint(ch))
+ if (WC_ISPRINT(ch))
return 1;
break;
case PP_PUNCT:
diff --git a/Src/utils.c b/Src/utils.c
index ea4b34b..8aceb79 100644
--- a/Src/utils.c
+++ b/Src/utils.c
@@ -629,7 +629,7 @@ wcs_nicechar_sel(wchar_t c, size_t *widthp, char **swidep, int quotable)
}
s = buf;
- if (!iswprint(c) && (c < 0x80 || !isset(PRINTEIGHTBIT))) {
+ if (!WC_ISPRINT(c) && (c < 0x80 || !isset(PRINTEIGHTBIT))) {
if (c == 0x7f) {
if (quotable) {
*s++ = '\\';
@@ -734,7 +734,7 @@ wcs_nicechar(wchar_t c, size_t *widthp, char **swidep)
/**/
mod_export int is_wcs_nicechar(wchar_t c)
{
- if (!iswprint(c) && (c < 0x80 || !isset(PRINTEIGHTBIT))) {
+ if (!WC_ISPRINT(c) && (c < 0x80 || !isset(PRINTEIGHTBIT))) {
if (c == 0x7f || c == L'\n' || c == L'\t' || c < 0x20)
return 1;
if (c >= 0x80) {
diff --git a/Src/ztype.h b/Src/ztype.h
index 76589b1..a8f5fe5 100644
--- a/Src/ztype.h
+++ b/Src/ztype.h
@@ -72,7 +72,11 @@
#ifdef MULTIBYTE_SUPPORT
#define WC_ZISTYPE(X,Y) wcsitype((X),(Y))
-#define WC_ISPRINT(X) iswprint(X)
+# if defined(__APPLE__) && defined(BROKEN_ISWPRINT)
+# define WC_ISPRINT(X) wc_isprint(X)
+# else
+# define WC_ISPRINT(X) iswprint(X)
+# endif
#else
#define WC_ZISTYPE(X,Y) zistype((X),(Y))
#define WC_ISPRINT(X) isprint(X)
diff --git a/configure.ac b/configure.ac
index 911cc45..d2f418d 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2591,14 +2591,18 @@ fi])
AH_TEMPLATE([BROKEN_WCWIDTH],
[Define to 1 if the wcwidth() function is present but broken.])
+AH_TEMPLATE([BROKEN_ISWPRINT],
+[Define to 1 if the iswprint() function is present but broken.])
AH_TEMPLATE([BROKEN_ISPRINT],
[Define to 1 if the isprint() function is broken under UTF-8 locale.])
if test x$zsh_cv_c_unicode_support = xyes; then
AC_DEFINE(MULTIBYTE_SUPPORT)
- dnl Test for a wcwidth() implementation that gives the wrong width for
- dnl zero-width combining characters.
- dnl For the test we use a combining acute accent (\u0301).
+ dnl Test for a wcwidth() implementation that gives the wrong width for either
+ dnl zero-width combining characters, or
+ dnl some characters in the Latin Extended-B.
+ dnl For the test we use a combining acute accent (\u0301) or
+ dnl a LATIN SMALL LETTER L WITH CURL (\u0234).
dnl We input it as UTF-8 since that is the standard we can rely
dnl upon most: we can't rely on a wchar_t being stored as a
dnl Unicode code point on all systems.
@@ -2607,9 +2611,8 @@ if test x$zsh_cv_c_unicode_support = xyes; then
dnl - the programme compiled, linked and ran
dnl - we successfully set a UTF-8 locale
dnl - the locale we set plausibly converted the UTF-8 string
- dnl for a zero-width combining character (the only way to be
- dnl 100% sure would be to output it and ask if it looked right)
- dnl - the converted wide character gave a non-zero width.
+ dnl into the correct wide character
+ dnl - but the converted wide character gave a wrong width.
dnl locale -a is a fallback; on most systems we should find en_US.UTF-8.
[locale_prog='char *my_locales[] = {
"en_US.UTF-8", "en_GB.UTF-8", "en.UTF-8", '
@@ -2625,17 +2628,19 @@ if test x$zsh_cv_c_unicode_support = xyes; then
int main() {
char **localep;
char comb_acute_mb[] = { (char)0xcc, (char)0x81 };
+ char u_0234[] = { (char)0xc8, (char)0xb4 };
wchar_t wc;
for (localep = my_locales; *localep; localep++)
- if (setlocale(LC_ALL, *localep) &&
- mbtowc(&wc, comb_acute_mb, 2) == 2)
+ if (setlocale(LC_ALL, *localep))
break;
if (!*localep)
return 1;
- if (wcwidth(wc) == 0)
- return 1;
- return 0;
+ if (mbtowc(&wc, comb_acute_mb, 2) == 2 && wcwidth(wc) != 0)
+ return 0;
+ if (mbtowc(&wc, u_0234, 2) == 2 && wcwidth(wc) != 1)
+ return 0;
+ return 1;
}
"]
@@ -2649,6 +2654,43 @@ if test x$zsh_cv_c_unicode_support = xyes; then
AC_DEFINE(BROKEN_WCWIDTH)
fi
+ dnl Check if iswprint() is broken.
+ [locale_prog='char *my_locales[] = {
+ "en_US.UTF-8", "en_GB.UTF-8", "en.UTF-8", '
+ locale_prog="$locale_prog"`locale -a 2>/dev/null | \
+ sed -e 's/utf8/UTF-8/' | grep UTF-8 | \
+ while read line; do echo " \"$line\","; done;`
+ locale_prog="$locale_prog 0 };
+ #include <stdlib.h>
+ #include <locale.h>
+ #include <wchar.h>
+ #include <wctype.h>
+
+ int main() {
+ char **localep;
+ char u_0234[] = { (char)0xc8, (char)0xb4 };
+ wchar_t wc;
+ for (localep = my_locales; *localep; localep++)
+ if (setlocale(LC_ALL, *localep))
+ break;
+ if (!*localep)
+ return 1;
+ if (mbtowc(&wc, u_0234, 2) == 2 && !iswprint(wc))
+ return 0;
+ return 1;
+ }
+ "]
+
+ AC_CACHE_CHECK(if the iswprint() function is broken,
+ zsh_cv_c_broken_iswprint,
+ [AC_TRY_RUN([$locale_prog],
+ zsh_cv_c_broken_iswprint=yes,
+ zsh_cv_c_broken_iswprint=no,
+ zsh_cv_c_broken_iswprint=no)])
+ if test x$zsh_cv_c_broken_iswprint = xyes; then
+ AC_DEFINE(BROKEN_ISWPRINT)
+ fi
+
dnl Check if isprint() behaves correctly under UTF-8 locale.
dnl On some platform (maybe only on Mac OS X), isprint() returns
dnl true for all characters in the range from 0xa0 to 0xff if
Messages sorted by:
Reverse Date,
Date,
Thread,
Author