Zsh Mailing List Archive
Messages sorted by:
Reverse Date,
Date,
Thread,
Author
Re: (LC_ALL=C; set -x 128 129; printf "%s\n" ${(#)@} | hexdump -C)
> 2023/09/13 18:59, Jun T <takimoto-j@xxxxxxxxxxxxxxxxx> wrote:
>
> the test fails on FreeBSD,
> DragonFly and NetBSD for out-of-range characters.
(snip)
> This is due to the peculiar behavior of iconv(3). It converts
> out-of-range character to '?' (0x3f) with return value 1,
This behavior of iconv(3) is explicitly documented in the manpage
and we can't say it's a bug, but anyway I think we should treat
the positive return value of iconv() in the same way as -1.
But simply replacing (utils.c:7046)
if (count == (size_t)-1) {
by
if (count) {
didn't work because of the complication due to errflag/noerrs.
So I moved the conversion code into a new function ucs4tomb().
Two more modifications:
[1] Negative value, such as ${(#X):--1}, is now an error.
[2] If __STDC_ISO_10646__ is not defined, for example in macOS, and
UTF-8 locale is in use, then ucs4toutf8() is used for the conversion.
This function now accepts only the range 0 - 0x7fff_ffff because
wctomb(3) on Linux (with UTF-8 locale) accepts this range (the old
range of UCS4).
# But now it seems UCS4 is equivalent to UTF-32 and limited to the
# range 0 - 0x10_ffff (and the maximum length of UTF-8 is 4 bytes).
# We can make ucs4toutf8() accept only this range, if that's better.
# This will also make $'\U110000' an error.
BTW, with or without my recent patch, if the (X) flag is not
given but conversion fails, then the lowest byte of the number
is output as a single byte character. Is this really useful?
If so, do we need to document it? Or we can just output ""?
diff --git a/Src/subst.c b/Src/subst.c
index dc2052ee0..347b1b8bd 100644
--- a/Src/subst.c
+++ b/Src/subst.c
@@ -1501,16 +1501,15 @@ substevalchar(char *ptr)
return noerrs ? dupstring(""): NULL;
}
errflag |= saved_errflag;
+ if (ires < 0) {
+ zerr("character not in range");
+ }
#ifdef MULTIBYTE_SUPPORT
- if (isset(MULTIBYTE) && ires > 127) {
- /* '\\' + 'U' + 8 bytes of character + '\0' */
- char buf[11];
-
- /* inefficient: should separate out \U handling from getkeystring */
- sprintf(buf, "\\U%.8x", (unsigned int)ires & 0xFFFFFFFFu);
- ptr = getkeystring(buf, &len, GETKEYS_BINDKEY, NULL);
+ else if (isset(MULTIBYTE) && ires > 127) {
+ ptr = zhalloc(MB_CUR_MAX);
+ len = ucs4tomb((unsigned int)ires & 0xffffffff, ptr);
}
- if (len == 0)
+ if (len <= 0)
#endif
{
ptr = zhalloc(2);
diff --git a/Src/utils.c b/Src/utils.c
index 7040d0954..e8d2613b4 100644
--- a/Src/utils.c
+++ b/Src/utils.c
@@ -6671,12 +6671,15 @@ dquotedzputs(char const *s, FILE *stream)
# if defined(HAVE_NL_LANGINFO) && defined(CODESET) && !defined(__STDC_ISO_10646__)
/* Convert a character from UCS4 encoding to UTF-8 */
-
-static size_t
+
+static int
ucs4toutf8(char *dest, unsigned int wval)
{
- size_t len;
+ int len;
+ /* UCS4 is now equvalent to UTF-32 and limited to 0 - 0x10_FFFF.
+ * This function accepts 0 - 0x7FFF_FFFF (old range of UCS4) to be
+ * compatible with wctomb(3) (in UTF-8 locale) on Linux. */
if (wval < 0x80)
len = 1;
else if (wval < 0x800)
@@ -6687,8 +6690,12 @@ ucs4toutf8(char *dest, unsigned int wval)
len = 4;
else if (wval < 0x4000000)
len = 5;
- else
+ else if (wval < 0x80000000)
len = 6;
+ else {
+ zerr("character not in range");
+ return -1;
+ }
switch (len) { /* falls through except to the last case */
case 6: dest[5] = (wval & 0x3f) | 0x80; wval >>= 6;
@@ -6705,30 +6712,89 @@ ucs4toutf8(char *dest, unsigned int wval)
}
#endif
+/* Convert UCS4 to a multibyte character in current locale.
+ * Result is saved in buf (must be at least MB_CUR_MAX bytes long).
+ * Returns the number of bytes saved in buf, or -1 if conversion fails. */
-/*
- * The following only occurs once or twice in the code, but in different
- * places depending how character set conversion is implemented.
- */
-#define CHARSET_FAILED() \
- if (how & GETKEY_DOLLAR_QUOTE) { \
- while ((*tdest++ = *++s)) { \
- if (how & GETKEY_UPDATE_OFFSET) { \
- if (s - sstart > *misc) \
- (*misc)++; \
- } \
- if (*s == Snull) { \
- *len = (s - sstart) + 1; \
- *tdest = '\0'; \
- return buf; \
- } \
- } \
- *len = tdest - buf; \
- return buf; \
- } \
- *t = '\0'; \
- *len = t - buf; \
- return buf
+/**/
+int
+ucs4tomb(unsigned int wval, char *buf)
+{
+#if defined(HAVE_WCHAR_H) && defined(HAVE_WCTOMB) && defined(__STDC_ISO_10646__)
+ int count = wctomb(buf, (wchar_t)wval);
+ if (count == -1)
+ zerr("character not in range");
+ return count;
+#else /* !(HAVE_WCHAR_H && HAVE_WCTOMB && __STDC_ISO_10646__) */
+# if defined(HAVE_NL_LANGINFO) && defined(CODESET)
+ if (!strcmp(nl_langinfo(CODESET), "UTF-8")) {
+ return ucs4toutf8(buf, wval);
+ } else {
+# ifdef HAVE_ICONV
+ iconv_t cd;
+ char inbuf[4], *bsave = buf;
+ ICONV_CONST char *inptr = inbuf;
+ size_t inbytes = 4, outbytes = 6;
+ const char *codesetstr = nl_langinfo(CODESET);
+ size_t count;
+ int i;
+
+ /*
+ * If the code set isn't handled, we'd better assume it's US-ASCII
+ * rather than just failing hopelessly. Solaris has a weird habit
+ * of returning 646. This is handled by the native iconv(), but
+ * not by GNU iconv; what's more, some versions of the native iconv
+ * don't handle standard names like ASCII.
+ *
+ * This should only be a problem if there's a mismatch between the
+ * NLS and the iconv in use, which probably only means if libiconv
+ * is in use. We checked at configure time if our libraries pulled
+ * in _libiconv_version, which should be a good test.
+ *
+ * It shouldn't ever be NULL, but while we're being paranoid...
+ */
+# ifdef ICONV_FROM_LIBICONV
+ if (!codesetstr || !*codesetstr)
+ codesetstr = "US-ASCII";
+# endif
+ cd = iconv_open(codesetstr, "UCS-4BE");
+# ifdef ICONV_FROM_LIBICONV
+ if (cd == (iconv_t)-1 && !strcmp(codesetstr, "646")) {
+ codesetstr = "US-ASCII";
+ cd = iconv_open(codesetstr, "UCS-4BE");
+ }
+# endif
+ if (cd == (iconv_t)-1) {
+ zerr("cannot do charset conversion (iconv failed)");
+ return -1;
+ }
+
+ /* store value in big endian form */
+ for (i=3; i>=0; i--) {
+ inbuf[i] = wval & 0xff;
+ wval >>= 8;
+ }
+ count = iconv(cd, &inptr, &inbytes, &buf, &outbytes);
+ iconv_close(cd);
+ if (count) {
+ /* -1 indicates error. Positive value means number of "invalid"
+ * (or "non-reversible") conversions, which we consider as
+ * "out-of-range" characters. */
+ zerr("character not in range");
+ return -1;
+ }
+ return buf - bsave;
+# else /* !HAVE_ICONV */
+ zerr("cannot do charset conversion (iconv not available)");
+ return -1;
+# endif /* HAVE_ICONV */
+ }
+# else /* !(HAVE_NL_LANGINFO && CODESET) */
+ zerr("cannot do charset conversion (NLS not supported)");
+ return -1;
+# endif /* HAVE_NL_LANGINFO && CODESET */
+#endif /* HAVE_WCHAR_H && HAVE_WCTOMB && __STDC_ISO_10646__ */
+}
/*
* Decode a key string, turning it into the literal characters.
@@ -6785,21 +6851,6 @@ getkeystring(char *s, int *len, int how, int *misc)
char *t, *tdest = NULL, *u = NULL, *sstart = s, *tbuf = NULL;
char svchar = '\0';
int meta = 0, control = 0, ignoring = 0;
- int i;
-#if defined(HAVE_WCHAR_H) && defined(HAVE_WCTOMB) && defined(__STDC_ISO_10646__)
- wint_t wval;
- int count;
-#else
- unsigned int wval;
-# if defined(HAVE_NL_LANGINFO) && defined(CODESET)
-# if defined(HAVE_ICONV)
- iconv_t cd;
- char inbuf[4];
- size_t inbytes, outbytes;
-# endif
- size_t count;
-# endif
-#endif
DPUTS((how & GETKEY_UPDATE_OFFSET) &&
(how & ~(GETKEYS_DOLLARS_QUOTE|GETKEY_UPDATE_OFFSET)),
@@ -6864,7 +6915,8 @@ getkeystring(char *s, int *len, int how, int *misc)
}
for (; *s; s++) {
if (*s == '\\' && s[1]) {
- int miscadded;
+ int miscadded, count, i;
+ unsigned int wval;
if ((how & GETKEY_UPDATE_OFFSET) && s - sstart < *misc) {
(*misc)--;
miscadded = 1;
@@ -6979,86 +7031,32 @@ getkeystring(char *s, int *len, int how, int *misc)
*misc = wval;
return s+1;
}
-#if defined(HAVE_WCHAR_H) && defined(HAVE_WCTOMB) && defined(__STDC_ISO_10646__)
- count = wctomb(t, (wchar_t)wval);
+ count = ucs4tomb(wval, t);
if (count == -1) {
- zerr("character not in range");
- CHARSET_FAILED();
+ if (how & GETKEY_DOLLAR_QUOTE) {
+ while ((*tdest++ = *++s)) {
+ if (how & GETKEY_UPDATE_OFFSET) {
+ if (s - sstart > *misc)
+ (*misc)++;
+ }
+ if (*s == Snull) {
+ *len = (s - sstart) + 1;
+ *tdest = '\0';
+ return buf;
+ }
+ }
+ *len = tdest - buf;
+ }
+ else {
+ *t = '\0';
+ *len = t - buf;
+ }
+ return buf;
}
if ((how & GETKEY_UPDATE_OFFSET) && s - sstart < *misc)
(*misc) += count;
t += count;
-# else
-# if defined(HAVE_NL_LANGINFO) && defined(CODESET)
- if (!strcmp(nl_langinfo(CODESET), "UTF-8")) {
- count = ucs4toutf8(t, wval);
- t += count;
- if ((how & GETKEY_UPDATE_OFFSET) && s - sstart < *misc)
- (*misc) += count;
- } else {
-# ifdef HAVE_ICONV
- ICONV_CONST char *inptr = inbuf;
- const char *codesetstr = nl_langinfo(CODESET);
- inbytes = 4;
- outbytes = 6;
- /* store value in big endian form */
- for (i=3;i>=0;i--) {
- inbuf[i] = wval & 0xff;
- wval >>= 8;
- }
- /*
- * If the code set isn't handled, we'd better
- * assume it's US-ASCII rather than just failing
- * hopelessly. Solaris has a weird habit of
- * returning 646. This is handled by the
- * native iconv(), but not by GNU iconv; what's
- * more, some versions of the native iconv don't
- * handle standard names like ASCII.
- *
- * This should only be a problem if there's a
- * mismatch between the NLS and the iconv in use,
- * which probably only means if libiconv is in use.
- * We checked at configure time if our libraries
- * pulled in _libiconv_version, which should be
- * a good test.
- *
- * It shouldn't ever be NULL, but while we're
- * being paranoid...
- */
-#ifdef ICONV_FROM_LIBICONV
- if (!codesetstr || !*codesetstr)
- codesetstr = "US-ASCII";
-#endif
- cd = iconv_open(codesetstr, "UCS-4BE");
-#ifdef ICONV_FROM_LIBICONV
- if (cd == (iconv_t)-1 && !strcmp(codesetstr, "646")) {
- codesetstr = "US-ASCII";
- cd = iconv_open(codesetstr, "UCS-4BE");
- }
-#endif
- if (cd == (iconv_t)-1) {
- zerr("cannot do charset conversion (iconv failed)");
- CHARSET_FAILED();
- }
- count = iconv(cd, &inptr, &inbytes, &t, &outbytes);
- iconv_close(cd);
- if (count == (size_t)-1) {
- zerr("character not in range");
- CHARSET_FAILED();
- }
- if ((how & GETKEY_UPDATE_OFFSET) && s - sstart < *misc)
- (*misc) += count;
-# else
- zerr("cannot do charset conversion (iconv not available)");
- CHARSET_FAILED();
-# endif
- }
-# else
- zerr("cannot do charset conversion (NLS not supported)");
- CHARSET_FAILED();
-# endif
-# endif
if (how & GETKEY_DOLLAR_QUOTE) {
char *t2;
for (t2 = tbuf; t2 < t; t2++) {
Messages sorted by:
Reverse Date,
Date,
Thread,
Author