Zsh Mailing List Archive
Messages sorted by:
Reverse Date,
Date,
Thread,
Author
PATCH: improvements to \u and \U
- X-seq: zsh-workers 18525
- From: Oliver Kiddle <okiddle@xxxxxxxxxxx>
- To: Zsh workers <zsh-workers@xxxxxxxxxx>
- Subject: PATCH: improvements to \u and \U
- Date: Tue, 13 May 2003 10:58:50 +0200
- Mailing-list: contact zsh-workers-help@xxxxxxxxxx; run by ezmlm
This is an attempt to make the \u and \U code have a better chance of
working on a variety of platforms.
I've added a manual unicode to UTF-8 conversion which can be used if
nl_langinfo(CODESET) is UTF-8. I've also changed the iconv fallback to
convert to the output from nl_langinfo. I may yet also change it to
convert from UTF-8 instead of UCS4.
Oliver
Index: Src/utils.c
===================================================================
RCS file: /cvsroot/zsh/zsh/Src/utils.c,v
retrieving revision 1.50
diff -u -r1.50 utils.c
--- Src/utils.c 12 May 2003 11:45:30 -0000 1.50
+++ Src/utils.c 13 May 2003 08:51:40 -0000
@@ -30,13 +30,15 @@
#include "zsh.mdh"
#include "utils.pro"
-#if defined(HAVE_WCHAR_H) && defined(HAVE_WCTOMB)
-#include <wchar.h>
-# ifndef __STDC_ISO_10646__
-# if defined(HAVE_ICONV) || defined(HAVE_LIBICONV)
-# include <iconv.h>
-# endif
-# endif
+#if defined(HAVE_WCHAR_H) && defined(HAVE_WCTOMB) && defined (__STDC_ISO_10646__)
+# include <wchar.h>
+#else
+# ifdef HAVE_LANGINFO_H
+# include <langinfo.h>
+# if defined(HAVE_ICONV) || defined(HAVE_LIBICONV)
+# include <iconv.h>
+# endif
+# endif
#endif
/* name of script being sourced */
@@ -3271,6 +3273,42 @@
}
#endif
+# if defined(HAVE_NL_LANGINFO) && defined(CODESET) && !defined(__STDC_ISO_10646__)
+/* Convert a character from UCS4 encoding to UTF-8 */
+
+size_t
+ucs4toutf8(char *dest, unsigned int wval)
+{
+ size_t len;
+
+ if (wval < 0x80)
+ len = 1;
+ else if (wval < 0x800)
+ len = 2;
+ else if (wval < 0x10000)
+ len = 3;
+ else if (wval < 0x200000)
+ len = 4;
+ else if (wval < 0x4000000)
+ len = 5;
+ else
+ len = 6;
+
+ switch (len) { /* falls through except to the last case */
+ case 6: dest[5] = (wval & 0x3f) | 0x80; wval >>= 6;
+ case 5: dest[4] = (wval & 0x3f) | 0x80; wval >>= 6;
+ case 4: dest[3] = (wval & 0x3f) | 0x80; wval >>= 6;
+ case 3: dest[2] = (wval & 0x3f) | 0x80; wval >>= 6;
+ case 2: dest[1] = (wval & 0x3f) | 0x80; wval >>= 6;
+ *dest = wval | (0xfc << (6 - len)) & 0xfc;
+ break;
+ case 1: *dest = wval;
+ }
+
+ return len;
+}
+#endif
+
/*
* Decode a key string, turning it into the literal characters.
* The length is returned in len.
@@ -3299,18 +3337,18 @@
char svchar = '\0';
int meta = 0, control = 0;
int i;
-#if defined(HAVE_WCHAR_H) && defined(HAVE_WCTOMB)
-# ifdef __STDC_ISO_10646__
+#if defined(HAVE_WCHAR_H) && defined(HAVE_WCTOMB) && defined(__STDC_ISO_10646__)
wint_t wval;
-# elif defined(HAVE_ICONV) || defined(HAVE_LIBICONV)
+ size_t count;
+#else
unsigned int wval;
+# if defined(HAVE_NL_LANGINFO) && defined(CODESET) && (defined(HAVE_ICONV) || defined(HAVE_LIBICONV))
iconv_t cd;
char inbuf[4];
- wchar_t outbuf[1];
size_t inbytes, outbytes;
- char *inptr, *outptr;
-# endif
+ char *inptr;
size_t count;
+# endif
#endif
if (fromwhere == 6)
@@ -3387,8 +3425,6 @@
*misc = 1;
break;
}
-#if defined(HAVE_WCHAR_H) && defined(HAVE_WCTOMB)
-#if defined(__STDC_ISO_10646__) || defined(HAVE_ICONV) || defined(HAVE_LIBICONV)
case 'u':
case 'U':
wval = 0;
@@ -3407,21 +3443,10 @@
*misc = wval;
return s+1;
}
-#ifdef __STDC_ISO_10646__
+#if defined(HAVE_WCHAR_H) && defined(HAVE_WCTOMB) && defined(__STDC_ISO_10646__)
count = wctomb(t, (wchar_t)wval);
-#elif defined(HAVE_ICONV) || defined(HAVE_LIBICONV)
- inbytes = outbytes = 4;
- inptr = inbuf;
- outptr = (char *)outbuf;
- /* assume big endian convention for UCS-4 */
- for (i=3;i>=0;i--) {
- inbuf[i] = wval & 0xff;
- wval >>= 8;
- }
-
- cd = iconv_open("WCHAR_T", "ISO-10646");
- if (cd == (iconv_t)-1) {
- zerr("cannot do charset conversion", NULL, 0);
+ if (count == (size_t)-1) {
+ zerr("character not in range", NULL, 0);
if (fromwhere == 4) {
for (u = t; (*u++ = *++s););
return t;
@@ -3430,24 +3455,58 @@
*len = t - buf;
return buf;
}
- iconv(cd, (const char **)&inptr, &inbytes, &outptr, &outbytes);
- iconv_close(cd);
- count = wctomb(t, *outbuf);
-#endif
- if (count == (size_t)-1) {
- zerr("character not in range", NULL, 0);
- if (fromwhere == 4) {
- for (u = t; (*u++ = *++s););
- return t;
+ t += count;
+ continue;
+# else
+# if defined(HAVE_NL_LANGINFO) && defined(CODESET)
+ if (!strcmp(nl_langinfo(CODESET), "UTF-8")) {
+ t += ucs4toutf8(t, wval);
+ continue;
+ } else {
+# if defined(HAVE_ICONV) || defined(HAVE_LIBICONV)
+ inbytes = 4;
+ outbytes = 6;
+ inptr = inbuf;
+ /* assume big endian convention for UCS-4 */
+ for (i=3;i>=0;i--) {
+ inbuf[i] = wval & 0xff;
+ wval >>= 8;
}
+
+ cd = iconv_open(nl_langinfo(CODESET), "ISO-10646");
+ if (cd == (iconv_t)-1) {
+ zerr("cannot do charset conversion", NULL, 0);
+ if (fromwhere == 4) {
+ for (u = t; (*u++ = *++s););
+ return t;
+ }
+ *t = '\0';
+ *len = t - buf;
+ return buf;
+ }
+ count = iconv(cd, (char **)&inptr, &inbytes, &t, &outbytes);
+ iconv_close(cd);
+ if (count == (size_t)-1) {
+ zerr("cannot do charset conversion", NULL, 0);
+ *t = '\0';
+ *len = t - buf;
+ return buf;
+ }
+ continue;
+# else
+ zerr("cannot do charset conversion", NULL, 0);
*t = '\0';
*len = t - buf;
return buf;
+# endif
}
- t += count;
- continue;
-#endif
-#endif
+# else
+ zerr("cannot do charset conversion", NULL, 0);
+ *t = '\0';
+ *len = t - buf;
+ return buf;
+# endif
+# endif
default:
def:
if ((idigit(*s) && *s < '8') || *s == 'x') {
Messages sorted by:
Reverse Date,
Date,
Thread,
Author