Zsh Mailing List Archive
Messages sorted by:
Reverse Date,
Date,
Thread,
Author
PATCH: count glyphs in multibyte strings
- X-seq: zsh-workers 27831
- From: Peter Stephenson <p.w.stephenson@xxxxxxxxxxxx>
- To: zsh-workers@xxxxxxx (Zsh hackers list)
- Subject: PATCH: count glyphs in multibyte strings
- Date: Thu, 25 Mar 2010 21:35:03 +0000
- List-help: <mailto:zsh-workers-help@zsh.org>
- List-id: Zsh Workers List <zsh-workers.zsh.org>
- List-post: <mailto:zsh-workers@zsh.org>
- Mailing-list: contact zsh-workers-help@xxxxxxx; run by ezmlm
I noticed we were missing this capability; not sure how useful it is in
practice, but it was straightforward to add.
You might want to check my terminology and assumptions about the way
Unicode works aren't gibberish.
--- ../zsh-git/zsh/Doc/Zsh/expn.yo 2010-03-25 21:01:19.000000000 +0000
+++ Doc/Zsh/expn.yo 2010-03-25 21:23:29.000000000 +0000
@@ -1004,6 +1004,12 @@
length of the string. Most printable characters have a width of one
unit, however certain Asian character sets and certain special effects
use wider characters; combining characters have zero width.
+
+If the tt(m) is repeated, the character either counts zero (if it has
+zero width), else one. For printable character strings this has the
+effect of counting the number of glyphs (visibly separate characters),
+except for the case where combining characters themselves have non-zero
+width (true in certain alphabets).
)
item(tt(r:)var(expr)tt(::)var(string1)tt(::)var(string2)tt(:))(
As tt(l), but pad the words on the right and insert var(string2)
--- ../zsh-git/zsh/Src/subst.c 2010-03-25 21:01:19.000000000 +0000
+++ Src/subst.c 2010-03-25 21:15:21.000000000 +0000
@@ -675,6 +675,35 @@
return dest;
}
+#ifdef MULTIBYTE_SUPPORT
+#define WCPADWIDTH(cchar, mw) wcpadwidth(cchar, mw)
+
+/*
+ * Width of character for padding purposes.
+ * 0: all characters count 1.
+ * 1: use width of multibyte character.
+ * 2: non-zero width characters count 1, zero width 0.
+ */
+static int
+wcpadwidth(wchar_t wc, int multi_width)
+{
+ switch (multi_width)
+ {
+ case 0:
+ return 1;
+
+ case 1:
+ return WCWIDTH(wc);
+
+ default:
+ return WCWIDTH(wc) ? 1 : 0;
+ }
+}
+
+#else
+#define WCPADWIDTH(cchar, mw) (1)
+#endif
+
/*
* Pad the string str, returning a result from the heap (or str itself,
* if it didn't need padding). If str is too large, it will be truncated.
@@ -703,12 +732,6 @@
#endif
)
{
-#ifdef MULTIBYTE_SUPPORT
-#define WCPADWIDTH(cchar) (multi_width ? WCWIDTH(cchar) : 1)
-#else
-#define WCPADWIDTH(cchar) (1)
-#endif
-
char *def, *ret, *t, *r;
int ls, ls2, lpreone, lpostone, lpremul, lpostmul, lr, f, m, c, cc, cl;
convchar_t cchar;
@@ -775,14 +798,14 @@
MB_METACHARINIT();
while (f > 0) {
str += MB_METACHARLENCONV(str, &cchar);
- f -= WCPADWIDTH(cchar);
+ f -= WCPADWIDTH(cchar, multi_width);
}
/* Now finish the first half. */
for (c = prenum; c > 0; ) {
cl = MB_METACHARLENCONV(str, &cchar);
while (cl--)
*r++ = *str++;
- c -= WCPADWIDTH(cchar);
+ c -= WCPADWIDTH(cchar, multi_width);
}
} else {
if (f <= lpreone) {
@@ -796,7 +819,7 @@
/* So skip. */
for (t = preone; f > 0; ) {
t += MB_METACHARLENCONV(t, &cchar);
- f -= WCPADWIDTH(cchar);
+ f -= WCPADWIDTH(cchar, multi_width);
}
/* Then copy the entire remainder. */
while (*t)
@@ -814,7 +837,7 @@
m = lpremul - m;
for (t = premul; m > 0; ) {
t += MB_METACHARLENCONV(t, &cchar);
- m -= WCPADWIDTH(cchar);
+ m -= WCPADWIDTH(cchar, multi_width);
}
/* Output the rest. */
while (*t)
@@ -827,7 +850,7 @@
cl = MB_METACHARLENCONV(t, &cchar);
while (cl--)
*r++ = *t++;
- c -= WCPADWIDTH(cchar);
+ c -= WCPADWIDTH(cchar, multi_width);
}
}
}
@@ -840,7 +863,7 @@
/* Output the first half width of the original string. */
for (c = ls2; c > 0; ) {
cl = MB_METACHARLENCONV(str, &cchar);
- c -= WCPADWIDTH(cchar);
+ c -= WCPADWIDTH(cchar, multi_width);
while (cl--)
*r++ = *str++;
}
@@ -854,7 +877,7 @@
MB_METACHARINIT();
for (c = postnum; c > 0; ) {
cl = MB_METACHARLENCONV(str, &cchar);
- c -= WCPADWIDTH(cchar);
+ c -= WCPADWIDTH(cchar, multi_width);
while (cl--)
*r++ = *str++;
}
@@ -867,7 +890,7 @@
/* Can't fit unrepeated string, truncate it */
for (c = f; c > 0; ) {
cl = MB_METACHARLENCONV(postone, &cchar);
- c -= WCPADWIDTH(cchar);
+ c -= WCPADWIDTH(cchar, multi_width);
while (cl--)
*r++ = *postone++;
}
@@ -890,7 +913,7 @@
MB_METACHARINIT();
while (m > 0) {
cl = MB_METACHARLENCONV(postmul, &cchar);
- m -= WCPADWIDTH(cchar);
+ m -= WCPADWIDTH(cchar, multi_width);
while (cl--)
*r++ = *postmul++;
}
@@ -914,14 +937,14 @@
MB_METACHARINIT();
while (f > 0) {
str += MB_METACHARLENCONV(str, &cchar);
- f -= WCPADWIDTH(cchar);
+ f -= WCPADWIDTH(cchar, multi_width);
}
/* Copy the rest of the original string */
for (c = prenum; c > 0; ) {
cl = MB_METACHARLENCONV(str, &cchar);
while (cl--)
*r++ = *str++;
- c -= WCPADWIDTH(cchar);
+ c -= WCPADWIDTH(cchar, multi_width);
}
} else {
/*
@@ -942,7 +965,7 @@
MB_METACHARINIT();
for (t = preone; f > 0; ) {
t += MB_METACHARLENCONV(t, &cchar);
- f -= WCPADWIDTH(cchar);
+ f -= WCPADWIDTH(cchar, multi_width);
}
/* Copy the rest of preone */
while (*t)
@@ -966,14 +989,14 @@
MB_METACHARINIT();
for (t = premul; m > 0; ) {
t += MB_METACHARLENCONV(t, &cchar);
- m -= WCPADWIDTH(cchar);
+ m -= WCPADWIDTH(cchar, multi_width);
}
/* Now the rest of the repeated string. */
while (c > 0) {
cl = MB_METACHARLENCONV(t, &cchar);
while (cl--)
*r++ = *t++;
- c -= WCPADWIDTH(cchar);
+ c -= WCPADWIDTH(cchar, multi_width);
}
}
for (cc = f / lpremul; cc--;) {
@@ -985,7 +1008,7 @@
cl = MB_METACHARLENCONV(t, &cchar);
while (cl--)
*r++ = *t++;
- c -= WCPADWIDTH(cchar);
+ c -= WCPADWIDTH(cchar, multi_width);
}
}
}
@@ -1023,7 +1046,7 @@
cl = MB_METACHARLENCONV(str, &cchar);
while (cl--)
*r++ = *str++;
- c -= WCPADWIDTH(cchar);
+ c -= WCPADWIDTH(cchar, multi_width);
}
} else {
/*
@@ -1035,7 +1058,7 @@
cl = MB_METACHARLENCONV(str, &cchar);
while (cl--)
*r++ = *str++;
- c -= WCPADWIDTH(cchar);
+ c -= WCPADWIDTH(cchar, multi_width);
}
MB_METACHARINIT();
if (f <= lpostone) {
@@ -1048,7 +1071,7 @@
cl = MB_METACHARLENCONV(postone, &cchar);
while (cl--)
*r++ = *postone++;
- c -= WCPADWIDTH(cchar);
+ c -= WCPADWIDTH(cchar, multi_width);
}
}
} else {
@@ -1059,7 +1082,7 @@
cl = MB_METACHARLENCONV(postone, &cchar);
while (cl--)
*r++ = *postone++;
- c -= WCPADWIDTH(cchar);
+ c -= WCPADWIDTH(cchar, multi_width);
}
}
if (lpostmul) {
@@ -1070,7 +1093,7 @@
cl = MB_METACHARLENCONV(t, &cchar);
while (cl--)
*r++ = *t++;
- c -= WCPADWIDTH(cchar);
+ c -= WCPADWIDTH(cchar, multi_width);
}
}
/*
@@ -1083,7 +1106,7 @@
cl = MB_METACHARLENCONV(postmul, &cchar);
while (cl--)
*r++ = *postmul++;
- m -= WCPADWIDTH(cchar);
+ m -= WCPADWIDTH(cchar, multi_width);
}
}
}
@@ -1782,7 +1805,7 @@
case 'm':
#ifdef MULTIBYTE_SUPPORT
- multi_width = 1;
+ multi_width++;
#endif
break;
--- ../zsh-git/zsh/Src/utils.c 2010-03-25 21:01:19.000000000 +0000
+++ Src/utils.c 2010-03-25 21:14:17.000000000 +0000
@@ -4406,6 +4406,8 @@
* until end of string.
*
* If width is 1, return total character width rather than number.
+ * If width is greater than 1, return 1 if character has non-zero width,
+ * else 0.
*/
/**/
@@ -4447,9 +4449,12 @@
* turn this into 1 for backward compatibility.
*/
int wcw = WCWIDTH(wc);
- if (wcw >= 0)
- num += wcw;
- else
+ if (wcw >= 0) {
+ if (width == 1)
+ num += wcw;
+ else
+ num += (wcw > 0);
+ } else
num++;
} else
num++;
--
Peter Stephenson <p.w.stephenson@xxxxxxxxxxxx>
Web page now at http://homepage.ntlworld.com/p.w.stephenson/
Messages sorted by:
Reverse Date,
Date,
Thread,
Author