Zsh Mailing List Archive
Messages sorted by: Reverse Date, Date, Thread, Author

PATCH: prompt truncation with multibyte characters



Here's a go at extending prompt truncation to work with multibyte
characters in the prompt string.  It doesn't look very much like the
existing code in the other branch because I didn't really understand
that (possibly I wrote bits of it).  It seems to do roughly the right
thing; if anyone finds any more obscure uses, they can be added to the
tests.

It's hard to write tests that depend on a given locale, however.  If
anyone wants to try doing this in a fail-safe manner, maybe printing a
message if the locale isn't available, please do.

I didn't alter the documentation since it already seems intuitively
obvious that truncation should work on displayed characters, not bytes.

Index: Src/prompt.c
===================================================================
RCS file: /cvsroot/zsh/zsh/Src/prompt.c,v
retrieving revision 1.26
diff -u -r1.26 prompt.c
--- Src/prompt.c	13 Oct 2005 13:19:30 -0000	1.26
+++ Src/prompt.c	19 Oct 2005 08:24:43 -0000
@@ -87,7 +87,7 @@
 
 /* Non-zero if truncating the current segment of the buffer. */
 
-static int trunclen;
+static int truncwidth;
 
 /* Current level of nesting of %{ / %} sequences. */
 
@@ -179,7 +179,7 @@
     fm = s;
     bp = bufline = buf = zshcalloc(bufspc = 256);
     bp1 = NULL;
-    trunclen = 0;
+    truncwidth = 0;
     putpromptchar(1, '\0');
     addbufspc(1);
     if(dontcount)
@@ -229,7 +229,7 @@
 	    } else if (minus)
 		arg = -1;
 	    if (*fm == '(') {
-		int tc, otrunclen;
+		int tc, otruncwidth;
 
 		if (idigit(*++fm)) {
 		    arg = zstrtol(fm, &fm, 10);
@@ -334,14 +334,14 @@
 		    return 0;
 		fm++;
 		/* Don't do the current truncation until we get back */
-		otrunclen = trunclen;
-		trunclen = 0;
+		otruncwidth = truncwidth;
+		truncwidth = 0;
 		if (!putpromptchar(test == 1 && doprint, sep) || !*++fm ||
 		    !putpromptchar(test == 0 && doprint, ')')) {
-		    trunclen = otrunclen;
+		    truncwidth = otruncwidth;
 		    return 0;
 		}
-		trunclen = otrunclen;
+		truncwidth = otruncwidth;
 		continue;
 	    }
 	    if (!doprint)
@@ -973,14 +973,14 @@
 	 * can be finished, backing up so that the new truncation
 	 * can be started afterwards.
 	 */
-	if (trunclen) {
+	if (truncwidth) {
 	    while (*--fm != '%')
 		;
 	    fm--;
 	    return 0;
 	}
 
-	trunclen = arg;
+	truncwidth = arg;
 	if (*fm != ']')
 	    fm++;
 	while (*fm && *fm != truncchar) {
@@ -996,6 +996,12 @@
 	    *bp++ = '<';
 	}
 	ptr = buf + w;		/* addbufspc() may have realloc()'d buf */
+	/*
+	 * Now:
+	 *   buf is the start of the output prompt buffer
+	 *   ptr is the start of the truncation string
+	 *   bp is the end of the truncation string
+	 */
 	truncstr = ztrduppfx(ptr, bp - ptr);
 
 	bp = ptr;
@@ -1006,24 +1012,237 @@
 	trunccount = 0;
 	ptr = buf + w;		/* putpromptchar() may have realloc()'d */
 	*bp = '\0';
+	/*
+	 * Now:
+	 *   ptr is the start of the truncation string and also
+	 *     where we need to start putting any truncated output
+	 *   bp is the end of the string we have just added, which
+	 *     may need truncating.
+	 */
 
+	/*
+	 * w below is screen width if multibyte support is enabled
+	 * (note that above it was a raw string pointer difference).
+	 * It's the full width of the string we may need to truncate.
+	 *
+	 * truncwidth has come from the user, so we interpret this
+	 * as a screen width, too.
+	 */
 	countprompt(ptr, &w, 0, -1);
-	if (w > trunclen) {
+	if (w > truncwidth) {
 	    /*
-	     * We need to truncate.  t points to the truncation string --
-	     * which is inserted literally, without nice representation.
-	     * tlen is its length, and maxlen is the amount of the main
-	     * string that we want to keep.  Note that if the truncation
-	     * string is longer than the truncation length (tlen >
-	     * trunclen), the truncation string is used in full.
+	     * We need to truncate.  t points to the truncation string
+	     * -- which is inserted literally, without nice
+	     * representation.  twidth is its printing width, and maxwidth
+	     * is the amount of the main string that we want to keep.
+	     * Note that if the truncation string is longer than the
+	     * truncation length (twidth > truncwidth), the truncation
+	     * string is used in full.
 	     *
 	     * TODO: we don't take account of multibyte characters
 	     * in the string we're truncating.
 	     */
 	    char *t = truncstr;
 	    int fullen = bp - ptr;
-	    int tlen = ztrlen(t), maxlen;
-	    maxlen = tlen < trunclen ? trunclen - tlen : 0;
+	    int twidth, maxwidth;
+#ifdef ZLE_UNICODE_SUPPORT
+	    int ntrunc = strlen(t);
+
+	    /* Use screen width of string */
+	    twidth = mb_width(t);
+	    if (twidth < truncwidth) {
+		maxwidth = truncwidth - twidth;
+		/*
+		 * It's not safe to assume there are no invisible substrings
+		 * just because the width is less than the full string
+		 * length since there may be multibyte characters.
+		 */
+		addbufspc(ntrunc+1);
+		/* may have realloc'd */
+		ptr = bp - fullen;
+
+		if (truncatleft) {
+		    /*
+		     * To truncate at the left, selectively copy
+		     * maxwidth bytes from the main prompt, preceeded
+		     * by the truncation string in full.
+		     *
+		     * We're overwriting the string containing the
+		     * text to be truncated, so copy it.  We've
+		     * just ensured there's sufficient space at the
+		     * end of the prompt string.
+		     *
+		     * Pointer into text to be truncated.
+		     */
+		    char *fulltextptr, *fulltext;
+		    int remw;
+		    mbstate_t mbs;
+
+		    fulltextptr = fulltext = bp;
+		    memmove(fulltext, ptr, fullen);
+		    fulltext[fullen] = '\0';
+
+		    /* Copy the truncstr into place. */
+		    while (*t)
+			*ptr++ = *t++;
+
+		    memset(&mbs, 0, sizeof(mbstate_t));
+
+		    /*
+		     * Find the point in the text at which we should
+		     * start copying, i.e. when the remaining width
+		     * is less than or equal to the maximum width.
+		     */
+		    remw = w;
+		    while (remw > maxwidth && *fulltextptr) {
+			if (*fulltextptr == Inpar) {
+			    /*
+			     * Text marked as invisible: copy
+			     * regardless, since we don't know what
+			     * this does but it shouldn't affect
+			     * the width.
+			     */
+			    for (;;) {
+				*ptr++ = *fulltextptr;
+				if (*fulltextptr == Outpar ||
+				    *fulltextptr == '\0')
+				    break;
+				fulltextptr++;
+			    }
+			} else {
+			    /*
+			     * Normal text: build up a multibyte character.
+			     */
+			    char inchar;
+			    wchar_t cc;
+			    int ret;
+
+			    /*
+			     * careful: string is still metafied (we
+			     * need that because we don't know a
+			     * priori when to stop and the resulting
+			     * string must be metafied).
+			     */
+			    if (*fulltextptr == Meta)
+				inchar = *++fulltextptr ^ 32;
+			    else
+				inchar = *fulltextptr;
+			    fulltextptr++;
+			    ret = mbrtowc(&cc, &inchar, 1, &mbs);
+
+			    if (ret != -2) {
+				/* complete */
+				if (ret <= 0) {
+				    /* assume a single-byte character */
+				    remw--;
+				    if (ret < 0) {
+					/* need to reset invalid state */
+					memset(&mbs, 0, sizeof(mbstate_t));
+				    }
+				} else {
+				    remw -= wcwidth(cc);
+				}
+			    }
+			}
+		    }
+
+		    /*
+		     * Now simply copy the rest of the text.  Still
+		     * metafied, so this is easy.
+		     */
+		    while (*fulltextptr)
+			*ptr++ = *fulltextptr++;
+		    /* Mark the end of copying */
+		    bp = ptr;
+		} else {
+		    /*
+		     * Truncating at the right is easier: just leave
+		     * enough characters until we have reached the
+		     * maximum width.
+		     */
+		    char *skiptext = ptr;
+		    mbstate_t mbs;
+
+		    memset(&mbs, 0, sizeof(mbstate_t));
+
+		    while (maxwidth > 0 && *skiptext) {
+			if (*skiptext == Inpar) {
+			    for (; *skiptext != Outpar && *skiptext;
+				 skiptext++);
+			} else {
+			    char inchar;
+			    wchar_t cc;
+			    int ret;
+
+			    if (*skiptext == Meta)
+				inchar = *++skiptext ^ 32;
+			    else
+				inchar = *skiptext;
+			    skiptext++;
+			    ret = mbrtowc(&cc, &inchar, 1, &mbs);
+
+			    if (ret != -2) {
+				/* complete or invalid character */
+				if (ret <= 0) {
+				    /* assume single byte */
+				    maxwidth--;
+				    if (ret < 0) {
+					/* need to reset invalid state */
+					memset(&mbs, 0, sizeof(mbstate_t));
+				    }
+				} else {
+				    maxwidth -= wcwidth(cc);
+				}
+			    }
+			}
+		    }
+		    /*
+		     * We don't need the visible text from now on,
+		     * but we'd better copy any invisible bits.
+		     * History dictates that these go after the
+		     * truncation string.  This is sensible since
+		     * they may, for example, turn off an effect which
+		     * should apply to all text at this point.
+		     *
+		     * Copy the truncstr.
+		     */
+		    ptr = skiptext;
+		    while (*t)
+			*ptr++ = *t++;
+		    bp = ptr;
+		    if (*skiptext) {
+			/* Move remaining text so we don't overwrite it */
+			memmove(bp, skiptext, strlen(skiptext)+1);
+			skiptext = bp;
+
+			/*
+			 * Copy anything we want, updating bp
+			 */
+			while (*skiptext) {
+			    if (*skiptext == Inpar) {
+				for (;;) {
+				    *bp++ = *skiptext;
+				    if (*skiptext == Outpar ||
+					*skiptext == '\0')
+					break;
+				    skiptext++;
+				}
+			    }
+			    else
+				skiptext++;
+			}
+		    }
+		}
+	    } else {
+		/* Just copy truncstr; no other text appears. */
+		while (*t)
+		    *ptr++ = *t++;
+		bp = ptr;
+	    }
+	    *bp = '\0';
+#else
+	    twidth = ztrlen(t);
+	    maxwidth = twidth < truncwidth ? truncwidth - twidth : 0;
 	    if (w < fullen) {
 		/* Invisible substrings, lots of shuffling. */
 		int n = strlen(t);
@@ -1035,6 +1254,13 @@
 		    p = ptr + n;
 		    q = p;
 
+		    /*
+		     * I don't think we need n and the test below since
+		     * we must have enough space (we are using a subset
+		     * of the existing text with no repetition) and the
+		     * string is null-terminated, so I haven't copied it
+		     * to the ZLE_UNICODE_SUPPORT section.
+		     */
 		    n = fullen - w;
 
 		    /* Shift the whole string right, then *
@@ -1047,7 +1273,7 @@
 				--n;
 			    } while (*p++ != Outpar && *p && n);
 			else if (w) {
-			    if (--w < maxlen)
+			    if (--w < maxwidth)
 				*q++ = *p;
 			    ++p;
 			}
@@ -1058,11 +1284,11 @@
 		    q = ptr + fullen;
 
 		    /* First skip over as much as will "fit". */
-		    while (w > 0 && maxlen > 0) {
+		    while (w > 0 && maxwidth > 0) {
 			if (*ptr == Inpar)
 			    while (*ptr++ != Outpar && *ptr) {;}
 			else
-			    ++ptr, --w, --maxlen;
+			    ++ptr, --w, --maxwidth;
 		    }
 		    if (ptr < q) {
 			/* We didn't reach the end of the string. *
@@ -1087,25 +1313,26 @@
 		}
 	    } else {
 		/* No invisible substrings. */
-		if (tlen > fullen) {
-		    addbufspc(tlen - fullen);
+		if (twidth > fullen) {
+		    addbufspc(twidth - fullen);
 		    ptr = bp;	/* addbufspc() may have realloc()'d buf */
-		    bp += tlen - fullen;
+		    bp += twidth - fullen;
 		} else
-		    bp -= fullen - trunclen;
+		    bp -= fullen - truncwidth;
 		if (truncatleft) {
-		    if (maxlen)
-			memmove(ptr + strlen(t), ptr + fullen - maxlen,
-				maxlen);
+		    if (maxwidth)
+			memmove(ptr + strlen(t), ptr + fullen - maxwidth,
+				maxwidth);
 		} else
-		    ptr += maxlen;
+		    ptr += maxwidth;
 	    }
 	    /* Finally, copy the truncstr into place. */
 	    while (*t)
 		*ptr++ = *t++;
+#endif
 	}
 	zsfree(truncstr);
-	trunclen = 0;
+	truncwidth = 0;
 	/*
 	 * We may have returned early from the previous putpromptchar *
 	 * because we found another truncation following this one.    *
@@ -1116,7 +1343,7 @@
 	if (*fm != endchar) {
 	    fm++;
 	    /*
-	     * With trunclen set to zero, we always reach endchar *
+	     * With truncwidth set to zero, we always reach endchar *
 	     * (or the terminating NULL) this time round.         *
 	     */
 	    if (!putpromptchar(doprint, endchar))
@@ -1132,7 +1359,7 @@
 		fm++;
 	    fm++;
 	}
-	if (trunclen || !*fm)
+	if (truncwidth || !*fm)
 	    return 0;
     }
     return 1;
Index: Src/utils.c
===================================================================
RCS file: /cvsroot/zsh/zsh/Src/utils.c,v
retrieving revision 1.99
diff -u -r1.99 utils.c
--- Src/utils.c	13 Oct 2005 16:30:14 -0000	1.99
+++ Src/utils.c	19 Oct 2005 08:24:43 -0000
@@ -3520,6 +3520,49 @@
     return retstr;
 }
 
+/*
+ * Return the screen width of a multibyte string.  The input
+ * string is metafied.
+ */
+/**/
+mod_export int
+mb_width(const char *s)
+{
+    char *ums = ztrdup(s), *umptr;
+    int umlen;
+    int width = 0;
+    mbstate_t mbs;
+
+    memset(&mbs, 0, sizeof(mbs));
+    umptr = unmetafy(ums, &umlen);
+    /*
+     * Convert one wide character at a time.  We could convet
+     * the entire string using mbsrtowcs(), but that terminates on
+     * a NUL and we might have embedded NULs.
+     */
+    while (umlen > 0) {
+	wchar_t cc;
+	int ret = mbrtowc(&cc, umptr, umlen, &mbs);
+
+	if (ret <= 0) {
+	    /* Assume a single-width character. */
+	    width++;
+	    ret = 1;
+	} else {
+	    int wret = wcwidth(cc);
+	    if (wret > 0)
+		width += wret;
+	}
+
+	umlen -= ret;
+	umptr += ret;
+    }
+
+    free(ums);
+
+    return width;
+}
+
 /**/
 #endif /* ZLE_UNICODE_SUPPORT */
 
Index: Test/D01prompt.ztst
===================================================================
RCS file: /cvsroot/zsh/zsh/Test/D01prompt.ztst,v
retrieving revision 1.2
diff -u -r1.2 D01prompt.ztst
--- Test/D01prompt.ztst	9 Jul 2001 18:31:25 -0000	1.2
+++ Test/D01prompt.ztst	19 Oct 2005 08:24:43 -0000
@@ -68,11 +68,11 @@
 >true
 >false
 
-  print -P '%10<...<truncated at 10%<< Not truncated'
-  print -P '%10>...>truncated at 10%>> Not truncated'
+  print -P 'start %10<...<truncated at 10%<< Not truncated%3< ...<Not shown'
+  print -P 'start %10>...>truncated at 10%>> Not truncated%3> ...>Not shown'
 0:prompt truncation
->...d at 10 Not truncated
->truncat... Not truncated
+>start ...d at 10 Not truncated ...
+>start truncat... Not truncated ...
 
 # It's hard to check the time and date as they are moving targets.
 # We therefore just check that various forms of the date are consistent.

-- 
Peter Stephenson <pws@xxxxxxx>                  Software Engineer
CSR PLC, Churchill House, Cambridge Business Park, Cowley Road
Cambridge, CB4 0WZ, UK                          Tel: +44 (0)1223 692070


This message has been scanned for viruses by BlackSpider MailControl - www.blackspider.com



Messages sorted by: Reverse Date, Date, Thread, Author