Zsh Mailing List Archive
Messages sorted by: Reverse Date, Date, Thread, Author

Another idea on how to insert illegal multibyte characters



Here's another idea on how filenames with illegal byte sequences could
be inserted in the command line:  insert a $'\321' string for each one.
Since this idiom uses plain ASCII, it inserts into the line just fine.
It also has the advantage that zsh will interpret the sequence back into
the appropriate character.

I created an initial patch for this.  It works to insert the necessary
letters into the command-line, but has a bug where tab completion will
not remove enough characters when moving from match to match if the
previous match had one or more expanded $'\321' sequences.  If folks
like this idea, I imagine this bug wouldn't be too hard to fix.

Opinions?

..wayne..
--- Src/Zle/zle_utils.c	12 Jan 2006 01:04:17 -0000	1.36
+++ Src/Zle/zle_utils.c	12 Jan 2006 03:12:34 -0000
@@ -233,8 +233,9 @@ mod_export ZLE_STRING_T
 stringaszleline(char *instr, int incs, int *outll, int *outsz, int *outcs)
 {
     ZLE_STRING_T outstr;
-    int ll, sz;
+    int ll;
 #ifdef MULTIBYTE_SUPPORT
+    int eol = 0;
     mbstate_t mbs;
 #endif
 
@@ -256,17 +257,15 @@ stringaszleline(char *instr, int incs, i
     }
     unmetafy(instr, &ll);
 
-    /*
-     * ll is the maximum number of characters there can be in
-     * the output string; the closer to ASCII the string, the
-     * better the guess.  For the 2 see above.
-     */
-    sz = (ll + 2) * ZLE_CHAR_SIZE;
+#ifdef MULTIBYTE_SUPPORT
+    /* Compute the maximum amount of memory we'll need, which takes the
+     * pessimistic view that every character in the input needs to turn
+     * into a $'\321' string in the output.  For the reason for the +2,
+     * see the function comments. */
     if (outsz)
-	*outsz = ll;
-    outstr = (ZLE_STRING_T)zalloc(sz);
+	*outsz = ll * 7;
+    outstr = (ZLE_STRING_T)zalloc((ll*7 + 2) * ZLE_CHAR_SIZE);
 
-#ifdef MULTIBYTE_SUPPORT
     if (ll) {
 	char *inptr = instr;
 	wchar_t *outptr = outstr;
@@ -275,22 +274,36 @@ stringaszleline(char *instr, int incs, i
 	memset(&mbs, '\0', sizeof mbs);
 
 	while (ll > 0) {
-	    size_t cnt = mbrtowc(outptr, inptr, ll, &mbs);
+	    size_t cnt = eol ? MB_INVALID : mbrtowc(outptr, inptr, ll, &mbs);
 
-	    /*
-	     * At this point we don't handle either incomplete (-2) or
-	     * invalid (-1) multibyte sequences.  Use the current length
-	     * and return.
-	     */
-	    if (cnt == MB_INCOMPLETE || cnt == MB_INVALID)
+	    switch (cnt) {
+	    case MB_INCOMPLETE:
+		eol = 1;
+		/* FALL THROUGH */
+	    case MB_INVALID:
+		/* Get mbs out of its undefined state. */
+		memset(&mbs, '\0', sizeof mbs);
+		/* Transform invalid character sequences into $'\321'
+		 * strings that will be converted by the shell into
+		 * the appropriate character. */
+		*outptr++ = L'$';
+		*outptr++ = L'\'';
+		*outptr++ = L'\\';
+		*outptr++ = L'0' + (STOUC(*inptr) / 0100);
+		*outptr++ = L'0' + ((STOUC(*inptr) / 010) & 07);
+		*outptr++ = L'0' + (STOUC(*inptr) & 07);
+		*outptr = L'\'';
+		cnt = 1;
 		break;
-
-	    if (cnt == 0) {
+	    case 0:
 		/* Converting '\0' returns 0, but a '\0' is a real
 		 * character for us, so we should consume 1 byte
 		 * (certainly true for Unicode and unlikely to be false
 		 * in any non-pathological multibyte representation). */
 		cnt = 1;
+		/* FALL THROUGH */
+	    default:
+		break;
 	    }
 
 	    if (outcs) {
@@ -311,7 +324,15 @@ stringaszleline(char *instr, int incs, i
 	if (outcs)
 	    *outcs = 0;
     }
-#else
+
+#else /* !MULTIBYTE_SUPPORT */
+
+    if (outsz)
+	*outsz = ll;
+    /* ll is the number of characters in the unmetafied string.  For the
+     * reason for the +2, see the function comments. */
+    outstr = (ZLE_STRING_T)zalloc(ll + 2);
+
     memcpy(outstr, instr, ll);
     *outll = ll;
     if (outcs)


Messages sorted by: Reverse Date, Date, Thread, Author