Zsh Mailing List Archive
Messages sorted by: Reverse Date, Date, Thread, Author

[PATCH] reset mbstate when locale changes



On NetBSD, D07multibyte is skipped even if --disable-multibyte is NOT specified.
This means that no valid UTF-8 locale was found in the %prep section of D07.
But this is strange. If I manually run the following

$ zsh -f
% LANG=en_US.UTF-8
% [[ é = ? ]] && echo OK
OK

then it works. But if I try (as in ztst.zsh)

$ zsh -f
% LANG=C
% unset -m LC_\*
% LANG=en_US.UTF-8
% [[ é = ? ]] || echo failed
failed

then it fails. After spending some time on this strange behavior, I found that
"do pattern match in C-locale and then switch to UTF-8" causes the problem:

$ zsh -f
% LANG=C
% [[ a = ? ]] || echo failed     # do any pattern matching here
% LANG=en_US.UTF-8
% [[ é = ? ]] || echo failed
failed

In pattern.c, there is a global variable
static mbstate_t shiftstate;                                           

On NetBSD, when mbrtowc(..., &shiftstate) returns, even if the return value
is not MB_{INVALID,INCOMPLETE}, it seems shiftstate is set to a locale-specific
value and can't be used again if LC_CTYPE changes.


In the patch below, shiftstate (and another global variable "mb_shiftstate" in
utils.c) is reset when LC_CTYPE changes (or may have changed). Reseting
mb_shiftstate (by calling mb_charinit()) may not be necessary, but doing so
should cause not cause any problem.

Another possibility would be to use local mb_state (initialized to zero) in
each function (metacharinc(), charref(), ...).
# This should work for UTF-8. It would cause a problem for more "statefull"
# encodings, but I think such encodings are not supported in the current zsh
# anyway. Or is it?
Maybe the use of the global variables are to improve the performance?

I also added a memset() to charsub().


With this patch, only one test in D07multibyte fails:

@@ -1,3 +1,2 @@
 OK
 OK
-OK
Test ./D07multibyte.ztst failed: output differs from expected as shown above for:
  if zmodload zsh/regex 2>/dev/null; then
    [[ $'\ua0' =~ '^.$' ]] && print OK
    [[ $'\ua0' =~ $'^\ua0$' ]] && print OK
    [[ $'\ua0'X =~ '^X$' ]] || print OK
  else
    ZTST_skip="regexp library not found."
  fi
Was testing: Ensure no confusion on metafied input to regex module
./D07multibyte.ztst: test failed.
The following may (or may not) help identifying the cause:
  A failure here may indicate the system regex library does not
  support character sets outside the portable 7-bit range.

The same test fails also on OpenBSD. The regex library on these OSes do not
support UTF-8 and we can ignore the failure.



diff --git a/Src/params.c b/Src/params.c
index 970a207e4..27ea82298 100644
--- a/Src/params.c
+++ b/Src/params.c
@@ -4594,6 +4594,19 @@ static struct localename {
     {NULL, 0}
 };
 
+/* On some systems (at least on NetBSD-9), when LC_CTYPE changes,
+ * global variables (type mbstate_t) used by mbrtowc() etc. need be
+ * reset by clear_mbstate() */
+
+/**/
+static void
+clear_mbstate(void) {
+#ifdef MULTIBYTE_SUPPORT
+    mb_charinit();	/* utils.c */
+    clear_shiftstate();	/* pattern.c */
+#endif
+}
+
 /**/
 static void
 setlang(char *x)
@@ -4616,6 +4629,7 @@ setlang(char *x)
      * that case.
      */
     setlocale(LC_ALL, x ? unmeta(x) : "");
+    clear_mbstate();
     queue_signals();
     for (ln = lc_names; ln->name; ln++)
 	if ((x = getsparam_u(ln->name)) && *x)
@@ -4641,8 +4655,10 @@ lc_allsetfn(Param pm, char *x)
 	    unqueue_signals();
 	}
     }
-    else
+    else {
 	setlocale(LC_ALL, unmeta(x));
+	clear_mbstate();
+    }
 }
 
 /**/
@@ -4679,6 +4695,7 @@ lcsetfn(Param pm, char *x)
 		setlocale(ln->category, unmeta(x));
     }
     unqueue_signals();
+    clear_mbstate();	/* LC_CTYPE may have changed */
 }
 #endif /* USE_LOCALE */
 
@@ -5627,6 +5644,7 @@ endparamscope(void)
 		    setlocale(ln->category, val);
 	    }
 	}
+	clear_mbstate();    /* LC_CTYPE may have changed */
     }
 #endif /* USE_LOCALE */
     unqueue_signals();
diff --git a/Src/pattern.c b/Src/pattern.c
index c0e31b78e..e947d1216 100644
--- a/Src/pattern.c
+++ b/Src/pattern.c
@@ -320,6 +320,14 @@ typedef wint_t patint_t;
  */
 static mbstate_t shiftstate;
 
+/* See clear_mbstate() in params.c for the use of clear_shiftstate() */
+
+/**/
+mod_export void
+clear_shiftstate(void) {
+    memset(&shiftstate, 0, sizeof(shiftstate));
+}
+
 /*
  * Multibyte version: it's (almost) as easy to return the
  * value as not, so do so since we sometimes need it..
@@ -1999,6 +2007,8 @@ charsub(char *x, char *y)
 
 	if (ret == MB_INVALID || ret == MB_INCOMPLETE) {
 	    /* Error.  Treat remainder as single characters */
+	    /* Reset the shift state for next time. */
+	    memset(&shiftstate, 0, sizeof(shiftstate));
 	    return res + (y - x);
 	}
 






Messages sorted by: Reverse Date, Date, Thread, Author