Zsh Mailing List Archive
Messages sorted by:
Reverse Date,
Date,
Thread,
Author
PATCH: migrate pcre module to pcre2
- X-seq: zsh-workers 51723
- From: Oliver Kiddle <opk@xxxxxxx>
- To: Zsh workers <zsh-workers@xxxxxxx>
- Subject: PATCH: migrate pcre module to pcre2
- Date: Sat, 06 May 2023 01:35:46 +0200
- Archived-at: <https://zsh.org/workers/51723>
- List-id: <zsh-workers.zsh.org>
The old PCRE library that zsh uses is considered "end of life, and is no
longer being actively maintained". The newer PCRE2 has a slighty
different interface. It has been around since 2015 so this is not new.
Actual version numbers are 8.x and 10.x rather than 1 and 2. Migrating
seems to be the sensible way forward rather than, for example, adding a
separate zsh/pcre2 module. That way, zsh users don't need to be
concerned with the change.
I've mapped the pcre_study builtin which was named according to the
old interface onto pcre2_jit_compile. We previously had autoconf tests
for all of pcre_compile, pcre_study and pcre_exec. Building PCRE2 with
--disable-jit doesn't affect what symbols the resulting library contains
so I saw no point in checking for anything more than pcre2_compile_8
(and pcre2.h).
It remains the case that if you build with --enable-pcre, then pcre is
linked into everything. As before, LDFLAGS='-Wl,--as-needed' (or
-zignore depending on your linker) is a workaround.
The new interface handles embedded NULs in the regex so the warnings
that were added for that in workers/41308 could be removed and test
cases adapted. Aside from that this adds a test case for an empty capture
which came about as part of satisfying myself that I wasn't introducing
a bug when removing the *x++ = NULL; line in zpcre_get_substrings().
Would it perhaps be useful to add support for PCRE2's alternative DFA
algorithm? This might meet the needs Phil was considering with the re2
based module he once posted but never committed. What form might that
take?
When looking at PCRE documentation for this, other ideas for extensions
I saw would be to support:
* named capture groups
(either to an associative array or to variables directly?)
* callouts
(would need to consider the interface, either eval a string
or name a function, then either variables or parameters for
the callout interface. And reentrancy may be a concern.)
Oliver
diff --git a/Src/Modules/pcre.c b/Src/Modules/pcre.c
index 46875a59b..079ecc2c5 100644
--- a/Src/Modules/pcre.c
+++ b/Src/Modules/pcre.c
@@ -34,11 +34,11 @@
#define CPCRE_PLAIN 0
/**/
-#if defined(HAVE_PCRE_COMPILE) && defined(HAVE_PCRE_EXEC)
-#include <pcre.h>
+#if defined(HAVE_PCRE2_COMPILE_8) && defined(HAVE_PCRE2_H)
+#define PCRE2_CODE_UNIT_WIDTH 8
+#include <pcre2.h>
-static pcre *pcre_pattern;
-static pcre_extra *pcre_hints;
+static pcre2_code *pcre_pattern;
/**/
static int
@@ -54,8 +54,8 @@ zpcre_utf8_enabled(void)
return 0;
if ((have_utf8_pcre == -1) &&
- (pcre_config(PCRE_CONFIG_UTF8, &have_utf8_pcre))) {
- have_utf8_pcre = -2; /* erk, failed to ask */
+ (pcre2_config(PCRE2_CONFIG_UNICODE, &have_utf8_pcre))) {
+ have_utf8_pcre = -2; /* erk, failed to ask */
}
return (have_utf8_pcre == 1) && (!strcmp(nl_langinfo(CODESET), "UTF-8"));
@@ -69,115 +69,87 @@ zpcre_utf8_enabled(void)
static int
bin_pcre_compile(char *nam, char **args, Options ops, UNUSED(int func))
{
- int pcre_opts = 0, pcre_errptr, target_len;
- const char *pcre_error;
+ uint32_t pcre_opts = 0;
+ int target_len;
+ int pcre_error;
+ PCRE2_SIZE pcre_offset;
char *target;
- if(OPT_ISSET(ops,'a')) pcre_opts |= PCRE_ANCHORED;
- if(OPT_ISSET(ops,'i')) pcre_opts |= PCRE_CASELESS;
- if(OPT_ISSET(ops,'m')) pcre_opts |= PCRE_MULTILINE;
- if(OPT_ISSET(ops,'x')) pcre_opts |= PCRE_EXTENDED;
- if(OPT_ISSET(ops,'s')) pcre_opts |= PCRE_DOTALL;
+ if (OPT_ISSET(ops, 'a')) pcre_opts |= PCRE2_ANCHORED;
+ if (OPT_ISSET(ops, 'i')) pcre_opts |= PCRE2_CASELESS;
+ if (OPT_ISSET(ops, 'm')) pcre_opts |= PCRE2_MULTILINE;
+ if (OPT_ISSET(ops, 'x')) pcre_opts |= PCRE2_EXTENDED;
+ if (OPT_ISSET(ops, 's')) pcre_opts |= PCRE2_DOTALL;
if (zpcre_utf8_enabled())
- pcre_opts |= PCRE_UTF8;
-
-#ifdef HAVE_PCRE_STUDY
- if (pcre_hints)
-#ifdef PCRE_CONFIG_JIT
- pcre_free_study(pcre_hints);
-#else
- pcre_free(pcre_hints);
-#endif
- pcre_hints = NULL;
-#endif
+ pcre_opts |= PCRE2_UTF;
if (pcre_pattern)
- pcre_free(pcre_pattern);
+ pcre2_code_free(pcre_pattern);
pcre_pattern = NULL;
target = ztrdup(*args);
unmetafy(target, &target_len);
- if ((int)strlen(target) != target_len) {
- zwarnnam(nam, "embedded NULs in PCRE pattern terminate pattern");
- }
-
- pcre_pattern = pcre_compile(target, pcre_opts, &pcre_error, &pcre_errptr, NULL);
+ pcre_pattern = pcre2_compile((PCRE2_SPTR) target, (PCRE2_SIZE) target_len,
+ pcre_opts, &pcre_error, &pcre_offset, NULL);
free(target);
if (pcre_pattern == NULL)
{
- zwarnnam(nam, "error in regex: %s", pcre_error);
+ PCRE2_UCHAR buffer[256];
+ pcre2_get_error_message(pcre_error, buffer, sizeof(buffer));
+ zwarnnam(nam, "error in regex: %s", buffer);
return 1;
}
return 0;
}
-/**/
-#ifdef HAVE_PCRE_STUDY
-
/**/
static int
bin_pcre_study(char *nam, UNUSED(char **args), UNUSED(Options ops), UNUSED(int func))
{
- const char *pcre_error;
-
if (pcre_pattern == NULL)
{
zwarnnam(nam, "no pattern has been compiled for study");
return 1;
}
-
- if (pcre_hints)
-#ifdef PCRE_CONFIG_JIT
- pcre_free_study(pcre_hints);
-#else
- pcre_free(pcre_hints);
-#endif
- pcre_hints = NULL;
- pcre_hints = pcre_study(pcre_pattern, 0, &pcre_error);
- if (pcre_error != NULL)
- {
- zwarnnam(nam, "error while studying regex: %s", pcre_error);
- return 1;
+ int jit = 0;
+ if (!pcre2_config(PCRE2_CONFIG_JIT, &jit) && jit) {
+ if (pcre2_jit_compile(pcre_pattern, PCRE2_JIT_COMPLETE) < 0) {
+ zwarnnam(nam, "error while studying regex");
+ return 1;
+ }
}
return 0;
}
-/**/
-#else /* !HAVE_PCRE_STUDY */
-
-# define bin_pcre_study bin_notavail
-
-/**/
-#endif /* !HAVE_PCRE_STUDY */
-
-/**/
static int
-zpcre_get_substrings(char *arg, int *ovec, int captured_count, char *matchvar,
- char *substravar, int want_offset_pair, int matchedinarr,
- int want_begin_end)
+zpcre_get_substrings(char *arg, pcre2_match_data *mdata, int captured_count,
+ char *matchvar, char *substravar, int want_offset_pair,
+ int matchedinarr, int want_begin_end)
{
- char **captures, *match_all, **matches;
+ PCRE2_SIZE *ovec;
+ char *match_all, **matches;
char offset_all[50];
int capture_start = 1;
if (matchedinarr) {
- /* bash-style captures[0] entire-matched string in the array */
+ /* bash-style ovec[0] entire-matched string in the array */
capture_start = 0;
}
- /* captures[0] will be entire matched string, [1] first substring */
- if (!pcre_get_substring_list(arg, ovec, captured_count, (const char ***)&captures)) {
- int nelem = arrlen(captures)-1;
+ /* ovec[0] will be entire matched string, [1] first substring */
+ ovec = pcre2_get_ovector_pointer(mdata);
+ if (ovec) {
+ int nelem = captured_count - 1;
/* Set to the offsets of the complete match */
if (want_offset_pair) {
- sprintf(offset_all, "%d %d", ovec[0], ovec[1]);
+ sprintf(offset_all, "%ld %ld", ovec[0], ovec[1]);
setsparam("ZPCRE_OP", ztrdup(offset_all));
}
/*
@@ -186,7 +158,7 @@ zpcre_get_substrings(char *arg, int *ovec, int captured_count, char *matchvar,
* ovec is length 2*(1+capture_list_length)
*/
if (matchvar) {
- match_all = metafy(captures[0], ovec[1] - ovec[0], META_DUP);
+ match_all = metafy(arg + ovec[0], ovec[1] - ovec[0], META_DUP);
setsparam(matchvar, match_all);
}
/*
@@ -201,16 +173,12 @@ zpcre_get_substrings(char *arg, int *ovec, int captured_count, char *matchvar,
*/
if (substravar &&
(!want_begin_end || nelem)) {
- char **x, **y;
+ char **x;
int vec_off, i;
- y = &captures[capture_start];
matches = x = (char **) zalloc(sizeof(char *) * (captured_count+1-capture_start));
- for (i = capture_start; i < captured_count; i++, y++) {
+ for (i = capture_start; i < captured_count; i++) {
vec_off = 2*i;
- if (*y)
- *x++ = metafy(*y, ovec[vec_off+1]-ovec[vec_off], META_DUP);
- else
- *x++ = NULL;
+ *x++ = metafy(arg + ovec[vec_off], ovec[vec_off+1]-ovec[vec_off], META_DUP);
}
*x = NULL;
setaparam(substravar, matches);
@@ -247,7 +215,8 @@ zpcre_get_substrings(char *arg, int *ovec, int captured_count, char *matchvar,
setiparam("MEND", offs + !isset(KSHARRAYS) - 1);
if (nelem) {
char **mbegin, **mend, **bptr, **eptr;
- int i, *ipair;
+ int i;
+ size_t *ipair;
bptr = mbegin = zalloc(sizeof(char*)*(nelem+1));
eptr = mend = zalloc(sizeof(char*)*(nelem+1));
@@ -287,8 +256,6 @@ zpcre_get_substrings(char *arg, int *ovec, int captured_count, char *matchvar,
setaparam("mend", mend);
}
}
-
- pcre_free_substring_list((const char **)captures);
}
return 0;
@@ -314,7 +281,8 @@ getposint(char *instr, char *nam)
static int
bin_pcre_match(char *nam, char **args, Options ops, UNUSED(int func))
{
- int ret, capcount, *ovec, ovecsize, c;
+ int ret, c;
+ pcre2_match_data *pcre_mdata = NULL;
char *matched_portion = NULL;
char *plaintext = NULL;
char *receptacle = NULL;
@@ -344,36 +312,30 @@ bin_pcre_match(char *nam, char **args, Options ops, UNUSED(int func))
/* For the entire match, 'Return' the offset byte positions instead of the matched string */
if(OPT_ISSET(ops,'b')) want_offset_pair = 1;
- if ((ret = pcre_fullinfo(pcre_pattern, pcre_hints, PCRE_INFO_CAPTURECOUNT, &capcount)))
- {
- zwarnnam(nam, "error %d in fullinfo", ret);
- return 1;
- }
-
- ovecsize = (capcount+1)*3;
- ovec = zalloc(ovecsize*sizeof(int));
-
plaintext = ztrdup(*args);
unmetafy(plaintext, &subject_len);
if (offset_start > 0 && offset_start >= subject_len)
- ret = PCRE_ERROR_NOMATCH;
- else
- ret = pcre_exec(pcre_pattern, pcre_hints, plaintext, subject_len, offset_start, 0, ovec, ovecsize);
+ ret = PCRE2_ERROR_NOMATCH;
+ else {
+ pcre_mdata = pcre2_match_data_create_from_pattern(pcre_pattern, NULL);
+ ret = pcre2_match(pcre_pattern, (PCRE2_SPTR) plaintext, subject_len,
+ offset_start, 0, pcre_mdata, NULL);
+ }
if (ret==0) return_value = 0;
- else if (ret==PCRE_ERROR_NOMATCH) /* no match */;
+ else if (ret == PCRE2_ERROR_NOMATCH) /* no match */;
else if (ret>0) {
- zpcre_get_substrings(plaintext, ovec, ret, matched_portion, receptacle,
+ zpcre_get_substrings(plaintext, pcre_mdata, ret, matched_portion, receptacle,
want_offset_pair, 0, 0);
return_value = 0;
}
else {
- zwarnnam(nam, "error in pcre_exec [%d]", ret);
+ zwarnnam(nam, "error in pcre2_match [%d]", ret);
}
- if (ovec)
- zfree(ovec, ovecsize*sizeof(int));
+ if (pcre_mdata)
+ pcre2_match_data_free(pcre_mdata);
zsfree(plaintext);
return return_value;
@@ -383,17 +345,19 @@ bin_pcre_match(char *nam, char **args, Options ops, UNUSED(int func))
static int
cond_pcre_match(char **a, int id)
{
- pcre *pcre_pat;
- const char *pcre_err;
+ pcre2_code *pcre_pat = NULL;
+ int pcre_err;
+ PCRE2_SIZE pcre_erroff;
char *lhstr, *rhre, *lhstr_plain, *rhre_plain, *avar, *svar;
- int r = 0, pcre_opts = 0, pcre_errptr, capcnt, *ov, ovsize;
+ int r = 0, pcre_opts = 0;
+ pcre2_match_data *pcre_mdata = NULL;
int lhstr_plain_len, rhre_plain_len;
int return_value = 0;
if (zpcre_utf8_enabled())
- pcre_opts |= PCRE_UTF8;
+ pcre_opts |= PCRE2_UTF;
if (isset(REMATCHPCRE) && !isset(CASEMATCH))
- pcre_opts |= PCRE_CASELESS;
+ pcre_opts |= PCRE2_CASELESS;
lhstr = cond_str(a,0,0);
rhre = cond_str(a,1,0);
@@ -401,9 +365,6 @@ cond_pcre_match(char **a, int id)
rhre_plain = ztrdup(rhre);
unmetafy(lhstr_plain, &lhstr_plain_len);
unmetafy(rhre_plain, &rhre_plain_len);
- pcre_pat = NULL;
- ov = NULL;
- ovsize = 0;
if (isset(BASHREMATCH)) {
svar = NULL;
@@ -415,27 +376,27 @@ cond_pcre_match(char **a, int id)
switch(id) {
case CPCRE_PLAIN:
- if ((int)strlen(rhre_plain) != rhre_plain_len) {
- zwarn("embedded NULs in PCRE pattern terminate pattern");
- }
- pcre_pat = pcre_compile(rhre_plain, pcre_opts, &pcre_err, &pcre_errptr, NULL);
- if (pcre_pat == NULL) {
- zwarn("failed to compile regexp /%s/: %s", rhre, pcre_err);
+ if (!(pcre_pat = pcre2_compile((PCRE2_SPTR) rhre_plain,
+ (PCRE2_SIZE) rhre_plain_len, pcre_opts,
+ &pcre_err, &pcre_erroff, NULL)))
+ {
+ PCRE2_UCHAR buffer[256];
+ pcre2_get_error_message(pcre_err, buffer, sizeof(buffer));
+ zwarn("failed to compile regexp /%s/: %s", rhre, buffer);
break;
}
- pcre_fullinfo(pcre_pat, NULL, PCRE_INFO_CAPTURECOUNT, &capcnt);
- ovsize = (capcnt+1)*3;
- ov = zalloc(ovsize*sizeof(int));
- r = pcre_exec(pcre_pat, NULL, lhstr_plain, lhstr_plain_len, 0, 0, ov, ovsize);
- /* r < 0 => error; r==0 match but not enough size in ov
+ pcre_mdata = pcre2_match_data_create_from_pattern(pcre_pat, NULL);
+ r = pcre2_match(pcre_pat, (PCRE2_SPTR8) lhstr_plain, lhstr_plain_len,
+ 0, 0, pcre_mdata, NULL);
+ /* r < 0 => error; r==0 match but not enough size in match data
* r > 0 => (r-1) substrings found; r==1 => no substrings
*/
if (r==0) {
- zwarn("reportable zsh problem: pcre_exec() returned 0");
+ zwarn("reportable zsh problem: pcre2_match() returned 0");
return_value = 1;
break;
}
- else if (r==PCRE_ERROR_NOMATCH) {
+ else if (r == PCRE2_ERROR_NOMATCH) {
return_value = 0; /* no match */
break;
}
@@ -444,7 +405,7 @@ cond_pcre_match(char **a, int id)
break;
}
else if (r>0) {
- zpcre_get_substrings(lhstr_plain, ov, r, svar, avar, 0,
+ zpcre_get_substrings(lhstr_plain, pcre_mdata, r, svar, avar, 0,
isset(BASHREMATCH),
!isset(BASHREMATCH));
return_value = 1;
@@ -457,10 +418,10 @@ cond_pcre_match(char **a, int id)
free(lhstr_plain);
if(rhre_plain)
free(rhre_plain);
+ if (pcre_mdata)
+ pcre2_match_data_free(pcre_mdata);
if (pcre_pat)
- pcre_free(pcre_pat);
- if (ov)
- zfree(ov, ovsize*sizeof(int));
+ pcre2_code_free(pcre_pat);
return return_value;
}
@@ -489,11 +450,11 @@ static struct builtin bintab[] = {
static struct features module_features = {
bintab, sizeof(bintab)/sizeof(*bintab),
-#if defined(HAVE_PCRE_COMPILE) && defined(HAVE_PCRE_EXEC)
+#if defined(HAVE_PCRE2_COMPILE_8) && defined(HAVE_PCRE2_H)
cotab, sizeof(cotab)/sizeof(*cotab),
-#else /* !(HAVE_PCRE_COMPILE && HAVE_PCRE_EXEC) */
+#else /* !(HAVE_PCRE2_COMPILE_8 && HAVE_PCRE2_H) */
NULL, 0,
-#endif /* !(HAVE_PCRE_COMPILE && HAVE_PCRE_EXEC) */
+#endif /* !(HAVE_PCRE2_COMPILE_8 && HAVE_PCRE2_H) */
NULL, 0,
NULL, 0,
0
@@ -540,19 +501,9 @@ cleanup_(Module m)
int
finish_(UNUSED(Module m))
{
-#if defined(HAVE_PCRE_COMPILE) && defined(HAVE_PCRE_EXEC)
-#ifdef HAVE_PCRE_STUDY
- if (pcre_hints)
-#ifdef PCRE_CONFIG_JIT
- pcre_free_study(pcre_hints);
-#else
- pcre_free(pcre_hints);
-#endif
- pcre_hints = NULL;
-#endif
-
+#if defined(HAVE_PCRE2_COMPILE_8) && defined(HAVE_PCRE2_H)
if (pcre_pattern)
- pcre_free(pcre_pattern);
+ pcre2_code_free(pcre_pattern);
pcre_pattern = NULL;
#endif
diff --git a/Test/V07pcre.ztst b/Test/V07pcre.ztst
index 22a0b64c7..6eb366964 100644
--- a/Test/V07pcre.ztst
+++ b/Test/V07pcre.ztst
@@ -117,12 +117,17 @@
>78884; ZPCRE_OP: 25 30
>90210; ZPCRE_OP: 31 36
-# Embedded NULs allowed in plaintext, but not in RE (although \0 as two-chars allowed)
+# Embedded NULs allowed in plaintext, in RE, pcre supports \0 as two-chars
[[ $'a\0bc\0d' =~ '^(a\0.)(.+)$' ]]
print "${#MATCH}; ${#match[1]}; ${#match[2]}"
0:ensure ASCII NUL passes in and out of matched plaintext
>6; 3; 3
+# PCRE2 supports NULs also in the RE
+ [[ $'a\0b\0c' =~ $'^(.\0)+' ]] && print "${#MATCH}; ${#match[1]}"
+0:ensure ASCII NUL works also in the regex
+>4; 2
+
# Ensure the long-form infix operator works
[[ foo -pcre-match ^f..$ ]]
print $?
@@ -169,7 +174,11 @@
[[ é =~ '^..\z' ]]; echo $?
LANG=$LANG_SAVE
[[ é =~ '^.\z' ]]; echo $?
-0:swich between C/UTF-8 locales
+0:switch between C/UTF-8 locales
>0
>0
>0
+
+ [[ abc =~ 'a(d*)bc' ]] && print "$#MATCH; $#match; ${#match[1]}"
+0:empty capture
+>3; 1; 0
diff --git a/configure.ac b/configure.ac
index d8a17791a..4710d1659 100644
--- a/configure.ac
+++ b/configure.ac
@@ -438,7 +438,7 @@ fi],
dnl Do you want to look for pcre support?
AC_ARG_ENABLE(pcre,
-AS_HELP_STRING([--enable-pcre],[enable the search for the pcre library (may create run-time library dependencies)]))
+AS_HELP_STRING([--enable-pcre],[enable the search for the pcre2 library (may create run-time library dependencies)]))
dnl Do you want to look for capability support?
AC_ARG_ENABLE(cap,
@@ -652,13 +652,12 @@ AC_HEADER_SYS_WAIT
oldcflags="$CFLAGS"
if test x$enable_pcre = xyes; then
-AC_CHECK_PROG([PCRECONF], pcre-config, pcre-config)
-dnl Typically (meaning on this single RedHat 9 box in front of me)
-dnl pcre-config --cflags produces a -I output which needs to go into
+AC_CHECK_PROG([PCRECONF], pcre2-config, pcre2-config)
+dnl pcre2-config --cflags may produce a -I output which needs to go into
dnl CPPFLAGS else configure's preprocessor tests don't pick it up,
dnl producing a warning.
-if test "x$ac_cv_prog_PCRECONF" = xpcre-config; then
- CPPFLAGS="$CPPFLAGS `pcre-config --cflags`"
+if test "x$ac_cv_prog_PCRECONF" = xpcre2-config; then
+ CPPFLAGS="$CPPFLAGS `pcre2-config --cflags`"
fi
fi
@@ -668,9 +667,10 @@ AC_CHECK_HEADERS(sys/time.h sys/times.h sys/select.h termcap.h termio.h \
locale.h errno.h stdio.h stdarg.h varargs.h stdlib.h \
unistd.h sys/capability.h \
utmp.h utmpx.h sys/types.h pwd.h grp.h poll.h sys/mman.h \
- netinet/in_systm.h pcre.h langinfo.h wchar.h stddef.h \
+ netinet/in_systm.h langinfo.h wchar.h stddef.h \
sys/stropts.h iconv.h ncurses.h ncursesw/ncurses.h \
ncurses/ncurses.h)
+AC_CHECK_HEADERS([pcre2.h],,,[#define PCRE2_CODE_UNIT_WIDTH 8])
if test x$dynamic = xyes; then
AC_CHECK_HEADERS(dlfcn.h)
AC_CHECK_HEADERS(dl.h)
@@ -948,9 +948,7 @@ if test "x$ac_found_iconv" = "xyes"; then
fi
if test x$enable_pcre = xyes; then
-dnl pcre-config should probably be employed here
-dnl AC_SEARCH_LIBS(pcre_compile, pcre)
- LIBS="`$ac_cv_prog_PCRECONF --libs` $LIBS"
+ LIBS="`$ac_cv_prog_PCRECONF --libs8` $LIBS"
fi
dnl ---------------------
@@ -1313,7 +1311,7 @@ AC_CHECK_FUNCS(strftime strptime mktime timelocal \
pathconf sysconf \
tgetent tigetflag tigetnum tigetstr setupterm initscr resize_term \
getcchar setcchar waddwstr wget_wch win_wch use_default_colors \
- pcre_compile pcre_study pcre_exec \
+ pcre2_compile_8 \
nl_langinfo \
erand48 open_memstream \
posix_openpt \
Messages sorted by:
Reverse Date,
Date,
Thread,
Author