Zsh Mailing List Archive
Messages sorted by: Reverse Date, Date, Thread, Author

Re: named capture groups with PCRE matching (Was: PATCH: migrate pcre module to pcre2)



Bart Schaefer wrote:
> > >  * named capture groups

> Since this would be new functionality anyway, perhaps it's a good
> opportunity to take advantage of namespaces, e.g. ${.pcre.match[foo]}
> etc., instead of overloading the existing variables.

You can mix named and unnamed captures in Perl so overloading is perhaps
better avoided. And I agree we should make good use of the namespaces.
This uses that but also adds -A to pcre_match as suggested by Stephane.

Oliver

diff --git a/Doc/Zsh/mod_pcre.yo b/Doc/Zsh/mod_pcre.yo
index c2817f519..6d073985d 100644
--- a/Doc/Zsh/mod_pcre.yo
+++ b/Doc/Zsh/mod_pcre.yo
@@ -20,12 +20,12 @@ including those that indicate newline.
 )
 findex(pcre_study)
 item(tt(pcre_study))(
-Studies the previously-compiled PCRE which may result in faster
-matching.
+Requests JIT compilation for the previously-compiled PCRE which
+may result in faster matching.
 )
 findex(pcre_match)
 item(tt(pcre_match) [ tt(-v) var(var) ] [ tt(-a) var(arr) ] \
-[ tt(-n) var(offset) ] [ tt(-b) ] var(string))(
+[ tt(-A) var(assoc) ] [ tt(-n) var(offset) ] [ tt(-b) ] var(string))(
 Returns successfully if tt(string) matches the previously-compiled
 PCRE.
 
@@ -36,7 +36,9 @@ substrings, unless the tt(-a) option is given, in which
 case it will set the array var(arr).  Similarly, the variable
 tt(MATCH) will be set to the entire matched portion of the
 string, unless the tt(-v) option is given, in which case the variable
-var(var) will be set.
+var(var) will be set. Furthermore, any named captures will
+be stored in the associative array tt(.pcre.match) unless an
+alternative is given with tt(-A).
 No variables are altered if there is no successful match.
 A tt(-n) option starts searching for a match from the
 byte var(offset) position in var(string).  If the tt(-b) option is given,
diff --git a/Src/Modules/pcre.c b/Src/Modules/pcre.c
index 079ecc2c5..6be1f76e2 100644
--- a/Src/Modules/pcre.c
+++ b/Src/Modules/pcre.c
@@ -129,14 +129,17 @@ bin_pcre_study(char *nam, UNUSED(char **args), UNUSED(Options ops), UNUSED(int f
 }
 
 static int
-zpcre_get_substrings(char *arg, pcre2_match_data *mdata, int captured_count,
-	char *matchvar, char *substravar, int want_offset_pair,
-	int matchedinarr, int want_begin_end)
+zpcre_get_substrings(pcre2_code *pat, char *arg, pcre2_match_data *mdata,
+	int captured_count, char *matchvar, char *substravar, char *namedassoc,
+	int want_offset_pair, int matchedinarr, int want_begin_end)
 {
     PCRE2_SIZE *ovec;
     char *match_all, **matches;
     char offset_all[50];
     int capture_start = 1;
+    int vec_off;
+    PCRE2_SPTR ntable; /* table of named captures */
+    uint32_t ncount, nsize;
 
     if (matchedinarr) {
 	/* bash-style ovec[0] entire-matched string in the array */
@@ -174,7 +177,7 @@ zpcre_get_substrings(char *arg, pcre2_match_data *mdata, int captured_count,
 	if (substravar &&
 	    (!want_begin_end || nelem)) {
 	    char **x;
-	    int vec_off, i;
+	    int i;
 	    matches = x = (char **) zalloc(sizeof(char *) * (captured_count+1-capture_start));
 	    for (i = capture_start; i < captured_count; i++) {
 		vec_off = 2*i;
@@ -184,6 +187,23 @@ zpcre_get_substrings(char *arg, pcre2_match_data *mdata, int captured_count,
 	    setaparam(substravar, matches);
 	}
 
+	if (!pcre2_pattern_info(pat, PCRE2_INFO_NAMECOUNT, &ncount) && ncount
+		&& !pcre2_pattern_info(pat, PCRE2_INFO_NAMEENTRYSIZE, &nsize)
+		&& !pcre2_pattern_info(pat, PCRE2_INFO_NAMETABLE, &ntable))
+	{
+	    char **hash, **hashptr;
+	    uint32_t nidx;
+	    hashptr = hash = (char **)zshcalloc((ncount+1)*2*sizeof(char *));
+	    for (nidx = 0; nidx < ncount; nidx++) {
+		vec_off = (ntable[nsize * nidx] << 9) + 2 * ntable[nsize * nidx + 1];
+		/* would metafy the key but pcre limits characters in the name */
+		*hashptr++ = ztrdup((char *) ntable + nsize * nidx + 2);
+		*hashptr++ = metafy(arg + ovec[vec_off],
+			ovec[vec_off+1]-ovec[vec_off], META_DUP);
+	    }
+	    sethparam(namedassoc, hash);
+	}
+
 	if (want_begin_end) {
 	    /*
 	     * cond-infix rather than builtin; also not bash; so we set a bunch
@@ -286,6 +306,7 @@ bin_pcre_match(char *nam, char **args, Options ops, UNUSED(int func))
     char *matched_portion = NULL;
     char *plaintext = NULL;
     char *receptacle = NULL;
+    char *named = ".pcre.match";
     int return_value = 1;
     /* The subject length and offset start are both int values in pcre_exec */
     int subject_len;
@@ -305,6 +326,9 @@ bin_pcre_match(char *nam, char **args, Options ops, UNUSED(int func))
     if(OPT_HASARG(ops,c='v')) {
 	matched_portion = OPT_ARG(ops,c);
     }
+    if (OPT_HASARG(ops, c='A')) {
+	named = OPT_ARG(ops, c);
+    }
     if(OPT_HASARG(ops,c='n')) { /* The offset position to start the search, in bytes. */
 	if ((offset_start = getposint(OPT_ARG(ops,c), nam)) < 0)
 	    return 1;
@@ -326,8 +350,8 @@ bin_pcre_match(char *nam, char **args, Options ops, UNUSED(int func))
     if (ret==0) return_value = 0;
     else if (ret == PCRE2_ERROR_NOMATCH) /* no match */;
     else if (ret>0) {
-	zpcre_get_substrings(plaintext, pcre_mdata, ret, matched_portion, receptacle,
-			     want_offset_pair, 0, 0);
+	zpcre_get_substrings(pcre_pattern, plaintext, pcre_mdata, ret, matched_portion,
+		receptacle, named, want_offset_pair, 0, 0);
 	return_value = 0;
     }
     else {
@@ -405,9 +429,8 @@ cond_pcre_match(char **a, int id)
 		    break;
 		}
                 else if (r>0) {
-		    zpcre_get_substrings(lhstr_plain, pcre_mdata, r, svar, avar, 0,
-					 isset(BASHREMATCH),
-					 !isset(BASHREMATCH));
+		    zpcre_get_substrings(pcre_pat, lhstr_plain, pcre_mdata, r, svar, avar,
+			    ".pcre.match", 0, isset(BASHREMATCH), !isset(BASHREMATCH));
 		    return_value = 1;
 		    break;
 		}
@@ -443,7 +466,7 @@ static struct conddef cotab[] = {
 
 static struct builtin bintab[] = {
     BUILTIN("pcre_compile", 0, bin_pcre_compile, 1, 1, 0, "aimxs",  NULL),
-    BUILTIN("pcre_match",   0, bin_pcre_match,   1, 1, 0, "a:v:n:b",    NULL),
+    BUILTIN("pcre_match",   0, bin_pcre_match,   1, 1, 0, "A:a:v:n:b",    NULL),
     BUILTIN("pcre_study",   0, bin_pcre_study,   0, 0, 0, NULL,    NULL)
 };
 
diff --git a/Test/V07pcre.ztst b/Test/V07pcre.ztst
index 6eb366964..027fea3aa 100644
--- a/Test/V07pcre.ztst
+++ b/Test/V07pcre.ztst
@@ -182,3 +182,17 @@
   [[ abc =~ 'a(d*)bc' ]] && print "$#MATCH; $#match; ${#match[1]}"
 0:empty capture
 >3; 1; 0
+
+  [[ category/name-12345 =~ '(?x)^
+    (?<category> [^/]* ) /
+    (?<package>
+      (?<name> \w+ ) -
+      (?<version> \d+ ))$' ]]
+  typeset -p1 .pcre.match
+0:named captures
+>typeset -g -A .pcre.match=(
+>  [category]=category
+>  [name]=name
+>  [package]=name-12345
+>  [version]=12345
+>)




Messages sorted by: Reverse Date, Date, Thread, Author