Zsh Mailing List Archive
Messages sorted by:
Reverse Date,
Date,
Thread,
Author
PATCH: New options for the PCRE module
- X-seq: zsh-workers 26620
- From: Jon Strait <jstrait@xxxxxxxxxxxx>
- To: zsh workers <zsh-workers@xxxxxxxxxx>
- Subject: PATCH: New options for the PCRE module
- Date: Thu, 26 Feb 2009 23:15:50 -0800
- Mailing-list: contact zsh-workers-help@xxxxxxxxxx; run by ezmlm
Hi all,
I thought the PCRE module could use a little enhancement, so I added a
few things that may be useful to some.
Let's see if I can get through all of this before the coffee wears off...
1. A new '-s' option to pcre_compile. This is the frequently set
PCRE_DOTALL option, allowing the dot character to match a newline as well.
2. For pcre_match, a '-n offset' option for starting the search at the
offset position in the match string, and a '-b' option for setting the
variable ZPCRE_OP to the offset pair of positions of the entire
successful pattern match. For example, if a pattern matches with the
'-b' option set, a ZPCRE_OP set to the string "32 45" indicates that the
entire match started on byte position 32 and ended on byte position 44.
PCRE is saying byte position 32 to 45 exclusive, zero based.
All of this is to enable the 'find all' functionality. For example, if
I want all of the non-overlapping matches within a string, I can now do:
accum=()
pcre_match -b -- $match_string
while [[ -n $ZPCRE_OP ]] do
b=($=ZPCRE_OP)
accum+=$MATCH
pcre_match -b -n $(( b[2] )) -- $match_string
done
print -l $accum
On the safe side, regarding the possibility of multi-byte characters,
I'm assuming that the returned offset positions are only for sending
back to pcre_match and not for indexing on a match string, because the
offsets are in byte count, not character count.
3. A needed correction: all of the module's external variables are now
unset on each match attempt, so that a failed match will be obvious.
Could someone please point me to the doc files that would need updating
(for the zshmodule man page), or if someone here has that part
automated, I can send them whatever targeted write-up they want.
--- pcre.c 2007-07-09 02:30:42.000000000 -0700
+++ pcre-new.c 2009-02-26 22:10:46.000000000 -0800
@@ -82,6 +82,7 @@
if(OPT_ISSET(ops,'i')) pcre_opts |= PCRE_CASELESS;
if(OPT_ISSET(ops,'m')) pcre_opts |= PCRE_MULTILINE;
if(OPT_ISSET(ops,'x')) pcre_opts |= PCRE_EXTENDED;
+ if(OPT_ISSET(ops,'s')) pcre_opts |= PCRE_DOTALL;
if (zpcre_utf8_enabled())
pcre_opts |= PCRE_UTF8;
@@ -137,20 +138,23 @@
/**/
static int
-zpcre_get_substrings(char *arg, int *ovec, int ret, char *matchvar, char *substravar, int matchedinarr)
+zpcre_get_substrings(char *arg, int *ovec, int ret, char *matchvar, char *substravar,
+ int want_offset_pair, int matchedinarr)
{
char **captures, *match_all, **matches;
+ char offset_all[50];
int capture_start = 1;
if (matchedinarr)
capture_start = 0;
- if (matchvar == NULL)
- matchvar = "MATCH";
- if (substravar == NULL)
- substravar = "match";
-
+
/* captures[0] will be entire matched string, [1] first substring */
- if(!pcre_get_substring_list(arg, ovec, ret, (const char ***)&captures)) {
+ if (!pcre_get_substring_list(arg, ovec, ret, (const char ***)&captures)) {
+ /* Set to the offsets of the complete match */
+ if (want_offset_pair) {
+ sprintf(offset_all, "%d %d", ovec[0], ovec[1]);
+ setsparam("ZPCRE_OP", ztrdup(offset_all));
+ }
match_all = ztrdup(captures[0]);
setsparam(matchvar, match_all);
matches = zarrdup(&captures[capture_start]);
@@ -163,12 +167,30 @@
/**/
static int
+getposint(char *instr, char *nam)
+{
+ char *eptr;
+ int ret;
+
+ ret = (int)zstrtol(instr, &eptr, 10);
+ if (*eptr || ret < 0) {
+ zwarnnam(nam, "integer expected: %s", instr);
+ return -1;
+ }
+
+ return ret;
+}
+
+/**/
+static int
bin_pcre_match(char *nam, char **args, Options ops, UNUSED(int func))
{
int ret, capcount, *ovec, ovecsize, c;
char *matched_portion = NULL;
char *receptacle = NULL;
int return_value = 1;
+ int offset_start = 0;
+ int want_offset_pair = 0;
if (pcre_pattern == NULL) {
zwarnnam(nam, "no pattern has been compiled");
@@ -181,6 +203,12 @@
if(OPT_HASARG(ops,c='v')) {
matched_portion = OPT_ARG(ops,c);
}
+ if(OPT_HASARG(ops,c='n')) { /* The offset position to start the search */
+ offset_start = getposint(OPT_ARG(ops,c), nam);
+ }
+ /* For the entire match, 'Return' the offset positions instead of the matched string */
+ if(OPT_ISSET(ops,'b')) want_offset_pair = 1;
+
if(!*args) {
zwarnnam(nam, "not enough arguments");
}
@@ -194,12 +222,22 @@
ovecsize = (capcount+1)*3;
ovec = zalloc(ovecsize*sizeof(int));
- ret = pcre_exec(pcre_pattern, pcre_hints, *args, strlen(*args), 0, 0, ovec, ovecsize);
+ ret = pcre_exec(pcre_pattern, pcre_hints, *args, strlen(*args), offset_start, 0, ovec, ovecsize);
+
+ if (matched_portion == NULL)
+ matched_portion = "MATCH";
+ if (receptacle == NULL)
+ receptacle = "match";
+
+ /* Reset the external variables */
+ unsetparam(matched_portion);
+ unsetparam(receptacle);
+ unsetparam("ZPCRE_OP");
if (ret==0) return_value = 0;
else if (ret==PCRE_ERROR_NOMATCH) /* no match */;
else if (ret>0) {
- zpcre_get_substrings(*args, ovec, ret, matched_portion, receptacle, 0);
+ zpcre_get_substrings(*args, ovec, ret, matched_portion, receptacle, want_offset_pair, 0);
return_value = 0;
}
else {
@@ -258,7 +296,7 @@
break;
}
else if (r>0) {
- zpcre_get_substrings(lhstr, ov, r, NULL, avar, isset(BASHREMATCH));
+ zpcre_get_substrings(lhstr, ov, r, NULL, avar, 0, isset(BASHREMATCH));
return_value = 1;
break;
}
@@ -289,8 +327,8 @@
#endif /* !(HAVE_PCRE_COMPILE && HAVE_PCRE_EXEC) */
static struct builtin bintab[] = {
- BUILTIN("pcre_compile", 0, bin_pcre_compile, 1, 1, 0, "aimx", NULL),
- BUILTIN("pcre_match", 0, bin_pcre_match, 1, 1, 0, "a:v:", NULL),
+ BUILTIN("pcre_compile", 0, bin_pcre_compile, 1, 1, 0, "aimxs", NULL),
+ BUILTIN("pcre_match", 0, bin_pcre_match, 1, 1, 0, "a:v:n:b", NULL),
BUILTIN("pcre_study", 0, bin_pcre_study, 0, 0, 0, NULL, NULL)
};
Messages sorted by:
Reverse Date,
Date,
Thread,
Author