Zsh Mailing List Archive
Messages sorted by:
Reverse Date,
Date,
Thread,
Author
[patch] Re: UTF-8 and PCRE and metafy
- X-seq: zsh-workers 29837
- From: Phil Pennock <zsh-workers+phil.pennock@xxxxxxxxxxxx>
- To: Peter Stephenson <Peter.Stephenson@xxxxxxx>
- Subject: [patch] Re: UTF-8 and PCRE and metafy
- Date: Fri, 21 Oct 2011 05:56:25 -0400
- Cc: zsh-workers@xxxxxxx
- Dkim-signature: v=1; a=rsa-sha256; q=dns/txt; c=relaxed/relaxed; d=spodhuis.org; s=d201107; h=In-Reply-To:Content-Transfer-Encoding:Content-Type:MIME-Version:References:Message-ID:Subject:Cc:To:From:Date; bh=rxa8zRPmyj6XBpnaj44Ll241TM0pKlQpgiGmpgofPuI=; b=jyVp36jLlzR14g5fGG3r7iPi27GCuNVhPfS8dy1cvI2YgFi2Xuy8AJ0lnFSMAUXuE0FNISpO8UBI3MDqGY8W2XrmGPb2NRlE/u2d0tsjG4k7T6HUQjOLXAhX+VaFSSQLPim9VdFSPol5qNeOpRA0Tj11INGIhRXuS8gwvXXSgLQ=;
- In-reply-to: <20110308095850.12843492@pwslap01u.europe.root.pri>
- List-help: <mailto:zsh-workers-help@zsh.org>
- List-id: Zsh Workers List <zsh-workers.zsh.org>
- List-post: <mailto:zsh-workers@zsh.org>
- Mail-followup-to: Peter Stephenson <Peter.Stephenson@xxxxxxx>, zsh-workers@xxxxxxx
- Mailing-list: contact zsh-workers-help@xxxxxxx; run by ezmlm
- References: <20110308065216.GB79682@redoubt.spodhuis.org> <20110308095850.12843492@pwslap01u.europe.root.pri>
On 2011-03-08 at 09:58 +0000, Peter Stephenson wrote:
> On Tue, 8 Mar 2011 01:52:16 -0500
> Phil Pennock <zsh-workers+phil.pennock@xxxxxxxxxxxx> wrote:
> > I'm guessing I need a bunch of calls to metafy() to process the
> > results of extraction in zpcre_get_substrings() ?
>
> You'll need to unmetafy any string getting passed into
> pcre_get_substring_list() and metafy() the resulting captures coming
> out. You should duplicate any string that needs unmetafying, since
> otherwise it's in place and you may need the metafied form later (you do
> for the string passed in as the first argument).
Okay, it took me far too long to get back around to this, sorry. :(
Attached is what looks to me to be a correct patch. With bash_rematch
set, I can do:
% [[ 'foo→bar' =~ .([^[:ascii:]]). ]]
% echo $BASH_REMATCH
o→b →
% [[ 'foo→bar' =~ .(→.). ]]
% echo $BASH_REMATCH
o→ba →b
I'm not sure on when I should be using the wcs_strdup() functions and
the like; what I've got appears to work. None of what I've added
appears to be specific to UTF-8.
Is it reasonable to add tests to D07multibyte.ztst for this, with the
zsh/pcre dependency?
Can anyone spot any cases I've missed in zsh/pcre ?
Does anyone know of a system extended regexp library which supports
multibyte characters? I think I should be making the same changes to
zsh/regex but am not sure where to actually test those changes.
Regards,
-Phil
Index: Src/Modules/pcre.c
===================================================================
RCS file: /home/cvsroot/zsh/Src/Modules/pcre.c,v
retrieving revision 1.18
diff -a -u -p -r1.18 pcre.c
--- Src/Modules/pcre.c 20 Jan 2010 11:17:11 -0000 1.18
+++ Src/Modules/pcre.c 21 Oct 2011 09:43:29 -0000
@@ -77,6 +77,7 @@ bin_pcre_compile(char *nam, char **args,
{
int pcre_opts = 0, pcre_errptr;
const char *pcre_error;
+ char *target;
if(OPT_ISSET(ops,'a')) pcre_opts |= PCRE_ANCHORED;
if(OPT_ISSET(ops,'i')) pcre_opts |= PCRE_CASELESS;
@@ -92,8 +93,13 @@ bin_pcre_compile(char *nam, char **args,
if (pcre_pattern)
pcre_free(pcre_pattern);
- pcre_pattern = pcre_compile(*args, pcre_opts, &pcre_error, &pcre_errptr, NULL);
+ target = ztrdup(*args);
+ unmetafy(target, NULL);
+
+ pcre_pattern = pcre_compile(target, pcre_opts, &pcre_error, &pcre_errptr, NULL);
+ free(target);
+
if (pcre_pattern == NULL)
{
zwarnnam(nam, "error in regex: %s", pcre_error);
@@ -161,7 +167,7 @@ zpcre_get_substrings(char *arg, int *ove
sprintf(offset_all, "%d %d", ovec[0], ovec[1]);
setsparam("ZPCRE_OP", ztrdup(offset_all));
}
- match_all = ztrdup(captures[0]);
+ match_all = metafy(captures[0], -1, META_DUP);
setsparam(matchvar, match_all);
/*
* If we're setting match, mbegin, mend we only do
@@ -169,7 +175,15 @@ zpcre_get_substrings(char *arg, int *ove
* (c.f. regex.c).
*/
if (!want_begin_end || nelem) {
- matches = zarrdup(&captures[capture_start]);
+ char **x, **y;
+ y = &captures[capture_start];
+ matches = x = (char **) zalloc(sizeof(char *) * (arrlen(y) + 1));
+ do {
+ if (*y)
+ *x++ = metafy(*y, -1, META_DUP);
+ else
+ *x++ = NULL;
+ } while (*y++);
setaparam(substravar, matches);
}
@@ -255,6 +269,7 @@ bin_pcre_match(char *nam, char **args, O
{
int ret, capcount, *ovec, ovecsize, c;
char *matched_portion = NULL;
+ char *plaintext = NULL;
char *receptacle = NULL;
int return_value = 1;
/* The subject length and offset start are both int values in pcre_exec */
@@ -292,22 +307,23 @@ bin_pcre_match(char *nam, char **args, O
ovecsize = (capcount+1)*3;
ovec = zalloc(ovecsize*sizeof(int));
- subject_len = (int)strlen(*args);
+ plaintext = ztrdup(*args);
+ subject_len = (int)strlen(plaintext);
if (offset_start < 0 || offset_start >= subject_len)
ret = PCRE_ERROR_NOMATCH;
else
- ret = pcre_exec(pcre_pattern, pcre_hints, *args, subject_len, offset_start, 0, ovec, ovecsize);
+ ret = pcre_exec(pcre_pattern, pcre_hints, plaintext, subject_len, offset_start, 0, ovec, ovecsize);
if (ret==0) return_value = 0;
else if (ret==PCRE_ERROR_NOMATCH) /* no match */;
else if (ret>0) {
- zpcre_get_substrings(*args, ovec, ret, matched_portion, receptacle,
+ zpcre_get_substrings(plaintext, ovec, ret, matched_portion, receptacle,
want_offset_pair, 0, 0);
return_value = 0;
}
else {
- zwarnnam(nam, "error in pcre_exec");
+ zwarnnam(nam, "error in pcre_exec [%d]", ret);
}
if (ovec)
@@ -322,7 +338,8 @@ cond_pcre_match(char **a, int id)
{
pcre *pcre_pat;
const char *pcre_err;
- char *lhstr, *rhre, *avar=NULL;
+ char *lhstr, *rhre, *lhstr_plain, *rhre_plain, *avar=NULL;
+ char *p;
int r = 0, pcre_opts = 0, pcre_errptr, capcnt, *ov, ovsize;
int return_value = 0;
@@ -331,6 +348,10 @@ cond_pcre_match(char **a, int id)
lhstr = cond_str(a,0,0);
rhre = cond_str(a,1,0);
+ lhstr_plain = ztrdup(lhstr);
+ rhre_plain = ztrdup(rhre);
+ unmetafy(lhstr_plain, NULL);
+ unmetafy(rhre_plain, NULL);
pcre_pat = NULL;
ov = NULL;
@@ -339,7 +360,7 @@ cond_pcre_match(char **a, int id)
switch(id) {
case CPCRE_PLAIN:
- pcre_pat = pcre_compile(rhre, pcre_opts, &pcre_err, &pcre_errptr, NULL);
+ pcre_pat = pcre_compile(rhre_plain, pcre_opts, &pcre_err, &pcre_errptr, NULL);
if (pcre_pat == NULL) {
zwarn("failed to compile regexp /%s/: %s", rhre, pcre_err);
break;
@@ -347,7 +368,7 @@ cond_pcre_match(char **a, int id)
pcre_fullinfo(pcre_pat, NULL, PCRE_INFO_CAPTURECOUNT, &capcnt);
ovsize = (capcnt+1)*3;
ov = zalloc(ovsize*sizeof(int));
- r = pcre_exec(pcre_pat, NULL, lhstr, strlen(lhstr), 0, 0, ov, ovsize);
+ r = pcre_exec(pcre_pat, NULL, lhstr_plain, strlen(lhstr_plain), 0, 0, ov, ovsize);
/* r < 0 => error; r==0 match but not enough size in ov
* r > 0 => (r-1) substrings found; r==1 => no substrings
*/
@@ -356,13 +377,16 @@ cond_pcre_match(char **a, int id)
return_value = 1;
break;
}
- else if (r==PCRE_ERROR_NOMATCH) return 0; /* no match */
+ else if (r==PCRE_ERROR_NOMATCH) {
+ return_value = 0; /* no match */
+ break;
+ }
else if (r<0) {
- zwarn("pcre_exec() error: %d", r);
+ zwarn("pcre_exec() error [%d]", r);
break;
}
else if (r>0) {
- zpcre_get_substrings(lhstr, ov, r, NULL, avar, 0,
+ zpcre_get_substrings(lhstr_plain, ov, r, NULL, avar, 0,
isset(BASHREMATCH),
!isset(BASHREMATCH));
return_value = 1;
@@ -371,6 +395,10 @@ cond_pcre_match(char **a, int id)
break;
}
+ if (lhstr_plain)
+ free(lhstr_plain);
+ if(rhre_plain)
+ free(rhre_plain);
if (pcre_pat)
pcre_free(pcre_pat);
if (ov)
Messages sorted by:
Reverse Date,
Date,
Thread,
Author