Zsh Mailing List Archive Messages sorted by: Reverse Date, Date, Thread, Author

Re: quest for bld_line (was: Re: Stuff to do)

X-seq: zsh-workers 22867
From: Andrey Borzenkov <arvidjaar@xxxxxxxxxx>
To: zsh-workers@xxxxxxxxxx
Subject: Re: quest for bld_line (was: Re: Stuff to do)
Date: Wed, 11 Oct 2006 21:54:27 +0400
In-reply-to: <200610092028.50896.arvidjaar@xxxxxxxxxx>
Mailing-list: contact zsh-workers-help@xxxxxxxxxx; run by ezmlm
References: <200609271211.k8RCBW5N023914@xxxxxxxxxxxxxx> <20061009130024.76925819.pws@xxxxxxx> <200610092028.50896.arvidjaar@xxxxxxxxxx>

On Monday 09 October 2006 20:28, Andrey Borzenkov wrote:
> On Monday 09 October 2006 16:00, Peter Stephenson wrote:
> > Andrey Borzenkov <arvidjaar@xxxxxxxxxx> wrote:
> > > The point of patch is to replace exhaustive enumeration of all possible
> > > combinations by comparison of patterns. I.e. it checks if two patterns
> > > may have something in common - this can be generalized later using
> > > different pattern representations.
> > >
> > > I would be happy if we could just toss away this function.
> > >
> > > Comments?
> >
> > I don't know enough about this stage of completion to comment in any
> > detail, but it certainly looks like we need to do something about
> > it...
>
> to avoid duplication of efforts - I know now how to do it properly (that
> is, to preserve full functionality without relying on current fixed 256
> byte tables), I just have to find some time to write it down.
>

OK so here is the updated patch; all tests passed; unfortunately I still do 
not know how to write regression tests that would specifically check this 
function.

I dare to presume that code became legible enough to be understandable now. It 
checks if it can match next character(s) and then recursively tries to match 
the rest. The net effect is that instead of answering "how to find all 
characters matching given pattern" we have to answer "can two patterns match 
the same character". It is obvious for literal character, ? and equivalence 
class; it is also obvious for future [:upper:] and [:lower:] patterns; and it 
is straightforward (albeit not necessarily fast) for [...] set.

Question - how does zsh internally represent [...] now? If this is a (set of) 
bit strings, comparison is trivial.

Comments (now attached)?

Index: Src/Zle/compmatch.c
===================================================================
RCS file: /cvsroot/zsh/zsh/Src/Zle/compmatch.c,v
retrieving revision 1.50
diff -u -p -r1.50 compmatch.c
--- Src/Zle/compmatch.c	30 Sep 2006 06:53:15 -0000	1.50
+++ Src/Zle/compmatch.c	11 Oct 2006 17:43:58 -0000
@@ -1087,51 +1087,70 @@ comp_match(char *pfx, char *sfx, char *w
     return r;
 }
 
-/* Check if the given pattern matches the given string.             *
- *  p and  s are either anchor or line pattern and string;
+/*
+ * Try to match single pair of characters from line and word
+ * Either of line or word patterns (and characters) may be missing
+ */
+static int
+cm_try_single(Cpattern lp, char lc, Cpattern wp, char wc)
+{
+    char li = 0, wi = 0;
+
+    if (lp) {
+	li = lp->tab[STOUC(lc)];
+	if (!li)
+	    return 0;
+    }
+    if (wp) {
+	wi = wp->tab[STOUC(wc)];
+	if (!wi)
+	    return 0;
+    }
+
+    /* Equiv is expected to be defined on both sides */
+    if (lp && lp->equiv && wp && wp->equiv && li != wi)
+	return 0;
+
+    return 1;
+}
+
+/*
+ * Check if the given pattern matches the given string.             *
+ * lp and ls are either anchor or line pattern and string;
  * wp and ws are word (candidate) pattern and string
- *
- * If only one pattern is given, we just check if characters match
- * If both line and word are given, we check that characters match
- * for {...} classes by comparing relative numbers in sequence.
- *
- * Patterns and strings are always passed in pairs, so it is enough
- * to check for non-NULL wp. p should always be present.
  */
 
 /**/
 mod_export int
-pattern_match(Cpattern p, char *s, Cpattern wp, char *ws)
+pattern_match(Cpattern lp, char *ls, Cpattern wp, char *ws)
 {
-    unsigned char c;
-    unsigned char wc;
-
-    while (p && wp && *s && *ws) {
-	c = p->tab[*((unsigned char *) s)];
-	wc = wp->tab[*((unsigned char *) ws)];
-
-	if (!c || !wc || c != wc)
+    while (lp && wp && *ls && *ws) {
+	if (!cm_try_single(lp, *ls, wp, *ws))
 	    return 0;
 
-	s++;
+	ls++;
 	ws++;
-	p = p->next;
+	lp = lp->next;
 	wp = wp->next;
     }
 
-    while (p && *s) {
-	if (!p->tab[*((unsigned char *) s)])
+    while (lp && *ls) {
+	if (!cm_try_single(lp, *ls, NULL, '\0'))
 	    return 0;
-	p = p->next;
-	s++;
+	lp = lp->next;
+	ls++;
     }
 
     while (wp && *ws) {
-	if (!wp->tab[*((unsigned char *) ws)])
+	if (!cm_try_single(NULL, '\0', wp, *ws))
 	    return 0;
 	wp = wp->next;
 	ws++;
     }
+    
+    /* If any line is shorter than pattern, return falure */
+    if ((lp && !*ls) || (wp && !*ws))
+	return 0;
 
     return 1;
 }
@@ -1214,106 +1233,164 @@ bld_parts(char *str, int len, int plen, 
     return ret;
 }
 
-/* This builds all the possible line patterns for the pattern pat in the
- * buffer line. Initially line is the same as lp, but during recursive
- * calls lp is incremented for storing successive characters. Whenever
- * a full possible string is build, we test if this line matches the
- * string given by wlen and word.
+/*
+ * Compare two different line patterns if they can have some common character
+ * Insert one of common characters in line we are building (it does not matter
+ * which one)
+ * mlp	- line pattern which has matched before
+ * mwp	- word pattern which has matched before
+ * nlp	- new line pattern that we currently test against mlp
+ * nwp	- new word pattern that we currently test against mwp
+ * line	- line we build; we insert characters there
+ */
+
+/**/
+static int
+pattern_compare(Cpattern mlp, Cpattern mwp, Cpattern nlp, Cpattern nwp,
+		char *line)
+{
+    while (nlp) {
+	int i;
+
+	/*
+	 * test to see if mlp and nlp have something in commons
+	 * nlp cannot be less than mlp (we check pattern length before)
+	 * but word pattern may of course be shorter than line ...
+	 */
+	for (i = 0; i < 256; i++)
+	    if (mlp->tab[i] && nlp->tab[i]) {
+		/* for equiv. class they must also match word pattern */
+		if (mlp->equiv) {
+		    if (!mwp || !nwp || (mlp->tab[i] == mwp->tab[i] &&
+			nlp->tab[i] == nwp->tab[i]))
+			break;
+		} else
+		    break;
+	    }
+	if (i < 256) {
+	    /* OK we found character that matches both matchers */
+	    *line++ = (char)i;
+	} else {
+	    /* No matching character */
+	    return 0;
+	}
+	mlp = mlp->next;
+	nlp = nlp->next;
+	if (mwp) mwp = mwp->next;
+	if (nwp) nwp = nwp->next;
+    }
+
+    return 1;
+}
+
+/* This tries to find out, if there is common line that may match two
+ * words (possible matches or parts thereof). When this function is called,
+ * it is ensured that `mword' has matched word pattern in `matcher';
+ * we try to find a string that both matches line pattern in `matcher'
+ * and another word `word'
  *
- * wpat contains pattern that matched previously
- * lpat contains the pattern for line we build
- * mword is a string that matched wpat before
- * word is string that we try to match now
+ * matcher - matcher that `mword' has been matched against
+ * line    - buffer for string we build
+ * lmatch  - length of line built so far
+ * wmatch  - length of word matching the above
+ * mword   - word that has matched word pattern in `matcher' before
+ * word    - is string that we try to match now
+ * wlen    - length of `word'
+ * sfx     - if we should match bacwards
  *
- * The return value is the length of the string matched in the word, it
+ * The return value is the length of the string matched in the `word', it
  * is zero if we couldn't build a line that matches the word.
  */
 
-
 /**/
 static int
-bld_line(Cpattern wpat, Cpattern lpat, char *line, char *lp,
+bld_line(Cmatcher matcher,  char *line, int lmatch, int wmatch,
 	 char *mword, char *word, int wlen, int sfx)
 {
-    if (lpat) {
-	/* Still working on the pattern. */
-
-	int i, l;
-	unsigned char c = 0;
+    static Cpattern *mlpa;
+    static Cpattern *mwpa;
+    Cmlist ms;
+    Cmatcher mp;
+    Cpattern pat;
+    char *lp;
+    int llen = matcher->llen, rl = 0, i, clpos, cwpos;
 
-	/* Get the number of the character for a correspondence class
-	 * if it has a corresponding class. */
-	if (lpat->equiv)
-	    if (wpat && *mword) {
-		c = wpat->tab[STOUC(*mword)];
-		wpat = wpat->next;
-		mword++;
-	    }
+    if (!lmatch) {
+	/*
+	 * Setup array instead of list; this is required for suffix matchs
+	 * Array size is that of line; we are not interested in patterns
+	 * above this length
+	 */
+	mlpa = zalloc(sizeof(Cpattern) * llen);
+	mwpa = zalloc(sizeof(Cpattern) * llen);
 
+	for (i = 0, pat = matcher->line; pat; i++, pat = pat->next)
+	    mlpa[i] = pat;
+	for (i = 0, pat = matcher->word; pat && i < llen; i++, pat = pat->next)
+	    mwpa[i] = pat;
+	/* In case mword is smaller than line ... */
+	for (i = matcher->wlen; i < llen; i++)
+	    mwpa[i] = NULL;
+    }
 
-	/* Walk through the table in the pattern and try the characters
-	 * that may appear in the current position. */
-	for (i = 0; i < 256; i++)
-	    if ((lpat->equiv && c) ? (c == lpat->tab[i]) : lpat->tab[i]) {
-		*lp = i;
-		/* We stored the character, now call ourselves to build
-		 * the rest. */
-		if ((l = bld_line(wpat, lpat->next, line, lp + 1,
-				  mword, word, wlen, sfx)))
-		    return l;
-	    }
+    if (sfx) {
+	lp = line + llen - lmatch;
+	clpos = llen - lmatch - 1;
+	cwpos = llen - wmatch - 1;
     } else {
-	/* We reached the end, i.e. the line string is fully build, now
-	 * see if it matches the given word. */
-
-	Cmlist ms;
-	Cmatcher mp;
-	int l = lp - line, t, rl = 0, ind, add;
-
-	/* Quick test if the strings are exactly the same. */
-	if (l == wlen && !strncmp(line, word, l))
-	    return l;
-
-	if (sfx) {
-	    line = lp; word += wlen;
-	    ind = -1; add = -1;
-	} else {
-	    ind = 0; add = 1;
+	lp = line + lmatch;
+	clpos = lmatch;
+	cwpos = wmatch;
+    }
+
+    /*
+     * First try if next character in word fits mword directly;
+     * if yes, try to build the rest of line
+     * Do it only if word is at least as large as line
+     */
+    if (wmatch + 1 <= llen &&
+	cm_try_single(mlpa[clpos], word[clpos],
+		      mwpa[clpos], mword[clpos])) {
+	/* last character? */
+	if (lmatch + 1 == llen) {
+	    rl = 1;
+	    goto cleanup;
 	}
-	/* We loop through the whole line string built. */
-	while (l && wlen) {
-	    if (word[ind] == line[ind]) {
-		/* The same character in both strings, skip over. */
-		line += add; word += add;
-		l--; wlen--; rl++;
-	    } else {
-		t = 0;
-		for (ms = bmatchers; ms && !t; ms = ms->next) {
-		    mp = ms->matcher;
-		    if (mp && !mp->flags && mp->wlen <= wlen && mp->llen <= l &&
-			pattern_match(mp->line, (sfx ? line - mp->llen : line),
-				      mp->word, (sfx ? word - mp->wlen : word))) {
-			/* Both the line and the word pattern matched,
-			 * now skip over the matched portions. */
-			if (sfx) {
-			    line -= mp->llen; word -= mp->wlen;
-			} else {
-			    line += mp->llen; word += mp->wlen;
-			}
-			l -= mp->llen; wlen -= mp->wlen; rl += mp->wlen;
-			t = 1;
-		    }
-		}
-		if (!t)
-		    /* Didn't match, give up. */
-		    return 0;
+	if ((rl = bld_line(matcher, line, lmatch + 1, wmatch + 1,
+			   mword, word, wlen, sfx)) >= 0) {
+	    rl++;
+	    goto cleanup;
+	}
+    }
+
+    /* nope; try each matcher in turn and see if we can match with it */
+    for (ms = bmatchers; ms; ms = ms->next) {
+	mp = ms->matcher;
+	if (mp && !mp->flags && mp->wlen <= wlen - wmatch && mp->llen <= llen - lmatch &&
+	    pattern_match(mp->word, (sfx ? word - wmatch - mp->wlen : word + wmatch),
+			  NULL, NULL) &&
+	    pattern_compare(mlpa[clpos], mwpa[clpos], mp->line, mp->word,
+			    (sfx ? lp - mp->llen : lp))) {
+	    /* Both the line and the word pattern matched,
+	     * now try to match the rest */
+	    rl = bld_line(matcher, line, lmatch + mp->llen, wmatch + mp->wlen,
+			  mword, word, wlen, sfx);
+	    if (rl) {
+		rl += mp->wlen;
+		goto cleanup;
 	    }
 	}
-	if (!l)
-	    /* Unmatched portion in the line built, return matched length. */
-	    return rl;
     }
-    return 0;
+
+    /* Nothing match; return failure */
+
+cleanup:
+    if (!lmatch) {
+	zfree(mlpa, sizeof(Cpattern) * matcher->llen);
+	zfree(mwpa, sizeof(Cpattern) * matcher->wlen);
+    }
+
+    return rl;
 }
 
 /* This builds a string that may be put on the line that fully matches the
@@ -1357,7 +1434,7 @@ join_strs(int la, char *sa, int lb, char
 			}
 			/* Now try to build a string that matches the other
 			 * string. */
-			if ((bl = bld_line(mp->word, mp->line, line, line,
+			if ((bl = bld_line(mp, line, 0, 0,
 					   *ap, *bp, *blp, 0))) {
 			    /* Found one, put it into the return string. */
 			    line[mp->llen] = '\0';
@@ -1560,7 +1637,7 @@ join_sub(Cmdata md, char *str, int len, 
 		    else
 			mw = nw - (sfx ? mp->wlen : 0);
 
-		    if ((bl = bld_line(mp->word, mp->line, line, line,
+		    if ((bl = bld_line(mp, line, 0, 0,
 				       mw, (t ? nw : ow), (t ? nl : ol), sfx)))  {
 			/* Yep, one of the lines matched the other
 			 * string. */

Attachment: pgpMmSySe8qPC.pgp
Description: PGP signature

References:
- Stuff to do
  - From: Peter Stephenson
- Re: quest for bld_line (was: Re: Stuff to do)
  - From: Peter Stephenson
- Re: quest for bld_line (was: Re: Stuff to do)
  - From: Andrey Borzenkov

Messages sorted by: Reverse Date, Date, Thread, Author