summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authordcs <dcs@FreeBSD.org>2000-06-29 04:48:34 +0000
committerdcs <dcs@FreeBSD.org>2000-06-29 04:48:34 +0000
commit83f8b91f10d1b498ad3400df259bf55f4eba5e43 (patch)
tree555fba08ae3d1d411a94ab499f9ed6cf0144a059
parent12d92ca4840d8a4774f880ce5fffc9e30b05d810 (diff)
downloadFreeBSD-src-83f8b91f10d1b498ad3400df259bf55f4eba5e43.zip
FreeBSD-src-83f8b91f10d1b498ad3400df259bf55f4eba5e43.tar.gz
Add Boyler-Moore algorithm to pre-matching test.
The BM algorithm works by scanning the pattern from right to left, and jumping as many characters as viable based on the text's mismatched character and the pattern's already matched suffix. This typically enable us to test only a fraction of the text's characters, but has a worse performance than the straight-forward method for small patterns. Because of this, the BM algorithm will only be used if the pattern size is at least 4 characters. Notice that this pre-matching is done on the largest substring of the regular expression that _must_ be present on the text for a succesful match to be possible at all. For instance, "(xyzzy|grues)" will yield a null "must" substring, and, therefore, not benefit from the BM algorithm at all. Because of the lack of intelligence of the algorithm that finds the "must" string, things like "charjump|matchjump" will also yield a null string. To optimize that, "(char|match)jump" should be used. The setup time (at regcomp()) for the BM algorithm will most likely outweight any benefits for one-time matches. Given the slow regex(3) we have, this is unlikely to be even perceptible, though. The size of a regex_t structure is increased by 2*sizeof(char*) + 256*sizeof(int) + strlen(must)*sizeof(int). This is all inside the regex_t's "guts", which is allocated dynamically by regcomp(). If allocation of either of the two tables fail, the other one is freed. In this case, the straight-forward algorithm is used for pre-matching. Tests exercising the code path affected have shown a speed increase of 50% for "must" strings of length four or five. API and ABI remain unchanged by this commit. The patch submitted on the PR was not used, as it was non-functional. PR: 14342
-rw-r--r--lib/libc/regex/engine.c60
-rw-r--r--lib/libc/regex/regcomp.c144
-rw-r--r--lib/libc/regex/regex2.h4
3 files changed, 202 insertions, 6 deletions
diff --git a/lib/libc/regex/engine.c b/lib/libc/regex/engine.c
index be569b1..9d11f54 100644
--- a/lib/libc/regex/engine.c
+++ b/lib/libc/regex/engine.c
@@ -35,6 +35,8 @@
* SUCH DAMAGE.
*
* @(#)engine.c 8.5 (Berkeley) 3/20/94
+ *
+ * $FreeBSD$
*/
/*
@@ -152,6 +154,16 @@ int eflags;
register const sopno gl = g->laststate;
char *start;
char *stop;
+ /* Boyer-Moore algorithms variables */
+ register unsigned char *pp;
+ int cj, mj;
+ register unsigned char *mustfirst;
+ register unsigned char *mustlast;
+ register int mustlen;
+ register int *matchjump;
+ register int *charjump;
+ register unsigned char *bmp;
+ register unsigned char *bmps;
/* simplify the situation where possible */
if (g->cflags&REG_NOSUB)
@@ -168,12 +180,48 @@ int eflags;
/* prescreening; this does wonders for this rather slow code */
if (g->must != NULL) {
- for (dp = start; dp < stop; dp++)
- if (*dp == g->must[0] && stop - dp >= g->mlen &&
- memcmp(dp, g->must, (size_t)g->mlen) == 0)
- break;
- if (dp == stop) /* we didn't find g->must */
- return(REG_NOMATCH);
+ if(g->charjump != NULL && g->matchjump != NULL) {
+ mustlen = -g->mlen;
+ mustfirst = g->must;
+ mustlast = g->must + g->mlen - 1;
+ charjump = g->charjump;
+ matchjump = g->matchjump;
+ bmps = stop;
+ pp = mustlast;
+ for(bmp = start+g->mlen-1; bmp < bmps;) {
+ /* Fast skip non-matches */
+ while(bmp < bmps && charjump[*bmp])
+ bmp += charjump[*bmp];
+
+ if(bmp >= bmps)
+ break;
+
+ /* Greedy matcher */
+ /* We depend on not being used for
+ * for strings of length 1
+ */
+ while(*--bmp == *--pp && pp != mustfirst);
+
+ if(*bmp == *pp)
+ break;
+
+ /* Jump to next possible match */
+ mj = matchjump[pp - mustfirst];
+ cj = charjump[*bmp];
+ bmp += (cj < mj ? mj : cj);
+ pp = mustlast;
+ }
+ if(pp != mustfirst)
+ return(REG_NOMATCH);
+ } else {
+ for (dp = start; dp < stop; dp++)
+ if (*dp == g->must[0] &&
+ stop - dp >= g->mlen &&
+ memcmp(dp, g->must, (size_t)g->mlen) == 0)
+ break;
+ if (dp == stop) /* we didn't find g->must */
+ return(REG_NOMATCH);
+ }
}
/* match struct setup */
diff --git a/lib/libc/regex/regcomp.c b/lib/libc/regex/regcomp.c
index a4e9e1d..f71d63c 100644
--- a/lib/libc/regex/regcomp.c
+++ b/lib/libc/regex/regcomp.c
@@ -35,6 +35,8 @@
* SUCH DAMAGE.
*
* @(#)regcomp.c 8.5 (Berkeley) 3/20/94
+ *
+ * $FreeBSD$
*/
#if defined(LIBC_SCCS) && !defined(lint)
@@ -122,6 +124,8 @@ static void dofwd __P((struct parse *p, sopno pos, sop value));
static void enlarge __P((struct parse *p, sopno size));
static void stripsnug __P((struct parse *p, struct re_guts *g));
static void findmust __P((struct parse *p, struct re_guts *g));
+static void computejumps __P((struct parse *p, struct re_guts *g));
+static void computematchjumps __P((struct parse *p, struct re_guts *g));
static sopno pluscount __P((struct parse *p, struct re_guts *g));
#ifdef __cplusplus
@@ -167,6 +171,9 @@ static int never = 0; /* for use in asserts; shuts lint up */
#define never 0 /* some <assert.h>s have bugs too */
#endif
+/* Macro used by computejump()/computematchjump() */
+#define MIN(a,b) ((a)<(b)?(a):(b))
+
/*
- regcomp - interface for parser and compilation
= extern int regcomp(regex_t *, const char *, int);
@@ -262,6 +269,17 @@ int cflags;
categorize(p, g);
stripsnug(p, g);
findmust(p, g);
+ /* only use Boyer-Moore algorithm if the pattern is bigger
+ * than three characters
+ */
+ if(g->mlen > 3) {
+ computejumps(p, g);
+ computematchjumps(p, g);
+ if(g->matchjump == NULL) {
+ free(g->charjump);
+ g->charjump = NULL;
+ }
+ }
g->nplus = pluscount(p, g);
g->magic = MAGIC2;
preg->re_nsub = g->nsub;
@@ -1741,6 +1759,132 @@ register struct re_guts *g;
}
/*
+ - computejumps - compute char jumps for BM scan
+ == static void computejumps(register struct parse *p, register struct re_guts *g);
+ *
+ * This algorithm assumes g->must exists and is has size greater than
+ * zero. It's based on the algorithm found on Computer Algorithms by
+ * Sara Baase.
+ *
+ * A char jump is the number of characters one needs to jump based on
+ * the value of the character from the text that was mismatched.
+ */
+static void
+computejumps(p, g)
+struct parse *p;
+struct re_guts *g;
+{
+ int ch;
+ int mindex;
+
+ /* Avoid making errors worse */
+ if (p->error != 0)
+ return;
+
+ g->charjump = malloc(256 * sizeof(int));
+ if (g->charjump == NULL) /* Not a fatal error */
+ return;
+
+ /* If the character does not exist in the pattern, the jump
+ * is equal to the number of characters in the pattern.
+ */
+ for (ch = 0; ch < 256; ch++)
+ g->charjump[ch] = g->mlen;
+
+ /* If the character does exist, compute the jump that would
+ * take us to the last character in the pattern equal to it
+ * (notice that we match right to left, so that last character
+ * is the first one that would be matched).
+ */
+ for (mindex = 0; mindex < g->mlen; mindex++)
+ g->charjump[g->must[mindex]] = g->mlen - mindex - 1;
+}
+
+/*
+ - computematchjumps - compute match jumps for BM scan
+ == static void computematchjumps(register struct parse *p, register struct re_guts *g);
+ *
+ * This algorithm assumes g->must exists and is has size greater than
+ * zero. It's based on the algorithm found on Computer Algorithms by
+ * Sara Baase.
+ *
+ * A match jump is the number of characters one needs to advance based
+ * on the already-matched suffix.
+ * Notice that all values here are minus (g->mlen-1), because of the way
+ * the search algorithm works.
+ */
+static void
+computematchjumps(p, g)
+struct parse *p;
+struct re_guts *g;
+{
+ int mindex; /* General "must" iterator */
+ int suffix; /* Keeps track of matching suffix */
+ int ssuffix; /* Keeps track of suffixes' suffix */
+ int* pmatches; /* pmatches[k] points to the next i
+ * such that i+1...mlen is a substring
+ * of k+1...k+mlen-i-1
+ */
+
+ /* Avoid making errors worse */
+ if (p->error != 0)
+ return;
+
+ pmatches = malloc(g->mlen * sizeof(unsigned int));
+ if (pmatches == NULL) {
+ g->matchjump = NULL;
+ return;
+ }
+
+ g->matchjump = malloc(g->mlen * sizeof(unsigned int));
+ if (g->matchjump == NULL) /* Not a fatal error */
+ return;
+
+ /* Set maximum possible jump for each character in the pattern */
+ for (mindex = 0; mindex < g->mlen; mindex++)
+ g->matchjump[mindex] = 2*g->mlen - mindex - 1;
+
+ /* Compute pmatches[] */
+ for (mindex = g->mlen - 1, suffix = g->mlen; mindex >= 0;
+ mindex--, suffix--) {
+ pmatches[mindex] = suffix;
+
+ /* If a mismatch is found, interrupting the substring,
+ * compute the matchjump for that position. If no
+ * mismatch is found, then a text substring mismatched
+ * against the suffix will also mismatch against the
+ * substring.
+ */
+ while (suffix < g->mlen
+ && g->must[mindex] != g->must[suffix]) {
+ g->matchjump[suffix] = MIN(g->matchjump[suffix],
+ g->mlen - mindex - 1);
+ suffix = pmatches[suffix];
+ }
+ }
+
+ /* Compute the matchjump up to the last substring found to jump
+ * to the beginning of the largest must pattern prefix matching
+ * it's own suffix.
+ */
+ for (mindex = 0; mindex <= suffix; mindex++)
+ g->matchjump[mindex] = MIN(g->matchjump[mindex],
+ g->mlen + suffix - mindex);
+
+ ssuffix = pmatches[suffix];
+ while(suffix < g->mlen) {
+ while(suffix <= ssuffix) {
+ g->matchjump[suffix] = MIN(g->matchjump[suffix],
+ g->mlen + ssuffix - suffix);
+ suffix++;
+ }
+ ssuffix = pmatches[ssuffix];
+ }
+
+ free(pmatches);
+}
+
+/*
- pluscount - count + nesting
== static sopno pluscount(register struct parse *p, register struct re_guts *g);
*/
diff --git a/lib/libc/regex/regex2.h b/lib/libc/regex/regex2.h
index cd7b962..af9cc58 100644
--- a/lib/libc/regex/regex2.h
+++ b/lib/libc/regex/regex2.h
@@ -35,6 +35,8 @@
* SUCH DAMAGE.
*
* @(#)regex2.h 8.4 (Berkeley) 3/20/94
+ *
+ * $FreeBSD$
*/
/*
@@ -160,6 +162,8 @@ struct re_guts {
int ncategories; /* how many character categories */
cat_t *categories; /* ->catspace[-CHAR_MIN] */
char *must; /* match must contain this string */
+ int *charjump; /* Boyer-Moore char jump table */
+ int *matchjump; /* Boyer-Moore match jump table */
int mlen; /* length of must */
size_t nsub; /* copy of re_nsub */
int backrefs; /* does it use back references? */
OpenPOWER on IntegriCloud