diff options
-rw-r--r-- | lib/libc/regex/engine.c | 60 | ||||
-rw-r--r-- | lib/libc/regex/regcomp.c | 144 | ||||
-rw-r--r-- | lib/libc/regex/regex2.h | 4 |
3 files changed, 202 insertions, 6 deletions
diff --git a/lib/libc/regex/engine.c b/lib/libc/regex/engine.c index be569b1..9d11f54 100644 --- a/lib/libc/regex/engine.c +++ b/lib/libc/regex/engine.c @@ -35,6 +35,8 @@ * SUCH DAMAGE. * * @(#)engine.c 8.5 (Berkeley) 3/20/94 + * + * $FreeBSD$ */ /* @@ -152,6 +154,16 @@ int eflags; register const sopno gl = g->laststate; char *start; char *stop; + /* Boyer-Moore algorithms variables */ + register unsigned char *pp; + int cj, mj; + register unsigned char *mustfirst; + register unsigned char *mustlast; + register int mustlen; + register int *matchjump; + register int *charjump; + register unsigned char *bmp; + register unsigned char *bmps; /* simplify the situation where possible */ if (g->cflags®_NOSUB) @@ -168,12 +180,48 @@ int eflags; /* prescreening; this does wonders for this rather slow code */ if (g->must != NULL) { - for (dp = start; dp < stop; dp++) - if (*dp == g->must[0] && stop - dp >= g->mlen && - memcmp(dp, g->must, (size_t)g->mlen) == 0) - break; - if (dp == stop) /* we didn't find g->must */ - return(REG_NOMATCH); + if(g->charjump != NULL && g->matchjump != NULL) { + mustlen = -g->mlen; + mustfirst = g->must; + mustlast = g->must + g->mlen - 1; + charjump = g->charjump; + matchjump = g->matchjump; + bmps = stop; + pp = mustlast; + for(bmp = start+g->mlen-1; bmp < bmps;) { + /* Fast skip non-matches */ + while(bmp < bmps && charjump[*bmp]) + bmp += charjump[*bmp]; + + if(bmp >= bmps) + break; + + /* Greedy matcher */ + /* We depend on not being used for + * for strings of length 1 + */ + while(*--bmp == *--pp && pp != mustfirst); + + if(*bmp == *pp) + break; + + /* Jump to next possible match */ + mj = matchjump[pp - mustfirst]; + cj = charjump[*bmp]; + bmp += (cj < mj ? mj : cj); + pp = mustlast; + } + if(pp != mustfirst) + return(REG_NOMATCH); + } else { + for (dp = start; dp < stop; dp++) + if (*dp == g->must[0] && + stop - dp >= g->mlen && + memcmp(dp, g->must, (size_t)g->mlen) == 0) + break; + if (dp == stop) /* we didn't find g->must */ + return(REG_NOMATCH); + } } /* match struct setup */ diff --git a/lib/libc/regex/regcomp.c b/lib/libc/regex/regcomp.c index a4e9e1d..f71d63c 100644 --- a/lib/libc/regex/regcomp.c +++ b/lib/libc/regex/regcomp.c @@ -35,6 +35,8 @@ * SUCH DAMAGE. * * @(#)regcomp.c 8.5 (Berkeley) 3/20/94 + * + * $FreeBSD$ */ #if defined(LIBC_SCCS) && !defined(lint) @@ -122,6 +124,8 @@ static void dofwd __P((struct parse *p, sopno pos, sop value)); static void enlarge __P((struct parse *p, sopno size)); static void stripsnug __P((struct parse *p, struct re_guts *g)); static void findmust __P((struct parse *p, struct re_guts *g)); +static void computejumps __P((struct parse *p, struct re_guts *g)); +static void computematchjumps __P((struct parse *p, struct re_guts *g)); static sopno pluscount __P((struct parse *p, struct re_guts *g)); #ifdef __cplusplus @@ -167,6 +171,9 @@ static int never = 0; /* for use in asserts; shuts lint up */ #define never 0 /* some <assert.h>s have bugs too */ #endif +/* Macro used by computejump()/computematchjump() */ +#define MIN(a,b) ((a)<(b)?(a):(b)) + /* - regcomp - interface for parser and compilation = extern int regcomp(regex_t *, const char *, int); @@ -262,6 +269,17 @@ int cflags; categorize(p, g); stripsnug(p, g); findmust(p, g); + /* only use Boyer-Moore algorithm if the pattern is bigger + * than three characters + */ + if(g->mlen > 3) { + computejumps(p, g); + computematchjumps(p, g); + if(g->matchjump == NULL) { + free(g->charjump); + g->charjump = NULL; + } + } g->nplus = pluscount(p, g); g->magic = MAGIC2; preg->re_nsub = g->nsub; @@ -1741,6 +1759,132 @@ register struct re_guts *g; } /* + - computejumps - compute char jumps for BM scan + == static void computejumps(register struct parse *p, register struct re_guts *g); + * + * This algorithm assumes g->must exists and is has size greater than + * zero. It's based on the algorithm found on Computer Algorithms by + * Sara Baase. + * + * A char jump is the number of characters one needs to jump based on + * the value of the character from the text that was mismatched. + */ +static void +computejumps(p, g) +struct parse *p; +struct re_guts *g; +{ + int ch; + int mindex; + + /* Avoid making errors worse */ + if (p->error != 0) + return; + + g->charjump = malloc(256 * sizeof(int)); + if (g->charjump == NULL) /* Not a fatal error */ + return; + + /* If the character does not exist in the pattern, the jump + * is equal to the number of characters in the pattern. + */ + for (ch = 0; ch < 256; ch++) + g->charjump[ch] = g->mlen; + + /* If the character does exist, compute the jump that would + * take us to the last character in the pattern equal to it + * (notice that we match right to left, so that last character + * is the first one that would be matched). + */ + for (mindex = 0; mindex < g->mlen; mindex++) + g->charjump[g->must[mindex]] = g->mlen - mindex - 1; +} + +/* + - computematchjumps - compute match jumps for BM scan + == static void computematchjumps(register struct parse *p, register struct re_guts *g); + * + * This algorithm assumes g->must exists and is has size greater than + * zero. It's based on the algorithm found on Computer Algorithms by + * Sara Baase. + * + * A match jump is the number of characters one needs to advance based + * on the already-matched suffix. + * Notice that all values here are minus (g->mlen-1), because of the way + * the search algorithm works. + */ +static void +computematchjumps(p, g) +struct parse *p; +struct re_guts *g; +{ + int mindex; /* General "must" iterator */ + int suffix; /* Keeps track of matching suffix */ + int ssuffix; /* Keeps track of suffixes' suffix */ + int* pmatches; /* pmatches[k] points to the next i + * such that i+1...mlen is a substring + * of k+1...k+mlen-i-1 + */ + + /* Avoid making errors worse */ + if (p->error != 0) + return; + + pmatches = malloc(g->mlen * sizeof(unsigned int)); + if (pmatches == NULL) { + g->matchjump = NULL; + return; + } + + g->matchjump = malloc(g->mlen * sizeof(unsigned int)); + if (g->matchjump == NULL) /* Not a fatal error */ + return; + + /* Set maximum possible jump for each character in the pattern */ + for (mindex = 0; mindex < g->mlen; mindex++) + g->matchjump[mindex] = 2*g->mlen - mindex - 1; + + /* Compute pmatches[] */ + for (mindex = g->mlen - 1, suffix = g->mlen; mindex >= 0; + mindex--, suffix--) { + pmatches[mindex] = suffix; + + /* If a mismatch is found, interrupting the substring, + * compute the matchjump for that position. If no + * mismatch is found, then a text substring mismatched + * against the suffix will also mismatch against the + * substring. + */ + while (suffix < g->mlen + && g->must[mindex] != g->must[suffix]) { + g->matchjump[suffix] = MIN(g->matchjump[suffix], + g->mlen - mindex - 1); + suffix = pmatches[suffix]; + } + } + + /* Compute the matchjump up to the last substring found to jump + * to the beginning of the largest must pattern prefix matching + * it's own suffix. + */ + for (mindex = 0; mindex <= suffix; mindex++) + g->matchjump[mindex] = MIN(g->matchjump[mindex], + g->mlen + suffix - mindex); + + ssuffix = pmatches[suffix]; + while(suffix < g->mlen) { + while(suffix <= ssuffix) { + g->matchjump[suffix] = MIN(g->matchjump[suffix], + g->mlen + ssuffix - suffix); + suffix++; + } + ssuffix = pmatches[ssuffix]; + } + + free(pmatches); +} + +/* - pluscount - count + nesting == static sopno pluscount(register struct parse *p, register struct re_guts *g); */ diff --git a/lib/libc/regex/regex2.h b/lib/libc/regex/regex2.h index cd7b962..af9cc58 100644 --- a/lib/libc/regex/regex2.h +++ b/lib/libc/regex/regex2.h @@ -35,6 +35,8 @@ * SUCH DAMAGE. * * @(#)regex2.h 8.4 (Berkeley) 3/20/94 + * + * $FreeBSD$ */ /* @@ -160,6 +162,8 @@ struct re_guts { int ncategories; /* how many character categories */ cat_t *categories; /* ->catspace[-CHAR_MIN] */ char *must; /* match must contain this string */ + int *charjump; /* Boyer-Moore char jump table */ + int *matchjump; /* Boyer-Moore match jump table */ int mlen; /* length of must */ size_t nsub; /* copy of re_nsub */ int backrefs; /* does it use back references? */ |