diff options
Diffstat (limited to 'lib/libc/regex/regcomp.c')
-rw-r--r-- | lib/libc/regex/regcomp.c | 144 |
1 files changed, 144 insertions, 0 deletions
diff --git a/lib/libc/regex/regcomp.c b/lib/libc/regex/regcomp.c index a4e9e1d..f71d63c 100644 --- a/lib/libc/regex/regcomp.c +++ b/lib/libc/regex/regcomp.c @@ -35,6 +35,8 @@ * SUCH DAMAGE. * * @(#)regcomp.c 8.5 (Berkeley) 3/20/94 + * + * $FreeBSD$ */ #if defined(LIBC_SCCS) && !defined(lint) @@ -122,6 +124,8 @@ static void dofwd __P((struct parse *p, sopno pos, sop value)); static void enlarge __P((struct parse *p, sopno size)); static void stripsnug __P((struct parse *p, struct re_guts *g)); static void findmust __P((struct parse *p, struct re_guts *g)); +static void computejumps __P((struct parse *p, struct re_guts *g)); +static void computematchjumps __P((struct parse *p, struct re_guts *g)); static sopno pluscount __P((struct parse *p, struct re_guts *g)); #ifdef __cplusplus @@ -167,6 +171,9 @@ static int never = 0; /* for use in asserts; shuts lint up */ #define never 0 /* some <assert.h>s have bugs too */ #endif +/* Macro used by computejump()/computematchjump() */ +#define MIN(a,b) ((a)<(b)?(a):(b)) + /* - regcomp - interface for parser and compilation = extern int regcomp(regex_t *, const char *, int); @@ -262,6 +269,17 @@ int cflags; categorize(p, g); stripsnug(p, g); findmust(p, g); + /* only use Boyer-Moore algorithm if the pattern is bigger + * than three characters + */ + if(g->mlen > 3) { + computejumps(p, g); + computematchjumps(p, g); + if(g->matchjump == NULL) { + free(g->charjump); + g->charjump = NULL; + } + } g->nplus = pluscount(p, g); g->magic = MAGIC2; preg->re_nsub = g->nsub; @@ -1741,6 +1759,132 @@ register struct re_guts *g; } /* + - computejumps - compute char jumps for BM scan + == static void computejumps(register struct parse *p, register struct re_guts *g); + * + * This algorithm assumes g->must exists and is has size greater than + * zero. It's based on the algorithm found on Computer Algorithms by + * Sara Baase. + * + * A char jump is the number of characters one needs to jump based on + * the value of the character from the text that was mismatched. + */ +static void +computejumps(p, g) +struct parse *p; +struct re_guts *g; +{ + int ch; + int mindex; + + /* Avoid making errors worse */ + if (p->error != 0) + return; + + g->charjump = malloc(256 * sizeof(int)); + if (g->charjump == NULL) /* Not a fatal error */ + return; + + /* If the character does not exist in the pattern, the jump + * is equal to the number of characters in the pattern. + */ + for (ch = 0; ch < 256; ch++) + g->charjump[ch] = g->mlen; + + /* If the character does exist, compute the jump that would + * take us to the last character in the pattern equal to it + * (notice that we match right to left, so that last character + * is the first one that would be matched). + */ + for (mindex = 0; mindex < g->mlen; mindex++) + g->charjump[g->must[mindex]] = g->mlen - mindex - 1; +} + +/* + - computematchjumps - compute match jumps for BM scan + == static void computematchjumps(register struct parse *p, register struct re_guts *g); + * + * This algorithm assumes g->must exists and is has size greater than + * zero. It's based on the algorithm found on Computer Algorithms by + * Sara Baase. + * + * A match jump is the number of characters one needs to advance based + * on the already-matched suffix. + * Notice that all values here are minus (g->mlen-1), because of the way + * the search algorithm works. + */ +static void +computematchjumps(p, g) +struct parse *p; +struct re_guts *g; +{ + int mindex; /* General "must" iterator */ + int suffix; /* Keeps track of matching suffix */ + int ssuffix; /* Keeps track of suffixes' suffix */ + int* pmatches; /* pmatches[k] points to the next i + * such that i+1...mlen is a substring + * of k+1...k+mlen-i-1 + */ + + /* Avoid making errors worse */ + if (p->error != 0) + return; + + pmatches = malloc(g->mlen * sizeof(unsigned int)); + if (pmatches == NULL) { + g->matchjump = NULL; + return; + } + + g->matchjump = malloc(g->mlen * sizeof(unsigned int)); + if (g->matchjump == NULL) /* Not a fatal error */ + return; + + /* Set maximum possible jump for each character in the pattern */ + for (mindex = 0; mindex < g->mlen; mindex++) + g->matchjump[mindex] = 2*g->mlen - mindex - 1; + + /* Compute pmatches[] */ + for (mindex = g->mlen - 1, suffix = g->mlen; mindex >= 0; + mindex--, suffix--) { + pmatches[mindex] = suffix; + + /* If a mismatch is found, interrupting the substring, + * compute the matchjump for that position. If no + * mismatch is found, then a text substring mismatched + * against the suffix will also mismatch against the + * substring. + */ + while (suffix < g->mlen + && g->must[mindex] != g->must[suffix]) { + g->matchjump[suffix] = MIN(g->matchjump[suffix], + g->mlen - mindex - 1); + suffix = pmatches[suffix]; + } + } + + /* Compute the matchjump up to the last substring found to jump + * to the beginning of the largest must pattern prefix matching + * it's own suffix. + */ + for (mindex = 0; mindex <= suffix; mindex++) + g->matchjump[mindex] = MIN(g->matchjump[mindex], + g->mlen + suffix - mindex); + + ssuffix = pmatches[suffix]; + while(suffix < g->mlen) { + while(suffix <= ssuffix) { + g->matchjump[suffix] = MIN(g->matchjump[suffix], + g->mlen + ssuffix - suffix); + suffix++; + } + ssuffix = pmatches[ssuffix]; + } + + free(pmatches); +} + +/* - pluscount - count + nesting == static sopno pluscount(register struct parse *p, register struct re_guts *g); */ |