diff options
Diffstat (limited to 'lib/libc/regex/engine.c')
-rw-r--r-- | lib/libc/regex/engine.c | 132 |
1 files changed, 92 insertions, 40 deletions
diff --git a/lib/libc/regex/engine.c b/lib/libc/regex/engine.c index 9d69c1e..e6484ef 100644 --- a/lib/libc/regex/engine.c +++ b/lib/libc/regex/engine.c @@ -69,6 +69,17 @@ __FBSDID("$FreeBSD$"); #define at lat #define match lmat #endif +#ifdef MNAMES +#define matcher mmatcher +#define fast mfast +#define slow mslow +#define dissect mdissect +#define backref mbackref +#define step mstep +#define print mprint +#define at mat +#define match mmat +#endif /* another structure passed up and down to avoid zillions of parameters */ struct match { @@ -85,6 +96,7 @@ struct match { states fresh; /* states for a fresh start */ states tmp; /* temporary */ states empty; /* empty set of states */ + mbstate_t mbs; /* multibyte conversion state */ }; /* ========= begin header generated by ./mkh ========= */ @@ -98,16 +110,15 @@ static char *dissect(struct match *m, char *start, char *stop, sopno startst, so static char *backref(struct match *m, char *start, char *stop, sopno startst, sopno stopst, sopno lev); static char *fast(struct match *m, char *start, char *stop, sopno startst, sopno stopst); static char *slow(struct match *m, char *start, char *stop, sopno startst, sopno stopst); -static states step(struct re_guts *g, sopno start, sopno stop, states bef, int ch, states aft); -#define BOL (OUT+1) -#define EOL (BOL+1) -#define BOLEOL (BOL+2) -#define NOTHING (BOL+3) -#define BOW (BOL+4) -#define EOW (BOL+5) -#define CODEMAX (BOL+5) /* highest code used */ -#define NONCHAR(c) ((c) > CHAR_MAX) -#define NNONCHAR (CODEMAX-CHAR_MAX) +static states step(struct re_guts *g, sopno start, sopno stop, states bef, wint_t ch, states aft); +#define BOL (OUT-1) +#define EOL (BOL-1) +#define BOLEOL (BOL-2) +#define NOTHING (BOL-3) +#define BOW (BOL-4) +#define EOW (BOL-5) +#define BADCHAR (BOL-6) +#define NONCHAR(c) ((c) <= OUT) #ifdef REDEBUG static void print(struct match *m, char *caption, states st, int ch, FILE *d); #endif @@ -234,6 +245,7 @@ int eflags; SETUP(m->tmp); SETUP(m->empty); CLEAR(m->empty); + ZAPSTATE(&m->mbs); /* Adjust start according to moffset, to speed things up */ if (g->moffset > -1) @@ -257,7 +269,8 @@ int eflags; if (endp != NULL) break; assert(m->coldp < m->endp); - m->coldp++; + m->coldp += XMBRTOWC(NULL, m->coldp, + m->endp - m->coldp, &m->mbs, 0); } if (nmatch == 1 && !g->backrefs) break; /* no further info needed */ @@ -316,7 +329,9 @@ int eflags; /* despite initial appearances, there is no match here */ NOTE("false alarm"); - start = m->coldp + 1; /* recycle starting later */ + /* recycle starting later */ + start = m->coldp + XMBRTOWC(NULL, m->coldp, + m->endp - m->coldp, &m->mbs, 0); assert(start <= stop); } @@ -394,7 +409,7 @@ sopno stopst; assert(nope); break; case OCHAR: - sp++; + sp += XMBRTOWC(NULL, sp, stop - start, &m->mbs, 0); break; case OBOL: case OEOL: @@ -403,7 +418,7 @@ sopno stopst; break; case OANY: case OANYOF: - sp++; + sp += XMBRTOWC(NULL, sp, stop - start, &m->mbs, 0); break; case OBACK_: case O_BACK: @@ -558,6 +573,7 @@ sopno lev; /* PLUS nesting level */ sop s; regoff_t offsave; cset *cs; + wint_t wc; AT("back", start, stop, startst, stopst); sp = start; @@ -567,17 +583,25 @@ sopno lev; /* PLUS nesting level */ for (ss = startst; !hard && ss < stopst; ss++) switch (OP(s = m->g->strip[ss])) { case OCHAR: - if (sp == stop || *sp++ != (char)OPND(s)) + if (sp == stop) + return(NULL); + sp += XMBRTOWC(&wc, sp, stop - sp, &m->mbs, BADCHAR); + if (wc != OPND(s)) return(NULL); break; case OANY: if (sp == stop) return(NULL); - sp++; + sp += XMBRTOWC(&wc, sp, stop - sp, &m->mbs, BADCHAR); + if (wc == BADCHAR) + return (NULL); break; case OANYOF: + if (sp == stop) + return (NULL); cs = &m->g->sets[OPND(s)]; - if (sp == stop || !CHIN(cs, *sp++)) + sp += XMBRTOWC(&wc, sp, stop - sp, &m->mbs, BADCHAR); + if (wc == BADCHAR || !CHIN(cs, wc)) return(NULL); break; case OBOL: @@ -754,11 +778,12 @@ sopno stopst; states fresh = m->fresh; states tmp = m->tmp; char *p = start; - int c = (start == m->beginp) ? OUT : *(start-1); - int lastc; /* previous c */ - int flagch; + wint_t c; + wint_t lastc; /* previous c */ + wint_t flagch; int i; char *coldp; /* last p after which no match was underway */ + size_t clen; CLEAR(st); SET1(st, startst); @@ -766,10 +791,23 @@ sopno stopst; ASSIGN(fresh, st); SP("start", st, *p); coldp = NULL; + if (start == m->beginp) + c = OUT; + else { + /* + * XXX Wrong if the previous character was multi-byte. + * Newline never is (in encodings supported by FreeBSD), + * so this only breaks the ISWORD tests below. + */ + c = (uch)*(start - 1); + } for (;;) { /* next character */ lastc = c; - c = (p == m->endp) ? OUT : *p; + if (p == m->endp) + c = OUT; + else + clen = XMBRTOWC(&c, p, m->endp - p, &m->mbs, BADCHAR); if (EQ(st, fresh)) coldp = p; @@ -817,13 +855,13 @@ sopno stopst; st = step(m->g, startst, stopst, tmp, c, st); SP("aft", st, c); assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st)); - p++; + p += clen; } assert(coldp != NULL); m->coldp = coldp; if (ISSET(st, stopst)) - return(p+1); + return(p+XMBRTOWC(NULL, p, m->endp - p, &m->mbs, 0)); else return(NULL); } @@ -845,11 +883,12 @@ sopno stopst; states empty = m->empty; states tmp = m->tmp; char *p = start; - int c = (start == m->beginp) ? OUT : *(start-1); - int lastc; /* previous c */ - int flagch; + wint_t c; + wint_t lastc; /* previous c */ + wint_t flagch; int i; char *matchp; /* last p at which a match ended */ + size_t clen; AT("slow", start, stop, startst, stopst); CLEAR(st); @@ -857,10 +896,24 @@ sopno stopst; SP("sstart", st, *p); st = step(m->g, startst, stopst, st, NOTHING, st); matchp = NULL; + if (start == m->beginp) + c = OUT; + else { + /* + * XXX Wrong if the previous character was multi-byte. + * Newline never is (in encodings supported by FreeBSD), + * so this only breaks the ISWORD tests below. + */ + c = (uch)*(start - 1); + } for (;;) { /* next character */ lastc = c; - c = (p == m->endp) ? OUT : *p; + if (p == m->endp) { + c = OUT; + clen = 0; + } else + clen = XMBRTOWC(&c, p, m->endp - p, &m->mbs, BADCHAR); /* is there an EOL and/or BOL between lastc and c? */ flagch = '\0'; @@ -908,7 +961,7 @@ sopno stopst; st = step(m->g, startst, stopst, tmp, c, st); SP("saft", st, c); assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st)); - p++; + p += clen; } return(matchp); @@ -919,15 +972,14 @@ sopno stopst; - step - map set of states reachable before char to set reachable after == static states step(struct re_guts *g, sopno start, sopno stop, \ == states bef, int ch, states aft); - == #define BOL (OUT+1) - == #define EOL (BOL+1) - == #define BOLEOL (BOL+2) - == #define NOTHING (BOL+3) - == #define BOW (BOL+4) - == #define EOW (BOL+5) - == #define CODEMAX (BOL+5) // highest code used - == #define NONCHAR(c) ((c) > CHAR_MAX) - == #define NNONCHAR (CODEMAX-CHAR_MAX) + == #define BOL (OUT-1) + == #define EOL (BOL-1) + == #define BOLEOL (BOL-2) + == #define NOTHING (BOL-3) + == #define BOW (BOL-4) + == #define EOW (BOL-5) + == #define BADCHAR (BOL-6) + == #define NONCHAR(c) ((c) <= OUT) */ static states step(g, start, stop, bef, ch, aft) @@ -935,7 +987,7 @@ struct re_guts *g; sopno start; /* start state within strip */ sopno stop; /* state after stop state within strip */ states bef; /* states reachable before */ -int ch; /* character or NONCHAR code */ +wint_t ch; /* character or NONCHAR code */ states aft; /* states already known reachable after */ { cset *cs; @@ -953,8 +1005,8 @@ states aft; /* states already known reachable after */ break; case OCHAR: /* only characters can match */ - assert(!NONCHAR(ch) || ch != (char)OPND(s)); - if (ch == (char)OPND(s)) + assert(!NONCHAR(ch) || ch != OPND(s)); + if (ch == OPND(s)) FWD(aft, bef, 1); break; case OBOL: |