summaryrefslogtreecommitdiffstats
path: root/lib/libc/regex/engine.c
diff options
context:
space:
mode:
authortjr <tjr@FreeBSD.org>2004-07-12 07:35:59 +0000
committertjr <tjr@FreeBSD.org>2004-07-12 07:35:59 +0000
commitba689b40433c4dbba7960454bbb126e6da5bdf59 (patch)
tree0f9f638917b5d0fb257cbfab36551506ddb3b43f /lib/libc/regex/engine.c
parent031e087d2c3a8f23b40ec3cd62b741f935e1e2e6 (diff)
downloadFreeBSD-src-ba689b40433c4dbba7960454bbb126e6da5bdf59.zip
FreeBSD-src-ba689b40433c4dbba7960454bbb126e6da5bdf59.tar.gz
Make regular expression matching aware of multibyte characters. The general
idea is that we perform multibyte->wide character conversion while parsing and compiling, then convert byte sequences to wide characters when they're needed for comparison and stepping through the string during execution. As with tr(1), the main complication is to efficiently represent sets of characters in bracket expressions. The old bitmap representation is replaced by a bitmap for the first 256 characters combined with a vector of individual wide characters, a vector of character ranges (for [A-Z] etc.), and a vector of character classes (for [[:alpha:]] etc.). One other point of interest is that although the Boyer-Moore algorithm had to be disabled in the general multibyte case, it is still enabled for UTF-8 because of its self-synchronizing nature. This greatly speeds up matching by reducing the number of multibyte conversions that need to be done.
Diffstat (limited to 'lib/libc/regex/engine.c')
-rw-r--r--lib/libc/regex/engine.c132
1 files changed, 92 insertions, 40 deletions
diff --git a/lib/libc/regex/engine.c b/lib/libc/regex/engine.c
index 9d69c1e..e6484ef 100644
--- a/lib/libc/regex/engine.c
+++ b/lib/libc/regex/engine.c
@@ -69,6 +69,17 @@ __FBSDID("$FreeBSD$");
#define at lat
#define match lmat
#endif
+#ifdef MNAMES
+#define matcher mmatcher
+#define fast mfast
+#define slow mslow
+#define dissect mdissect
+#define backref mbackref
+#define step mstep
+#define print mprint
+#define at mat
+#define match mmat
+#endif
/* another structure passed up and down to avoid zillions of parameters */
struct match {
@@ -85,6 +96,7 @@ struct match {
states fresh; /* states for a fresh start */
states tmp; /* temporary */
states empty; /* empty set of states */
+ mbstate_t mbs; /* multibyte conversion state */
};
/* ========= begin header generated by ./mkh ========= */
@@ -98,16 +110,15 @@ static char *dissect(struct match *m, char *start, char *stop, sopno startst, so
static char *backref(struct match *m, char *start, char *stop, sopno startst, sopno stopst, sopno lev);
static char *fast(struct match *m, char *start, char *stop, sopno startst, sopno stopst);
static char *slow(struct match *m, char *start, char *stop, sopno startst, sopno stopst);
-static states step(struct re_guts *g, sopno start, sopno stop, states bef, int ch, states aft);
-#define BOL (OUT+1)
-#define EOL (BOL+1)
-#define BOLEOL (BOL+2)
-#define NOTHING (BOL+3)
-#define BOW (BOL+4)
-#define EOW (BOL+5)
-#define CODEMAX (BOL+5) /* highest code used */
-#define NONCHAR(c) ((c) > CHAR_MAX)
-#define NNONCHAR (CODEMAX-CHAR_MAX)
+static states step(struct re_guts *g, sopno start, sopno stop, states bef, wint_t ch, states aft);
+#define BOL (OUT-1)
+#define EOL (BOL-1)
+#define BOLEOL (BOL-2)
+#define NOTHING (BOL-3)
+#define BOW (BOL-4)
+#define EOW (BOL-5)
+#define BADCHAR (BOL-6)
+#define NONCHAR(c) ((c) <= OUT)
#ifdef REDEBUG
static void print(struct match *m, char *caption, states st, int ch, FILE *d);
#endif
@@ -234,6 +245,7 @@ int eflags;
SETUP(m->tmp);
SETUP(m->empty);
CLEAR(m->empty);
+ ZAPSTATE(&m->mbs);
/* Adjust start according to moffset, to speed things up */
if (g->moffset > -1)
@@ -257,7 +269,8 @@ int eflags;
if (endp != NULL)
break;
assert(m->coldp < m->endp);
- m->coldp++;
+ m->coldp += XMBRTOWC(NULL, m->coldp,
+ m->endp - m->coldp, &m->mbs, 0);
}
if (nmatch == 1 && !g->backrefs)
break; /* no further info needed */
@@ -316,7 +329,9 @@ int eflags;
/* despite initial appearances, there is no match here */
NOTE("false alarm");
- start = m->coldp + 1; /* recycle starting later */
+ /* recycle starting later */
+ start = m->coldp + XMBRTOWC(NULL, m->coldp,
+ m->endp - m->coldp, &m->mbs, 0);
assert(start <= stop);
}
@@ -394,7 +409,7 @@ sopno stopst;
assert(nope);
break;
case OCHAR:
- sp++;
+ sp += XMBRTOWC(NULL, sp, stop - start, &m->mbs, 0);
break;
case OBOL:
case OEOL:
@@ -403,7 +418,7 @@ sopno stopst;
break;
case OANY:
case OANYOF:
- sp++;
+ sp += XMBRTOWC(NULL, sp, stop - start, &m->mbs, 0);
break;
case OBACK_:
case O_BACK:
@@ -558,6 +573,7 @@ sopno lev; /* PLUS nesting level */
sop s;
regoff_t offsave;
cset *cs;
+ wint_t wc;
AT("back", start, stop, startst, stopst);
sp = start;
@@ -567,17 +583,25 @@ sopno lev; /* PLUS nesting level */
for (ss = startst; !hard && ss < stopst; ss++)
switch (OP(s = m->g->strip[ss])) {
case OCHAR:
- if (sp == stop || *sp++ != (char)OPND(s))
+ if (sp == stop)
+ return(NULL);
+ sp += XMBRTOWC(&wc, sp, stop - sp, &m->mbs, BADCHAR);
+ if (wc != OPND(s))
return(NULL);
break;
case OANY:
if (sp == stop)
return(NULL);
- sp++;
+ sp += XMBRTOWC(&wc, sp, stop - sp, &m->mbs, BADCHAR);
+ if (wc == BADCHAR)
+ return (NULL);
break;
case OANYOF:
+ if (sp == stop)
+ return (NULL);
cs = &m->g->sets[OPND(s)];
- if (sp == stop || !CHIN(cs, *sp++))
+ sp += XMBRTOWC(&wc, sp, stop - sp, &m->mbs, BADCHAR);
+ if (wc == BADCHAR || !CHIN(cs, wc))
return(NULL);
break;
case OBOL:
@@ -754,11 +778,12 @@ sopno stopst;
states fresh = m->fresh;
states tmp = m->tmp;
char *p = start;
- int c = (start == m->beginp) ? OUT : *(start-1);
- int lastc; /* previous c */
- int flagch;
+ wint_t c;
+ wint_t lastc; /* previous c */
+ wint_t flagch;
int i;
char *coldp; /* last p after which no match was underway */
+ size_t clen;
CLEAR(st);
SET1(st, startst);
@@ -766,10 +791,23 @@ sopno stopst;
ASSIGN(fresh, st);
SP("start", st, *p);
coldp = NULL;
+ if (start == m->beginp)
+ c = OUT;
+ else {
+ /*
+ * XXX Wrong if the previous character was multi-byte.
+ * Newline never is (in encodings supported by FreeBSD),
+ * so this only breaks the ISWORD tests below.
+ */
+ c = (uch)*(start - 1);
+ }
for (;;) {
/* next character */
lastc = c;
- c = (p == m->endp) ? OUT : *p;
+ if (p == m->endp)
+ c = OUT;
+ else
+ clen = XMBRTOWC(&c, p, m->endp - p, &m->mbs, BADCHAR);
if (EQ(st, fresh))
coldp = p;
@@ -817,13 +855,13 @@ sopno stopst;
st = step(m->g, startst, stopst, tmp, c, st);
SP("aft", st, c);
assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st));
- p++;
+ p += clen;
}
assert(coldp != NULL);
m->coldp = coldp;
if (ISSET(st, stopst))
- return(p+1);
+ return(p+XMBRTOWC(NULL, p, m->endp - p, &m->mbs, 0));
else
return(NULL);
}
@@ -845,11 +883,12 @@ sopno stopst;
states empty = m->empty;
states tmp = m->tmp;
char *p = start;
- int c = (start == m->beginp) ? OUT : *(start-1);
- int lastc; /* previous c */
- int flagch;
+ wint_t c;
+ wint_t lastc; /* previous c */
+ wint_t flagch;
int i;
char *matchp; /* last p at which a match ended */
+ size_t clen;
AT("slow", start, stop, startst, stopst);
CLEAR(st);
@@ -857,10 +896,24 @@ sopno stopst;
SP("sstart", st, *p);
st = step(m->g, startst, stopst, st, NOTHING, st);
matchp = NULL;
+ if (start == m->beginp)
+ c = OUT;
+ else {
+ /*
+ * XXX Wrong if the previous character was multi-byte.
+ * Newline never is (in encodings supported by FreeBSD),
+ * so this only breaks the ISWORD tests below.
+ */
+ c = (uch)*(start - 1);
+ }
for (;;) {
/* next character */
lastc = c;
- c = (p == m->endp) ? OUT : *p;
+ if (p == m->endp) {
+ c = OUT;
+ clen = 0;
+ } else
+ clen = XMBRTOWC(&c, p, m->endp - p, &m->mbs, BADCHAR);
/* is there an EOL and/or BOL between lastc and c? */
flagch = '\0';
@@ -908,7 +961,7 @@ sopno stopst;
st = step(m->g, startst, stopst, tmp, c, st);
SP("saft", st, c);
assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st));
- p++;
+ p += clen;
}
return(matchp);
@@ -919,15 +972,14 @@ sopno stopst;
- step - map set of states reachable before char to set reachable after
== static states step(struct re_guts *g, sopno start, sopno stop, \
== states bef, int ch, states aft);
- == #define BOL (OUT+1)
- == #define EOL (BOL+1)
- == #define BOLEOL (BOL+2)
- == #define NOTHING (BOL+3)
- == #define BOW (BOL+4)
- == #define EOW (BOL+5)
- == #define CODEMAX (BOL+5) // highest code used
- == #define NONCHAR(c) ((c) > CHAR_MAX)
- == #define NNONCHAR (CODEMAX-CHAR_MAX)
+ == #define BOL (OUT-1)
+ == #define EOL (BOL-1)
+ == #define BOLEOL (BOL-2)
+ == #define NOTHING (BOL-3)
+ == #define BOW (BOL-4)
+ == #define EOW (BOL-5)
+ == #define BADCHAR (BOL-6)
+ == #define NONCHAR(c) ((c) <= OUT)
*/
static states
step(g, start, stop, bef, ch, aft)
@@ -935,7 +987,7 @@ struct re_guts *g;
sopno start; /* start state within strip */
sopno stop; /* state after stop state within strip */
states bef; /* states reachable before */
-int ch; /* character or NONCHAR code */
+wint_t ch; /* character or NONCHAR code */
states aft; /* states already known reachable after */
{
cset *cs;
@@ -953,8 +1005,8 @@ states aft; /* states already known reachable after */
break;
case OCHAR:
/* only characters can match */
- assert(!NONCHAR(ch) || ch != (char)OPND(s));
- if (ch == (char)OPND(s))
+ assert(!NONCHAR(ch) || ch != OPND(s));
+ if (ch == OPND(s))
FWD(aft, bef, 1);
break;
case OBOL:
OpenPOWER on IntegriCloud