summaryrefslogtreecommitdiffstats
path: root/lib/libc/regex
diff options
context:
space:
mode:
authortjr <tjr@FreeBSD.org>2004-07-12 07:35:59 +0000
committertjr <tjr@FreeBSD.org>2004-07-12 07:35:59 +0000
commitba689b40433c4dbba7960454bbb126e6da5bdf59 (patch)
tree0f9f638917b5d0fb257cbfab36551506ddb3b43f /lib/libc/regex
parent031e087d2c3a8f23b40ec3cd62b741f935e1e2e6 (diff)
downloadFreeBSD-src-ba689b40433c4dbba7960454bbb126e6da5bdf59.zip
FreeBSD-src-ba689b40433c4dbba7960454bbb126e6da5bdf59.tar.gz
Make regular expression matching aware of multibyte characters. The general
idea is that we perform multibyte->wide character conversion while parsing and compiling, then convert byte sequences to wide characters when they're needed for comparison and stepping through the string during execution. As with tr(1), the main complication is to efficiently represent sets of characters in bracket expressions. The old bitmap representation is replaced by a bitmap for the first 256 characters combined with a vector of individual wide characters, a vector of character ranges (for [A-Z] etc.), and a vector of character classes (for [[:alpha:]] etc.). One other point of interest is that although the Boyer-Moore algorithm had to be disabled in the general multibyte case, it is still enabled for UTF-8 because of its self-synchronizing nature. This greatly speeds up matching by reducing the number of multibyte conversions that need to be done.
Diffstat (limited to 'lib/libc/regex')
-rw-r--r--lib/libc/regex/engine.c132
-rw-r--r--lib/libc/regex/regcomp.c516
-rw-r--r--lib/libc/regex/regex2.h74
-rw-r--r--lib/libc/regex/regexec.c67
-rw-r--r--lib/libc/regex/regfree.c13
5 files changed, 478 insertions, 324 deletions
diff --git a/lib/libc/regex/engine.c b/lib/libc/regex/engine.c
index 9d69c1e..e6484ef 100644
--- a/lib/libc/regex/engine.c
+++ b/lib/libc/regex/engine.c
@@ -69,6 +69,17 @@ __FBSDID("$FreeBSD$");
#define at lat
#define match lmat
#endif
+#ifdef MNAMES
+#define matcher mmatcher
+#define fast mfast
+#define slow mslow
+#define dissect mdissect
+#define backref mbackref
+#define step mstep
+#define print mprint
+#define at mat
+#define match mmat
+#endif
/* another structure passed up and down to avoid zillions of parameters */
struct match {
@@ -85,6 +96,7 @@ struct match {
states fresh; /* states for a fresh start */
states tmp; /* temporary */
states empty; /* empty set of states */
+ mbstate_t mbs; /* multibyte conversion state */
};
/* ========= begin header generated by ./mkh ========= */
@@ -98,16 +110,15 @@ static char *dissect(struct match *m, char *start, char *stop, sopno startst, so
static char *backref(struct match *m, char *start, char *stop, sopno startst, sopno stopst, sopno lev);
static char *fast(struct match *m, char *start, char *stop, sopno startst, sopno stopst);
static char *slow(struct match *m, char *start, char *stop, sopno startst, sopno stopst);
-static states step(struct re_guts *g, sopno start, sopno stop, states bef, int ch, states aft);
-#define BOL (OUT+1)
-#define EOL (BOL+1)
-#define BOLEOL (BOL+2)
-#define NOTHING (BOL+3)
-#define BOW (BOL+4)
-#define EOW (BOL+5)
-#define CODEMAX (BOL+5) /* highest code used */
-#define NONCHAR(c) ((c) > CHAR_MAX)
-#define NNONCHAR (CODEMAX-CHAR_MAX)
+static states step(struct re_guts *g, sopno start, sopno stop, states bef, wint_t ch, states aft);
+#define BOL (OUT-1)
+#define EOL (BOL-1)
+#define BOLEOL (BOL-2)
+#define NOTHING (BOL-3)
+#define BOW (BOL-4)
+#define EOW (BOL-5)
+#define BADCHAR (BOL-6)
+#define NONCHAR(c) ((c) <= OUT)
#ifdef REDEBUG
static void print(struct match *m, char *caption, states st, int ch, FILE *d);
#endif
@@ -234,6 +245,7 @@ int eflags;
SETUP(m->tmp);
SETUP(m->empty);
CLEAR(m->empty);
+ ZAPSTATE(&m->mbs);
/* Adjust start according to moffset, to speed things up */
if (g->moffset > -1)
@@ -257,7 +269,8 @@ int eflags;
if (endp != NULL)
break;
assert(m->coldp < m->endp);
- m->coldp++;
+ m->coldp += XMBRTOWC(NULL, m->coldp,
+ m->endp - m->coldp, &m->mbs, 0);
}
if (nmatch == 1 && !g->backrefs)
break; /* no further info needed */
@@ -316,7 +329,9 @@ int eflags;
/* despite initial appearances, there is no match here */
NOTE("false alarm");
- start = m->coldp + 1; /* recycle starting later */
+ /* recycle starting later */
+ start = m->coldp + XMBRTOWC(NULL, m->coldp,
+ m->endp - m->coldp, &m->mbs, 0);
assert(start <= stop);
}
@@ -394,7 +409,7 @@ sopno stopst;
assert(nope);
break;
case OCHAR:
- sp++;
+ sp += XMBRTOWC(NULL, sp, stop - start, &m->mbs, 0);
break;
case OBOL:
case OEOL:
@@ -403,7 +418,7 @@ sopno stopst;
break;
case OANY:
case OANYOF:
- sp++;
+ sp += XMBRTOWC(NULL, sp, stop - start, &m->mbs, 0);
break;
case OBACK_:
case O_BACK:
@@ -558,6 +573,7 @@ sopno lev; /* PLUS nesting level */
sop s;
regoff_t offsave;
cset *cs;
+ wint_t wc;
AT("back", start, stop, startst, stopst);
sp = start;
@@ -567,17 +583,25 @@ sopno lev; /* PLUS nesting level */
for (ss = startst; !hard && ss < stopst; ss++)
switch (OP(s = m->g->strip[ss])) {
case OCHAR:
- if (sp == stop || *sp++ != (char)OPND(s))
+ if (sp == stop)
+ return(NULL);
+ sp += XMBRTOWC(&wc, sp, stop - sp, &m->mbs, BADCHAR);
+ if (wc != OPND(s))
return(NULL);
break;
case OANY:
if (sp == stop)
return(NULL);
- sp++;
+ sp += XMBRTOWC(&wc, sp, stop - sp, &m->mbs, BADCHAR);
+ if (wc == BADCHAR)
+ return (NULL);
break;
case OANYOF:
+ if (sp == stop)
+ return (NULL);
cs = &m->g->sets[OPND(s)];
- if (sp == stop || !CHIN(cs, *sp++))
+ sp += XMBRTOWC(&wc, sp, stop - sp, &m->mbs, BADCHAR);
+ if (wc == BADCHAR || !CHIN(cs, wc))
return(NULL);
break;
case OBOL:
@@ -754,11 +778,12 @@ sopno stopst;
states fresh = m->fresh;
states tmp = m->tmp;
char *p = start;
- int c = (start == m->beginp) ? OUT : *(start-1);
- int lastc; /* previous c */
- int flagch;
+ wint_t c;
+ wint_t lastc; /* previous c */
+ wint_t flagch;
int i;
char *coldp; /* last p after which no match was underway */
+ size_t clen;
CLEAR(st);
SET1(st, startst);
@@ -766,10 +791,23 @@ sopno stopst;
ASSIGN(fresh, st);
SP("start", st, *p);
coldp = NULL;
+ if (start == m->beginp)
+ c = OUT;
+ else {
+ /*
+ * XXX Wrong if the previous character was multi-byte.
+ * Newline never is (in encodings supported by FreeBSD),
+ * so this only breaks the ISWORD tests below.
+ */
+ c = (uch)*(start - 1);
+ }
for (;;) {
/* next character */
lastc = c;
- c = (p == m->endp) ? OUT : *p;
+ if (p == m->endp)
+ c = OUT;
+ else
+ clen = XMBRTOWC(&c, p, m->endp - p, &m->mbs, BADCHAR);
if (EQ(st, fresh))
coldp = p;
@@ -817,13 +855,13 @@ sopno stopst;
st = step(m->g, startst, stopst, tmp, c, st);
SP("aft", st, c);
assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st));
- p++;
+ p += clen;
}
assert(coldp != NULL);
m->coldp = coldp;
if (ISSET(st, stopst))
- return(p+1);
+ return(p+XMBRTOWC(NULL, p, m->endp - p, &m->mbs, 0));
else
return(NULL);
}
@@ -845,11 +883,12 @@ sopno stopst;
states empty = m->empty;
states tmp = m->tmp;
char *p = start;
- int c = (start == m->beginp) ? OUT : *(start-1);
- int lastc; /* previous c */
- int flagch;
+ wint_t c;
+ wint_t lastc; /* previous c */
+ wint_t flagch;
int i;
char *matchp; /* last p at which a match ended */
+ size_t clen;
AT("slow", start, stop, startst, stopst);
CLEAR(st);
@@ -857,10 +896,24 @@ sopno stopst;
SP("sstart", st, *p);
st = step(m->g, startst, stopst, st, NOTHING, st);
matchp = NULL;
+ if (start == m->beginp)
+ c = OUT;
+ else {
+ /*
+ * XXX Wrong if the previous character was multi-byte.
+ * Newline never is (in encodings supported by FreeBSD),
+ * so this only breaks the ISWORD tests below.
+ */
+ c = (uch)*(start - 1);
+ }
for (;;) {
/* next character */
lastc = c;
- c = (p == m->endp) ? OUT : *p;
+ if (p == m->endp) {
+ c = OUT;
+ clen = 0;
+ } else
+ clen = XMBRTOWC(&c, p, m->endp - p, &m->mbs, BADCHAR);
/* is there an EOL and/or BOL between lastc and c? */
flagch = '\0';
@@ -908,7 +961,7 @@ sopno stopst;
st = step(m->g, startst, stopst, tmp, c, st);
SP("saft", st, c);
assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st));
- p++;
+ p += clen;
}
return(matchp);
@@ -919,15 +972,14 @@ sopno stopst;
- step - map set of states reachable before char to set reachable after
== static states step(struct re_guts *g, sopno start, sopno stop, \
== states bef, int ch, states aft);
- == #define BOL (OUT+1)
- == #define EOL (BOL+1)
- == #define BOLEOL (BOL+2)
- == #define NOTHING (BOL+3)
- == #define BOW (BOL+4)
- == #define EOW (BOL+5)
- == #define CODEMAX (BOL+5) // highest code used
- == #define NONCHAR(c) ((c) > CHAR_MAX)
- == #define NNONCHAR (CODEMAX-CHAR_MAX)
+ == #define BOL (OUT-1)
+ == #define EOL (BOL-1)
+ == #define BOLEOL (BOL-2)
+ == #define NOTHING (BOL-3)
+ == #define BOW (BOL-4)
+ == #define EOW (BOL-5)
+ == #define BADCHAR (BOL-6)
+ == #define NONCHAR(c) ((c) <= OUT)
*/
static states
step(g, start, stop, bef, ch, aft)
@@ -935,7 +987,7 @@ struct re_guts *g;
sopno start; /* start state within strip */
sopno stop; /* state after stop state within strip */
states bef; /* states reachable before */
-int ch; /* character or NONCHAR code */
+wint_t ch; /* character or NONCHAR code */
states aft; /* states already known reachable after */
{
cset *cs;
@@ -953,8 +1005,8 @@ states aft; /* states already known reachable after */
break;
case OCHAR:
/* only characters can match */
- assert(!NONCHAR(ch) || ch != (char)OPND(s));
- if (ch == (char)OPND(s))
+ assert(!NONCHAR(ch) || ch != OPND(s));
+ if (ch == OPND(s))
FWD(aft, bef, 1);
break;
case OBOL:
diff --git a/lib/libc/regex/regcomp.c b/lib/libc/regex/regcomp.c
index 95a2874..7c10f62 100644
--- a/lib/libc/regex/regcomp.c
+++ b/lib/libc/regex/regcomp.c
@@ -50,13 +50,14 @@ __FBSDID("$FreeBSD$");
#include <limits.h>
#include <stdlib.h>
#include <regex.h>
+#include <wchar.h>
+#include <wctype.h>
#include "collate.h"
#include "utils.h"
#include "regex2.h"
-#include "cclass.h"
#include "cname.h"
/*
@@ -83,29 +84,30 @@ extern "C" {
#endif
/* === regcomp.c === */
-static void p_ere(struct parse *p, int stop);
+static void p_ere(struct parse *p, wint_t stop);
static void p_ere_exp(struct parse *p);
static void p_str(struct parse *p);
-static void p_bre(struct parse *p, int end1, int end2);
+static void p_bre(struct parse *p, wint_t end1, wint_t end2);
static int p_simp_re(struct parse *p, int starordinary);
static int p_count(struct parse *p);
static void p_bracket(struct parse *p);
static void p_b_term(struct parse *p, cset *cs);
static void p_b_cclass(struct parse *p, cset *cs);
static void p_b_eclass(struct parse *p, cset *cs);
-static char p_b_symbol(struct parse *p);
-static char p_b_coll_elem(struct parse *p, int endc);
-static char othercase(int ch);
-static void bothcases(struct parse *p, int ch);
-static void ordinary(struct parse *p, int ch);
+static wint_t p_b_symbol(struct parse *p);
+static wint_t p_b_coll_elem(struct parse *p, wint_t endc);
+static wint_t othercase(wint_t ch);
+static void bothcases(struct parse *p, wint_t ch);
+static void ordinary(struct parse *p, wint_t ch);
static void nonnewline(struct parse *p);
static void repeat(struct parse *p, sopno start, int from, int to);
static int seterr(struct parse *p, int e);
static cset *allocset(struct parse *p);
static void freeset(struct parse *p, cset *cs);
-static int freezeset(struct parse *p, cset *cs);
-static int firstch(struct parse *p, cset *cs);
-static int nch(struct parse *p, cset *cs);
+static void CHadd(struct parse *p, cset *cs, wint_t ch);
+static void CHaddrange(struct parse *p, cset *cs, wint_t min, wint_t max);
+static void CHaddtype(struct parse *p, cset *cs, wctype_t wct);
+static wint_t singleton(cset *cs);
static sopno dupl(struct parse *p, sopno start, sopno finish);
static void doemit(struct parse *p, sop op, size_t opnd);
static void doinsert(struct parse *p, sop op, size_t opnd, sopno pos);
@@ -117,6 +119,7 @@ static int altoffset(sop *scan, int offset);
static void computejumps(struct parse *p, struct re_guts *g);
static void computematchjumps(struct parse *p, struct re_guts *g);
static sopno pluscount(struct parse *p, struct re_guts *g);
+static wint_t wgetnext(struct parse *p);
#ifdef __cplusplus
}
@@ -141,6 +144,7 @@ static char nuls[10]; /* place to point scanner in event of error */
#define NEXT2() (p->next += 2)
#define NEXTn(n) (p->next += (n))
#define GETNEXT() (*p->next++)
+#define WGETNEXT() wgetnext(p)
#define SETERROR(e) seterr(p, (e))
#define REQUIRE(co, e) ((co) || SETERROR(e))
#define MUSTSEE(c, e) (REQUIRE(MORE() && PEEK() == (c), e))
@@ -226,9 +230,7 @@ int cflags;
p->pbegin[i] = 0;
p->pend[i] = 0;
}
- g->csetsize = NC;
g->sets = NULL;
- g->setbits = NULL;
g->ncsets = 0;
g->cflags = cflags;
g->iflags = 0;
@@ -340,6 +342,7 @@ p_ere_exp(p)
struct parse *p;
{
char c;
+ wint_t wc;
sopno pos;
int count;
int count2;
@@ -409,14 +412,16 @@ struct parse *p;
break;
case '\\':
(void)REQUIRE(MORE(), REG_EESCAPE);
- c = GETNEXT();
- ordinary(p, c);
+ wc = WGETNEXT();
+ ordinary(p, wc);
break;
case '{': /* okay as ordinary except if digit follows */
(void)REQUIRE(!MORE() || !isdigit((uch)PEEK()), REG_BADRPT);
/* FALLTHROUGH */
default:
- ordinary(p, c);
+ p->next--;
+ wc = WGETNEXT();
+ ordinary(p, wc);
break;
}
@@ -490,7 +495,7 @@ struct parse *p;
{
(void)REQUIRE(MORE(), REG_EMPTY);
while (MORE())
- ordinary(p, GETNEXT());
+ ordinary(p, WGETNEXT());
}
/*
@@ -546,6 +551,7 @@ int starordinary; /* is a leading * an ordinary character? */
int count2;
sopno pos;
int i;
+ wint_t wc;
sopno subno;
# define BACKSL (1<<CHAR_BIT)
@@ -617,7 +623,9 @@ int starordinary; /* is a leading * an ordinary character? */
(void)REQUIRE(starordinary, REG_BADRPT);
/* FALLTHROUGH */
default:
- ordinary(p, (char)c);
+ p->next--;
+ wc = WGETNEXT();
+ ordinary(p, wc);
break;
}
@@ -673,16 +681,13 @@ struct parse *p;
/*
- p_bracket - parse a bracketed character list
== static void p_bracket(struct parse *p);
- *
- * Note a significant property of this code: if the allocset() did SETERROR,
- * no set operations are done.
*/
static void
p_bracket(p)
struct parse *p;
{
- cset *cs = allocset(p);
- int invert = 0;
+ cset *cs;
+ wint_t ch;
/* Dept of Truly Sickening Special-Case Kludges */
if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) {
@@ -696,49 +701,34 @@ struct parse *p;
return;
}
+ if ((cs = allocset(p)) == NULL)
+ return;
+
+ if (p->g->cflags&REG_ICASE)
+ cs->icase = 1;
if (EAT('^'))
- invert++; /* make note to invert set at end */
+ cs->invert = 1;
if (EAT(']'))
- CHadd(cs, ']');
+ CHadd(p, cs, ']');
else if (EAT('-'))
- CHadd(cs, '-');
+ CHadd(p, cs, '-');
while (MORE() && PEEK() != ']' && !SEETWO('-', ']'))
p_b_term(p, cs);
if (EAT('-'))
- CHadd(cs, '-');
+ CHadd(p, cs, '-');
(void)MUSTEAT(']', REG_EBRACK);
if (p->error != 0) /* don't mess things up further */
return;
- if (p->g->cflags&REG_ICASE) {
- int i;
- int ci;
-
- for (i = p->g->csetsize - 1; i >= 0; i--)
- if (CHIN(cs, i) && isalpha(i)) {
- ci = othercase(i);
- if (ci != i)
- CHadd(cs, ci);
- }
- }
- if (invert) {
- int i;
-
- for (i = p->g->csetsize - 1; i >= 0; i--)
- if (CHIN(cs, i))
- CHsub(cs, i);
- else
- CHadd(cs, i);
- if (p->g->cflags&REG_NEWLINE)
- CHsub(cs, '\n');
- }
+ if (cs->invert && p->g->cflags&REG_NEWLINE)
+ cs->bmp['\n' >> 3] |= 1 << ('\n' & 7);
- if (nch(p, cs) == 1) { /* optimize singleton sets */
- ordinary(p, firstch(p, cs));
+ if ((ch = singleton(cs)) != OUT) { /* optimize singleton sets */
+ ordinary(p, ch);
freeset(p, cs);
} else
- EMIT(OANYOF, freezeset(p, cs));
+ EMIT(OANYOF, (int)(cs - p->g->sets));
}
/*
@@ -751,8 +741,8 @@ struct parse *p;
cset *cs;
{
char c;
- char start, finish;
- int i;
+ wint_t start, finish;
+ wint_t i;
/* classify what we've got */
switch ((MORE()) ? PEEK() : '\0') {
@@ -799,19 +789,18 @@ cset *cs;
} else
finish = start;
if (start == finish)
- CHadd(cs, start);
+ CHadd(p, cs, start);
else {
if (__collate_load_error) {
(void)REQUIRE((uch)start <= (uch)finish, REG_ERANGE);
- for (i = (uch)start; i <= (uch)finish; i++)
- CHadd(cs, i);
+ CHaddrange(p, cs, start, finish);
} else {
(void)REQUIRE(__collate_range_cmp(start, finish) <= 0, REG_ERANGE);
- for (i = CHAR_MIN; i <= CHAR_MAX; i++) {
+ for (i = 0; i <= UCHAR_MAX; i++) {
if ( __collate_range_cmp(start, i) <= 0
&& __collate_range_cmp(i, finish) <= 0
)
- CHadd(cs, i);
+ CHadd(p, cs, i);
}
}
}
@@ -828,85 +817,25 @@ p_b_cclass(p, cs)
struct parse *p;
cset *cs;
{
- int c;
char *sp = p->next;
- struct cclass *cp;
size_t len;
+ wctype_t wct;
+ char clname[16];
while (MORE() && isalpha((uch)PEEK()))
NEXT();
len = p->next - sp;
- for (cp = cclasses; cp->name != NULL; cp++)
- if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0')
- break;
- if (cp->name == NULL) {
- /* oops, didn't find it */
+ if (len >= sizeof(clname) - 1) {
SETERROR(REG_ECTYPE);
return;
}
-
- switch (cp->fidx) {
- case CALNUM:
- for (c = CHAR_MIN; c <= CHAR_MAX; c++)
- if (isalnum((uch)c))
- CHadd(cs, c);
- break;
- case CALPHA:
- for (c = CHAR_MIN; c <= CHAR_MAX; c++)
- if (isalpha((uch)c))
- CHadd(cs, c);
- break;
- case CBLANK:
- for (c = CHAR_MIN; c <= CHAR_MAX; c++)
- if (isblank((uch)c))
- CHadd(cs, c);
- break;
- case CCNTRL:
- for (c = CHAR_MIN; c <= CHAR_MAX; c++)
- if (iscntrl((uch)c))
- CHadd(cs, c);
- break;
- case CDIGIT:
- for (c = CHAR_MIN; c <= CHAR_MAX; c++)
- if (isdigit((uch)c))
- CHadd(cs, c);
- break;
- case CGRAPH:
- for (c = CHAR_MIN; c <= CHAR_MAX; c++)
- if (isgraph((uch)c))
- CHadd(cs, c);
- break;
- case CLOWER:
- for (c = CHAR_MIN; c <= CHAR_MAX; c++)
- if (islower((uch)c))
- CHadd(cs, c);
- break;
- case CPRINT:
- for (c = CHAR_MIN; c <= CHAR_MAX; c++)
- if (isprint((uch)c))
- CHadd(cs, c);
- break;
- case CPUNCT:
- for (c = CHAR_MIN; c <= CHAR_MAX; c++)
- if (ispunct((uch)c))
- CHadd(cs, c);
- break;
- case CSPACE:
- for (c = CHAR_MIN; c <= CHAR_MAX; c++)
- if (isspace((uch)c))
- CHadd(cs, c);
- break;
- case CUPPER:
- for (c = CHAR_MIN; c <= CHAR_MAX; c++)
- if (isupper((uch)c))
- CHadd(cs, c);
- break;
- case CXDIGIT:
- for (c = CHAR_MIN; c <= CHAR_MAX; c++)
- if (isxdigit((uch)c))
- CHadd(cs, c);
- break;
+ memcpy(clname, sp, len);
+ clname[len] = '\0';
+ if ((wct = wctype(clname)) == 0) {
+ SETERROR(REG_ECTYPE);
+ return;
}
+ CHaddtype(p, cs, wct);
}
/*
@@ -920,25 +849,25 @@ p_b_eclass(p, cs)
struct parse *p;
cset *cs;
{
- char c;
+ wint_t c;
c = p_b_coll_elem(p, '=');
- CHadd(cs, c);
+ CHadd(p, cs, c);
}
/*
- p_b_symbol - parse a character or [..]ed multicharacter collating symbol
== static char p_b_symbol(struct parse *p);
*/
-static char /* value of symbol */
+static wint_t /* value of symbol */
p_b_symbol(p)
struct parse *p;
{
- char value;
+ wint_t value;
(void)REQUIRE(MORE(), REG_EBRACK);
if (!EATTWO('[', '.'))
- return(GETNEXT());
+ return(WGETNEXT());
/* collating symbol */
value = p_b_coll_elem(p, '.');
@@ -950,14 +879,17 @@ struct parse *p;
- p_b_coll_elem - parse a collating-element name and look it up
== static char p_b_coll_elem(struct parse *p, int endc);
*/
-static char /* value of collating element */
+static wint_t /* value of collating element */
p_b_coll_elem(p, endc)
struct parse *p;
-int endc; /* name ended by endc,']' */
+wint_t endc; /* name ended by endc,']' */
{
char *sp = p->next;
struct cname *cp;
int len;
+ mbstate_t mbs;
+ wchar_t wc;
+ size_t clen;
while (MORE() && !SEETWO(endc, ']'))
NEXT();
@@ -969,9 +901,13 @@ int endc; /* name ended by endc,']' */
for (cp = cnames; cp->name != NULL; cp++)
if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0')
return(cp->code); /* known name */
- if (len == 1)
- return(*sp); /* single character */
- SETERROR(REG_ECOLLATE); /* neither */
+ memset(&mbs, 0, sizeof(mbs));
+ if ((clen = mbrtowc(&wc, sp, len, &mbs)) == len)
+ return (wc); /* single character */
+ else if (clen == (size_t)-1 || clen == (size_t)-2)
+ SETERROR(REG_ILLSEQ);
+ else
+ SETERROR(REG_ECOLLATE); /* neither */
return(0);
}
@@ -979,16 +915,15 @@ int endc; /* name ended by endc,']' */
- othercase - return the case counterpart of an alphabetic
== static char othercase(int ch);
*/
-static char /* if no counterpart, return ch */
+static wint_t /* if no counterpart, return ch */
othercase(ch)
-int ch;
+wint_t ch;
{
- ch = (uch)ch;
- assert(isalpha(ch));
- if (isupper(ch))
- return(tolower(ch));
- else if (islower(ch))
- return(toupper(ch));
+ assert(iswalpha(ch));
+ if (iswupper(ch))
+ return(towlower(ch));
+ else if (iswlower(ch))
+ return(towupper(ch));
else /* peculiar, but could happen */
return(ch);
}
@@ -1002,21 +937,24 @@ int ch;
static void
bothcases(p, ch)
struct parse *p;
-int ch;
+wint_t ch;
{
char *oldnext = p->next;
char *oldend = p->end;
- char bracket[3];
+ char bracket[3 + MB_LEN_MAX];
+ size_t n;
+ mbstate_t mbs;
- ch = (uch)ch;
assert(othercase(ch) != ch); /* p_bracket() would recurse */
p->next = bracket;
- p->end = bracket+2;
- bracket[0] = ch;
- bracket[1] = ']';
- bracket[2] = '\0';
+ memset(&mbs, 0, sizeof(mbs));
+ n = wcrtomb(bracket, ch, &mbs);
+ assert(n != (size_t)-1);
+ bracket[n] = ']';
+ bracket[n + 1] = '\0';
+ p->end = bracket+n+1;
p_bracket(p);
- assert(p->next == bracket+2);
+ assert(p->next == p->end);
p->next = oldnext;
p->end = oldend;
}
@@ -1028,13 +966,24 @@ int ch;
static void
ordinary(p, ch)
struct parse *p;
-int ch;
+wint_t ch;
{
+ cset *cs;
- if ((p->g->cflags&REG_ICASE) && isalpha((uch)ch) && othercase(ch) != ch)
+ if ((p->g->cflags&REG_ICASE) && iswalpha(ch) && othercase(ch) != ch)
bothcases(p, ch);
- else
- EMIT(OCHAR, (uch)ch);
+ else if ((ch & OPDMASK) == ch)
+ EMIT(OCHAR, ch);
+ else {
+ /*
+ * Kludge: character is too big to fit into an OCHAR operand.
+ * Emit a singleton set.
+ */
+ if ((cs = allocset(p)) == NULL)
+ return;
+ CHadd(p, cs, ch);
+ EMIT(OANYOF, (int)(cs - p->g->sets));
+ }
}
/*
@@ -1136,6 +1085,31 @@ int to; /* to this number of times (maybe INFINITY) */
}
/*
+ - wgetnext - helper function for WGETNEXT() macro. Gets the next wide
+ - character from the parse struct, signals a REG_ILLSEQ error if the
+ - character can't be converted. Returns the number of bytes consumed.
+ */
+static wint_t
+wgetnext(p)
+struct parse *p;
+{
+ mbstate_t mbs;
+ wchar_t wc;
+ size_t n;
+
+ memset(&mbs, 0, sizeof(mbs));
+ n = mbrtowc(&wc, p->next, p->end - p->next, &mbs);
+ if (n == (size_t)-1 || n == (size_t)-2) {
+ SETERROR(REG_ILLSEQ);
+ return (0);
+ }
+ if (n == 0)
+ n = 1;
+ p->next += n;
+ return (wc);
+}
+
+/*
- seterr - set an error condition
== static int seterr(struct parse *p, int e);
*/
@@ -1159,47 +1133,16 @@ static cset *
allocset(p)
struct parse *p;
{
- int no = p->g->ncsets++;
- size_t nc;
- size_t nbytes;
- cset *cs;
- size_t css = (size_t)p->g->csetsize;
- int i;
+ cset *cs, *ncs;
- if (no >= p->ncsalloc) { /* need another column of space */
- p->ncsalloc += CHAR_BIT;
- nc = p->ncsalloc;
- assert(nc % CHAR_BIT == 0);
- nbytes = nc / CHAR_BIT * css;
- if (p->g->sets == NULL)
- p->g->sets = (cset *)malloc(nc * sizeof(cset));
- else
- p->g->sets = (cset *)reallocf((char *)p->g->sets,
- nc * sizeof(cset));
- if (p->g->setbits == NULL)
- p->g->setbits = (uch *)malloc(nbytes);
- else {
- p->g->setbits = (uch *)reallocf((char *)p->g->setbits,
- nbytes);
- /* xxx this isn't right if setbits is now NULL */
- for (i = 0; i < no; i++)
- p->g->sets[i].ptr = p->g->setbits + css*(i/CHAR_BIT);
- }
- if (p->g->sets != NULL && p->g->setbits != NULL)
- (void) memset((char *)p->g->setbits + (nbytes - css),
- 0, css);
- else {
- no = 0;
- SETERROR(REG_ESPACE);
- /* caller's responsibility not to do set ops */
- }
+ ncs = realloc(p->g->sets, (p->g->ncsets + 1) * sizeof(*ncs));
+ if (ncs == NULL) {
+ SETERROR(REG_ESPACE);
+ return (NULL);
}
-
- assert(p->g->sets != NULL); /* xxx */
- cs = &p->g->sets[no];
- cs->ptr = p->g->setbits + css*((no)/CHAR_BIT);
- cs->mask = 1 << ((no) % CHAR_BIT);
- cs->hash = 0;
+ p->g->sets = ncs;
+ cs = &p->g->sets[p->g->ncsets++];
+ memset(cs, 0, sizeof(*cs));
return(cs);
}
@@ -1213,92 +1156,124 @@ freeset(p, cs)
struct parse *p;
cset *cs;
{
- int i;
cset *top = &p->g->sets[p->g->ncsets];
- size_t css = (size_t)p->g->csetsize;
- for (i = 0; i < css; i++)
- CHsub(cs, i);
+ free(cs->wides);
+ free(cs->ranges);
+ free(cs->types);
+ memset(cs, 0, sizeof(*cs));
if (cs == top-1) /* recover only the easy case */
p->g->ncsets--;
}
/*
- - freezeset - final processing on a set of characters
- == static int freezeset(struct parse *p, cset *cs);
- *
- * The main task here is merging identical sets. This is usually a waste
- * of time (although the hash code minimizes the overhead), but can win
- * big if REG_ICASE is being used. REG_ICASE, by the way, is why the hash
- * is done using addition rather than xor -- all ASCII [aA] sets xor to
- * the same value!
+ - singleton - Determine whether a set contains only one character,
+ - returning it if so, otherwise returning OUT.
*/
-static int /* set number */
-freezeset(p, cs)
-struct parse *p;
+static wint_t
+singleton(cs)
cset *cs;
{
- short h = cs->hash;
- int i;
- cset *top = &p->g->sets[p->g->ncsets];
- cset *cs2;
- size_t css = (size_t)p->g->csetsize;
-
- /* look for an earlier one which is the same */
- for (cs2 = &p->g->sets[0]; cs2 < top; cs2++)
- if (cs2->hash == h && cs2 != cs) {
- /* maybe */
- for (i = 0; i < css; i++)
- if (!!CHIN(cs2, i) != !!CHIN(cs, i))
- break; /* no */
- if (i == css)
- break; /* yes */
+ wint_t i, s, n;
+
+ for (i = n = 0; i < NC; i++)
+ if (CHIN(cs, i)) {
+ n++;
+ s = i;
}
+ if (n == 1)
+ return (s);
+ if (cs->nwides == 1 && cs->nranges == 0 && cs->ntypes == 0 &&
+ cs->icase == 0)
+ return (cs->wides[0]);
+ /* Don't bother handling the other cases. */
+ return (OUT);
+}
- if (cs2 < top) { /* found one */
- freeset(p, cs);
- cs = cs2;
+/*
+ - CHadd - add character to character set.
+ */
+static void
+CHadd(p, cs, ch)
+struct parse *p;
+cset *cs;
+wint_t ch;
+{
+ wint_t *newwides;
+ assert(ch >= 0);
+ if (ch < NC) {
+ cs->bmp[ch >> 3] |= 1 << (ch & 7);
+ if (cs->icase) {
+ cs->bmp[towlower(ch) >> 3] |= 1 << (towlower(ch) & 7);
+ cs->bmp[towupper(ch) >> 3] |= 1 << (towupper(ch) & 7);
+ }
+ } else {
+ newwides = realloc(cs->wides, (cs->nwides + 1) *
+ sizeof(*cs->wides));
+ if (newwides == NULL) {
+ SETERROR(REG_ESPACE);
+ return;
+ }
+ cs->wides = newwides;
+ cs->wides[cs->nwides++] = ch;
}
-
- return((int)(cs - p->g->sets));
}
/*
- - firstch - return first character in a set (which must have at least one)
- == static int firstch(struct parse *p, cset *cs);
+ - CHaddrange - add all characters in the range [min,max] to a character set.
*/
-static int /* character; there is no "none" value */
-firstch(p, cs)
+static void
+CHaddrange(p, cs, min, max)
struct parse *p;
cset *cs;
+wint_t min, max;
{
- int i;
- size_t css = (size_t)p->g->csetsize;
+ crange *newranges;
- for (i = 0; i < css; i++)
- if (CHIN(cs, i))
- return((char)i);
- assert(never);
- return(0); /* arbitrary */
+ for (; min < NC && min <= max; min++)
+ CHadd(p, cs, min);
+ if (min >= max)
+ return;
+ newranges = realloc(cs->ranges, (cs->nranges + 1) *
+ sizeof(*cs->ranges));
+ if (newranges == NULL) {
+ SETERROR(REG_ESPACE);
+ return;
+ }
+ cs->ranges = newranges;
+ cs->ranges[cs->nranges].min = min;
+ cs->ranges[cs->nranges].min = max;
+ cs->nranges++;
}
/*
- - nch - number of characters in a set
- == static int nch(struct parse *p, cset *cs);
+ - CHaddtype - add all characters of a certain type to a character set.
*/
-static int
-nch(p, cs)
+static void
+CHaddtype(p, cs, wct)
struct parse *p;
cset *cs;
+wctype_t wct;
{
- int i;
- size_t css = (size_t)p->g->csetsize;
- int n = 0;
-
- for (i = 0; i < css; i++)
- if (CHIN(cs, i))
- n++;
- return(n);
+ wint_t i;
+ wctype_t *newtypes;
+
+ for (i = 0; i < NC; i++) {
+ if (iswctype(i, wct))
+ CHadd(p, cs, i);
+ if (cs->icase && i != towlower(i))
+ CHadd(p, cs, towlower(i));
+ if (cs->icase && i != towupper(i))
+ CHadd(p, cs, towupper(i));
+ }
+ newtypes = realloc(cs->types, (cs->ntypes + 1) *
+ sizeof(*cs->types));
+ if (newtypes == NULL) {
+ SETERROR(REG_ESPACE);
+ return;
+ }
+ cs->types = newtypes;
+ cs->types[cs->ntypes++] = wct;
}
/*
@@ -1474,13 +1449,24 @@ struct re_guts *g;
sopno newlen;
sop s;
char *cp;
- sopno i;
int offset;
+ char buf[MB_LEN_MAX];
+ size_t clen;
+ mbstate_t mbs;
/* avoid making error situations worse */
if (p->error != 0)
return;
+ /*
+ * It's not generally safe to do a ``char'' substring search on
+ * multibyte character strings, but it's safe for at least
+ * UTF-8 (see RFC 3629).
+ */
+ if (MB_CUR_MAX > 1 &&
+ strcmp(_CurrentRuneLocale->__encoding, "UTF-8") != 0)
+ return;
+
/* find the longest OCHAR sequence in strip */
newlen = 0;
offset = 0;
@@ -1490,9 +1476,14 @@ struct re_guts *g;
s = *scan++;
switch (OP(s)) {
case OCHAR: /* sequence member */
- if (newlen == 0) /* new sequence */
+ if (newlen == 0) { /* new sequence */
+ memset(&mbs, 0, sizeof(mbs));
newstart = scan - 1;
- newlen++;
+ }
+ clen = wcrtomb(buf, OPND(s), &mbs);
+ if (clen == (size_t)-1)
+ goto toohard;
+ newlen += clen;
break;
case OPLUS_: /* things that don't break one */
case OLPAREN:
@@ -1569,6 +1560,7 @@ struct re_guts *g;
offset++;
newlen = 0;
break;
+ toohard:
default:
/* Anything here makes it impossible or too hard
* to calculate the offset -- so we give up;
@@ -1603,11 +1595,13 @@ struct re_guts *g;
}
cp = g->must;
scan = start;
- for (i = g->mlen; i > 0; i--) {
+ memset(&mbs, 0, sizeof(mbs));
+ while (cp < g->must + g->mlen) {
while (OP(s = *scan++) != OCHAR)
continue;
- assert(cp < g->must + g->mlen);
- *cp++ = (char)OPND(s);
+ clen = wcrtomb(cp, OPND(s), &mbs);
+ assert(clen != (size_t)-1);
+ cp += clen;
}
assert(cp == g->must + g->mlen);
*cp++ = '\0'; /* just on general principles */
diff --git a/lib/libc/regex/regex2.h b/lib/libc/regex/regex2.h
index 4678824..6beea48 100644
--- a/lib/libc/regex/regex2.h
+++ b/lib/libc/regex/regex2.h
@@ -87,7 +87,7 @@ typedef long sopno;
/* operators meaning operand */
/* (back, fwd are offsets) */
#define OEND (1L<<OPSHIFT) /* endmarker - */
-#define OCHAR (2L<<OPSHIFT) /* character unsigned char */
+#define OCHAR (2L<<OPSHIFT) /* character wide character */
#define OBOL (3L<<OPSHIFT) /* left anchor - */
#define OEOL (4L<<OPSHIFT) /* right anchor - */
#define OANY (5L<<OPSHIFT) /* . - */
@@ -108,21 +108,63 @@ typedef long sopno;
#define OEOW (20L<<OPSHIFT) /* end word - */
/*
- * Structure for [] character-set representation. Character sets are
- * done as bit vectors, grouped 8 to a byte vector for compactness.
- * The individual set therefore has both a pointer to the byte vector
- * and a mask to pick out the relevant bit of each byte. A hash code
- * simplifies testing whether two sets could be identical.
+ * Structures for [] character-set representation.
*/
typedef struct {
- uch *ptr; /* -> uch [csetsize] */
- uch mask; /* bit within array */
- short hash; /* hash code */
+ wint_t min;
+ wint_t max;
+} crange;
+typedef struct {
+ unsigned char bmp[NC / 8];
+ wctype_t *types;
+ int ntypes;
+ wint_t *wides;
+ int nwides;
+ crange *ranges;
+ int nranges;
+ int invert;
+ int icase;
} cset;
-/* note that CHadd and CHsub are unsafe, and CHIN doesn't yield 0/1 */
-#define CHadd(cs, c) ((cs)->ptr[(uch)(c)] |= (cs)->mask, (cs)->hash += (uch)(c))
-#define CHsub(cs, c) ((cs)->ptr[(uch)(c)] &= ~(cs)->mask, (cs)->hash -= (uch)(c))
-#define CHIN(cs, c) ((cs)->ptr[(uch)(c)] & (cs)->mask)
+
+static int
+CHIN1(cs, ch)
+cset *cs;
+wint_t ch;
+{
+ int i;
+
+ assert(ch >= 0);
+ if (ch < NC)
+ return (((cs->bmp[ch >> 3] & (1 << (ch & 7))) != 0) ^
+ cs->invert);
+ for (i = 0; i < cs->nwides; i++)
+ if (ch == cs->wides[i])
+ return (!cs->invert);
+ for (i = 0; i < cs->nranges; i++)
+ if (cs->ranges[i].min <= ch && ch <= cs->ranges[i].max)
+ return (!cs->invert);
+ for (i = 0; i < cs->ntypes; i++)
+ if (iswctype(ch, cs->types[i]))
+ return (!cs->invert);
+ return (cs->invert);
+}
+
+static __inline int
+CHIN(cs, ch)
+cset *cs;
+wint_t ch;
+{
+
+ assert(ch >= 0);
+ if (ch < NC)
+ return (((cs->bmp[ch >> 3] & (1 << (ch & 7))) != 0) ^
+ cs->invert);
+ else if (cs->icase)
+ return (CHIN1(cs, ch) || CHIN1(cs, towlower(ch)) ||
+ CHIN1(cs, towupper(ch)));
+ else
+ return (CHIN1(cs, ch));
+}
/*
* main compiled-expression structure
@@ -131,10 +173,8 @@ struct re_guts {
int magic;
# define MAGIC2 ((('R'^0200)<<8)|'E')
sop *strip; /* malloced area for strip */
- int csetsize; /* number of bits in a cset vector */
int ncsets; /* number of csets in use */
cset *sets; /* -> cset [ncsets] */
- uch *setbits; /* -> uch[csetsize][ncsets/CHAR_BIT] */
int cflags; /* copy of regcomp() cflags argument */
sopno nstates; /* = number of sops */
sopno firststate; /* the initial OEND (normally 0) */
@@ -156,5 +196,5 @@ struct re_guts {
};
/* misc utilities */
-#define OUT (CHAR_MAX+1) /* a non-character value */
-#define ISWORD(c) (isalnum((uch)(c)) || (c) == '_')
+#define OUT (-2) /* a non-character value */
+#define ISWORD(c) (iswalnum((uch)(c)) || (c) == '_')
diff --git a/lib/libc/regex/regexec.c b/lib/libc/regex/regexec.c
index c13c72d..c596bdd 100644
--- a/lib/libc/regex/regexec.c
+++ b/lib/libc/regex/regexec.c
@@ -46,9 +46,9 @@ __FBSDID("$FreeBSD$");
/*
* the outer shell of regexec()
*
- * This file includes engine.c *twice*, after muchos fiddling with the
+ * This file includes engine.c three times, after muchos fiddling with the
* macros that code uses. This lets the same code operate on two different
- * representations for state sets.
+ * representations for state sets and characters.
*/
#include <sys/types.h>
#include <stdio.h>
@@ -57,12 +57,53 @@ __FBSDID("$FreeBSD$");
#include <limits.h>
#include <ctype.h>
#include <regex.h>
+#include <wchar.h>
+#include <wctype.h>
#include "utils.h"
#include "regex2.h"
static int nope __unused = 0; /* for use in asserts; shuts lint up */
+static __inline size_t
+xmbrtowc(wi, s, n, mbs, dummy)
+wint_t *wi;
+const char *s;
+size_t n;
+mbstate_t *mbs;
+wint_t dummy;
+{
+ size_t nr;
+ wchar_t wc;
+
+ nr = mbrtowc(&wc, s, n, mbs);
+ if (wi != NULL)
+ *wi = wc;
+ if (nr == 0)
+ return (1);
+ else if (nr == (size_t)-1 || nr == (size_t)-2) {
+ memset(mbs, 0, sizeof(*mbs));
+ if (wi != NULL)
+ *wi = dummy;
+ return (1);
+ } else
+ return (nr);
+}
+
+static __inline size_t
+xmbrtowc_dummy(wi, s, n, mbs, dummy)
+wint_t *wi;
+const char *s;
+size_t n __unused;
+mbstate_t *mbs __unused;
+wint_t dummy __unused;
+{
+
+ if (wi != NULL)
+ *wi = (unsigned char)*s;
+ return (1);
+}
+
/* macros for manipulating states, small version */
#define states long
#define states1 states /* for later use in regexec() decision */
@@ -85,6 +126,9 @@ static int nope __unused = 0; /* for use in asserts; shuts lint up */
#define FWD(dst, src, n) ((dst) |= ((unsigned long)(src)&(here)) << (n))
#define BACK(dst, src, n) ((dst) |= ((unsigned long)(src)&(here)) >> (n))
#define ISSETBACK(v, n) (((v) & ((unsigned long)here >> (n))) != 0)
+/* no multibyte support */
+#define XMBRTOWC xmbrtowc_dummy
+#define ZAPSTATE(mbs) ((void)(mbs))
/* function names */
#define SNAMES /* engine.c looks after details */
@@ -110,6 +154,8 @@ static int nope __unused = 0; /* for use in asserts; shuts lint up */
#undef BACK
#undef ISSETBACK
#undef SNAMES
+#undef XMBRTOWC
+#undef ZAPSTATE
/* macros for manipulating states, large version */
#define states char *
@@ -134,11 +180,24 @@ static int nope __unused = 0; /* for use in asserts; shuts lint up */
#define FWD(dst, src, n) ((dst)[here+(n)] |= (src)[here])
#define BACK(dst, src, n) ((dst)[here-(n)] |= (src)[here])
#define ISSETBACK(v, n) ((v)[here - (n)])
+/* no multibyte support */
+#define XMBRTOWC xmbrtowc_dummy
+#define ZAPSTATE(mbs) ((void)(mbs))
/* function names */
#define LNAMES /* flag */
#include "engine.c"
+/* multibyte character & large states version */
+#undef LNAMES
+#undef XMBRTOWC
+#undef ZAPSTATE
+#define XMBRTOWC xmbrtowc
+#define ZAPSTATE(mbs) memset((mbs), 0, sizeof(*(mbs)))
+#define MNAMES
+
+#include "engine.c"
+
/*
- regexec - interface for matching
= extern int regexec(const regex_t *, const char *, size_t, \
@@ -176,7 +235,9 @@ int eflags;
return(REG_BADPAT);
eflags = GOODFLAGS(eflags);
- if (g->nstates <= CHAR_BIT*sizeof(states1) && !(eflags&REG_LARGE))
+ if (MB_CUR_MAX > 1)
+ return(mmatcher(g, (char *)string, nmatch, pmatch, eflags));
+ else if (g->nstates <= CHAR_BIT*sizeof(states1) && !(eflags&REG_LARGE))
return(smatcher(g, (char *)string, nmatch, pmatch, eflags));
else
return(lmatcher(g, (char *)string, nmatch, pmatch, eflags));
diff --git a/lib/libc/regex/regfree.c b/lib/libc/regex/regfree.c
index 96f2997..b1e6a35 100644
--- a/lib/libc/regex/regfree.c
+++ b/lib/libc/regex/regfree.c
@@ -48,6 +48,8 @@ __FBSDID("$FreeBSD$");
#include <stdlib.h>
#include <limits.h>
#include <regex.h>
+#include <wchar.h>
+#include <wctype.h>
#include "utils.h"
#include "regex2.h"
@@ -61,6 +63,7 @@ regfree(preg)
regex_t *preg;
{
struct re_guts *g;
+ int i;
if (preg->re_magic != MAGIC1) /* oops */
return; /* nice to complain, but hard */
@@ -73,10 +76,14 @@ regex_t *preg;
if (g->strip != NULL)
free((char *)g->strip);
- if (g->sets != NULL)
+ if (g->sets != NULL) {
+ for (i = 0; i < g->ncsets; i++) {
+ free(g->sets[i].ranges);
+ free(g->sets[i].wides);
+ free(g->sets[i].types);
+ }
free((char *)g->sets);
- if (g->setbits != NULL)
- free((char *)g->setbits);
+ }
if (g->must != NULL)
free(g->must);
if (g->charjump != NULL)
OpenPOWER on IntegriCloud