diff options
author | tjr <tjr@FreeBSD.org> | 2004-07-12 07:35:59 +0000 |
---|---|---|
committer | tjr <tjr@FreeBSD.org> | 2004-07-12 07:35:59 +0000 |
commit | ba689b40433c4dbba7960454bbb126e6da5bdf59 (patch) | |
tree | 0f9f638917b5d0fb257cbfab36551506ddb3b43f /lib/libc/regex/regex2.h | |
parent | 031e087d2c3a8f23b40ec3cd62b741f935e1e2e6 (diff) | |
download | FreeBSD-src-ba689b40433c4dbba7960454bbb126e6da5bdf59.zip FreeBSD-src-ba689b40433c4dbba7960454bbb126e6da5bdf59.tar.gz |
Make regular expression matching aware of multibyte characters. The general
idea is that we perform multibyte->wide character conversion while parsing
and compiling, then convert byte sequences to wide characters when they're
needed for comparison and stepping through the string during execution.
As with tr(1), the main complication is to efficiently represent sets of
characters in bracket expressions. The old bitmap representation is replaced
by a bitmap for the first 256 characters combined with a vector of individual
wide characters, a vector of character ranges (for [A-Z] etc.), and a vector
of character classes (for [[:alpha:]] etc.).
One other point of interest is that although the Boyer-Moore algorithm had
to be disabled in the general multibyte case, it is still enabled for UTF-8
because of its self-synchronizing nature. This greatly speeds up matching
by reducing the number of multibyte conversions that need to be done.
Diffstat (limited to 'lib/libc/regex/regex2.h')
-rw-r--r-- | lib/libc/regex/regex2.h | 74 |
1 files changed, 57 insertions, 17 deletions
diff --git a/lib/libc/regex/regex2.h b/lib/libc/regex/regex2.h index 4678824..6beea48 100644 --- a/lib/libc/regex/regex2.h +++ b/lib/libc/regex/regex2.h @@ -87,7 +87,7 @@ typedef long sopno; /* operators meaning operand */ /* (back, fwd are offsets) */ #define OEND (1L<<OPSHIFT) /* endmarker - */ -#define OCHAR (2L<<OPSHIFT) /* character unsigned char */ +#define OCHAR (2L<<OPSHIFT) /* character wide character */ #define OBOL (3L<<OPSHIFT) /* left anchor - */ #define OEOL (4L<<OPSHIFT) /* right anchor - */ #define OANY (5L<<OPSHIFT) /* . - */ @@ -108,21 +108,63 @@ typedef long sopno; #define OEOW (20L<<OPSHIFT) /* end word - */ /* - * Structure for [] character-set representation. Character sets are - * done as bit vectors, grouped 8 to a byte vector for compactness. - * The individual set therefore has both a pointer to the byte vector - * and a mask to pick out the relevant bit of each byte. A hash code - * simplifies testing whether two sets could be identical. + * Structures for [] character-set representation. */ typedef struct { - uch *ptr; /* -> uch [csetsize] */ - uch mask; /* bit within array */ - short hash; /* hash code */ + wint_t min; + wint_t max; +} crange; +typedef struct { + unsigned char bmp[NC / 8]; + wctype_t *types; + int ntypes; + wint_t *wides; + int nwides; + crange *ranges; + int nranges; + int invert; + int icase; } cset; -/* note that CHadd and CHsub are unsafe, and CHIN doesn't yield 0/1 */ -#define CHadd(cs, c) ((cs)->ptr[(uch)(c)] |= (cs)->mask, (cs)->hash += (uch)(c)) -#define CHsub(cs, c) ((cs)->ptr[(uch)(c)] &= ~(cs)->mask, (cs)->hash -= (uch)(c)) -#define CHIN(cs, c) ((cs)->ptr[(uch)(c)] & (cs)->mask) + +static int +CHIN1(cs, ch) +cset *cs; +wint_t ch; +{ + int i; + + assert(ch >= 0); + if (ch < NC) + return (((cs->bmp[ch >> 3] & (1 << (ch & 7))) != 0) ^ + cs->invert); + for (i = 0; i < cs->nwides; i++) + if (ch == cs->wides[i]) + return (!cs->invert); + for (i = 0; i < cs->nranges; i++) + if (cs->ranges[i].min <= ch && ch <= cs->ranges[i].max) + return (!cs->invert); + for (i = 0; i < cs->ntypes; i++) + if (iswctype(ch, cs->types[i])) + return (!cs->invert); + return (cs->invert); +} + +static __inline int +CHIN(cs, ch) +cset *cs; +wint_t ch; +{ + + assert(ch >= 0); + if (ch < NC) + return (((cs->bmp[ch >> 3] & (1 << (ch & 7))) != 0) ^ + cs->invert); + else if (cs->icase) + return (CHIN1(cs, ch) || CHIN1(cs, towlower(ch)) || + CHIN1(cs, towupper(ch))); + else + return (CHIN1(cs, ch)); +} /* * main compiled-expression structure @@ -131,10 +173,8 @@ struct re_guts { int magic; # define MAGIC2 ((('R'^0200)<<8)|'E') sop *strip; /* malloced area for strip */ - int csetsize; /* number of bits in a cset vector */ int ncsets; /* number of csets in use */ cset *sets; /* -> cset [ncsets] */ - uch *setbits; /* -> uch[csetsize][ncsets/CHAR_BIT] */ int cflags; /* copy of regcomp() cflags argument */ sopno nstates; /* = number of sops */ sopno firststate; /* the initial OEND (normally 0) */ @@ -156,5 +196,5 @@ struct re_guts { }; /* misc utilities */ -#define OUT (CHAR_MAX+1) /* a non-character value */ -#define ISWORD(c) (isalnum((uch)(c)) || (c) == '_') +#define OUT (-2) /* a non-character value */ +#define ISWORD(c) (iswalnum((uch)(c)) || (c) == '_') |