summaryrefslogtreecommitdiffstats
path: root/lib/libc/regex/regexec.c
diff options
context:
space:
mode:
authortjr <tjr@FreeBSD.org>2004-07-12 07:35:59 +0000
committertjr <tjr@FreeBSD.org>2004-07-12 07:35:59 +0000
commitba689b40433c4dbba7960454bbb126e6da5bdf59 (patch)
tree0f9f638917b5d0fb257cbfab36551506ddb3b43f /lib/libc/regex/regexec.c
parent031e087d2c3a8f23b40ec3cd62b741f935e1e2e6 (diff)
downloadFreeBSD-src-ba689b40433c4dbba7960454bbb126e6da5bdf59.zip
FreeBSD-src-ba689b40433c4dbba7960454bbb126e6da5bdf59.tar.gz
Make regular expression matching aware of multibyte characters. The general
idea is that we perform multibyte->wide character conversion while parsing and compiling, then convert byte sequences to wide characters when they're needed for comparison and stepping through the string during execution. As with tr(1), the main complication is to efficiently represent sets of characters in bracket expressions. The old bitmap representation is replaced by a bitmap for the first 256 characters combined with a vector of individual wide characters, a vector of character ranges (for [A-Z] etc.), and a vector of character classes (for [[:alpha:]] etc.). One other point of interest is that although the Boyer-Moore algorithm had to be disabled in the general multibyte case, it is still enabled for UTF-8 because of its self-synchronizing nature. This greatly speeds up matching by reducing the number of multibyte conversions that need to be done.
Diffstat (limited to 'lib/libc/regex/regexec.c')
-rw-r--r--lib/libc/regex/regexec.c67
1 files changed, 64 insertions, 3 deletions
diff --git a/lib/libc/regex/regexec.c b/lib/libc/regex/regexec.c
index c13c72d..c596bdd 100644
--- a/lib/libc/regex/regexec.c
+++ b/lib/libc/regex/regexec.c
@@ -46,9 +46,9 @@ __FBSDID("$FreeBSD$");
/*
* the outer shell of regexec()
*
- * This file includes engine.c *twice*, after muchos fiddling with the
+ * This file includes engine.c three times, after muchos fiddling with the
* macros that code uses. This lets the same code operate on two different
- * representations for state sets.
+ * representations for state sets and characters.
*/
#include <sys/types.h>
#include <stdio.h>
@@ -57,12 +57,53 @@ __FBSDID("$FreeBSD$");
#include <limits.h>
#include <ctype.h>
#include <regex.h>
+#include <wchar.h>
+#include <wctype.h>
#include "utils.h"
#include "regex2.h"
static int nope __unused = 0; /* for use in asserts; shuts lint up */
+static __inline size_t
+xmbrtowc(wi, s, n, mbs, dummy)
+wint_t *wi;
+const char *s;
+size_t n;
+mbstate_t *mbs;
+wint_t dummy;
+{
+ size_t nr;
+ wchar_t wc;
+
+ nr = mbrtowc(&wc, s, n, mbs);
+ if (wi != NULL)
+ *wi = wc;
+ if (nr == 0)
+ return (1);
+ else if (nr == (size_t)-1 || nr == (size_t)-2) {
+ memset(mbs, 0, sizeof(*mbs));
+ if (wi != NULL)
+ *wi = dummy;
+ return (1);
+ } else
+ return (nr);
+}
+
+static __inline size_t
+xmbrtowc_dummy(wi, s, n, mbs, dummy)
+wint_t *wi;
+const char *s;
+size_t n __unused;
+mbstate_t *mbs __unused;
+wint_t dummy __unused;
+{
+
+ if (wi != NULL)
+ *wi = (unsigned char)*s;
+ return (1);
+}
+
/* macros for manipulating states, small version */
#define states long
#define states1 states /* for later use in regexec() decision */
@@ -85,6 +126,9 @@ static int nope __unused = 0; /* for use in asserts; shuts lint up */
#define FWD(dst, src, n) ((dst) |= ((unsigned long)(src)&(here)) << (n))
#define BACK(dst, src, n) ((dst) |= ((unsigned long)(src)&(here)) >> (n))
#define ISSETBACK(v, n) (((v) & ((unsigned long)here >> (n))) != 0)
+/* no multibyte support */
+#define XMBRTOWC xmbrtowc_dummy
+#define ZAPSTATE(mbs) ((void)(mbs))
/* function names */
#define SNAMES /* engine.c looks after details */
@@ -110,6 +154,8 @@ static int nope __unused = 0; /* for use in asserts; shuts lint up */
#undef BACK
#undef ISSETBACK
#undef SNAMES
+#undef XMBRTOWC
+#undef ZAPSTATE
/* macros for manipulating states, large version */
#define states char *
@@ -134,11 +180,24 @@ static int nope __unused = 0; /* for use in asserts; shuts lint up */
#define FWD(dst, src, n) ((dst)[here+(n)] |= (src)[here])
#define BACK(dst, src, n) ((dst)[here-(n)] |= (src)[here])
#define ISSETBACK(v, n) ((v)[here - (n)])
+/* no multibyte support */
+#define XMBRTOWC xmbrtowc_dummy
+#define ZAPSTATE(mbs) ((void)(mbs))
/* function names */
#define LNAMES /* flag */
#include "engine.c"
+/* multibyte character & large states version */
+#undef LNAMES
+#undef XMBRTOWC
+#undef ZAPSTATE
+#define XMBRTOWC xmbrtowc
+#define ZAPSTATE(mbs) memset((mbs), 0, sizeof(*(mbs)))
+#define MNAMES
+
+#include "engine.c"
+
/*
- regexec - interface for matching
= extern int regexec(const regex_t *, const char *, size_t, \
@@ -176,7 +235,9 @@ int eflags;
return(REG_BADPAT);
eflags = GOODFLAGS(eflags);
- if (g->nstates <= CHAR_BIT*sizeof(states1) && !(eflags&REG_LARGE))
+ if (MB_CUR_MAX > 1)
+ return(mmatcher(g, (char *)string, nmatch, pmatch, eflags));
+ else if (g->nstates <= CHAR_BIT*sizeof(states1) && !(eflags&REG_LARGE))
return(smatcher(g, (char *)string, nmatch, pmatch, eflags));
else
return(lmatcher(g, (char *)string, nmatch, pmatch, eflags));
OpenPOWER on IntegriCloud