summaryrefslogtreecommitdiffstats
path: root/lib
diff options
context:
space:
mode:
authorkevans <kevans@FreeBSD.org>2018-01-18 22:10:00 +0000
committerkevans <kevans@FreeBSD.org>2018-01-18 22:10:00 +0000
commit59d04d9396d6ca2f06abbde724cd61bbd9049d47 (patch)
tree0b8cf203f5366a2b2fa677e825fd45a94e783fa5 /lib
parent95b8f69ef3baf8a9b45e0963cecac957b54eb93c (diff)
downloadFreeBSD-src-59d04d9396d6ca2f06abbde724cd61bbd9049d47.zip
FreeBSD-src-59d04d9396d6ca2f06abbde724cd61bbd9049d47.tar.gz
MFC r320742, r320750, r320796: Refactor regex(3) for maintainability
MFC r320742: The impending libregex will implement GNU extensions to bring BREs and EREs closer together. Prepare for this and reduce the diff of libregex changes by refactoring and combining the top-level parsers for EREs/BREs ahead of time. Branching functionality has been split out to make it easier to follow the combined version of the top-level parser. It may also be enabled in the parsing context to make it easier when libregex enables branching for BREs. A branching context was also added for the various branching functions and so that BREs, for instance, can determine if they're the first expression in a chain of expressions within the current branch and treat '*' as ordinary if so. This should have no functional impact and negligible performance impact. MFC r320750: Fix sparc64 libc build after r320742. p_branch_empty was declared but never used due to an oversight. Use it as designed, further comment on its return value. MFC r320796: Correctly ignore branch operators in the top-level parser when applicable. An oversight in r320742 caused BREs to become sensitive to the branching operator prematurely, which caused breakage in some limited situations -- namely, those that tried to use branching in a BRE. Most of these scenarios had already been corrected beforehand to properly use gsed or grep for GNU extensions, so the damage is slightly mitigated.
Diffstat (limited to 'lib')
-rw-r--r--lib/libc/regex/regcomp.c323
-rw-r--r--lib/libc/regex/regex2.h1
2 files changed, 232 insertions, 92 deletions
diff --git a/lib/libc/regex/regcomp.c b/lib/libc/regex/regcomp.c
index 1967ff1..7012d8d 100644
--- a/lib/libc/regex/regcomp.c
+++ b/lib/libc/regex/regcomp.c
@@ -51,6 +51,7 @@ __FBSDID("$FreeBSD$");
#include <limits.h>
#include <stdlib.h>
#include <regex.h>
+#include <stdbool.h>
#include <wchar.h>
#include <wctype.h>
@@ -62,6 +63,24 @@ __FBSDID("$FreeBSD$");
#include "cname.h"
/*
+ * Branching context, used to keep track of branch state for all of the branch-
+ * aware functions. In addition to keeping track of branch positions for the
+ * p_branch_* functions, we use this to simplify some clumsiness in BREs for
+ * detection of whether ^ is acting as an anchor or being used erroneously and
+ * also for whether we're in a sub-expression or not.
+ */
+struct branchc {
+ sopno start;
+ sopno back;
+ sopno fwd;
+
+ int nbranch;
+ int nchain;
+ bool outer;
+ bool terminate;
+};
+
+/*
* parse structure, passed up and down to avoid global variables and
* other clumsinesses
*/
@@ -77,6 +96,11 @@ struct parse {
# define NPAREN 10 /* we need to remember () 1-9 for back refs */
sopno pbegin[NPAREN]; /* -> ( ([0] unused) */
sopno pend[NPAREN]; /* -> ) ([0] unused) */
+ bool allowbranch; /* can this expression branch? */
+ bool bre; /* convenience; is this a BRE? */
+ bool (*parse_expr)(struct parse *, struct branchc *);
+ void (*pre_parse)(struct parse *, struct branchc *);
+ void (*post_parse)(struct parse *, struct branchc *);
};
/* ========= begin header generated by ./mkh ========= */
@@ -85,11 +109,17 @@ extern "C" {
#endif
/* === regcomp.c === */
-static void p_ere(struct parse *p, int stop);
-static void p_ere_exp(struct parse *p);
+static bool p_ere_exp(struct parse *p, struct branchc *bc);
static void p_str(struct parse *p);
-static void p_bre(struct parse *p, int end1, int end2);
-static int p_simp_re(struct parse *p, int starordinary);
+static int p_branch_eat_delim(struct parse *p, struct branchc *bc);
+static void p_branch_ins_offset(struct parse *p, struct branchc *bc);
+static void p_branch_fix_tail(struct parse *p, struct branchc *bc);
+static bool p_branch_empty(struct parse *p, struct branchc *bc);
+static bool p_branch_do(struct parse *p, struct branchc *bc);
+static void p_bre_pre_parse(struct parse *p, struct branchc *bc);
+static void p_bre_post_parse(struct parse *p, struct branchc *bc);
+static void p_re(struct parse *p, int end1, int end2);
+static bool p_simp_re(struct parse *p, struct branchc *bc);
static int p_count(struct parse *p);
static void p_bracket(struct parse *p);
static void p_b_term(struct parse *p, cset *cs);
@@ -139,6 +169,7 @@ static char nuls[10]; /* place to point scanner in event of error */
#define MORE2() (p->next+1 < p->end)
#define SEE(c) (MORE() && PEEK() == (c))
#define SEETWO(a, b) (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b))
+#define SEESPEC(a) (p->bre ? SEETWO('\\', a) : SEE(a))
#define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0)
#define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0)
#define NEXT() (p->next++)
@@ -247,6 +278,19 @@ regcomp(regex_t * __restrict preg,
p->pbegin[i] = 0;
p->pend[i] = 0;
}
+ if (cflags & REG_EXTENDED) {
+ p->allowbranch = true;
+ p->bre = false;
+ p->parse_expr = p_ere_exp;
+ p->pre_parse = NULL;
+ p->post_parse = NULL;
+ } else {
+ p->allowbranch = false;
+ p->bre = true;
+ p->parse_expr = p_simp_re;
+ p->pre_parse = p_bre_pre_parse;
+ p->post_parse = p_bre_post_parse;
+ }
g->sets = NULL;
g->ncsets = 0;
g->cflags = cflags;
@@ -264,12 +308,10 @@ regcomp(regex_t * __restrict preg,
/* do it */
EMIT(OEND, 0);
g->firststate = THERE();
- if (cflags&REG_EXTENDED)
- p_ere(p, OUT);
- else if (cflags&REG_NOSPEC)
+ if (cflags & REG_NOSPEC)
p_str(p);
else
- p_bre(p, OUT, OUT);
+ p_re(p, OUT, OUT);
EMIT(OEND, 0);
g->laststate = THERE();
@@ -305,56 +347,12 @@ regcomp(regex_t * __restrict preg,
}
/*
- - p_ere - ERE parser top level, concatenation and alternation
- == static void p_ere(struct parse *p, int_t stop);
- */
-static void
-p_ere(struct parse *p,
- int stop) /* character this ERE should end at */
-{
- char c;
- sopno prevback;
- sopno prevfwd;
- sopno conc;
- int first = 1; /* is this the first alternative? */
-
- for (;;) {
- /* do a bunch of concatenated expressions */
- conc = HERE();
- while (MORE() && (c = PEEK()) != '|' && c != stop)
- p_ere_exp(p);
- (void)REQUIRE(HERE() != conc, REG_EMPTY); /* require nonempty */
-
- if (!EAT('|'))
- break; /* NOTE BREAK OUT */
-
- if (first) {
- INSERT(OCH_, conc); /* offset is wrong */
- prevfwd = conc;
- prevback = conc;
- first = 0;
- }
- ASTERN(OOR1, prevback);
- prevback = THERE();
- AHEAD(prevfwd); /* fix previous offset */
- prevfwd = HERE();
- EMIT(OOR2, 0); /* offset is very wrong */
- }
-
- if (!first) { /* tail-end fixups */
- AHEAD(prevfwd);
- ASTERN(O_CH, prevback);
- }
-
- assert(!MORE() || SEE(stop));
-}
-
-/*
- - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op
- == static void p_ere_exp(struct parse *p);
+ - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op,
+ - return whether we should terminate or not
+ == static bool p_ere_exp(struct parse *p);
*/
-static void
-p_ere_exp(struct parse *p)
+static bool
+p_ere_exp(struct parse *p, struct branchc *bc)
{
char c;
wint_t wc;
@@ -377,7 +375,7 @@ p_ere_exp(struct parse *p)
p->pbegin[subno] = HERE();
EMIT(OLPAREN, subno);
if (!SEE(')'))
- p_ere(p, ')');
+ p_re(p, ')', IGN);
if (subno < NPAREN) {
p->pend[subno] = HERE();
assert(p->pend[subno] != 0);
@@ -445,7 +443,7 @@ p_ere_exp(struct parse *p)
/* FALLTHROUGH */
default:
if (p->error != 0)
- return;
+ return (false);
p->next--;
wc = WGETNEXT();
ordinary(p, wc);
@@ -453,12 +451,12 @@ p_ere_exp(struct parse *p)
}
if (!MORE())
- return;
+ return (false);
c = PEEK();
/* we call { a repetition if followed by a digit */
if (!( c == '*' || c == '+' || c == '?' ||
(c == '{' && MORE2() && isdigit((uch)PEEK2())) ))
- return; /* no repetition, we're done */
+ return (false); /* no repetition, we're done */
NEXT();
(void)REQUIRE(!wascaret, REG_BADRPT);
@@ -504,12 +502,13 @@ p_ere_exp(struct parse *p)
}
if (!MORE())
- return;
+ return (false);
c = PEEK();
if (!( c == '*' || c == '+' || c == '?' ||
(c == '{' && MORE2() && isdigit((uch)PEEK2())) ) )
- return;
+ return (false);
SETERROR(REG_BADRPT);
+ return (false);
}
/*
@@ -525,50 +524,185 @@ p_str(struct parse *p)
}
/*
- - p_bre - BRE parser top level, anchoring and concatenation
- == static void p_bre(struct parse *p, int end1, \
- == int end2);
- * Giving end1 as OUT essentially eliminates the end1/end2 check.
- *
- * This implementation is a bit of a kludge, in that a trailing $ is first
- * taken as an ordinary character and then revised to be an anchor.
- * The amount of lookahead needed to avoid this kludge is excessive.
+ * Eat consecutive branch delimiters for the kind of expression that we are
+ * parsing, return the number of delimiters that we ate.
+ */
+static int
+p_branch_eat_delim(struct parse *p, struct branchc *bc)
+{
+ int nskip;
+
+ nskip = 0;
+ while (EAT('|'))
+ ++nskip;
+ return (nskip);
+}
+
+/*
+ * Insert necessary branch book-keeping operations. This emits a
+ * bogus 'next' offset, since we still have more to parse
+ */
+static void
+p_branch_ins_offset(struct parse *p, struct branchc *bc)
+{
+
+ if (bc->nbranch == 0) {
+ INSERT(OCH_, bc->start); /* offset is wrong */
+ bc->fwd = bc->start;
+ bc->back = bc->start;
+ }
+
+ ASTERN(OOR1, bc->back);
+ bc->back = THERE();
+ AHEAD(bc->fwd); /* fix previous offset */
+ bc->fwd = HERE();
+ EMIT(OOR2, 0); /* offset is very wrong */
+ ++bc->nbranch;
+}
+
+/*
+ * Fix the offset of the tail branch, if we actually had any branches.
+ * This is to correct the bogus placeholder offset that we use.
+ */
+static void
+p_branch_fix_tail(struct parse *p, struct branchc *bc)
+{
+
+ /* Fix bogus offset at the tail if we actually have branches */
+ if (bc->nbranch > 0) {
+ AHEAD(bc->fwd);
+ ASTERN(O_CH, bc->back);
+ }
+}
+
+/*
+ * Signal to the parser that an empty branch has been encountered; this will,
+ * in the future, be used to allow for more permissive behavior with empty
+ * branches. The return value should indicate whether parsing may continue
+ * or not.
+ */
+static bool
+p_branch_empty(struct parse *p, struct branchc *bc)
+{
+
+ SETERROR(REG_EMPTY);
+ return (false);
+}
+
+/*
+ * Take care of any branching requirements. This includes inserting the
+ * appropriate branching instructions as well as eating all of the branch
+ * delimiters until we either run out of pattern or need to parse more pattern.
*/
+static bool
+p_branch_do(struct parse *p, struct branchc *bc)
+{
+ int ate = 0;
+
+ ate = p_branch_eat_delim(p, bc);
+ if (ate == 0)
+ return (false);
+ else if ((ate > 1 || (bc->outer && !MORE())) && !p_branch_empty(p, bc))
+ /*
+ * Halt parsing only if we have an empty branch and p_branch_empty
+ * indicates that we must not continue. In the future, this will not
+ * necessarily be an error.
+ */
+ return (false);
+ p_branch_ins_offset(p, bc);
+
+ return (true);
+}
+
static void
-p_bre(struct parse *p,
- int end1, /* first terminating character */
- int end2) /* second terminating character */
+p_bre_pre_parse(struct parse *p, struct branchc *bc)
{
- sopno start = HERE();
- int first = 1; /* first subexpression? */
- int wasdollar = 0;
+ (void) bc;
+ /*
+ * Does not move cleanly into expression parser because of
+ * ordinary interpration of * at the beginning position of
+ * an expression.
+ */
if (EAT('^')) {
EMIT(OBOL, 0);
p->g->iflags |= USEBOL;
p->g->nbol++;
}
- while (MORE() && !SEETWO(end1, end2)) {
- wasdollar = p_simp_re(p, first);
- first = 0;
- }
- if (wasdollar) { /* oops, that was a trailing anchor */
+}
+
+static void
+p_bre_post_parse(struct parse *p, struct branchc *bc)
+{
+
+ /* Expression is terminating due to EOL token */
+ if (bc->terminate) {
DROP(1);
EMIT(OEOL, 0);
p->g->iflags |= USEEOL;
p->g->neol++;
}
+}
+
+/*
+ - p_re - Top level parser, concatenation and BRE anchoring
+ == static void p_re(struct parse *p, int end1, int end2);
+ * Giving end1 as OUT essentially eliminates the end1/end2 check.
+ *
+ * This implementation is a bit of a kludge, in that a trailing $ is first
+ * taken as an ordinary character and then revised to be an anchor.
+ * The amount of lookahead needed to avoid this kludge is excessive.
+ */
+static void
+p_re(struct parse *p,
+ int end1, /* first terminating character */
+ int end2) /* second terminating character; ignored for EREs */
+{
+ struct branchc bc;
- (void)REQUIRE(HERE() != start, REG_EMPTY); /* require nonempty */
+ bc.nbranch = 0;
+ if (end1 == OUT && end2 == OUT)
+ bc.outer = true;
+ else
+ bc.outer = false;
+#define SEEEND() (!p->bre ? SEE(end1) : SEETWO(end1, end2))
+ for (;;) {
+ bc.start = HERE();
+ bc.nchain = 0;
+ bc.terminate = false;
+ if (p->pre_parse != NULL)
+ p->pre_parse(p, &bc);
+ while (MORE() && (!p->allowbranch || !SEESPEC('|')) && !SEEEND()) {
+ bc.terminate = p->parse_expr(p, &bc);
+ ++bc.nchain;
+ }
+ if (p->post_parse != NULL)
+ p->post_parse(p, &bc);
+ (void) REQUIRE(HERE() != bc.start, REG_EMPTY);
+ if (!p->allowbranch)
+ break;
+ /*
+ * p_branch_do's return value indicates whether we should
+ * continue parsing or not. This is both for correctness and
+ * a slight optimization, because it will check if we've
+ * encountered an empty branch or the end of the string
+ * immediately following a branch delimiter.
+ */
+ if (!p_branch_do(p, &bc))
+ break;
+ }
+#undef SEE_END
+ if (p->allowbranch)
+ p_branch_fix_tail(p, &bc);
+ assert(!MORE() || SEE(end1));
}
/*
- p_simp_re - parse a simple RE, an atom possibly followed by a repetition
- == static int p_simp_re(struct parse *p, int starordinary);
+ == static bool p_simp_re(struct parse *p, struct branchc *bc);
*/
-static int /* was the simple RE an unbackslashed $? */
-p_simp_re(struct parse *p,
- int starordinary) /* is a leading * an ordinary character? */
+static bool /* was the simple RE an unbackslashed $? */
+p_simp_re(struct parse *p, struct branchc *bc)
{
int c;
int count;
@@ -614,7 +748,7 @@ p_simp_re(struct parse *p,
EMIT(OLPAREN, subno);
/* the MORE here is an error heuristic */
if (MORE() && !SEETWO('\\', ')'))
- p_bre(p, '\\', ')');
+ p_re(p, '\\', ')');
if (subno < NPAREN) {
p->pend[subno] = HERE();
assert(p->pend[subno] != 0);
@@ -650,11 +784,16 @@ p_simp_re(struct parse *p,
p->g->backrefs = 1;
break;
case '*':
- (void)REQUIRE(starordinary, REG_BADRPT);
+ /*
+ * Ordinary if used as the first character beyond BOL anchor of
+ * a (sub-)expression, counts as a bad repetition operator if it
+ * appears otherwise.
+ */
+ (void)REQUIRE(bc->nchain == 0, REG_BADRPT);
/* FALLTHROUGH */
default:
if (p->error != 0)
- return(0); /* Definitely not $... */
+ return (false); /* Definitely not $... */
p->next--;
wc = WGETNEXT();
ordinary(p, wc);
@@ -685,9 +824,9 @@ p_simp_re(struct parse *p,
SETERROR(REG_BADBR);
}
} else if (c == '$') /* $ (but not \$) ends it */
- return(1);
+ return (true);
- return(0);
+ return (false);
}
/*
diff --git a/lib/libc/regex/regex2.h b/lib/libc/regex/regex2.h
index 813bbd3..672c824 100644
--- a/lib/libc/regex/regex2.h
+++ b/lib/libc/regex/regex2.h
@@ -189,4 +189,5 @@ struct re_guts {
/* misc utilities */
#define OUT (CHAR_MIN - 1) /* a non-character value */
+#define IGN (CHAR_MIN - 2)
#define ISWORD(c) (iswalnum((uch)(c)) || (c) == '_')
OpenPOWER on IntegriCloud