diff options
Diffstat (limited to 'contrib/awk/re.c')
-rw-r--r-- | contrib/awk/re.c | 310 |
1 files changed, 310 insertions, 0 deletions
diff --git a/contrib/awk/re.c b/contrib/awk/re.c new file mode 100644 index 0000000..995fbb9 --- /dev/null +++ b/contrib/awk/re.c @@ -0,0 +1,310 @@ +/* + * re.c - compile regular expressions. + */ + +/* + * Copyright (C) 1991-1996 the Free Software Foundation, Inc. + * + * This file is part of GAWK, the GNU implementation of the + * AWK Programming Language. + * + * GAWK is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * GAWK is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA + */ + +#include "awk.h" + +static reg_syntax_t syn; + +/* make_regexp --- generate compiled regular expressions */ + +Regexp * +make_regexp(s, len, ignorecase, dfa) +char *s; +size_t len; +int ignorecase; +int dfa; +{ + Regexp *rp; + const char *rerr; + char *src = s; + char *temp; + char *end = s + len; + register char *dest; + register int c, c2; + + /* Handle escaped characters first. */ + + /* + * Build a copy of the string (in dest) with the + * escaped characters translated, and generate the regex + * from that. + */ + emalloc(dest, char *, len + 2, "make_regexp"); + temp = dest; + + while (src < end) { + if (*src == '\\') { + c = *++src; + switch (c) { + case 'a': + case 'b': + case 'f': + case 'n': + case 'r': + case 't': + case 'v': + case 'x': + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + c2 = parse_escape(&src); + if (c2 < 0) + cant_happen(); + /* + * Unix awk treats octal (and hex?) chars + * literally in re's, so escape regexp + * metacharacters. + */ + if (do_traditional && ! do_posix && (isdigit(c) || c == 'x') + && strchr("()|*+?.^$\\[]", c2) != NULL) + *dest++ = '\\'; + *dest++ = (char) c2; + break; + case '8': + case '9': /* a\9b not valid */ + *dest++ = c; + src++; + break; + case 'y': /* normally \b */ + /* gnu regex op */ + if (! do_traditional) { + *dest++ = '\\'; + *dest++ = 'b'; + src++; + break; + } + /* else, fall through */ + default: + *dest++ = '\\'; + *dest++ = (char) c; + src++; + break; + } /* switch */ + } else + *dest++ = *src++; /* not '\\' */ + } /* for */ + + *dest = '\0' ; /* Only necessary if we print dest ? */ + emalloc(rp, Regexp *, sizeof(*rp), "make_regexp"); + memset((char *) rp, 0, sizeof(*rp)); + rp->pat.allocated = 0; /* regex will allocate the buffer */ + emalloc(rp->pat.fastmap, char *, 256, "make_regexp"); + + if (ignorecase) + rp->pat.translate = casetable; + else + rp->pat.translate = NULL; + len = dest - temp; + if ((rerr = re_compile_pattern(temp, len, &(rp->pat))) != NULL) + fatal("%s: /%s/", rerr, temp); + + /* gack. this must be done *after* re_compile_pattern */ + rp->pat.newline_anchor = FALSE; /* don't get \n in middle of string */ + if (dfa && ! ignorecase) { + dfacomp(temp, len, &(rp->dfareg), TRUE); + rp->dfa = TRUE; + } else + rp->dfa = FALSE; + + free(temp); + return rp; +} + +/* research --- do a regexp search. use dfa if possible */ + +int +research(rp, str, start, len, need_start) +Regexp *rp; +register char *str; +int start; +register size_t len; +int need_start; +{ + char *ret = str; + int try_backref; + + /* + * Always do dfa search if can; if it fails, then even if + * need_start is true, we won't bother with the regex search. + */ + if (rp->dfa) { + char save; + int count = 0; + + /* + * dfa likes to stick a '\n' right after the matched + * text. So we just save and restore the character. + */ + save = str[start+len]; + ret = dfaexec(&(rp->dfareg), str+start, str+start+len, TRUE, + &count, &try_backref); + str[start+len] = save; + } + if (ret) { + if (need_start || rp->dfa == FALSE || try_backref) { + int result = re_search(&(rp->pat), str, start+len, + start, len, &(rp->regs)); + /* recover any space from C based alloca */ +#ifdef C_ALLOCA + (void) alloca(0); +#endif + return result; + } else + return 1; + } else + return -1; +} + +/* refree --- free up the dynamic memory used by a compiled regexp */ + +void +refree(rp) +Regexp *rp; +{ + free(rp->pat.buffer); + free(rp->pat.fastmap); + if (rp->regs.start) + free(rp->regs.start); + if (rp->regs.end) + free(rp->regs.end); + if (rp->dfa) + dfafree(&(rp->dfareg)); + free(rp); +} + +/* dfaerror --- print an error message for the dfa routines */ + +void +dfaerror(s) +const char *s; +{ + fatal("%s", s); +} + +/* re_update --- recompile a dynamic regexp */ + +Regexp * +re_update(t) +NODE *t; +{ + NODE *t1; + +/* # define CASE 1 */ + if ((t->re_flags & CASE) == IGNORECASE) { + if ((t->re_flags & CONST) != 0) + return t->re_reg; + t1 = force_string(tree_eval(t->re_exp)); + if (t->re_text != NULL) { + if (cmp_nodes(t->re_text, t1) == 0) { + free_temp(t1); + return t->re_reg; + } + unref(t->re_text); + } + t->re_text = dupnode(t1); + free_temp(t1); + } + if (t->re_reg != NULL) + refree(t->re_reg); + if (t->re_cnt > 0) + t->re_cnt++; + if (t->re_cnt > 10) + t->re_cnt = 0; + if (t->re_text == NULL) { + t1 = force_string(tree_eval(t->re_exp)); + t->re_text = dupnode(t1); + free_temp(t1); + } + t->re_reg = make_regexp(t->re_text->stptr, t->re_text->stlen, + IGNORECASE, t->re_cnt); + t->re_flags &= ~CASE; + t->re_flags |= IGNORECASE; + return t->re_reg; +} + +/* resetup --- choose what kind of regexps we match */ + +void +resetup() +{ + if (do_posix) + syn = RE_SYNTAX_POSIX_AWK; /* strict POSIX re's */ + else if (do_traditional) + syn = RE_SYNTAX_AWK; /* traditional Unix awk re's */ + else + syn = RE_SYNTAX_GNU_AWK; /* POSIX re's + GNU ops */ + + /* + * Interval expressions are off by default, since it's likely to + * break too many old programs to have them on. + */ + if (do_intervals) + syn |= RE_INTERVALS; + + (void) re_set_syntax(syn); + dfasyntax(syn, FALSE); +} + +/* avoid_dfa --- FIXME: temporary kludge function until we have a new dfa.c */ + +int +avoid_dfa(re, str, len) +NODE *re; +char *str; +size_t len; +{ + char *restr; + int relen; + int anchor, i; + char *end; + + if ((re->re_flags & CONST) != 0) { + restr = re->re_exp->stptr; + relen = re->re_exp->stlen; + } else { + restr = re->re_text->stptr; + relen = re->re_text->stlen; + } + + for (anchor = FALSE, i = 0; i < relen; i++) { + if (restr[i] == '^' || restr[i] == '$') { + anchor = TRUE; + break; + } + } + if (! anchor) + return FALSE; + + for (end = str + len; str < end; str++) + if (*str == '\n') + return TRUE; + + return FALSE; +} |