summaryrefslogtreecommitdiffstats
path: root/gnu/usr.bin/grep/dfa.h
diff options
context:
space:
mode:
authornate <nate@FreeBSD.org>1993-06-29 06:04:45 +0000
committernate <nate@FreeBSD.org>1993-06-29 06:04:45 +0000
commitecf0e2f6a5309f514ea6ded12ffbc7d23044d191 (patch)
tree464aaed7bcd6c9e54cc018bb4fd37093c1980e2f /gnu/usr.bin/grep/dfa.h
downloadFreeBSD-src-ecf0e2f6a5309f514ea6ded12ffbc7d23044d191.zip
FreeBSD-src-ecf0e2f6a5309f514ea6ded12ffbc7d23044d191.tar.gz
Gnu e?grep 1.6
Diffstat (limited to 'gnu/usr.bin/grep/dfa.h')
-rw-r--r--gnu/usr.bin/grep/dfa.h450
1 files changed, 450 insertions, 0 deletions
diff --git a/gnu/usr.bin/grep/dfa.h b/gnu/usr.bin/grep/dfa.h
new file mode 100644
index 0000000..33e4bf1
--- /dev/null
+++ b/gnu/usr.bin/grep/dfa.h
@@ -0,0 +1,450 @@
+/* dfa.h - declarations for GNU deterministic regexp compiler
+ Copyright (C) 1988 Free Software Foundation, Inc.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2, or (at your option)
+ any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
+
+/* Written June, 1988 by Mike Haertel */
+
+#ifdef STDC_HEADERS
+
+#include <stddef.h>
+#include <stdlib.h>
+
+#else /* !STDC_HEADERS */
+
+#define const
+#include <sys/types.h> /* For size_t. */
+extern char *calloc(), *malloc(), *realloc();
+extern void free();
+
+#ifndef NULL
+#define NULL 0
+#endif
+
+#endif /* ! STDC_HEADERS */
+
+#include <ctype.h>
+#ifndef isascii
+#define ISALNUM(c) isalnum(c)
+#define ISALPHA(c) isalpha(c)
+#define ISUPPER(c) isupper(c)
+#define ISLOWER(c) islower(c)
+#else
+#define ISALNUM(c) (isascii(c) && isalnum(c))
+#define ISALPHA(c) (isascii(c) && isalpha(c))
+#define ISUPPER(c) (isascii(c) && isupper(c))
+#define ISLOWER(c) (isascii(c) && islower(c))
+#endif
+
+/* 1 means plain parentheses serve as grouping, and backslash
+ parentheses are needed for literal searching.
+ 0 means backslash-parentheses are grouping, and plain parentheses
+ are for literal searching. */
+#define RE_NO_BK_PARENS 1
+
+/* 1 means plain | serves as the "or"-operator, and \| is a literal.
+ 0 means \| serves as the "or"-operator, and | is a literal. */
+#define RE_NO_BK_VBAR 2
+
+/* 0 means plain + or ? serves as an operator, and \+, \? are literals.
+ 1 means \+, \? are operators and plain +, ? are literals. */
+#define RE_BK_PLUS_QM 4
+
+/* 1 means | binds tighter than ^ or $.
+ 0 means the contrary. */
+#define RE_TIGHT_VBAR 8
+
+/* 1 means treat \n as an _OR operator
+ 0 means treat it as a normal character */
+#define RE_NEWLINE_OR 16
+
+/* 0 means that a special characters (such as *, ^, and $) always have
+ their special meaning regardless of the surrounding context.
+ 1 means that special characters may act as normal characters in some
+ contexts. Specifically, this applies to:
+ ^ - only special at the beginning, or after ( or |
+ $ - only special at the end, or before ) or |
+ *, +, ? - only special when not after the beginning, (, or | */
+#define RE_CONTEXT_INDEP_OPS 32
+
+/* Now define combinations of bits for the standard possibilities. */
+#define RE_SYNTAX_AWK (RE_NO_BK_PARENS | RE_NO_BK_VBAR | RE_CONTEXT_INDEP_OPS)
+#define RE_SYNTAX_EGREP (RE_SYNTAX_AWK | RE_NEWLINE_OR)
+#define RE_SYNTAX_GREP (RE_BK_PLUS_QM | RE_NEWLINE_OR)
+#define RE_SYNTAX_EMACS 0
+
+/* Number of bits in an unsigned char. */
+#define CHARBITS 8
+
+/* First integer value that is greater than any character code. */
+#define _NOTCHAR (1 << CHARBITS)
+
+/* INTBITS need not be exact, just a lower bound. */
+#define INTBITS (CHARBITS * sizeof (int))
+
+/* Number of ints required to hold a bit for every character. */
+#define _CHARSET_INTS ((_NOTCHAR + INTBITS - 1) / INTBITS)
+
+/* Sets of unsigned characters are stored as bit vectors in arrays of ints. */
+typedef int _charset[_CHARSET_INTS];
+
+/* The regexp is parsed into an array of tokens in postfix form. Some tokens
+ are operators and others are terminal symbols. Most (but not all) of these
+ codes are returned by the lexical analyzer. */
+#if __STDC__
+
+typedef enum
+{
+ _END = -1, /* _END is a terminal symbol that matches the
+ end of input; any value of _END or less in
+ the parse tree is such a symbol. Accepting
+ states of the DFA are those that would have
+ a transition on _END. */
+
+ /* Ordinary character values are terminal symbols that match themselves. */
+
+ _EMPTY = _NOTCHAR, /* _EMPTY is a terminal symbol that matches
+ the empty string. */
+
+ _BACKREF, /* _BACKREF is generated by \<digit>; it
+ it not completely handled. If the scanner
+ detects a transition on backref, it returns
+ a kind of "semi-success" indicating that
+ the match will have to be verified with
+ a backtracking matcher. */
+
+ _BEGLINE, /* _BEGLINE is a terminal symbol that matches
+ the empty string if it is at the beginning
+ of a line. */
+
+ _ALLBEGLINE, /* _ALLBEGLINE is a terminal symbol that
+ matches the empty string if it is at the
+ beginning of a line; _ALLBEGLINE applies
+ to the entire regexp and can only occur
+ as the first token thereof. _ALLBEGLINE
+ never appears in the parse tree; a _BEGLINE
+ is prepended with _CAT to the entire
+ regexp instead. */
+
+ _ENDLINE, /* _ENDLINE is a terminal symbol that matches
+ the empty string if it is at the end of
+ a line. */
+
+ _ALLENDLINE, /* _ALLENDLINE is to _ENDLINE as _ALLBEGLINE
+ is to _BEGLINE. */
+
+ _BEGWORD, /* _BEGWORD is a terminal symbol that matches
+ the empty string if it is at the beginning
+ of a word. */
+
+ _ENDWORD, /* _ENDWORD is a terminal symbol that matches
+ the empty string if it is at the end of
+ a word. */
+
+ _LIMWORD, /* _LIMWORD is a terminal symbol that matches
+ the empty string if it is at the beginning
+ or the end of a word. */
+
+ _NOTLIMWORD, /* _NOTLIMWORD is a terminal symbol that
+ matches the empty string if it is not at
+ the beginning or end of a word. */
+
+ _QMARK, /* _QMARK is an operator of one argument that
+ matches zero or one occurences of its
+ argument. */
+
+ _STAR, /* _STAR is an operator of one argument that
+ matches the Kleene closure (zero or more
+ occurrences) of its argument. */
+
+ _PLUS, /* _PLUS is an operator of one argument that
+ matches the positive closure (one or more
+ occurrences) of its argument. */
+
+ _CAT, /* _CAT is an operator of two arguments that
+ matches the concatenation of its
+ arguments. _CAT is never returned by the
+ lexical analyzer. */
+
+ _OR, /* _OR is an operator of two arguments that
+ matches either of its arguments. */
+
+ _LPAREN, /* _LPAREN never appears in the parse tree,
+ it is only a lexeme. */
+
+ _RPAREN, /* _RPAREN never appears in the parse tree. */
+
+ _SET /* _SET and (and any value greater) is a
+ terminal symbol that matches any of a
+ class of characters. */
+} _token;
+
+#else /* ! __STDC__ */
+
+typedef short _token;
+
+#define _END -1
+#define _EMPTY _NOTCHAR
+#define _BACKREF (_EMPTY + 1)
+#define _BEGLINE (_EMPTY + 2)
+#define _ALLBEGLINE (_EMPTY + 3)
+#define _ENDLINE (_EMPTY + 4)
+#define _ALLENDLINE (_EMPTY + 5)
+#define _BEGWORD (_EMPTY + 6)
+#define _ENDWORD (_EMPTY + 7)
+#define _LIMWORD (_EMPTY + 8)
+#define _NOTLIMWORD (_EMPTY + 9)
+#define _QMARK (_EMPTY + 10)
+#define _STAR (_EMPTY + 11)
+#define _PLUS (_EMPTY + 12)
+#define _CAT (_EMPTY + 13)
+#define _OR (_EMPTY + 14)
+#define _LPAREN (_EMPTY + 15)
+#define _RPAREN (_EMPTY + 16)
+#define _SET (_EMPTY + 17)
+
+#endif /* ! __STDC__ */
+
+/* Sets are stored in an array in the compiled regexp; the index of the
+ array corresponding to a given set token is given by _SET_INDEX(t). */
+#define _SET_INDEX(t) ((t) - _SET)
+
+/* Sometimes characters can only be matched depending on the surrounding
+ context. Such context decisions depend on what the previous character
+ was, and the value of the current (lookahead) character. Context
+ dependent constraints are encoded as 8 bit integers. Each bit that
+ is set indicates that the constraint succeeds in the corresponding
+ context.
+
+ bit 7 - previous and current are newlines
+ bit 6 - previous was newline, current isn't
+ bit 5 - previous wasn't newline, current is
+ bit 4 - neither previous nor current is a newline
+ bit 3 - previous and current are word-constituents
+ bit 2 - previous was word-constituent, current isn't
+ bit 1 - previous wasn't word-constituent, current is
+ bit 0 - neither previous nor current is word-constituent
+
+ Word-constituent characters are those that satisfy isalnum().
+
+ The macro _SUCCEEDS_IN_CONTEXT determines whether a a given constraint
+ succeeds in a particular context. Prevn is true if the previous character
+ was a newline, currn is true if the lookahead character is a newline.
+ Prevl and currl similarly depend upon whether the previous and current
+ characters are word-constituent letters. */
+#define _MATCHES_NEWLINE_CONTEXT(constraint, prevn, currn) \
+ ((constraint) & 1 << ((prevn) ? 2 : 0) + ((currn) ? 1 : 0) + 4)
+#define _MATCHES_LETTER_CONTEXT(constraint, prevl, currl) \
+ ((constraint) & 1 << ((prevl) ? 2 : 0) + ((currl) ? 1 : 0))
+#define _SUCCEEDS_IN_CONTEXT(constraint, prevn, currn, prevl, currl) \
+ (_MATCHES_NEWLINE_CONTEXT(constraint, prevn, currn) \
+ && _MATCHES_LETTER_CONTEXT(constraint, prevl, currl))
+
+/* The following macros give information about what a constraint depends on. */
+#define _PREV_NEWLINE_DEPENDENT(constraint) \
+ (((constraint) & 0xc0) >> 2 != ((constraint) & 0x30))
+#define _PREV_LETTER_DEPENDENT(constraint) \
+ (((constraint) & 0x0c) >> 2 != ((constraint) & 0x03))
+
+/* Tokens that match the empty string subject to some constraint actually
+ work by applying that constraint to determine what may follow them,
+ taking into account what has gone before. The following values are
+ the constraints corresponding to the special tokens previously defined. */
+#define _NO_CONSTRAINT 0xff
+#define _BEGLINE_CONSTRAINT 0xcf
+#define _ENDLINE_CONSTRAINT 0xaf
+#define _BEGWORD_CONSTRAINT 0xf2
+#define _ENDWORD_CONSTRAINT 0xf4
+#define _LIMWORD_CONSTRAINT 0xf6
+#define _NOTLIMWORD_CONSTRAINT 0xf9
+
+/* States of the recognizer correspond to sets of positions in the parse
+ tree, together with the constraints under which they may be matched.
+ So a position is encoded as an index into the parse tree together with
+ a constraint. */
+typedef struct
+{
+ unsigned index; /* Index into the parse array. */
+ unsigned constraint; /* Constraint for matching this position. */
+} _position;
+
+/* Sets of positions are stored as arrays. */
+typedef struct
+{
+ _position *elems; /* Elements of this position set. */
+ int nelem; /* Number of elements in this set. */
+} _position_set;
+
+/* A state of the regexp consists of a set of positions, some flags,
+ and the token value of the lowest-numbered position of the state that
+ contains an _END token. */
+typedef struct
+{
+ int hash; /* Hash of the positions of this state. */
+ _position_set elems; /* Positions this state could match. */
+ char newline; /* True if previous state matched newline. */
+ char letter; /* True if previous state matched a letter. */
+ char backref; /* True if this state matches a \<digit>. */
+ unsigned char constraint; /* Constraint for this state to accept. */
+ int first_end; /* Token value of the first _END in elems. */
+} _dfa_state;
+
+/* If an r.e. is at most MUST_MAX characters long, we look for a string which
+ must appear in it; whatever's found is dropped into the struct reg. */
+
+#define MUST_MAX 50
+
+/* A compiled regular expression. */
+struct regexp
+{
+ /* Stuff built by the scanner. */
+ _charset *charsets; /* Array of character sets for _SET tokens. */
+ int cindex; /* Index for adding new charsets. */
+ int calloc; /* Number of charsets currently allocated. */
+
+ /* Stuff built by the parser. */
+ _token *tokens; /* Postfix parse array. */
+ int tindex; /* Index for adding new tokens. */
+ int talloc; /* Number of tokens currently allocated. */
+ int depth; /* Depth required of an evaluation stack
+ used for depth-first traversal of the
+ parse tree. */
+ int nleaves; /* Number of leaves on the parse tree. */
+ int nregexps; /* Count of parallel regexps being built
+ with regparse(). */
+
+ /* Stuff owned by the state builder. */
+ _dfa_state *states; /* States of the regexp. */
+ int sindex; /* Index for adding new states. */
+ int salloc; /* Number of states currently allocated. */
+
+ /* Stuff built by the structure analyzer. */
+ _position_set *follows; /* Array of follow sets, indexed by position
+ index. The follow of a position is the set
+ of positions containing characters that
+ could conceivably follow a character
+ matching the given position in a string
+ matching the regexp. Allocated to the
+ maximum possible position index. */
+ int searchflag; /* True if we are supposed to build a searching
+ as opposed to an exact matcher. A searching
+ matcher finds the first and shortest string
+ matching a regexp anywhere in the buffer,
+ whereas an exact matcher finds the longest
+ string matching, but anchored to the
+ beginning of the buffer. */
+
+ /* Stuff owned by the executor. */
+ int tralloc; /* Number of transition tables that have
+ slots so far. */
+ int trcount; /* Number of transition tables that have
+ actually been built. */
+ int **trans; /* Transition tables for states that can
+ never accept. If the transitions for a
+ state have not yet been computed, or the
+ state could possibly accept, its entry in
+ this table is NULL. */
+ int **realtrans; /* Trans always points to realtrans + 1; this
+ is so trans[-1] can contain NULL. */
+ int **fails; /* Transition tables after failing to accept
+ on a state that potentially could do so. */
+ int *success; /* Table of acceptance conditions used in
+ regexecute and computed in build_state. */
+ int *newlines; /* Transitions on newlines. The entry for a
+ newline in any transition table is always
+ -1 so we can count lines without wasting
+ too many cycles. The transition for a
+ newline is stored separately and handled
+ as a special case. Newline is also used
+ as a sentinel at the end of the buffer. */
+ char must[MUST_MAX];
+ int mustn;
+};
+
+/* Some macros for user access to regexp internals. */
+
+/* ACCEPTING returns true if s could possibly be an accepting state of r. */
+#define ACCEPTING(s, r) ((r).states[s].constraint)
+
+/* ACCEPTS_IN_CONTEXT returns true if the given state accepts in the
+ specified context. */
+#define ACCEPTS_IN_CONTEXT(prevn, currn, prevl, currl, state, reg) \
+ _SUCCEEDS_IN_CONTEXT((reg).states[state].constraint, \
+ prevn, currn, prevl, currl)
+
+/* FIRST_MATCHING_REGEXP returns the index number of the first of parallel
+ regexps that a given state could accept. Parallel regexps are numbered
+ starting at 1. */
+#define FIRST_MATCHING_REGEXP(state, reg) (-(reg).states[state].first_end)
+
+/* Entry points. */
+
+#if __STDC__
+
+/* Regsyntax() takes two arguments; the first sets the syntax bits described
+ earlier in this file, and the second sets the case-folding flag. */
+extern void regsyntax(int, int);
+
+/* Compile the given string of the given length into the given struct regexp.
+ Final argument is a flag specifying whether to build a searching or an
+ exact matcher. */
+extern void regcompile(const char *, size_t, struct regexp *, int);
+
+/* Execute the given struct regexp on the buffer of characters. The
+ first char * points to the beginning, and the second points to the
+ first character after the end of the buffer, which must be a writable
+ place so a sentinel end-of-buffer marker can be stored there. The
+ second-to-last argument is a flag telling whether to allow newlines to
+ be part of a string matching the regexp. The next-to-last argument,
+ if non-NULL, points to a place to increment every time we see a
+ newline. The final argument, if non-NULL, points to a flag that will
+ be set if further examination by a backtracking matcher is needed in
+ order to verify backreferencing; otherwise the flag will be cleared.
+ Returns NULL if no match is found, or a pointer to the first
+ character after the first & shortest matching string in the buffer. */
+extern char *regexecute(struct regexp *, char *, char *, int, int *, int *);
+
+/* Free the storage held by the components of a struct regexp. */
+extern void regfree(struct regexp *);
+
+/* Entry points for people who know what they're doing. */
+
+/* Initialize the components of a struct regexp. */
+extern void reginit(struct regexp *);
+
+/* Incrementally parse a string of given length into a struct regexp. */
+extern void regparse(const char *, size_t, struct regexp *);
+
+/* Analyze a parsed regexp; second argument tells whether to build a searching
+ or an exact matcher. */
+extern void reganalyze(struct regexp *, int);
+
+/* Compute, for each possible character, the transitions out of a given
+ state, storing them in an array of integers. */
+extern void regstate(int, struct regexp *, int []);
+
+/* Error handling. */
+
+/* Regerror() is called by the regexp routines whenever an error occurs. It
+ takes a single argument, a NUL-terminated string describing the error.
+ The default regerror() prints the error message to stderr and exits.
+ The user can provide a different regfree() if so desired. */
+extern void regerror(const char *);
+
+#else /* ! __STDC__ */
+extern void regsyntax(), regcompile(), regfree(), reginit(), regparse();
+extern void reganalyze(), regstate(), regerror();
+extern char *regexecute();
+#endif /* ! __STDC__ */
OpenPOWER on IntegriCloud