diff options
author | peter <peter@FreeBSD.org> | 1999-11-20 09:40:28 +0000 |
---|---|---|
committer | peter <peter@FreeBSD.org> | 1999-11-20 09:40:28 +0000 |
commit | 5354776cb2ed414ffde1c1fe64dec67336140a07 (patch) | |
tree | d0c72a4c9036fad6c80fbeabcec870b89ced46dd /gnu/usr.bin/grep/search.c | |
parent | b17c96564d10656e0e30a9637f98871157255e28 (diff) | |
download | FreeBSD-src-5354776cb2ed414ffde1c1fe64dec67336140a07.zip FreeBSD-src-5354776cb2ed414ffde1c1fe64dec67336140a07.tar.gz |
Back out the botched attempt to update to gnu grep 2.3 (lots of history
was lost). Restore original version to try and avoid breaking the build
while David O'brien does a proper set of imports and merges.
Requested by: obrien
Diffstat (limited to 'gnu/usr.bin/grep/search.c')
-rw-r--r-- | gnu/usr.bin/grep/search.c | 481 |
1 files changed, 481 insertions, 0 deletions
diff --git a/gnu/usr.bin/grep/search.c b/gnu/usr.bin/grep/search.c new file mode 100644 index 0000000..f0e3d5c --- /dev/null +++ b/gnu/usr.bin/grep/search.c @@ -0,0 +1,481 @@ +/* search.c - searching subroutines using dfa, kwset and regex for grep. + Copyright (C) 1992 Free Software Foundation, Inc. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + + Written August 1992 by Mike Haertel. */ + +#include <ctype.h> + +#ifdef STDC_HEADERS +#include <limits.h> +#include <stdlib.h> +#else +#define UCHAR_MAX 255 +#include <sys/types.h> +extern char *malloc(); +#endif + +#ifdef HAVE_MEMCHR +#include <string.h> +#ifdef NEED_MEMORY_H +#include <memory.h> +#endif +#else +#ifdef __STDC__ +extern void *memchr(); +#else +extern char *memchr(); +#endif +#endif + +#if defined(HAVE_STRING_H) || defined(STDC_HEADERS) +#undef bcopy +#define bcopy(s, d, n) memcpy((d), (s), (n)) +#endif + +#if defined(isascii) && !defined(__FreeBSD__) +#define ISALNUM(C) (isascii(C) && isalnum(C)) +#define ISUPPER(C) (isascii(C) && isupper(C)) +#else +#define ISALNUM(C) isalnum((unsigned char)C) +#define ISUPPER(C) isupper((unsigned char)C) +#endif + +#define TOLOWER(C) (ISUPPER(C) ? tolower((unsigned char)C) : (C)) + +#include "grep.h" +#include "dfa.h" +#include "kwset.h" +#include "gnuregex.h" + +#define NCHAR (UCHAR_MAX + 1) + +#if __STDC__ +static void Gcompile(char *, size_t); +static void Ecompile(char *, size_t); +static char *EGexecute(char *, size_t, char **); +static void Fcompile(char *, size_t); +static char *Fexecute(char *, size_t, char **); +#else +static void Gcompile(); +static void Ecompile(); +static char *EGexecute(); +static void Fcompile(); +static char *Fexecute(); +#endif + +/* Here is the matchers vector for the main program. */ +struct matcher matchers[] = { + { "default", Gcompile, EGexecute }, + { "grep", Gcompile, EGexecute }, + { "ggrep", Gcompile, EGexecute }, + { "egrep", Ecompile, EGexecute }, + { "posix-egrep", Ecompile, EGexecute }, + { "gegrep", Ecompile, EGexecute }, + { "fgrep", Fcompile, Fexecute }, + { "gfgrep", Fcompile, Fexecute }, + { 0, 0, 0 }, +}; + +/* For -w, we also consider _ to be word constituent. */ +#define WCHAR(C) (ISALNUM(C) || (C) == '_') + +/* DFA compiled regexp. */ +static struct dfa dfa; + +/* Regex compiled regexp. */ +static struct re_pattern_buffer regex; + +/* KWset compiled pattern. For Ecompile and Gcompile, we compile + a list of strings, at least one of which is known to occur in + any string matching the regexp. */ +static kwset_t kwset; + +/* Last compiled fixed string known to exactly match the regexp. + If kwsexec() returns < lastexact, then we don't need to + call the regexp matcher at all. */ +static int lastexact; + +void +dfaerror(mesg) + char *mesg; +{ + fatal(mesg, 0); +} + +static void +kwsinit() +{ + static char trans[NCHAR]; + int i; + + if (match_icase) + for (i = 0; i < NCHAR; ++i) + trans[i] = TOLOWER(i); + + if (!(kwset = kwsalloc(match_icase ? trans : (char *) 0))) + fatal("memory exhausted", 0); +} + +/* If the DFA turns out to have some set of fixed strings one of + which must occur in the match, then we build a kwset matcher + to find those strings, and thus quickly filter out impossible + matches. */ +static void +kwsmusts() +{ + struct dfamust *dm; + char *err; + + if (dfa.musts) + { + kwsinit(); + /* First, we compile in the substrings known to be exact + matches. The kwset matcher will return the index + of the matching string that it chooses. */ + for (dm = dfa.musts; dm; dm = dm->next) + { + if (!dm->exact) + continue; + ++lastexact; + if ((err = kwsincr(kwset, dm->must, strlen(dm->must))) != 0) + fatal(err, 0); + } + /* Now, we compile the substrings that will require + the use of the regexp matcher. */ + for (dm = dfa.musts; dm; dm = dm->next) + { + if (dm->exact) + continue; + if ((err = kwsincr(kwset, dm->must, strlen(dm->must))) != 0) + fatal(err, 0); + } + if ((err = kwsprep(kwset)) != 0) + fatal(err, 0); + } +} + +static void +Gcompile(pattern, size) + char *pattern; + size_t size; +{ +#ifdef __STDC__ + const +#endif + char *err; + + re_set_syntax(RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE); + dfasyntax(RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE, match_icase); + + if ((err = re_compile_pattern(pattern, size, ®ex)) != 0) + fatal(err, 0); + + dfainit(&dfa); + + /* In the match_words and match_lines cases, we use a different pattern + for the DFA matcher that will quickly throw out cases that won't work. + Then if DFA succeeds we do some hairy stuff using the regex matcher + to decide whether the match should really count. */ + if (match_words || match_lines) + { + /* In the whole-word case, we use the pattern: + (^|[^A-Za-z_])(userpattern)([^A-Za-z_]|$). + In the whole-line case, we use the pattern: + ^(userpattern)$. + BUG: Using [A-Za-z_] is locale-dependent! */ + + char *n = malloc(size + 50); + int i = 0; + + strcpy(n, ""); + + if (match_lines) + strcpy(n, "^\\("); + if (match_words) + strcpy(n, "\\(^\\|[^0-9A-Za-z_]\\)\\("); + + i = strlen(n); + bcopy(pattern, n + i, size); + i += size; + + if (match_words) + strcpy(n + i, "\\)\\([^0-9A-Za-z_]\\|$\\)"); + if (match_lines) + strcpy(n + i, "\\)$"); + + i += strlen(n + i); + dfacomp(n, i, &dfa, 1); + } + else + dfacomp(pattern, size, &dfa, 1); + + kwsmusts(); +} + +static void +Ecompile(pattern, size) + char *pattern; + size_t size; +{ +#ifdef __STDC__ + const +#endif + char *err; + + if (strcmp(matcher, "posix-egrep") == 0) + { + re_set_syntax(RE_SYNTAX_POSIX_EGREP); + dfasyntax(RE_SYNTAX_POSIX_EGREP, match_icase); + } + else + { + re_set_syntax(RE_SYNTAX_EGREP); + dfasyntax(RE_SYNTAX_EGREP, match_icase); + } + + if ((err = re_compile_pattern(pattern, size, ®ex)) != 0) + fatal(err, 0); + + dfainit(&dfa); + + /* In the match_words and match_lines cases, we use a different pattern + for the DFA matcher that will quickly throw out cases that won't work. + Then if DFA succeeds we do some hairy stuff using the regex matcher + to decide whether the match should really count. */ + if (match_words || match_lines) + { + /* In the whole-word case, we use the pattern: + (^|[^A-Za-z_])(userpattern)([^A-Za-z_]|$). + In the whole-line case, we use the pattern: + ^(userpattern)$. + BUG: Using [A-Za-z_] is locale-dependent! */ + + char *n = malloc(size + 50); + int i = 0; + + strcpy(n, ""); + + if (match_lines) + strcpy(n, "^("); + if (match_words) + strcpy(n, "(^|[^0-9A-Za-z_])("); + + i = strlen(n); + bcopy(pattern, n + i, size); + i += size; + + if (match_words) + strcpy(n + i, ")([^0-9A-Za-z_]|$)"); + if (match_lines) + strcpy(n + i, ")$"); + + i += strlen(n + i); + dfacomp(n, i, &dfa, 1); + } + else + dfacomp(pattern, size, &dfa, 1); + + kwsmusts(); +} + +static char * +EGexecute(buf, size, endp) + char *buf; + size_t size; + char **endp; +{ + register char *buflim, *beg, *end, save; + int backref, start, len; + struct kwsmatch kwsm; + static struct re_registers regs; /* This is static on account of a BRAIN-DEAD + Q@#%!# library interface in regex.c. */ + + buflim = buf + size; + + for (beg = end = buf; end < buflim; beg = end + 1) + { + if (kwset) + { + /* Find a possible match using the KWset matcher. */ + beg = kwsexec(kwset, beg, buflim - beg, &kwsm); + if (!beg) + goto failure; + /* Narrow down to the line containing the candidate, and + run it through DFA. */ + end = memchr(beg, '\n', buflim - beg); + if (!end) + end = buflim; + while (beg > buf && beg[-1] != '\n') + --beg; + save = *end; + if (kwsm.index < lastexact) + goto success; + if (!dfaexec(&dfa, beg, end, 0, (int *) 0, &backref)) + { + *end = save; + continue; + } + *end = save; + /* Successful, no backreferences encountered. */ + if (!backref) + goto success; + } + else + { + /* No good fixed strings; start with DFA. */ + save = *buflim; + beg = dfaexec(&dfa, beg, buflim, 0, (int *) 0, &backref); + *buflim = save; + if (!beg) + goto failure; + /* Narrow down to the line we've found. */ + end = memchr(beg, '\n', buflim - beg); + if (!end) + end = buflim; + while (beg > buf && beg[-1] != '\n') + --beg; + /* Successful, no backreferences encountered! */ + if (!backref) + goto success; + } + /* If we've made it to this point, this means DFA has seen + a probable match, and we need to run it through Regex. */ + regex.not_eol = 0; + if ((start = re_search(®ex, beg, end - beg, 0, end - beg, ®s)) >= 0) + { + len = regs.end[0] - start; + if (!match_lines && !match_words || match_lines && len == end - beg) + goto success; + /* If -w, check if the match aligns with word boundaries. + We do this iteratively because: + (a) the line may contain more than one occurence of the pattern, and + (b) Several alternatives in the pattern might be valid at a given + point, and we may need to consider a shorter one to find a word + boundary. */ + if (match_words) + while (start >= 0) + { + if ((start == 0 || !WCHAR(beg[start - 1])) + && (len == end - beg || !WCHAR(beg[start + len]))) + goto success; + if (len > 0) + { + /* Try a shorter length anchored at the same place. */ + --len; + regex.not_eol = 1; + len = re_match(®ex, beg, start + len, start, ®s); + } + if (len <= 0) + { + /* Try looking further on. */ + if (start == end - beg) + break; + ++start; + regex.not_eol = 0; + start = re_search(®ex, beg, end - beg, + start, end - beg - start, ®s); + len = regs.end[0] - start; + } + } + } + } + + failure: + return 0; + + success: + *endp = end < buflim ? end + 1 : end; + return beg; +} + +static void +Fcompile(pattern, size) + char *pattern; + size_t size; +{ + char *beg, *lim, *err; + + kwsinit(); + beg = pattern; + do + { + for (lim = beg; lim < pattern + size && *lim != '\n'; ++lim) + ; + if ((err = kwsincr(kwset, beg, lim - beg)) != 0) + fatal(err, 0); + if (lim < pattern + size) + ++lim; + beg = lim; + } + while (beg < pattern + size); + + if ((err = kwsprep(kwset)) != 0) + fatal(err, 0); +} + +static char * +Fexecute(buf, size, endp) + char *buf; + size_t size; + char **endp; +{ + register char *beg, *try, *end; + register size_t len; + struct kwsmatch kwsmatch; + + for (beg = buf; beg <= buf + size; ++beg) + { + if (!(beg = kwsexec(kwset, beg, buf + size - beg, &kwsmatch))) + return 0; + len = kwsmatch.size[0]; + if (match_lines) + { + if (beg > buf && beg[-1] != '\n') + continue; + if (beg + len < buf + size && beg[len] != '\n') + continue; + goto success; + } + else if (match_words) + for (try = beg; len && try;) + { + if (try > buf && WCHAR((unsigned char) try[-1])) + break; + if (try + len < buf + size && WCHAR((unsigned char) try[len])) + { + try = kwsexec(kwset, beg, --len, &kwsmatch); + len = kwsmatch.size[0]; + } + else + goto success; + } + else + goto success; + } + + return 0; + + success: + if ((end = memchr(beg + len, '\n', (buf + size) - (beg + len))) != 0) + ++end; + else + end = buf + size; + *endp = end; + while (beg > buf && beg[-1] != '\n') + --beg; + return beg; +} |