diff options
author | tjr <tjr@FreeBSD.org> | 2004-07-04 10:02:03 +0000 |
---|---|---|
committer | tjr <tjr@FreeBSD.org> | 2004-07-04 10:02:03 +0000 |
commit | 82d1ca30699cb272779d30c895a96d430ca8c806 (patch) | |
tree | 1e4e4b75976c4d7d9acbe2f02a75aa760e569603 /gnu | |
parent | 7bb57b0d67aa634a010e91a2f147ebc35d105f9b (diff) | |
download | FreeBSD-src-82d1ca30699cb272779d30c895a96d430ca8c806.zip FreeBSD-src-82d1ca30699cb272779d30c895a96d430ca8c806.tar.gz |
Merge local changes.
Diffstat (limited to 'gnu')
-rw-r--r-- | gnu/usr.bin/grep/dfa.c | 1331 | ||||
-rw-r--r-- | gnu/usr.bin/grep/dfa.h | 108 | ||||
-rw-r--r-- | gnu/usr.bin/grep/getpagesize.h | 5 | ||||
-rw-r--r-- | gnu/usr.bin/grep/grep.1 | 213 | ||||
-rw-r--r-- | gnu/usr.bin/grep/grep.c | 1053 | ||||
-rw-r--r-- | gnu/usr.bin/grep/grep.h | 16 | ||||
-rw-r--r-- | gnu/usr.bin/grep/kwset.c | 98 | ||||
-rw-r--r-- | gnu/usr.bin/grep/kwset.h | 8 | ||||
-rw-r--r-- | gnu/usr.bin/grep/search.c | 723 |
9 files changed, 2674 insertions, 881 deletions
diff --git a/gnu/usr.bin/grep/dfa.c b/gnu/usr.bin/grep/dfa.c index df6880e..e823d84 100644 --- a/gnu/usr.bin/grep/dfa.c +++ b/gnu/usr.bin/grep/dfa.c @@ -38,12 +38,24 @@ extern void free(); #if defined(HAVE_STRING_H) || defined(STDC_HEADERS) #include <string.h> -#undef index -#define index strchr #else #include <strings.h> #endif +#if HAVE_SETLOCALE +# include <locale.h> +#endif + +#if defined HAVE_WCTYPE_H && defined HAVE_WCHAR_H && defined HAVE_MBRTOWC +/* We can handle multibyte string. */ +# define MBS_SUPPORT +#endif + +#ifdef MBS_SUPPORT +# include <wchar.h> +# include <wctype.h> +#endif + #ifndef DEBUG /* use the same approach as regex.c */ #undef assert #define assert(e) @@ -104,6 +116,7 @@ extern void free(); #include "regex.h" #include "dfa.h" +#include "hard-locale.h" /* HPUX, define those as macros in sys/param.h */ #ifdef setbit @@ -114,47 +127,7 @@ extern void free(); #endif static void dfamust PARAMS ((struct dfa *dfa)); - -static ptr_t xcalloc PARAMS ((size_t n, size_t s)); -static ptr_t xmalloc PARAMS ((size_t n)); -static ptr_t xrealloc PARAMS ((ptr_t p, size_t n)); -#ifdef DEBUG -static void prtok PARAMS ((token t)); -#endif -static int tstbit PARAMS ((int b, charclass c)); -static void setbit PARAMS ((int b, charclass c)); -static void clrbit PARAMS ((int b, charclass c)); -static void copyset PARAMS ((charclass src, charclass dst)); -static void zeroset PARAMS ((charclass s)); -static void notset PARAMS ((charclass s)); -static int equal PARAMS ((charclass s1, charclass s2)); -static int charclass_index PARAMS ((charclass s)); -static int looking_at PARAMS ((const char *s)); -static token lex PARAMS ((void)); -static void addtok PARAMS ((token t)); -static void atom PARAMS ((void)); -static int nsubtoks PARAMS ((int tindex)); -static void copytoks PARAMS ((int tindex, int ntokens)); -static void closure PARAMS ((void)); -static void branch PARAMS ((void)); static void regexp PARAMS ((int toplevel)); -static void copy PARAMS ((position_set *src, position_set *dst)); -static void insert PARAMS ((position p, position_set *s)); -static void merge PARAMS ((position_set *s1, position_set *s2, position_set *m)); -static void delete PARAMS ((position p, position_set *s)); -static int state_index PARAMS ((struct dfa *d, position_set *s, - int newline, int letter)); -static void build_state PARAMS ((int s, struct dfa *d)); -static void build_state_zero PARAMS ((struct dfa *d)); -static char *icatalloc PARAMS ((char *old, char *new)); -static char *icpyalloc PARAMS ((char *string)); -static char *istrstr PARAMS ((char *lookin, char *lookfor)); -static void ifree PARAMS ((char *cp)); -static void freelist PARAMS ((char **cpp)); -static char **enlist PARAMS ((char **cpp, char *new, size_t len)); -static char **comsubs PARAMS ((char *left, char *right)); -static char **addlists PARAMS ((char **old, char **new)); -static char **inboth PARAMS ((char **left, char **right)); static ptr_t xcalloc (size_t n, size_t s) @@ -196,8 +169,9 @@ xrealloc (ptr_t p, size_t n) #define REALLOC_IF_NECESSARY(p, t, nalloc, index) \ if ((index) >= (nalloc)) \ { \ - while ((index) >= (nalloc)) \ + do \ (nalloc) *= 2; \ + while ((index) >= (nalloc)); \ REALLOC(p, t, nalloc); \ } @@ -206,7 +180,7 @@ xrealloc (ptr_t p, size_t n) static void prtok (token t) { - char *s; + char const *s; if (t < 0) fprintf(stderr, "END"); @@ -232,6 +206,11 @@ prtok (token t) case ORTOP: s = "ORTOP"; break; case LPAREN: s = "LPAREN"; break; case RPAREN: s = "RPAREN"; break; + case CRANGE: s = "CRANGE"; break; +#ifdef MBS_SUPPORT + case ANYCHAR: s = "ANYCHAR"; break; + case MBCSET: s = "MBCSET"; break; +#endif /* MBS_SUPPORT */ default: s = "CSET"; break; } fprintf(stderr, "%s", s); @@ -242,19 +221,19 @@ prtok (token t) /* Stuff pertaining to charclasses. */ static int -tstbit (int b, charclass c) +tstbit (unsigned b, charclass c) { return c[b / INTBITS] & 1 << b % INTBITS; } static void -setbit (int b, charclass c) +setbit (unsigned b, charclass c) { c[b / INTBITS] |= 1 << b % INTBITS; } static void -clrbit (int b, charclass c) +clrbit (unsigned b, charclass c) { c[b / INTBITS] &= ~(1 << b % INTBITS); } @@ -262,19 +241,13 @@ clrbit (int b, charclass c) static void copyset (charclass src, charclass dst) { - int i; - - for (i = 0; i < CHARCLASS_INTS; ++i) - dst[i] = src[i]; + memcpy (dst, src, sizeof (charclass)); } static void zeroset (charclass s) { - int i; - - for (i = 0; i < CHARCLASS_INTS; ++i) - s[i] = 0; + memset (s, 0, sizeof (charclass)); } static void @@ -289,12 +262,7 @@ notset (charclass s) static int equal (charclass s1, charclass s2) { - int i; - - for (i = 0; i < CHARCLASS_INTS; ++i) - if (s1[i] != s2[i]) - return 0; - return 1; + return memcmp (s1, s2, sizeof (charclass)) == 0; } /* A pointer to the current dfa is kept here during parsing. */ @@ -326,7 +294,7 @@ static unsigned char eolbyte; /* Entry point to set syntax options. */ void -dfasyntax (reg_syntax_t bits, int fold, int eol) +dfasyntax (reg_syntax_t bits, int fold, unsigned char eol) { syntax_bits_set = 1; syntax_bits = bits; @@ -334,22 +302,145 @@ dfasyntax (reg_syntax_t bits, int fold, int eol) eolbyte = eol; } +/* Like setbit, but if case is folded, set both cases of a letter. */ +static void +setbit_case_fold (unsigned b, charclass c) +{ + setbit (b, c); + if (case_fold) + { + if (ISUPPER (b)) + setbit (tolower (b), c); + else if (ISLOWER (b)) + setbit (toupper (b), c); + } +} + /* Lexical analyzer. All the dross that deals with the obnoxious GNU Regex syntax bits is located here. The poor, suffering reader is referred to the GNU Regex documentation for the meaning of the @#%!@#%^!@ syntax bits. */ -static char *lexstart; /* Pointer to beginning of input string. */ -static char *lexptr; /* Pointer to next input character. */ +static char const *lexstart; /* Pointer to beginning of input string. */ +static char const *lexptr; /* Pointer to next input character. */ static int lexleft; /* Number of characters remaining. */ static token lasttok; /* Previous token returned; initially END. */ static int laststart; /* True if we're separated from beginning or (, | only by zero-width characters. */ static int parens; /* Count of outstanding left parens. */ static int minrep, maxrep; /* Repeat counts for {m,n}. */ +static int hard_LC_COLLATE; /* Nonzero if LC_COLLATE is hard. */ + +#ifdef MBS_SUPPORT +/* These variables are used only if (MB_CUR_MAX > 1). */ +static mbstate_t mbs; /* Mbstate for mbrlen(). */ +static int cur_mb_len; /* Byte length of the current scanning + multibyte character. */ +static int cur_mb_index; /* Byte index of the current scanning multibyte + character. + + singlebyte character : cur_mb_index = 0 + multibyte character + 1st byte : cur_mb_index = 1 + 2nd byte : cur_mb_index = 2 + ... + nth byte : cur_mb_index = n */ +static unsigned char *mblen_buf;/* Correspond to the input buffer in dfaexec(). + Each element store the amount of remain + byte of corresponding multibyte character + in the input string. A element's value + is 0 if corresponding character is a + singlebyte chracter. + e.g. input : 'a', <mb(0)>, <mb(1)>, <mb(2)> + mblen_buf : 0, 3, 2, 1 + */ +static wchar_t *inputwcs; /* Wide character representation of input + string in dfaexec(). + The length of this array is same as + the length of input string(char array). + inputstring[i] is a single-byte char, + or 1st byte of a multibyte char. + And inputwcs[i] is the codepoint. */ +static unsigned char const *buf_begin;/* refference to begin in dfaexec(). */ +static unsigned char const *buf_end; /* refference to end in dfaexec(). */ +#endif /* MBS_SUPPORT */ + +#ifdef MBS_SUPPORT +/* This function update cur_mb_len, and cur_mb_index. + p points current lexptr, len is the remaining buffer length. */ +static void +update_mb_len_index (unsigned char const *p, int len) +{ + /* If last character is a part of a multibyte character, + we update cur_mb_index. */ + if (cur_mb_index) + cur_mb_index = (cur_mb_index >= cur_mb_len)? 0 + : cur_mb_index + 1; + + /* If last character is a single byte character, or the + last portion of a multibyte character, we check whether + next character is a multibyte character or not. */ + if (! cur_mb_index) + { + cur_mb_len = mbrlen(p, len, &mbs); + if (cur_mb_len > 1) + /* It is a multibyte character. + cur_mb_len was already set by mbrlen(). */ + cur_mb_index = 1; + else if (cur_mb_len < 1) + /* Invalid sequence. We treat it as a singlebyte character. + cur_mb_index is aleady 0. */ + cur_mb_len = 1; + /* Otherwise, cur_mb_len == 1, it is a singlebyte character. + cur_mb_index is aleady 0. */ + } +} +#endif /* MBS_SUPPORT */ +#ifdef MBS_SUPPORT /* Note that characters become unsigned here. */ -#define FETCH(c, eoferr) \ +# define FETCH(c, eoferr) \ + { \ + if (! lexleft) \ + { \ + if (eoferr != 0) \ + dfaerror (eoferr); \ + else \ + return lasttok = END; \ + } \ + if (MB_CUR_MAX > 1) \ + update_mb_len_index(lexptr, lexleft); \ + (c) = (unsigned char) *lexptr++; \ + --lexleft; \ + } + +/* This function fetch a wide character, and update cur_mb_len, + used only if the current locale is a multibyte environment. */ +static wchar_t +fetch_wc (char const *eoferr) +{ + wchar_t wc; + if (! lexleft) + { + if (eoferr != 0) + dfaerror (eoferr); + else + return -1; + } + + cur_mb_len = mbrtowc(&wc, lexptr, lexleft, &mbs); + if (cur_mb_len <= 0) + { + cur_mb_len = 1; + wc = *lexptr; + } + lexptr += cur_mb_len; + lexleft -= cur_mb_len; + return wc; +} +#else +/* Note that characters become unsigned here. */ +# define FETCH(c, eoferr) \ { \ if (! lexleft) \ { \ @@ -361,6 +452,202 @@ static int minrep, maxrep; /* Repeat counts for {m,n}. */ (c) = (unsigned char) *lexptr++; \ --lexleft; \ } +#endif /* MBS_SUPPORT */ + +#ifdef MBS_SUPPORT +/* Multibyte character handling sub-routin for lex. + This function parse a bracket expression and build a struct + mb_char_classes. */ +static void +parse_bracket_exp_mb () +{ + wchar_t wc, wc1, wc2; + + /* Work area to build a mb_char_classes. */ + struct mb_char_classes *work_mbc; + int chars_al, range_sts_al, range_ends_al, ch_classes_al, + equivs_al, coll_elems_al; + + REALLOC_IF_NECESSARY(dfa->mbcsets, struct mb_char_classes, + dfa->mbcsets_alloc, dfa->nmbcsets + 1); + /* dfa->multibyte_prop[] hold the index of dfa->mbcsets. + We will update dfa->multibyte_prop in addtok(), because we can't + decide the index in dfa->tokens[]. */ + + /* Initialize work are */ + work_mbc = &(dfa->mbcsets[dfa->nmbcsets++]); + + chars_al = 1; + range_sts_al = range_ends_al = 0; + ch_classes_al = equivs_al = coll_elems_al = 0; + MALLOC(work_mbc->chars, wchar_t, chars_al); + + work_mbc->nchars = work_mbc->nranges = work_mbc->nch_classes = 0; + work_mbc->nequivs = work_mbc->ncoll_elems = 0; + work_mbc->chars = work_mbc->ch_classes = NULL; + work_mbc->range_sts = work_mbc->range_ends = NULL; + work_mbc->equivs = work_mbc->coll_elems = NULL; + + wc = fetch_wc(_("Unbalanced [")); + if (wc == L'^') + { + wc = fetch_wc(_("Unbalanced [")); + work_mbc->invert = 1; + } + else + work_mbc->invert = 0; + do + { + wc1 = -1; /* mark wc1 is not initialized". */ + + /* Note that if we're looking at some other [:...:] construct, + we just treat it as a bunch of ordinary characters. We can do + this because we assume regex has checked for syntax errors before + dfa is ever called. */ + if (wc == L'[' && (syntax_bits & RE_CHAR_CLASSES)) + { +#define BRACKET_BUFFER_SIZE 128 + char str[BRACKET_BUFFER_SIZE]; + wc1 = wc; + wc = fetch_wc(_("Unbalanced [")); + + /* If pattern contains `[[:', `[[.', or `[[='. */ + if (cur_mb_len == 1 && (wc == L':' || wc == L'.' || wc == L'=')) + { + unsigned char c; + unsigned char delim = (unsigned char)wc; + int len = 0; + for (;;) + { + if (! lexleft) + dfaerror (_("Unbalanced [")); + c = (unsigned char) *lexptr++; + --lexleft; + + if ((c == delim && *lexptr == ']') || lexleft == 0) + break; + if (len < BRACKET_BUFFER_SIZE) + str[len++] = c; + else + /* This is in any case an invalid class name. */ + str[0] = '\0'; + } + str[len] = '\0'; + + if (lexleft == 0) + { + REALLOC_IF_NECESSARY(work_mbc->chars, wchar_t, chars_al, + work_mbc->nchars + 2); + work_mbc->chars[work_mbc->nchars++] = L'['; + work_mbc->chars[work_mbc->nchars++] = delim; + break; + } + + if (--lexleft, *lexptr++ != ']') + dfaerror (_("Unbalanced [")); + if (delim == ':') + /* build character class. */ + { + wctype_t wt; + /* Query the character class as wctype_t. */ + wt = wctype (str); + + if (ch_classes_al == 0) + MALLOC(work_mbc->ch_classes, wchar_t, ++ch_classes_al); + REALLOC_IF_NECESSARY(work_mbc->ch_classes, wctype_t, + ch_classes_al, + work_mbc->nch_classes + 1); + work_mbc->ch_classes[work_mbc->nch_classes++] = wt; + + } + else if (delim == '=' || delim == '.') + { + char *elem; + MALLOC(elem, char, len + 1); + strncpy(elem, str, len + 1); + + if (delim == '=') + /* build equivalent class. */ + { + if (equivs_al == 0) + MALLOC(work_mbc->equivs, char*, ++equivs_al); + REALLOC_IF_NECESSARY(work_mbc->equivs, char*, + equivs_al, + work_mbc->nequivs + 1); + work_mbc->equivs[work_mbc->nequivs++] = elem; + } + + if (delim == '.') + /* build collating element. */ + { + if (coll_elems_al == 0) + MALLOC(work_mbc->coll_elems, char*, ++coll_elems_al); + REALLOC_IF_NECESSARY(work_mbc->coll_elems, char*, + coll_elems_al, + work_mbc->ncoll_elems + 1); + work_mbc->coll_elems[work_mbc->ncoll_elems++] = elem; + } + } + wc = -1; + } + else + /* We treat '[' as a normal character here. */ + { + wc2 = wc1; wc1 = wc; wc = wc2; /* swap */ + } + } + else + { + if (wc == L'\\' && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS)) + wc = fetch_wc(("Unbalanced [")); + } + + if (wc1 == -1) + wc1 = fetch_wc(_("Unbalanced [")); + + if (wc1 == L'-') + /* build range characters. */ + { + wc2 = fetch_wc(_("Unbalanced [")); + if (wc2 == L']') + { + /* In the case [x-], the - is an ordinary hyphen, + which is left in c1, the lookahead character. */ + lexptr -= cur_mb_len; + lexleft += cur_mb_len; + wc2 = wc; + } + else + { + if (wc2 == L'\\' + && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS)) + wc2 = fetch_wc(_("Unbalanced [")); + wc1 = fetch_wc(_("Unbalanced [")); + } + + if (range_sts_al == 0) + { + MALLOC(work_mbc->range_sts, wchar_t, ++range_sts_al); + MALLOC(work_mbc->range_ends, wchar_t, ++range_ends_al); + } + REALLOC_IF_NECESSARY(work_mbc->range_sts, wchar_t, + range_sts_al, work_mbc->nranges + 1); + work_mbc->range_sts[work_mbc->nranges] = wc; + REALLOC_IF_NECESSARY(work_mbc->range_ends, wchar_t, + range_ends_al, work_mbc->nranges + 1); + work_mbc->range_ends[work_mbc->nranges++] = wc2; + } + else if (wc != -1) + /* build normal characters. */ + { + REALLOC_IF_NECESSARY(work_mbc->chars, wchar_t, chars_al, + work_mbc->nchars + 1); + work_mbc->chars[work_mbc->nchars++] = wc; + } + } + while ((wc = wc1) != L']'); +} +#endif /* MBS_SUPPORT */ #ifdef __STDC__ #define FUNC(F, P) static int F(int c) { return P(c); } @@ -392,7 +679,7 @@ is_blank (int c) static struct { const char *name; int (*pred) PARAMS ((int)); -} prednames[] = { +} const prednames[] = { { ":alpha:]", is_alpha }, { ":upper:]", is_upper }, { ":lower:]", is_lower }, @@ -425,12 +712,10 @@ looking_at (char const *s) static token lex (void) { - token c, c1, c2; + unsigned c, c1, c2; int backslash = 0, invert; charclass ccl; int i; - char lo[2]; - char hi[2]; /* Basic plan: We fetch a character. If it's a backslash, we set the backslash flag and go through the loop again. @@ -441,6 +726,14 @@ lex (void) for (i = 0; i < 2; ++i) { FETCH(c, 0); +#ifdef MBS_SUPPORT + if (MB_CUR_MAX > 1 && cur_mb_index) + /* If this is a part of a multi-byte character, we must treat + this byte data as a normal character. + e.g. In case of SJIS encoding, some character contains '\', + but they must not be backslash. */ + goto normal_char; +#endif /* MBS_SUPPORT */ switch (c) { case '\\': @@ -661,6 +954,15 @@ lex (void) case '.': if (backslash) goto normal_char; +#ifdef MBS_SUPPORT + if (MB_CUR_MAX > 1) + { + /* In multibyte environment period must match with a single + character not a byte. So we use ANYCHAR. */ + laststart = 0; + return lasttok = ANYCHAR; + } +#endif /* MBS_SUPPORT */ zeroset(ccl); notset(ccl); if (!(syntax_bits & RE_DOT_NEWLINE)) @@ -686,6 +988,17 @@ lex (void) case '[': if (backslash) goto normal_char; + laststart = 0; +#ifdef MBS_SUPPORT + if (MB_CUR_MAX > 1) + { + /* In multibyte environment a bracket expression may contain + multibyte characters, which must be treated as characters + (not bytes). So we parse it by parse_bracket_exp_mb(). */ + parse_bracket_exp_mb(); + return lasttok = MBCSET; + } +#endif zeroset(ccl); FETCH(c, _("Unbalanced [")); if (c == '^') @@ -707,14 +1020,11 @@ lex (void) for (c1 = 0; prednames[c1].name; ++c1) if (looking_at(prednames[c1].name)) { - int (*pred)() = prednames[c1].pred; - if (case_fold - && (pred == is_upper || pred == is_lower)) - pred = is_alpha; + int (*pred) PARAMS ((int)) = prednames[c1].pred; for (c2 = 0; c2 < NOTCHAR; ++c2) if ((*pred)(c2)) - setbit(c2, ccl); + setbit_case_fold (c2, ccl); lexptr += strlen(prednames[c1].name); lexleft -= strlen(prednames[c1].name); FETCH(c1, _("Unbalanced [")); @@ -732,7 +1042,6 @@ lex (void) which is left in c1, the lookahead character. */ --lexptr; ++lexleft; - c2 = c; } else { @@ -740,30 +1049,30 @@ lex (void) && (syntax_bits & RE_BACKSLASH_ESCAPE_IN_LISTS)) FETCH(c2, _("Unbalanced [")); FETCH(c1, _("Unbalanced [")); - } - } - else - c2 = c; - - lo[0] = c; lo[1] = '\0'; - hi[0] = c2; hi[1] = '\0'; - for (c = 0; c < NOTCHAR; c++) - { - char ch[2]; - ch[0] = c; ch[1] = '\0'; - if (strcoll (lo, ch) <= 0 && strcoll (ch, hi) <= 0) - { - setbit (c, ccl); - if (case_fold) - { - if (ISUPPER (c)) - setbit (tolower (c), ccl); - else if (ISLOWER (c)) - setbit (toupper (c), ccl); + if (!hard_LC_COLLATE) { + for (; c <= c2; c++) + setbit_case_fold (c, ccl); + } else { + /* POSIX locales are painful - leave the decision to libc */ + char expr[6] = { '[', c, '-', c2, ']', '\0' }; + regex_t re; + if (regcomp (&re, expr, case_fold ? REG_ICASE : 0) == REG_NOERROR) { + for (c = 0; c < NOTCHAR; ++c) { + char buf[2] = { c, '\0' }; + regmatch_t mat; + if (regexec (&re, buf, 1, &mat, 0) == REG_NOERROR + && mat.rm_so == 0 && mat.rm_eo == 1) + setbit_case_fold (c, ccl); + } + regfree (&re); } + } + continue; } } + setbit_case_fold (c, ccl); + skip: ; } @@ -774,7 +1083,6 @@ lex (void) if (syntax_bits & RE_HAT_LISTS_NOT_NEWLINE) clrbit(eolbyte, ccl); } - laststart = 0; return lasttok = CSET + charclass_index(ccl); default: @@ -783,11 +1091,7 @@ lex (void) if (case_fold && ISALPHA(c)) { zeroset(ccl); - setbit(c, ccl); - if (isupper(c)) - setbit(tolower(c), ccl); - else - setbit(toupper(c), ccl); + setbit_case_fold (c, ccl); return lasttok = CSET + charclass_index(ccl); } return c; @@ -814,6 +1118,26 @@ static int depth; /* Current depth of a hypothetical stack static void addtok (token t) { +#ifdef MBS_SUPPORT + if (MB_CUR_MAX > 1) + { + REALLOC_IF_NECESSARY(dfa->multibyte_prop, int, dfa->nmultibyte_prop, + dfa->tindex); + /* Set dfa->multibyte_prop. See struct dfa in dfa.h. */ + if (t == MBCSET) + dfa->multibyte_prop[dfa->tindex] = ((dfa->nmbcsets - 1) << 2) + 3; + else if (t < NOTCHAR) + dfa->multibyte_prop[dfa->tindex] + = (cur_mb_len == 1)? 3 /* single-byte char */ + : (((cur_mb_index == 1)? 1 : 0) /* 1st-byte of multibyte char */ + + ((cur_mb_index == cur_mb_len)? 2 : 0)); /* last-byte */ + else + /* It may be unnecesssary, but it is safer to treat other + symbols as singlebyte characters. */ + dfa->multibyte_prop[dfa->tindex] = 3; + } +#endif + REALLOC_IF_NECESSARY(dfa->tokens, token, dfa->talloc, dfa->tindex); dfa->tokens[dfa->tindex++] = t; @@ -854,10 +1178,14 @@ addtok (token t) closure QMARK closure STAR closure PLUS + closure REPMN atom atom: <normal character> + <multibyte character> + ANYCHAR + MBCSET CSET BACKREF BEGLINE @@ -866,6 +1194,8 @@ addtok (token t) ENDWORD LIMWORD NOTLIMWORD + CRANGE + LPAREN regexp RPAREN <empty> The parser builds a parse tree in postfix form in an array of tokens. */ @@ -875,10 +1205,47 @@ atom (void) { if ((tok >= 0 && tok < NOTCHAR) || tok >= CSET || tok == BACKREF || tok == BEGLINE || tok == ENDLINE || tok == BEGWORD +#ifdef MBS_SUPPORT + || tok == ANYCHAR || tok == MBCSET /* MB_CUR_MAX > 1 */ +#endif /* MBS_SUPPORT */ || tok == ENDWORD || tok == LIMWORD || tok == NOTLIMWORD) { addtok(tok); tok = lex(); +#ifdef MBS_SUPPORT + /* We treat a multibyte character as a single atom, so that DFA + can treat a multibyte character as a single expression. + + e.g. We construct following tree from "<mb1><mb2>". + <mb1(1st-byte)><mb1(2nd-byte)><CAT><mb1(3rd-byte)><CAT> + <mb2(1st-byte)><mb2(2nd-byte)><CAT><mb2(3rd-byte)><CAT><CAT> + */ + if (MB_CUR_MAX > 1) + { + while (cur_mb_index > 1 && tok >= 0 && tok < NOTCHAR) + { + addtok(tok); + addtok(CAT); + tok = lex(); + } + } +#endif /* MBS_SUPPORT */ + } + else if (tok == CRANGE) + { + /* A character range like "[a-z]" in a locale other than "C" or + "POSIX". This range might any sequence of one or more + characters. Unfortunately the POSIX locale primitives give + us no practical way to find what character sequences might be + matched. Treat this approximately like "(.\1)" -- i.e. match + one character, and then punt to the full matcher. */ + charclass ccl; + zeroset (ccl); + notset (ccl); + addtok (CSET + charclass_index (ccl)); + addtok (BACKREF); + addtok (CAT); + tok = lex (); } else if (tok == LPAREN) { @@ -989,7 +1356,7 @@ regexp (int toplevel) length of the string, so s can include NUL characters. D is a pointer to the struct dfa to parse into. */ void -dfaparse (char *s, size_t len, struct dfa *d) +dfaparse (char const *s, size_t len, struct dfa *d) { dfa = d; lexstart = lexptr = s; @@ -997,6 +1364,17 @@ dfaparse (char *s, size_t len, struct dfa *d) lasttok = END; laststart = 1; parens = 0; +#if ENABLE_NLS + hard_LC_COLLATE = hard_locale (LC_COLLATE); +#endif +#ifdef MBS_SUPPORT + if (MB_CUR_MAX > 1) + { + cur_mb_index = 0; + cur_mb_len = 0; + memset(&mbs, 0, sizeof(mbstate_t)); + } +#endif /* MBS_SUPPORT */ if (! syntax_bits_set) dfaerror(_("No syntax specified")); @@ -1022,7 +1400,7 @@ dfaparse (char *s, size_t len, struct dfa *d) /* Copy one set to another; the destination must be large enough. */ static void -copy (position_set *src, position_set *dst) +copy (position_set const *src, position_set *dst) { int i; @@ -1061,7 +1439,7 @@ insert (position p, position_set *s) /* Merge two sets of positions into a third. The result is exactly as if the positions of both sets were inserted into an initially empty set. */ static void -merge (position_set *s1, position_set *s2, position_set *m) +merge (position_set const *s1, position_set const *s2, position_set *m) { int i = 0, j = 0; @@ -1101,7 +1479,7 @@ delete (position p, position_set *s) state. Newline and letter tell whether we got here on a newline or letter, respectively. */ static int -state_index (struct dfa *d, position_set *s, int newline, int letter) +state_index (struct dfa *d, position_set const *s, int newline, int letter) { int hash = 0; int constraint; @@ -1138,6 +1516,10 @@ state_index (struct dfa *d, position_set *s, int newline, int letter) d->states[i].backref = 0; d->states[i].constraint = 0; d->states[i].first_end = 0; +#ifdef MBS_SUPPORT + if (MB_CUR_MAX > 1) + d->states[i].mbps.nelem = 0; +#endif for (j = 0; j < s->nelem; ++j) if (d->tokens[s->elems[j].index] < 0) { @@ -1167,7 +1549,7 @@ state_index (struct dfa *d, position_set *s, int newline, int letter) constraint. Repeat exhaustively until no funny positions are left. S->elems must be large enough to hold the result. */ static void -epsclosure (position_set *s, struct dfa *d) +epsclosure (position_set *s, struct dfa const *d) { int i, j; int *visited; @@ -1180,6 +1562,10 @@ epsclosure (position_set *s, struct dfa *d) for (i = 0; i < s->nelem; ++i) if (d->tokens[s->elems[i].index] >= NOTCHAR && d->tokens[s->elems[i].index] != BACKREF +#ifdef MBS_SUPPORT + && d->tokens[s->elems[i].index] != ANYCHAR + && d->tokens[s->elems[i].index] != MBCSET +#endif && d->tokens[s->elems[i].index] < CSET) { old = s->elems[i]; @@ -1461,6 +1847,10 @@ dfaanalyze (struct dfa *d, int searchflag) it with its epsilon closure. */ for (i = 0; i < d->tindex; ++i) if (d->tokens[i] < NOTCHAR || d->tokens[i] == BACKREF +#ifdef MBS_SUPPORT + || d->tokens[i] == ANYCHAR + || d->tokens[i] == MBCSET +#endif || d->tokens[i] >= CSET) { #ifdef DEBUG @@ -1562,6 +1952,9 @@ dfastate (int s, struct dfa *d, int trans[]) int wants_letter; /* New state wants to know letter context. */ int state_letter; /* New state on a letter transition. */ static int initialized; /* Flag for static initialization. */ +#ifdef MBS_SUPPORT + int next_isnt_1st_byte = 0; /* Flag If we can't add state0. */ +#endif int i, j, k; /* Initialize the set of letters, if necessary. */ @@ -1583,6 +1976,23 @@ dfastate (int s, struct dfa *d, int trans[]) setbit(d->tokens[pos.index], matches); else if (d->tokens[pos.index] >= CSET) copyset(d->charclasses[d->tokens[pos.index] - CSET], matches); +#ifdef MBS_SUPPORT + else if (d->tokens[pos.index] == ANYCHAR + || d->tokens[pos.index] == MBCSET) + /* MB_CUR_MAX > 1 */ + { + /* ANYCHAR and MBCSET must match with a single character, so we + must put it to d->states[s].mbps, which contains the positions + which can match with a single character not a byte. */ + if (d->states[s].mbps.nelem == 0) + { + MALLOC(d->states[s].mbps.elems, position, + d->states[s].elems.nelem); + } + insert(pos, &(d->states[s].mbps)); + continue; + } +#endif /* MBS_SUPPORT */ else continue; @@ -1719,9 +2129,46 @@ dfastate (int s, struct dfa *d, int trans[]) for (k = 0; k < d->follows[grps[i].elems[j].index].nelem; ++k) insert(d->follows[grps[i].elems[j].index].elems[k], &follows); +#ifdef MBS_SUPPORT + if (MB_CUR_MAX > 1) + { + /* If a token in follows.elems is not 1st byte of a multibyte + character, or the states of follows must accept the bytes + which are not 1st byte of the multibyte character. + Then, if a state of follows encounter a byte, it must not be + a 1st byte of a multibyte character nor singlebyte character. + We cansel to add state[0].follows to next state, because + state[0] must accept 1st-byte + + For example, we assume <sb a> is a certain singlebyte + character, <mb A> is a certain multibyte character, and the + codepoint of <sb a> equals the 2nd byte of the codepoint of + <mb A>. + When state[0] accepts <sb a>, state[i] transit to state[i+1] + by accepting accepts 1st byte of <mb A>, and state[i+1] + accepts 2nd byte of <mb A>, if state[i+1] encounter the + codepoint of <sb a>, it must not be <sb a> but 2nd byte of + <mb A>, so we can not add state[0]. */ + + next_isnt_1st_byte = 0; + for (j = 0; j < follows.nelem; ++j) + { + if (!(d->multibyte_prop[follows.elems[j].index] & 1)) + { + next_isnt_1st_byte = 1; + break; + } + } + } +#endif + /* If we are building a searching matcher, throw in the positions of state 0 as well. */ +#ifdef MBS_SUPPORT + if (d->searchflag && (MB_CUR_MAX == 1 || !next_isnt_1st_byte)) +#else if (d->searchflag) +#endif for (j = 0; j < d->states[0].elems.nelem; ++j) insert(d->states[0].elems.elems[j], &follows); @@ -1838,7 +2285,6 @@ build_state (int s, struct dfa *d) d->trans = d->realtrans + 1; REALLOC(d->fails, int *, d->tralloc); REALLOC(d->success, int, d->tralloc); - REALLOC(d->newlines, int, d->tralloc); while (oldalloc < d->tralloc) { d->trans[oldalloc] = NULL; @@ -1846,9 +2292,7 @@ build_state (int s, struct dfa *d) } } - /* Keep the newline transition in a special place so we can use it as - a sentinel. */ - d->newlines[s] = trans[eolbyte]; + /* Newline is a sentinel. */ trans[eolbyte] = -1; if (ACCEPTING(s, *d)) @@ -1866,29 +2310,450 @@ build_state_zero (struct dfa *d) d->trans = d->realtrans + 1; CALLOC(d->fails, int *, d->tralloc); MALLOC(d->success, int, d->tralloc); - MALLOC(d->newlines, int, d->tralloc); build_state(0, d); } +#ifdef MBS_SUPPORT +/* Multibyte character handling sub-routins for dfaexec. */ + +/* Initial state may encounter the byte which is not a singlebyte character + nor 1st byte of a multibyte character. But it is incorrect for initial + state to accept such a byte. + For example, in sjis encoding the regular expression like "\\" accepts + the codepoint 0x5c, but should not accept the 2nd byte of the codepoint + 0x815c. Then Initial state must skip the bytes which are not a singlebyte + character nor 1st byte of a multibyte character. */ +#define SKIP_REMAINS_MB_IF_INITIAL_STATE(s, p) \ + if (s == 0) \ + { \ + while (inputwcs[p - buf_begin] == 0 \ + && mblen_buf[p - buf_begin] > 0 \ + && p < buf_end) \ + ++p; \ + if (p >= end) \ + { \ + free(mblen_buf); \ + free(inputwcs); \ + return (size_t) -1; \ + } \ + } + +static void +realloc_trans_if_necessary(struct dfa *d, int new_state) +{ + /* Make sure that the trans and fail arrays are allocated large enough + to hold a pointer for the new state. */ + if (new_state >= d->tralloc) + { + int oldalloc = d->tralloc; + + while (new_state >= d->tralloc) + d->tralloc *= 2; + REALLOC(d->realtrans, int *, d->tralloc + 1); + d->trans = d->realtrans + 1; + REALLOC(d->fails, int *, d->tralloc); + REALLOC(d->success, int, d->tralloc); + while (oldalloc < d->tralloc) + { + d->trans[oldalloc] = NULL; + d->fails[oldalloc++] = NULL; + } + } +} + +/* Return values of transit_state_singlebyte(), and + transit_state_consume_1char. */ +typedef enum +{ + TRANSIT_STATE_IN_PROGRESS, /* State transition has not finished. */ + TRANSIT_STATE_DONE, /* State transition has finished. */ + TRANSIT_STATE_END_BUFFER /* Reach the end of the buffer. */ +} status_transit_state; + +/* Consume a single byte and transit state from 's' to '*next_state'. + This function is almost same as the state transition routin in dfaexec(). + But state transition is done just once, otherwise matching succeed or + reach the end of the buffer. */ +static status_transit_state +transit_state_singlebyte (struct dfa *d, int s, unsigned char const *p, + int *next_state) +{ + int *t; + int works = s; + + status_transit_state rval = TRANSIT_STATE_IN_PROGRESS; + + while (rval == TRANSIT_STATE_IN_PROGRESS) + { + if ((t = d->trans[works]) != NULL) + { + works = t[*p]; + rval = TRANSIT_STATE_DONE; + if (works < 0) + works = 0; + } + else if (works < 0) + { + if (p == buf_end) + /* At the moment, it must not happen. */ + return TRANSIT_STATE_END_BUFFER; + works = 0; + } + else if (d->fails[works]) + { + works = d->fails[works][*p]; + rval = TRANSIT_STATE_DONE; + } + else + { + build_state(works, d); + } + } + *next_state = works; + return rval; +} + +/* Check whether period can match or not in the current context. If it can, + return the amount of the bytes with which period can match, otherwise + return 0. + `pos' is the position of the period. `index' is the index from the + buf_begin, and it is the current position in the buffer. */ +static int +match_anychar (struct dfa *d, int s, position pos, int index) +{ + int newline = 0; + int letter = 0; + wchar_t wc; + int mbclen; + + wc = inputwcs[index]; + mbclen = (mblen_buf[index] == 0)? 1 : mblen_buf[index]; + + /* Check context. */ + if (wc == (wchar_t)eolbyte) + { + if (!(syntax_bits & RE_DOT_NEWLINE)) + return 0; + newline = 1; + } + else if (wc == (wchar_t)'\0') + { + if (syntax_bits & RE_DOT_NOT_NULL) + return 0; + newline = 1; + } + + if (iswalnum(wc) || wc == L'_') + letter = 1; + + if (!SUCCEEDS_IN_CONTEXT(pos.constraint, d->states[s].newline, + newline, d->states[s].letter, letter)) + return 0; + + return mbclen; +} + +/* Check whether bracket expression can match or not in the current context. + If it can, return the amount of the bytes with which expression can match, + otherwise return 0. + `pos' is the position of the bracket expression. `index' is the index + from the buf_begin, and it is the current position in the buffer. */ +int +match_mb_charset (struct dfa *d, int s, position pos, int index) +{ + int i; + int match; /* Flag which represent that matching succeed. */ + int match_len; /* Length of the character (or collating element) + with which this operator match. */ + int op_len; /* Length of the operator. */ + char buffer[128]; + wchar_t wcbuf[6]; + + /* Pointer to the structure to which we are currently reffering. */ + struct mb_char_classes *work_mbc; + + int newline = 0; + int letter = 0; + wchar_t wc; /* Current reffering character. */ + + wc = inputwcs[index]; + + /* Check context. */ + if (wc == (wchar_t)eolbyte) + { + if (!(syntax_bits & RE_DOT_NEWLINE)) + return 0; + newline = 1; + } + else if (wc == (wchar_t)'\0') + { + if (syntax_bits & RE_DOT_NOT_NULL) + return 0; + newline = 1; + } + if (iswalnum(wc) || wc == L'_') + letter = 1; + if (!SUCCEEDS_IN_CONTEXT(pos.constraint, d->states[s].newline, + newline, d->states[s].letter, letter)) + return 0; + + /* Assign the current reffering operator to work_mbc. */ + work_mbc = &(d->mbcsets[(d->multibyte_prop[pos.index]) >> 2]); + match = !work_mbc->invert; + match_len = (mblen_buf[index] == 0)? 1 : mblen_buf[index]; + + /* match with a character class? */ + for (i = 0; i<work_mbc->nch_classes; i++) + { + if (iswctype((wint_t)wc, work_mbc->ch_classes[i])) + goto charset_matched; + } + + strncpy(buffer, buf_begin + index, match_len); + buffer[match_len] = '\0'; + + /* match with an equivalent class? */ + for (i = 0; i<work_mbc->nequivs; i++) + { + op_len = strlen(work_mbc->equivs[i]); + strncpy(buffer, buf_begin + index, op_len); + buffer[op_len] = '\0'; + if (strcoll(work_mbc->equivs[i], buffer) == 0) + { + match_len = op_len; + goto charset_matched; + } + } + + /* match with a collating element? */ + for (i = 0; i<work_mbc->ncoll_elems; i++) + { + op_len = strlen(work_mbc->coll_elems[i]); + strncpy(buffer, buf_begin + index, op_len); + buffer[op_len] = '\0'; + + if (strcoll(work_mbc->coll_elems[i], buffer) == 0) + { + match_len = op_len; + goto charset_matched; + } + } + + wcbuf[0] = wc; + wcbuf[1] = wcbuf[3] = wcbuf[5] = '\0'; + + /* match with a range? */ + for (i = 0; i<work_mbc->nranges; i++) + { + wcbuf[2] = work_mbc->range_sts[i]; + wcbuf[4] = work_mbc->range_ends[i]; + + if (wcscoll(wcbuf, wcbuf+2) >= 0 && + wcscoll(wcbuf+4, wcbuf) >= 0) + goto charset_matched; + } + + /* match with a character? */ + for (i = 0; i<work_mbc->nchars; i++) + { + if (wc == work_mbc->chars[i]) + goto charset_matched; + } + + match = !match; + + charset_matched: + return match ? match_len : 0; +} + +/* Check each of `d->states[s].mbps.elem' can match or not. Then return the + array which corresponds to `d->states[s].mbps.elem' and each element of + the array contains the amount of the bytes with which the element can + match. + `index' is the index from the buf_begin, and it is the current position + in the buffer. + Caller MUST free the array which this function return. */ +static int* +check_matching_with_multibyte_ops (struct dfa *d, int s, int index) +{ + int i; + int* rarray; + + MALLOC(rarray, int, d->states[s].mbps.nelem); + for (i = 0; i < d->states[s].mbps.nelem; ++i) + { + position pos = d->states[s].mbps.elems[i]; + switch(d->tokens[pos.index]) + { + case ANYCHAR: + rarray[i] = match_anychar(d, s, pos, index); + break; + case MBCSET: + rarray[i] = match_mb_charset(d, s, pos, index); + break; + default: + break; /* can not happen. */ + } + } + return rarray; +} + +/* Consume a single character and enumerate all of the positions which can + be next position from the state `s'. + `match_lens' is the input. It can be NULL, but it can also be the output + of check_matching_with_multibyte_ops() for optimization. + `mbclen' and `pps' are the output. `mbclen' is the length of the + character consumed, and `pps' is the set this function enumerate. */ +static status_transit_state +transit_state_consume_1char (struct dfa *d, int s, unsigned char const **pp, + int *match_lens, int *mbclen, position_set *pps) +{ + int i, j; + int s1, s2; + int* work_mbls; + status_transit_state rs = TRANSIT_STATE_DONE; + + /* Calculate the length of the (single/multi byte) character + to which p points. */ + *mbclen = (mblen_buf[*pp - buf_begin] == 0)? 1 + : mblen_buf[*pp - buf_begin]; + + /* Calculate the state which can be reached from the state `s' by + consuming `*mbclen' single bytes from the buffer. */ + s1 = s; + for (i = 0; i < *mbclen; i++) + { + s2 = s1; + rs = transit_state_singlebyte(d, s2, (*pp)++, &s1); + } + /* Copy the positions contained by `s1' to the set `pps'. */ + copy(&(d->states[s1].elems), pps); + + /* Check (inputed)match_lens, and initialize if it is NULL. */ + if (match_lens == NULL && d->states[s].mbps.nelem != 0) + work_mbls = check_matching_with_multibyte_ops(d, s, *pp - buf_begin); + else + work_mbls = match_lens; + + /* Add all of the positions which can be reached from `s' by consuming + a single character. */ + for (i = 0; i < d->states[s].mbps.nelem ; i++) + { + if (work_mbls[i] == *mbclen) + for (j = 0; j < d->follows[d->states[s].mbps.elems[i].index].nelem; + j++) + insert(d->follows[d->states[s].mbps.elems[i].index].elems[j], + pps); + } + + if (match_lens == NULL && work_mbls != NULL) + free(work_mbls); + return rs; +} + +/* Transit state from s, then return new state and update the pointer of the + buffer. This function is for some operator which can match with a multi- + byte character or a collating element(which may be multi characters). */ +static int +transit_state (struct dfa *d, int s, unsigned char const **pp) +{ + int s1; + int mbclen; /* The length of current input multibyte character. */ + int maxlen = 0; + int i, j; + int *match_lens = NULL; + int nelem = d->states[s].mbps.nelem; /* Just a alias. */ + position_set follows; + unsigned char const *p1 = *pp; + status_transit_state rs; + wchar_t wc; + + if (nelem > 0) + /* This state has (a) multibyte operator(s). + We check whether each of them can match or not. */ + { + /* Note: caller must free the return value of this function. */ + match_lens = check_matching_with_multibyte_ops(d, s, *pp - buf_begin); + + for (i = 0; i < nelem; i++) + /* Search the operator which match the longest string, + in this state. */ + { + if (match_lens[i] > maxlen) + maxlen = match_lens[i]; + } + } + + if (nelem == 0 || maxlen == 0) + /* This state has no multibyte operator which can match. + We need to check only one singlebyte character. */ + { + status_transit_state rs; + rs = transit_state_singlebyte(d, s, *pp, &s1); + + /* We must update the pointer if state transition succeeded. */ + if (rs == TRANSIT_STATE_DONE) + ++*pp; + + if (match_lens != NULL) + free(match_lens); + return s1; + } + + /* This state has some operators which can match a multibyte character. */ + follows.nelem = 0; + MALLOC(follows.elems, position, d->nleaves); + + /* `maxlen' may be longer than the length of a character, because it may + not be a character but a (multi character) collating element. + We enumerate all of the positions which `s' can reach by consuming + `maxlen' bytes. */ + rs = transit_state_consume_1char(d, s, pp, match_lens, &mbclen, &follows); + + wc = inputwcs[*pp - mbclen - buf_begin]; + s1 = state_index(d, &follows, wc == L'\n', iswalnum(wc)); + realloc_trans_if_necessary(d, s1); + + while (*pp - p1 < maxlen) + { + follows.nelem = 0; + rs = transit_state_consume_1char(d, s1, pp, NULL, &mbclen, &follows); + + for (i = 0; i < nelem ; i++) + { + if (match_lens[i] == *pp - p1) + for (j = 0; + j < d->follows[d->states[s1].mbps.elems[i].index].nelem; j++) + insert(d->follows[d->states[s1].mbps.elems[i].index].elems[j], + &follows); + } + + wc = inputwcs[*pp - mbclen - buf_begin]; + s1 = state_index(d, &follows, wc == L'\n', iswalnum(wc)); + realloc_trans_if_necessary(d, s1); + } + free(match_lens); + free(follows.elems); + return s1; +} + +#endif + /* Search through a buffer looking for a match to the given struct dfa. Find the first occurrence of a string matching the regexp in the buffer, - and the shortest possible version thereof. Return a pointer to the first - character after the match, or NULL if none is found. Begin points to - the beginning of the buffer, and end points to the first character after - its end. We store a newline in *end to act as a sentinel, so end had - better point somewhere valid. Newline is a flag indicating whether to - allow newlines to be in the matching string. If count is non- - NULL it points to a place we're supposed to increment every time we - see a newline. Finally, if backref is non-NULL it points to a place + and the shortest possible version thereof. Return the offset of the first + character after the match, or (size_t) -1 if none is found. BEGIN points to + the beginning of the buffer, and SIZE is the size of the buffer. If SIZE + is nonzero, BEGIN[SIZE - 1] must be a newline. BACKREF points to a place where we're supposed to store a 1 if backreferencing happened and the match needs to be verified by a backtracking matcher. Otherwise we store a 0 in *backref. */ -char * -dfaexec (struct dfa *d, char *begin, char *end, - int newline, int *count, int *backref) +size_t +dfaexec (struct dfa *d, char const *begin, size_t size, int *backref) { - register int s, s1, tmp; /* Current state. */ - register unsigned char *p; /* Current input character. */ + register int s; /* Current state. */ + register unsigned char const *p; /* Current input character. */ + register unsigned char const *end; /* One past the last input character. */ register int **trans, *t; /* Copy of d->trans so it can be optimized into a register. */ register unsigned char eol = eolbyte; /* Likewise for eolbyte. */ @@ -1908,58 +2773,145 @@ dfaexec (struct dfa *d, char *begin, char *end, if (! d->tralloc) build_state_zero(d); - s = s1 = 0; - p = (unsigned char *) begin; + s = 0; + p = (unsigned char const *) begin; + end = p + size; trans = d->trans; - *end = eol; + +#ifdef MBS_SUPPORT + if (MB_CUR_MAX > 1) + { + int remain_bytes, i; + buf_begin = begin; + buf_end = end; + + /* initialize mblen_buf, and inputwcs. */ + MALLOC(mblen_buf, unsigned char, end - (unsigned char const *)begin + 2); + MALLOC(inputwcs, wchar_t, end - (unsigned char const *)begin + 2); + memset(&mbs, 0, sizeof(mbstate_t)); + remain_bytes = 0; + for (i = 0; i < end - (unsigned char const *)begin + 1; i++) + { + if (remain_bytes == 0) + { + remain_bytes + = mbrtowc(inputwcs + i, begin + i, + end - (unsigned char const *)begin - i + 1, &mbs); + if (remain_bytes <= 1) + { + remain_bytes = 0; + inputwcs[i] = (wchar_t)begin[i]; + mblen_buf[i] = 0; + } + else + { + mblen_buf[i] = remain_bytes; + remain_bytes--; + } + } + else + { + mblen_buf[i] = remain_bytes; + inputwcs[i] = 0; + remain_bytes--; + } + } + mblen_buf[i] = 0; + inputwcs[i] = 0; /* sentinel */ + } +#endif /* MBS_SUPPORT */ for (;;) { - while ((t = trans[s]) != 0) { /* hand-optimized loop */ - s1 = t[*p++]; - if ((t = trans[s1]) == 0) { - tmp = s ; s = s1 ; s1 = tmp ; /* swap */ - break; - } - s = t[*p++]; - } +#ifdef MBS_SUPPORT + if (MB_CUR_MAX > 1) + while ((t = trans[s])) + { + if (d->states[s].mbps.nelem != 0) + { + /* Can match with a multibyte character( and multi character + collating element). */ + unsigned char const *nextp; + + SKIP_REMAINS_MB_IF_INITIAL_STATE(s, p); - if (s >= 0 && p <= (unsigned char *) end && d->fails[s]) + nextp = p; + s = transit_state(d, s, &nextp); + p = nextp; + + /* Trans table might be updated. */ + trans = d->trans; + } + else + { + SKIP_REMAINS_MB_IF_INITIAL_STATE(s, p); + s = t[*p++]; + } + } + else +#endif /* MBS_SUPPORT */ + while ((t = trans[s])) + s = t[*p++]; + + if (s < 0) + { + if (p == end) + { +#ifdef MBS_SUPPORT + if (MB_CUR_MAX > 1) + { + free(mblen_buf); + free(inputwcs); + } +#endif /* MBS_SUPPORT */ + return (size_t) -1; + } + s = 0; + } + else if ((t = d->fails[s])) { if (d->success[s] & sbit[*p]) { if (backref) *backref = (d->states[s].backref != 0); - return (char *) p; +#ifdef MBS_SUPPORT + if (MB_CUR_MAX > 1) + { + free(mblen_buf); + free(inputwcs); + } +#endif /* MBS_SUPPORT */ + return (char const *) p - begin; } - s1 = s; - s = d->fails[s][*p++]; - continue; +#ifdef MBS_SUPPORT + if (MB_CUR_MAX > 1) + { + SKIP_REMAINS_MB_IF_INITIAL_STATE(s, p); + if (d->states[s].mbps.nelem != 0) + { + /* Can match with a multibyte character( and multi + character collating element). */ + unsigned char const *nextp; + nextp = p; + s = transit_state(d, s, &nextp); + p = nextp; + + /* Trans table might be updated. */ + trans = d->trans; + } + else + s = t[*p++]; + } + else +#endif /* MBS_SUPPORT */ + s = t[*p++]; } - - /* If the previous character was a newline, count it. */ - if (count && (char *) p <= end && p[-1] == eol) - ++*count; - - /* Check if we've run off the end of the buffer. */ - if ((char *) p > end) - return NULL; - - if (s >= 0) + else { build_state(s, d); trans = d->trans; - continue; - } - - if (p[-1] == eol && newline) - { - s = d->newlines[s1]; - continue; } - - s = 0; } } @@ -1975,6 +2927,16 @@ dfainit (struct dfa *d) d->talloc = 1; MALLOC(d->tokens, token, d->talloc); d->tindex = d->depth = d->nleaves = d->nregexps = 0; +#ifdef MBS_SUPPORT + if (MB_CUR_MAX > 1) + { + d->nmultibyte_prop = 1; + MALLOC(d->multibyte_prop, int, d->nmultibyte_prop); + d->nmbcsets = 0; + d->mbcsets_alloc = 1; + MALLOC(d->mbcsets, struct mb_char_classes, d->mbcsets_alloc); + } +#endif d->searchflag = 0; d->tralloc = 0; @@ -1984,7 +2946,7 @@ dfainit (struct dfa *d) /* Parse and analyze a single string of the given length. */ void -dfacomp (char *s, size_t len, struct dfa *d, int searchflag) +dfacomp (char const *s, size_t len, struct dfa *d, int searchflag) { if (case_fold) /* dummy folding in service of dfamust() */ { @@ -2030,6 +2992,38 @@ dfafree (struct dfa *d) free((ptr_t) d->charclasses); free((ptr_t) d->tokens); + +#ifdef MBS_SUPPORT + if (MB_CUR_MAX > 1) + { + free((ptr_t) d->multibyte_prop); + for (i = 0; i < d->nmbcsets; ++i) + { + int j; + struct mb_char_classes *p = &(d->mbcsets[i]); + if (p->chars != NULL) + free(p->chars); + if (p->ch_classes != NULL) + free(p->ch_classes); + if (p->range_sts != NULL) + free(p->range_sts); + if (p->range_ends != NULL) + free(p->range_ends); + + for (j = 0; j < p->nequivs; ++j) + free(p->equivs[j]); + if (p->equivs != NULL) + free(p->equivs); + + for (j = 0; j < p->ncoll_elems; ++j) + free(p->coll_elems[j]); + if (p->coll_elems != NULL) + free(p->coll_elems); + } + free((ptr_t) d->mbcsets); + } +#endif /* MBS_SUPPORT */ + for (i = 0; i < d->sindex; ++i) free((ptr_t) d->states[i].elems.elems); free((ptr_t) d->states); @@ -2044,7 +3038,6 @@ dfafree (struct dfa *d) free((ptr_t) d->fails[i]); if (d->realtrans) free((ptr_t) d->realtrans); if (d->fails) free((ptr_t) d->fails); - if (d->newlines) free((ptr_t) d->newlines); if (d->success) free((ptr_t) d->success); for (dm = d->musts; dm; dm = ndm) { @@ -2085,6 +3078,10 @@ dfafree (struct dfa *d) ---- ---- ----- -- -- char c # c # c # c # c + ANYCHAR ZERO ZERO ZERO ZERO + + MBCSET ZERO ZERO ZERO ZERO + CSET ZERO ZERO ZERO ZERO STAR ZERO ZERO ZERO ZERO @@ -2257,14 +3254,14 @@ comsubs (char *left, char *right) for (lcp = left; *lcp != '\0'; ++lcp) { len = 0; - rcp = index(right, *lcp); + rcp = strchr (right, *lcp); while (rcp != NULL) { for (i = 1; lcp[i] != '\0' && lcp[i] == rcp[i]; ++i) continue; if (i > len) len = i; - rcp = index(rcp + 1, *lcp); + rcp = strchr (rcp + 1, *lcp); } if (len == 0) continue; @@ -2530,7 +3527,12 @@ dfamust (struct dfa *dfa) /* not on *my* shift */ goto done; } - else if (t >= CSET) + else if (t >= CSET +#ifdef MBS_SUPPORT + || t == ANYCHAR + || t == MBCSET +#endif /* MBS_SUPPORT */ + ) { /* easy enough */ resetmust(mp); @@ -2580,3 +3582,4 @@ dfamust (struct dfa *dfa) } free((char *) mp); } +/* vim:set shiftwidth=2: */ diff --git a/gnu/usr.bin/grep/dfa.h b/gnu/usr.bin/grep/dfa.h index 2a7fed2..4cdbe7a 100644 --- a/gnu/usr.bin/grep/dfa.h +++ b/gnu/usr.bin/grep/dfa.h @@ -24,18 +24,24 @@ In addition to clobbering modularity, we eat up valuable name space. */ -# undef PARAMS -#if __STDC__ +#ifdef __STDC__ # ifndef _PTR_T # define _PTR_T typedef void * ptr_t; # endif -# define PARAMS(x) x #else # ifndef _PTR_T # define _PTR_T typedef char * ptr_t; # endif +#endif + +#ifdef PARAMS +# undef PARAMS +#endif +#if PROTOTYPES +# define PARAMS(x) x +#else # define PARAMS(x) () #endif @@ -138,6 +144,21 @@ typedef enum RPAREN, /* RPAREN never appears in the parse tree. */ + CRANGE, /* CRANGE never appears in the parse tree. + It stands for a character range that can + match a string of one or more characters. + For example, [a-z] can match "ch" in + a Spanish locale. */ + +#ifdef MBS_SUPPORT + ANYCHAR, /* ANYCHAR is a terminal symbol that matches + any multibyte(or singlebyte) characters. + It is used only if MB_CUR_MAX > 1. */ + + MBCSET, /* MBCSET is similar to CSET, but for + multibyte characters. */ +#endif /* MBS_SUPPORT */ + CSET /* CSET and (and any value greater) is a terminal symbol that matches any of a class of characters. */ @@ -225,6 +246,12 @@ typedef struct char backref; /* True if this state matches a \<digit>. */ unsigned char constraint; /* Constraint for this state to accept. */ int first_end; /* Token value of the first END in elems. */ +#ifdef MBS_SUPPORT + position_set mbps; /* Positions which can match multibyte + characters. e.g. period. + These staff are used only if + MB_CUR_MAX > 1. */ +#endif } dfa_state; /* Element of a list of strings, at least one of which is known to @@ -236,6 +263,26 @@ struct dfamust struct dfamust *next; }; +#ifdef MBS_SUPPORT +/* A bracket operator. + e.g. [a-c], [[:alpha:]], etc. */ +struct mb_char_classes +{ + int invert; + wchar_t *chars; /* Normal characters. */ + int nchars; + wctype_t *ch_classes; /* Character classes. */ + int nch_classes; + wchar_t *range_sts; /* Range characters (start of the range). */ + wchar_t *range_ends; /* Range characters (end of the range). */ + int nranges; + char **equivs; /* Equivalent classes. */ + int nequivs; + char **coll_elems; + int ncoll_elems; /* Collating elements. */ +}; +#endif + /* A compiled regular expression. */ struct dfa { @@ -254,6 +301,32 @@ struct dfa int nleaves; /* Number of leaves on the parse tree. */ int nregexps; /* Count of parallel regexps being built with dfaparse(). */ +#ifdef MBS_SUPPORT + /* These stuff are used only if MB_CUR_MAX > 1 or multibyte environments. */ + int nmultibyte_prop; + int *multibyte_prop; + /* The value of multibyte_prop[i] is defined by following rule. + if tokens[i] < NOTCHAR + bit 1 : tokens[i] is a singlebyte character, or the last-byte of + a multibyte character. + bit 0 : tokens[i] is a singlebyte character, or the 1st-byte of + a multibyte character. + if tokens[i] = MBCSET + ("the index of mbcsets correspnd to this operator" << 2) + 3 + + e.g. + tokens + = 'single_byte_a', 'multi_byte_A', single_byte_b' + = 'sb_a', 'mb_A(1st byte)', 'mb_A(2nd byte)', 'mb_A(3rd byte)', 'sb_b' + multibyte_prop + = 3 , 1 , 0 , 2 , 3 + */ + + /* Array of the bracket expressoin in the DFA. */ + struct mb_char_classes *mbcsets; + int nmbcsets; + int mbcsets_alloc; +#endif /* Stuff owned by the state builder. */ dfa_state *states; /* States of the dfa. */ @@ -292,13 +365,6 @@ struct dfa on a state that potentially could do so. */ int *success; /* Table of acceptance conditions used in dfaexec and computed in build_state. */ - int *newlines; /* Transitions on newlines. The entry for a - newline in any transition table is always - -1 so we can count lines without wasting - too many cycles. The transition for a - newline is stored separately and handled - as a special case. Newline is also used - as a sentinel at the end of the buffer. */ struct dfamust *musts; /* List of strings, at least one of which is known to appear in any r.e. matching the dfa. */ @@ -325,26 +391,21 @@ struct dfa /* dfasyntax() takes three arguments; the first sets the syntax bits described earlier in this file, the second sets the case-folding flag, and the third specifies the line terminator. */ -extern void dfasyntax PARAMS ((reg_syntax_t, int, int)); +extern void dfasyntax PARAMS ((reg_syntax_t, int, unsigned char)); /* Compile the given string of the given length into the given struct dfa. Final argument is a flag specifying whether to build a searching or an exact matcher. */ -extern void dfacomp PARAMS ((char *, size_t, struct dfa *, int)); +extern void dfacomp PARAMS ((char const *, size_t, struct dfa *, int)); /* Execute the given struct dfa on the buffer of characters. The - first char * points to the beginning, and the second points to the - first character after the end of the buffer, which must be a writable - place so a sentinel end-of-buffer marker can be stored there. The - second-to-last argument is a flag telling whether to allow newlines to - be part of a string matching the regexp. The next-to-last argument, - if non-NULL, points to a place to increment every time we see a - newline. The final argument, if non-NULL, points to a flag that will + last byte of the buffer must equal the end-of-line byte. + The final argument points to a flag that will be set if further examination by a backtracking matcher is needed in order to verify backreferencing; otherwise the flag will be cleared. - Returns NULL if no match is found, or a pointer to the first + Returns (size_t) -1 if no match is found, or the offset of the first character after the first & shortest matching string in the buffer. */ -extern char *dfaexec PARAMS ((struct dfa *, char *, char *, int, int *, int *)); +extern size_t dfaexec PARAMS ((struct dfa *, char const *, size_t, int *)); /* Free the storage held by the components of a struct dfa. */ extern void dfafree PARAMS ((struct dfa *)); @@ -355,7 +416,7 @@ extern void dfafree PARAMS ((struct dfa *)); extern void dfainit PARAMS ((struct dfa *)); /* Incrementally parse a string of given length into a struct dfa. */ -extern void dfaparse PARAMS ((char *, size_t, struct dfa *)); +extern void dfaparse PARAMS ((char const *, size_t, struct dfa *)); /* Analyze a parsed regexp; second argument tells whether to build a searching or an exact matcher. */ @@ -369,6 +430,5 @@ extern void dfastate PARAMS ((int, struct dfa *, int [])); /* dfaerror() is called by the regexp routines whenever an error occurs. It takes a single argument, a NUL-terminated string describing the error. - The default dfaerror() prints the error message to stderr and exits. - The user can provide a different dfafree() if so desired. */ + The user must supply a dfaerror. */ extern void dfaerror PARAMS ((const char *)); diff --git a/gnu/usr.bin/grep/getpagesize.h b/gnu/usr.bin/grep/getpagesize.h index daf7f81..7b3e376 100644 --- a/gnu/usr.bin/grep/getpagesize.h +++ b/gnu/usr.bin/grep/getpagesize.h @@ -4,6 +4,11 @@ #ifndef HAVE_GETPAGESIZE +#if !defined getpagesize && defined __BEOS__ +# include <OS.h> +# define getpagesize() B_PAGE_SIZE +#endif + #ifdef HAVE_UNISTD_H # include <unistd.h> #endif diff --git a/gnu/usr.bin/grep/grep.1 b/gnu/usr.bin/grep/grep.1 index 6d8b17e..425b393 100644 --- a/gnu/usr.bin/grep/grep.1 +++ b/gnu/usr.bin/grep/grep.1 @@ -13,7 +13,7 @@ .de Id .ds Dt \\$4 .. -.Id $Id: grep.1,v 1.11 2000/02/26 03:18:40 alainm Exp $ +.Id $Id: grep.1,v 1.23 2002/01/22 13:20:04 bero Exp $ .TH GREP 1 \*(Dt "GNU Project" .SH NAME grep, egrep, fgrep, zgrep, zegrep, zfgrep, @@ -72,6 +72,9 @@ is the same as Print .I NUM lines of trailing context after matching lines. +Places a line containing +.B \-\^\- +between contiguous groups of matches. .TP .BR \-a ", " \-\^\-text Process a binary file as if it were text; this is equivalent to the @@ -82,11 +85,17 @@ option. Print .I NUM lines of leading context before matching lines. +Places a line containing +.B \-\^\- +between contiguous groups of matches. .TP -\fB\-C\fP [\fINUM\fP], \fB\-\fP\fINUM\fP, \fB\-\^\-context\fP[\fB=\fP\fINUM\fP] +.BI \-C " NUM" "\fR,\fP \-\^\-context=" NUM Print .I NUM -lines (default 2) of output context. +lines of output context. +Places a line containing +.B \-\^\- +between contiguous groups of matches. .TP .BR \-b ", " \-\^\-byte-offset Print the byte offset within the input file before @@ -127,6 +136,11 @@ might output binary garbage, which can have nasty side effects if the output is a terminal and if the terminal driver interprets some of it as commands. .TP +.BI \-\^\-colour[=\fIWHEN\fR] ", " \-\^\-color[=\fIWHEN\fR] +Surround the matching string with the marker find in +.B GREP_COLOR +environment variable. WHEN may be `never', `always', or `auto' +.TP .BR \-c ", " \-\^\-count Suppress normal output; instead print a count of matching lines for each input file. @@ -134,6 +148,20 @@ With the .BR \-v ", " \-\^\-invert-match option (see below), count non-matching lines. .TP +.BI \-D " ACTION" "\fR,\fP \-\^\-devices=" ACTION +If an input file is a device, FIFO or socket, use +.I ACTION +to process it. By default, +.I ACTION +is +.BR read , +which means that devices are read just as if they were ordinary files. +If +.I ACTION +is +.BR skip , +devices are silently skipped. +.TP .BI \-d " ACTION" "\fR,\fP \-\^\-directories=" ACTION If an input file is a directory, use .I ACTION @@ -173,6 +201,10 @@ Interpret .I PATTERN as a list of fixed strings, separated by newlines, any of which is to be matched. +.BR \-P ", " \-\^\-perl-regexp +Interpret +.I PATTERN +as a Perl regular expression. .TP .BI \-f " FILE" "\fR,\fP \-\^\-file=" FILE Obtain patterns from @@ -218,6 +250,39 @@ the name of each input file from which output would normally have been printed. The scanning will stop on the first match. .TP +.BI \-m " NUM" "\fR,\fP \-\^\-max-count=" NUM +Stop reading a file after +.I NUM +matching lines. If the input is standard input from a regular file, +and +.I NUM +matching lines are output, +.B grep +ensures that the standard input is positioned to just after the last +matching line before exiting, regardless of the presence of trailing +context lines. This enables a calling process to resume a search. +When +.B grep +stops after +.I NUM +matching lines, it outputs any trailing context lines. When the +.B \-c +or +.B \-\^\-count +option is also used, +.B grep +does not output a count greater than +.IR NUM . +When the +.B \-v +or +.B \-\^\-invert-match +option is also used, +.B grep +stops after outputting +.I NUM +non-matching lines. +.TP .B \-\^\-mmap If possible, use the .BR mmap (2) @@ -237,21 +302,43 @@ is operating, or if an I/O error occurs. Prefix each line of output with the line number within its input file. .TP +.BR \-o ", " \-\^\-only-matching +Show only the part of a matching line that matches +.I PATTERN. +.TP +.BI \-\^\-label= LABEL +Displays input actually coming from standard input as input coming from file +.I LABEL. +This is especially useful for tools like zgrep, e.g. +.B "gzip -cd foo.gz |grep --label=foo something" +.TP +.BR \-\^\-line-buffering +Use line buffering, it can be a performance penality. +.TP .BR \-q ", " \-\^\-quiet ", " \-\^\-silent -Quiet; suppress normal output. The scanning will stop -on the first match. +Quiet; do not write anything to standard output. +Exit immediately with zero status if any match is found, +even if an error was detected. Also see the .B \-s or .B \-\^\-no-messages -option below. +option. .TP -.BR \-r ", " \-\^\-recursive +.BR \-R ", " \-r ", " \-\^\-recursive Read all files under each directory, recursively; this is equivalent to the .B "\-d recurse" option. .TP +.BR "\fR \fP \-\^\-include=" PATTERN +Recurse in directories only searching file matching +.I PATTERN. +.TP +.BR "\fR \fP \-\^\-exclude=" PATTERN +Recurse in directories skip file matching +.I PATTERN. +.TP .BR \-s ", " \-\^\-no-messages Suppress error messages about nonexistent or unreadable files. Portability note: unlike \s-1GNU\s0 @@ -378,11 +465,13 @@ a single character. Most characters, including all letters and digits, are regular expressions that match themselves. Any metacharacter with special meaning may be quoted by preceding it with a backslash. .PP -A list of characters enclosed by +A +.I "bracket expression" +is a list of characters enclosed by .B [ and -.B ] -matches any single +.BR ] . +It matches any single character in that list; if the first character of the list is the caret .B ^ @@ -391,10 +480,32 @@ then it matches any character in the list. For example, the regular expression .B [0123456789] -matches any single digit. A range of characters -may be specified by giving the first and last characters, separated -by a hyphen. -Finally, certain named classes of characters are predefined. +matches any single digit. +.PP +Within a bracket expression, a +.I "range expression" +consists of two characters separated by a hyphen. +It matches any single character that sorts between the two characters, +inclusive, using the locale's collating sequence and character set. +For example, in the default C locale, +.B [a\-d] +is equivalent to +.BR [abcd] . +Many locales sort characters in dictionary order, and in these locales +.B [a\-d] +is typically not equivalent to +.BR [abcd] ; +it might be equivalent to +.BR [aBbCcDd] , +for example. +To obtain the traditional interpretation of bracket expressions, +you can use the C locale by setting the +.B LC_ALL +environment variable to the value +.BR C . +.PP +Finally, certain named classes of characters are predefined within +bracket expressions, as follows. Their names are self explanatory, and they are .BR [:alnum:] , .BR [:alpha:] , @@ -411,8 +522,8 @@ and For example, .B [[:alnum:]] means -.BR [0-9A-Za-z] , -except the latter form depends upon the \s-1POSIX\s0 locale and the +.BR [0\-9A\-Za\-z] , +except the latter form depends upon the C locale and the \s-1ASCII\s0 character encoding, whereas the former is independent of locale and character set. (Note that the brackets in these class names are part of the symbolic @@ -559,6 +670,29 @@ instead of reporting a syntax error in the regular expression. \s-1POSIX.2\s0 allows this behavior as an extension, but portable scripts should avoid it. .SH "ENVIRONMENT VARIABLES" +Grep's behavior is affected by the following environment variables. +.PP +A locale +.BI LC_ foo +is specified by examining the three environment variables +.BR LC_ALL , +.BR LC_\fIfoo\fP , +.BR LANG , +in that order. +The first of these variables that is set specifies the locale. +For example, if +.B LC_ALL +is not set, but +.B LC_MESSAGES +is set to +.BR pt_BR , +then Brazilian Portuguese is used for the +.B LC_MESSAGES +locale. +The C locale is used if none of these environment variables are set, +or if the locale catalog is not installed, or if +.B grep +was not compiled with national language support (\s-1NLS\s0). .TP .B GREP_OPTIONS This variable specifies default options to be placed in front of any @@ -576,28 +710,29 @@ Option specifications are separated by whitespace. A backslash escapes the next character, so it can be used to specify an option containing whitespace or a backslash. .TP -\fBLC_ALL\fP, \fBLC_MESSAGES\fP, \fBLANG\fP +.B GREP_COLOR +Specifies the marker for highlighting. +.TP +\fBLC_ALL\fP, \fBLC_COLLATE\fP, \fBLANG\fP These variables specify the -.B LC_MESSAGES -locale, which determines the language that -.B grep -uses for messages. -The locale is determined by the first of these variables that is set. -American English is used if none of these environment variables are set, -or if the message catalog is not installed, or if -.B grep -was not compiled with national language support (\s-1NLS\s0). +.B LC_COLLATE +locale, which determines the collating sequence used to interpret +range expressions like +.BR [a\-z] . .TP \fBLC_ALL\fP, \fBLC_CTYPE\fP, \fBLANG\fP These variables specify the .B LC_CTYPE locale, which determines the type of characters, e.g., which characters are whitespace. -The locale is determined by the first of these variables that is set. -The \s-1POSIX\s0 locale is used if none of these environment variables -are set, or if the locale catalog is not installed, or if +.TP +\fBLC_ALL\fP, \fBLC_MESSAGES\fP, \fBLANG\fP +These variables specify the +.B LC_MESSAGES +locale, which determines the language that .B grep -was not compiled with national language support (\s-1NLS\s0). +uses for messages. +The default C locale uses American English messages. .TP .B POSIXLY_CORRECT If set, @@ -612,13 +747,15 @@ Also, \s-1POSIX.2\s0 requires that unrecognized options be diagnosed as \*(lqillegal\*(rq, but since they are not really against the law the default is to diagnose them as \*(lqinvalid\*(rq. .SH DIAGNOSTICS -Normally, exit status is 0 if matches were found, -and 1 if no matches were found. (The -.B \-v -option inverts the sense of the exit status.) -Exit status is 2 if there were syntax errors -in the pattern, inaccessible input files, or -other system errors. +.PP +Normally, exit status is 0 if selected lines are found and 1 otherwise. +But the exit status is 2 if an error occurred, unless the +.B \-q +or +.B \-\^\-quiet +or +.B \-\^\-silent +option is used and a selected line is found. .SH BUGS Email bug reports to .BR bug-gnu-utils@gnu.org . @@ -626,7 +763,7 @@ Be sure to include the word \*(lqgrep\*(rq somewhere in the \*(lqSubject:\*(rq field. .PP Large repetition counts in the -.BI { m , n } +.BI { n , m } construct may cause grep to use lots of memory. In addition, certain other obscure regular expressions require exponential time diff --git a/gnu/usr.bin/grep/grep.c b/gnu/usr.bin/grep/grep.c index bee3b88..c831537 100644 --- a/gnu/usr.bin/grep/grep.c +++ b/gnu/usr.bin/grep/grep.c @@ -39,13 +39,18 @@ #include "getpagesize.h" #include "grep.h" #include "savedir.h" +#include "xstrtol.h" +#include "xalloc.h" +#include "error.h" +#include "exclude.h" +#include "closeout.h" #undef MAX #define MAX(A,B) ((A) > (B) ? (A) : (B)) struct stats { - struct stats *parent; + struct stats const *parent; struct stat stat; }; @@ -58,43 +63,73 @@ static int show_help; /* If non-zero, print the version on standard output and exit. */ static int show_version; +/* If nonzero, suppress diagnostics for nonexistent or unreadable files. */ +static int suppress_errors; + /* If nonzero, use mmap if possible. */ static int mmap_option; /* If zero, output nulls after filenames. */ static int filename_mask; +/* If nonzero, use grep_color marker. */ +static int color_option; + +/* If nonzero, show only the part of a line matching the expression. */ +static int only_matching; + +/* The color string used. The user can overwrite it using the environment + variable GREP_COLOR. The default is to print red. */ +static const char *grep_color = "01;31"; + +static struct exclude *excluded_patterns; +static struct exclude *included_patterns; /* Short options. */ static char const short_options[] = -"0123456789A:B:C::EFGHIJRUVX:abcd:e:f:hiLlnqrsuvwxyZz"; +"0123456789A:B:C:D:EFGHIJPUVX:abcd:e:f:hiKLlm:noqRrsuvwxyZz"; /* Non-boolean long options that have no corresponding short equivalents. */ enum { - BINARY_FILES_OPTION = CHAR_MAX + 1 + BINARY_FILES_OPTION = CHAR_MAX + 1, + COLOR_OPTION, + INCLUDE_OPTION, + EXCLUDE_OPTION, + EXCLUDE_FROM_OPTION, + LINE_BUFFERED_OPTION, + LABEL_OPTION }; /* Long options equivalences. */ -static struct option long_options[] = +static struct option const long_options[] = { {"after-context", required_argument, NULL, 'A'}, {"basic-regexp", no_argument, NULL, 'G'}, {"before-context", required_argument, NULL, 'B'}, {"binary-files", required_argument, NULL, BINARY_FILES_OPTION}, {"byte-offset", no_argument, NULL, 'b'}, - {"context", optional_argument, NULL, 'C'}, + {"context", required_argument, NULL, 'C'}, + {"color", optional_argument, NULL, COLOR_OPTION}, + {"colour", optional_argument, NULL, COLOR_OPTION}, {"count", no_argument, NULL, 'c'}, + {"devices", required_argument, NULL, 'D'}, {"directories", required_argument, NULL, 'd'}, {"extended-regexp", no_argument, NULL, 'E'}, + {"exclude", required_argument, NULL, EXCLUDE_OPTION}, + {"exclude-from", required_argument, NULL, EXCLUDE_FROM_OPTION}, {"file", required_argument, NULL, 'f'}, {"files-with-matches", no_argument, NULL, 'l'}, {"files-without-match", no_argument, NULL, 'L'}, {"fixed-regexp", no_argument, NULL, 'F'}, {"fixed-strings", no_argument, NULL, 'F'}, {"help", no_argument, &show_help, 1}, + {"include", required_argument, NULL, INCLUDE_OPTION}, {"ignore-case", no_argument, NULL, 'i'}, + {"label", required_argument, NULL, LABEL_OPTION}, + {"line-buffered", no_argument, NULL, LINE_BUFFERED_OPTION}, {"line-number", no_argument, NULL, 'n'}, {"line-regexp", no_argument, NULL, 'x'}, + {"max-count", required_argument, NULL, 'm'}, {"mmap", no_argument, &mmap_option, 1}, {"no-filename", no_argument, NULL, 'h'}, {"no-messages", no_argument, NULL, 's'}, @@ -106,8 +141,11 @@ static struct option long_options[] = {"null", no_argument, NULL, 'Z'}, #endif {"null-data", no_argument, NULL, 'z'}, + {"only-matching", no_argument, NULL, 'o'}, + {"perl-regexp", no_argument, NULL, 'P'}, {"quiet", no_argument, NULL, 'q'}, {"recursive", no_argument, NULL, 'r'}, + {"recursive", no_argument, NULL, 'R'}, {"regexp", required_argument, NULL, 'e'}, {"invert-match", no_argument, NULL, 'v'}, {"silent", no_argument, NULL, 'q'}, @@ -127,7 +165,8 @@ int match_lines; unsigned char eolbyte; /* For error messages. */ -static char *prog; +/* The name the program was run with, stripped of any leading path. */ +char *program_name; static char const *filename; static int errseen; @@ -137,115 +176,70 @@ static enum READ_DIRECTORIES, RECURSE_DIRECTORIES, SKIP_DIRECTORIES - } directories; - -static int ck_atoi PARAMS ((char const *, int *)); -static void usage PARAMS ((int)) __attribute__((noreturn)); -static void error PARAMS ((const char *, int)); -static void setmatcher PARAMS ((char const *)); -static int install_matcher PARAMS ((char const *)); -static int prepend_args PARAMS ((char const *, char *, char **)); -static void prepend_default_options PARAMS ((char const *, int *, char ***)); -static char *page_alloc PARAMS ((size_t, char **)); -static int reset PARAMS ((int, char const *, struct stats *)); -static int fillbuf PARAMS ((size_t, struct stats *)); -static int grepbuf PARAMS ((char *, char *)); -static void prtext PARAMS ((char *, char *, int *)); -static void prpending PARAMS ((char *)); -static void prline PARAMS ((char *, char *, int)); -static void print_offset_sep PARAMS ((off_t, int)); -static void nlscan PARAMS ((char *)); -static int grep PARAMS ((int, char const *, struct stats *)); -static int grepdir PARAMS ((char const *, struct stats *)); -static int grepfile PARAMS ((char const *, struct stats *)); -#if O_BINARY + } directories = READ_DIRECTORIES; + +/* How to handle devices. */ +static enum + { + READ_DEVICES, + SKIP_DEVICES + } devices = READ_DEVICES; + +static int grepdir PARAMS ((char const *, struct stats const *)); +#if defined(HAVE_DOS_FILE_CONTENTS) static inline int undossify_input PARAMS ((register char *, size_t)); #endif /* Functions we'll use to search. */ -static void (*compile) PARAMS ((char *, size_t)); -static char *(*execute) PARAMS ((char *, size_t, char **)); +static void (*compile) PARAMS ((char const *, size_t)); +static size_t (*execute) PARAMS ((char const *, size_t, size_t *, int)); -/* Print a message and possibly an error string. Remember - that something awful happened. */ +/* Like error, but suppress the diagnostic if requested. */ static void -error (const char *mesg, int errnum) +suppressible_error (char const *mesg, int errnum) { - if (errnum) - fprintf (stderr, "%s: %s: %s\n", prog, mesg, strerror (errnum)); - else - fprintf (stderr, "%s: %s\n", prog, mesg); + if (! suppress_errors) + error (0, errnum, "%s", mesg); errseen = 1; } -/* Like error (), but die horribly after printing. */ -void -fatal (const char *mesg, int errnum) -{ - error (mesg, errnum); - exit (2); -} - -/* Interface to handle errors and fix library lossage. */ -char * -xmalloc (size_t size) -{ - char *result; - - result = malloc (size); - if (size && !result) - fatal (_("memory exhausted"), 0); - return result; -} - -/* Interface to handle errors and fix some library lossage. */ -char * -xrealloc (char *ptr, size_t size) -{ - char *result; - - if (ptr) - result = realloc (ptr, size); - else - result = malloc (size); - if (size && !result) - fatal (_("memory exhausted"), 0); - return result; -} - /* Convert STR to a positive integer, storing the result in *OUT. - If STR is not a valid integer, return -1 (otherwise 0). */ -static int -ck_atoi (char const *str, int *out) + STR must be a valid context length argument; report an error if it + isn't. */ +static void +context_length_arg (char const *str, int *out) { - char const *p; - for (p = str; *p; p++) - if (*p < '0' || *p > '9') - return -1; - - *out = atoi (optarg); - return 0; + uintmax_t value; + if (! (xstrtoumax (str, 0, 10, &value, "") == LONGINT_OK + && 0 <= (*out = value) + && *out == value)) + { + error (2, 0, "%s: %s\n", str, _("invalid context length argument")); + } } /* Hairy buffering mechanism for grep. The intent is to keep all reads aligned on a page boundary and multiples of the - page size. */ + page size, unless a read yields a partial page. */ -static char *ubuffer; /* Unaligned base of buffer. */ static char *buffer; /* Base of buffer. */ -static size_t bufsalloc; /* Allocated size of buffer save region. */ -static size_t bufalloc; /* Total buffer size. */ -#define PREFERRED_SAVE_FACTOR 5 /* Preferred value of bufalloc / bufsalloc. */ +static size_t bufalloc; /* Allocated buffer size, counting slop. */ +#define INITIAL_BUFSIZE 32768 /* Initial buffer size, not counting slop. */ static int bufdesc; /* File descriptor. */ static char *bufbeg; /* Beginning of user-visible stuff. */ static char *buflim; /* Limit of user-visible stuff. */ static size_t pagesize; /* alignment of memory pages */ static off_t bufoffset; /* Read offset; defined on regular files. */ +static off_t after_last_match; /* Pointer after last matching line that + would have been output if we were + outputting characters. */ #if defined(HAVE_MMAP) static int bufmapped; /* True if buffer is memory-mapped. */ static off_t initial_bufoffset; /* Initial value of bufoffset. */ +#else +# define bufmapped 0 #endif #include <bzlib.h> @@ -264,80 +258,51 @@ static int Zflag; /* uncompress before searching. */ ? (val) \ : (val) + ((alignment) - (size_t) (val) % (alignment))) -/* Return the address of a page-aligned buffer of size SIZE, - reallocating it from *UP. Set *UP to the newly allocated (but - possibly unaligned) buffer used to build the aligned buffer. To - free the buffer, free (*UP). */ -static char * -page_alloc (size_t size, char **up) -{ - size_t asize = size + pagesize - 1; - if (size <= asize) - { - char *p = *up ? realloc (*up, asize) : malloc (asize); - if (p) - { - *up = p; - return ALIGN_TO (p, pagesize); - } - } - return NULL; -} - /* Reset the buffer for a new file, returning zero if we should skip it. Initialize on the first time through. */ static int reset (int fd, char const *file, struct stats *stats) { - if (pagesize) - bufsalloc = ALIGN_TO (bufalloc / PREFERRED_SAVE_FACTOR, pagesize); - else + if (! pagesize) { - size_t ubufsalloc; pagesize = getpagesize (); - if (pagesize == 0) + if (pagesize == 0 || 2 * pagesize + 1 <= pagesize) abort (); -#ifndef BUFSALLOC - ubufsalloc = MAX (8192, pagesize); -#else - ubufsalloc = BUFSALLOC; -#endif - bufsalloc = ALIGN_TO (ubufsalloc, pagesize); - bufalloc = PREFERRED_SAVE_FACTOR * bufsalloc; - /* The 1 byte of overflow is a kludge for dfaexec(), which - inserts a sentinel newline at the end of the buffer - being searched. There's gotta be a better way... */ - if (bufsalloc < ubufsalloc - || bufalloc / PREFERRED_SAVE_FACTOR != bufsalloc - || bufalloc + 1 < bufalloc - || ! (buffer = page_alloc (bufalloc + 1, &ubuffer))) - fatal (_("memory exhausted"), 0); + bufalloc = ALIGN_TO (INITIAL_BUFSIZE, pagesize) + pagesize + 1; + buffer = xmalloc (bufalloc); } if (BZflag) { bzbufdesc = BZ2_bzdopen(fd, "r"); if (bzbufdesc == NULL) - fatal(_("memory exhausted"), 0); + error(2, 0, _("memory exhausted")); } #if HAVE_LIBZ > 0 if (Zflag) { gzbufdesc = gzdopen(fd, "r"); if (gzbufdesc == NULL) - fatal(_("memory exhausted"), 0); + error(2, 0, _("memory exhausted")); } #endif - buflim = buffer; + bufbeg = buflim = ALIGN_TO (buffer + 1, pagesize); + bufbeg[-1] = eolbyte; bufdesc = fd; if (fstat (fd, &stats->stat) != 0) { - error ("fstat", errno); + error (0, errno, "fstat"); return 0; } if (directories == SKIP_DIRECTORIES && S_ISDIR (stats->stat.st_mode)) return 0; +#ifndef DJGPP + if (devices == SKIP_DEVICES && (S_ISCHR(stats->stat.st_mode) || S_ISBLK(stats->stat.st_mode) || S_ISSOCK(stats->stat.st_mode))) +#else + if (devices == SKIP_DEVICES && (S_ISCHR(stats->stat.st_mode) || S_ISBLK(stats->stat.st_mode))) +#endif + return 0; if ( BZflag || #if HAVE_LIBZ > 0 @@ -352,18 +317,18 @@ reset (int fd, char const *file, struct stats *stats) bufoffset = lseek (fd, 0, SEEK_CUR); if (bufoffset < 0) { - error ("lseek", errno); + error (0, errno, "lseek"); return 0; } } -#ifdef HAVE_MMAP +#if defined(HAVE_MMAP) initial_bufoffset = bufoffset; bufmapped = mmap_option && bufoffset % pagesize == 0; #endif } else { -#ifdef HAVE_MMAP +#if defined(HAVE_MMAP) bufmapped = 0; #endif } @@ -375,73 +340,68 @@ reset (int fd, char const *file, struct stats *stats) to the beginning of the buffer contents, and 'buflim' points just after the end. Return zero if there's an error. */ static int -fillbuf (size_t save, struct stats *stats) +fillbuf (size_t save, struct stats const *stats) { size_t fillsize = 0; int cc = 1; + char *readbuf; size_t readsize; - /* Offset from start of unaligned buffer to start of old stuff + /* Offset from start of buffer to start of old stuff that we want to save. */ - size_t saved_offset = buflim - ubuffer - save; + size_t saved_offset = buflim - save - buffer; - if (bufsalloc < save) + if (pagesize <= buffer + bufalloc - buflim) { - size_t aligned_save = ALIGN_TO (save, pagesize); - size_t maxalloc = (size_t) -1; + readbuf = buflim; + bufbeg = buflim - save; + } + else + { + size_t minsize = save + pagesize; + size_t newsize; size_t newalloc; - + char *newbuf; + + /* Grow newsize until it is at least as great as minsize. */ + for (newsize = bufalloc - pagesize - 1; newsize < minsize; newsize *= 2) + if (newsize * 2 < newsize || newsize * 2 + pagesize + 1 < newsize * 2) + xalloc_die (); + + /* Try not to allocate more memory than the file size indicates, + as that might cause unnecessary memory exhaustion if the file + is large. However, do not use the original file size as a + heuristic if we've already read past the file end, as most + likely the file is growing. */ if (S_ISREG (stats->stat.st_mode)) { - /* Calculate an upper bound on how much memory we should allocate. - We can't use ALIGN_TO here, since off_t might be longer than - size_t. Watch out for arithmetic overflow. */ off_t to_be_read = stats->stat.st_size - bufoffset; - size_t slop = to_be_read % pagesize; - off_t aligned_to_be_read = to_be_read + (slop ? pagesize - slop : 0); - off_t maxalloc_off = aligned_save + aligned_to_be_read; - if (0 <= maxalloc_off && maxalloc_off == (size_t) maxalloc_off) - maxalloc = maxalloc_off; + off_t maxsize_off = save + to_be_read; + if (0 <= to_be_read && to_be_read <= maxsize_off + && maxsize_off == (size_t) maxsize_off + && minsize <= (size_t) maxsize_off + && (size_t) maxsize_off < newsize) + newsize = maxsize_off; } - /* Grow bufsalloc until it is at least as great as `save'; but - if there is an overflow, just grow it to the next page boundary. */ - while (bufsalloc < save) - if (bufsalloc < bufsalloc * 2) - bufsalloc *= 2; - else - { - bufsalloc = aligned_save; - break; - } + /* Add enough room so that the buffer is aligned and has room + for byte sentinels fore and aft. */ + newalloc = newsize + pagesize + 1; - /* Grow the buffer size to be PREFERRED_SAVE_FACTOR times - bufsalloc.... */ - newalloc = PREFERRED_SAVE_FACTOR * bufsalloc; - if (maxalloc < newalloc) + newbuf = bufalloc < newalloc ? xmalloc (bufalloc = newalloc) : buffer; + readbuf = ALIGN_TO (newbuf + 1 + save, pagesize); + bufbeg = readbuf - save; + memmove (bufbeg, buffer + saved_offset, save); + bufbeg[-1] = eolbyte; + if (newbuf != buffer) { - /* ... except don't grow it more than a pagesize past the - file size, as that might cause unnecessary memory - exhaustion if the file is large. */ - newalloc = maxalloc; - bufsalloc = aligned_save; + free (buffer); + buffer = newbuf; } - - /* Check that the above calculations made progress, which might - not occur if there is arithmetic overflow. If there's no - progress, or if the new buffer size is larger than the old - and buffer reallocation fails, report memory exhaustion. */ - if (bufsalloc < save || newalloc < save - || (newalloc == save && newalloc != maxalloc) - || (bufalloc < newalloc - && ! (buffer - = page_alloc ((bufalloc = newalloc) + 1, &ubuffer)))) - fatal (_("memory exhausted"), 0); } - bufbeg = buffer + bufsalloc - save; - memmove (bufbeg, ubuffer + saved_offset, save); - readsize = bufalloc - bufsalloc; + readsize = buffer + bufalloc - readbuf; + readsize -= readsize % pagesize; #if defined(HAVE_MMAP) if (bufmapped) @@ -457,7 +417,7 @@ fillbuf (size_t save, struct stats *stats) } if (mmapsize - && (mmap ((caddr_t) (buffer + bufsalloc), mmapsize, + && (mmap ((caddr_t) readbuf, mmapsize, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_FIXED, bufdesc, bufoffset) != (caddr_t) -1)) @@ -478,7 +438,7 @@ fillbuf (size_t save, struct stats *stats) if (bufoffset != initial_bufoffset && lseek (bufdesc, bufoffset, SEEK_SET) < 0) { - error ("lseek", errno); + error (0, errno, "lseek"); cc = 0; } } @@ -492,7 +452,7 @@ fillbuf (size_t save, struct stats *stats) if (BZflag && bzbufdesc) { int bzerr; - bytesread = BZ2_bzRead (&bzerr, bzbufdesc, buffer + bufsalloc, readsize); + bytesread = BZ2_bzRead (&bzerr, bzbufdesc, readbuf, readsize); switch (bzerr) { @@ -503,7 +463,7 @@ fillbuf (size_t save, struct stats *stats) case BZ_DATA_ERROR_MAGIC: BZ2_bzReadClose (&bzerr, bzbufdesc); bzbufdesc = NULL; lseek (bufdesc, 0, SEEK_SET); - bytesread = read (bufdesc, buffer + bufsalloc, readsize); + bytesread = read (bufdesc, readbuf, readsize); break; default: bytesread = 0; @@ -513,10 +473,10 @@ fillbuf (size_t save, struct stats *stats) else #if HAVE_LIBZ > 0 if (Zflag) - bytesread = gzread (gzbufdesc, buffer + bufsalloc, readsize); + bytesread = gzread (gzbufdesc, readbuf, readsize); else #endif - bytesread = read (bufdesc, buffer + bufsalloc, readsize); + bytesread = read (bufdesc, readbuf, readsize); while (bytesread < 0 && errno == EINTR); if (bytesread < 0) cc = 0; @@ -525,21 +485,23 @@ fillbuf (size_t save, struct stats *stats) } bufoffset += fillsize; -#if O_BINARY +#if defined(HAVE_DOS_FILE_CONTENTS) if (fillsize) - fillsize = undossify_input (buffer + bufsalloc, fillsize); + fillsize = undossify_input (readbuf, fillsize); #endif - buflim = buffer + bufsalloc + fillsize; + buflim = readbuf + fillsize; return cc; } /* Flags controlling the style of output. */ static enum - { - BINARY_BINARY_FILES, - TEXT_BINARY_FILES, - WITHOUT_MATCH_BINARY_FILES - } binary_files; /* How to handle binary files. */ +{ + BINARY_BINARY_FILES, + TEXT_BINARY_FILES, + WITHOUT_MATCH_BINARY_FILES +} binary_files; /* How to handle binary files. */ + +static int filename_mask; /* If zero, output nulls after filenames. */ static int out_quiet; /* Suppress all normal output. */ static int out_invert; /* Print nonmatching stuff. */ static int out_file; /* Print filenames. */ @@ -550,36 +512,58 @@ static int out_after; /* Lines of trailing context. */ static int count_matches; /* Count matching lines. */ static int list_files; /* List matching files. */ static int no_filenames; /* Suppress file names. */ -static int suppress_errors; /* Suppress diagnostics. */ +static off_t max_count; /* Stop after outputting this many + lines from an input file. */ +static int line_buffered; /* If nonzero, use line buffering, i.e. + fflush everyline out. */ +static char *label = NULL; /* Fake filename for stdin */ + /* Internal variables to keep track of byte count, context, etc. */ -static off_t totalcc; /* Total character count before bufbeg. */ -static char *lastnl; /* Pointer after last newline counted. */ -static char *lastout; /* Pointer after last character output; +static uintmax_t totalcc; /* Total character count before bufbeg. */ +static char const *lastnl; /* Pointer after last newline counted. */ +static char const *lastout; /* Pointer after last character output; NULL if no character has been output or if it's conceptually before bufbeg. */ -static off_t totalnl; /* Total newline count before lastnl. */ -static int pending; /* Pending lines of output. */ -static int done_on_match; /* Stop scanning file on first match */ - -#if O_BINARY +static uintmax_t totalnl; /* Total newline count before lastnl. */ +static off_t outleft; /* Maximum number of lines to be output. */ +static int pending; /* Pending lines of output. + Always kept 0 if out_quiet is true. */ +static int done_on_match; /* Stop scanning file on first match. */ +static int exit_on_match; /* Exit on first match. */ + +#if defined(HAVE_DOS_FILE_CONTENTS) # include "dosbuf.c" #endif +/* Add two numbers that count input bytes or lines, and report an + error if the addition overflows. */ +static uintmax_t +add_count (uintmax_t a, uintmax_t b) +{ + uintmax_t sum = a + b; + if (sum < a) + error (2, 0, _("input is too large to count")); + return sum; +} + static void -nlscan (char *lim) +nlscan (char const *lim) { - char *beg; - for (beg = lastnl; (beg = memchr (beg, eolbyte, lim - beg)); beg++) - totalnl++; + size_t newlines = 0; + char const *beg; + for (beg = lastnl; beg != lim; beg = memchr (beg, eolbyte, lim - beg), beg++) + newlines++; + totalnl = add_count (totalnl, newlines); lastnl = lim; } +/* Print a byte offset, followed by a character separator. */ static void -print_offset_sep (off_t pos, int sep) +print_offset_sep (uintmax_t pos, char sep) { - /* Do not rely on printf to print pos, since off_t may be longer than long, - and long long is not portable. */ + /* Do not rely on printf to print pos, since uintmax_t may be longer + than long, and long long is not portable. */ char buf[sizeof pos * CHAR_BIT]; char *p = buf + sizeof buf - 1; @@ -593,56 +577,134 @@ print_offset_sep (off_t pos, int sep) } static void -prline (char *beg, char *lim, int sep) +prline (char const *beg, char const *lim, int sep) { if (out_file) printf ("%s%c", filename, sep & filename_mask); if (out_line) { nlscan (beg); - print_offset_sep (++totalnl, sep); + totalnl = add_count (totalnl, 1); + print_offset_sep (totalnl, sep); lastnl = lim; } if (out_byte) { - off_t pos = totalcc + (beg - bufbeg); -#if O_BINARY + uintmax_t pos = add_count (totalcc, beg - bufbeg); +#if defined(HAVE_DOS_FILE_CONTENTS) pos = dossified_pos (pos); #endif print_offset_sep (pos, sep); } + if (only_matching) + { + size_t match_size; + size_t match_offset; + while ((match_offset = (*execute) (beg, lim - beg, &match_size, 1)) + != (size_t) -1) + { + char const *b = beg + match_offset; + if (b == lim) + break; + if (match_size == 0) + break; + if(color_option) + printf("\33[%sm", grep_color); + fwrite(b, sizeof (char), match_size, stdout); + if(color_option) + fputs("\33[00m", stdout); + fputs("\n", stdout); + beg = b + match_size; + } + lastout = lim; + if(line_buffered) + fflush(stdout); + return; + } + if (color_option) + { + size_t match_size; + size_t match_offset; + if(match_icase) + { + /* Yuck, this is tricky */ + char *buf = (char*) xmalloc (lim - beg); + char *ibeg = buf; + char *ilim = ibeg + (lim - beg); + int i; + for (i = 0; i < lim - beg; i++) + ibeg[i] = tolower (beg[i]); + while ((match_offset = (*execute) (ibeg, ilim-ibeg, &match_size, 1)) + != (size_t) -1) + { + char const *b = beg + match_offset; + if (b == lim) + break; + fwrite (beg, sizeof (char), match_offset, stdout); + printf ("\33[%sm", grep_color); + fwrite (b, sizeof (char), match_size, stdout); + fputs ("\33[00m", stdout); + beg = b + match_size; + ibeg = ibeg + match_offset + match_size; + } + fwrite (beg, 1, lim - beg, stdout); + free (buf); + lastout = lim; + return; + } + while (lim-beg && (match_offset = (*execute) (beg, lim - beg, &match_size, 1)) + != (size_t) -1) + { + char const *b = beg + match_offset; + /* Avoid matching the empty line at the end of the buffer. */ + if (b == lim) + break; + /* Avoid hanging on grep --color "" foo */ + if (match_size == 0) + break; + fwrite (beg, sizeof (char), match_offset, stdout); + printf ("\33[%sm", grep_color); + fwrite (b, sizeof (char), match_size, stdout); + fputs ("\33[00m", stdout); + beg = b + match_size; + } + } fwrite (beg, 1, lim - beg, stdout); if (ferror (stdout)) - error (_("writing output"), errno); + error (0, errno, _("writing output")); lastout = lim; + if (line_buffered) + fflush (stdout); } -/* Print pending lines of trailing context prior to LIM. */ +/* Print pending lines of trailing context prior to LIM. Trailing context ends + at the next matching line when OUTLEFT is 0. */ static void -prpending (char *lim) +prpending (char const *lim) { - char *nl; - if (!lastout) lastout = bufbeg; while (pending > 0 && lastout < lim) { + char const *nl = memchr (lastout, eolbyte, lim - lastout); + size_t match_size; --pending; - if ((nl = memchr (lastout, eolbyte, lim - lastout)) != 0) - ++nl; + if (outleft + || (((*execute) (lastout, nl - lastout, &match_size, 0) == (size_t) -1) + == !out_invert)) + prline (lastout, nl + 1, '-'); else - nl = lim; - prline (lastout, nl, '-'); + pending = 0; } } /* Print the lines between BEG and LIM. Deal with context crap. - If NLINESP is non-null, store a count of lines between BEG and LIM. */ + If NLINESP is non-null, store a count of lines between BEG and LIM. */ static void -prtext (char *beg, char *lim, int *nlinesp) +prtext (char const *beg, char const *lim, int *nlinesp) { static int used; /* avoid printing "--" before any output */ - char *bp, *p, *nl; + char const *bp, *p; char eol = eolbyte; int i, n; @@ -660,7 +722,7 @@ prtext (char *beg, char *lim, int *nlinesp) if (p > bp) do --p; - while (p > bp && p[-1] != eol); + while (p[-1] != eol); /* We only print the "--" separator if our output is discontiguous from the last output in the file. */ @@ -669,26 +731,28 @@ prtext (char *beg, char *lim, int *nlinesp) while (p < beg) { - nl = memchr (p, eol, beg - p); - prline (p, nl + 1, '-'); - p = nl + 1; + char const *nl = memchr (p, eol, beg - p); + nl++; + prline (p, nl, '-'); + p = nl; } } if (nlinesp) { /* Caller wants a line count. */ - for (n = 0; p < lim; ++n) + for (n = 0; p < lim && n < outleft; n++) { - if ((nl = memchr (p, eol, lim - p)) != 0) - ++nl; - else - nl = lim; + char const *nl = memchr (p, eol, lim - p); + nl++; if (!out_quiet) prline (p, nl, ':'); p = nl; } *nlinesp = n; + + /* relying on it that this function is never called when outleft = 0. */ + after_last_match = bufoffset - (buflim - p); } else if (!out_quiet) @@ -702,31 +766,42 @@ prtext (char *beg, char *lim, int *nlinesp) between matching lines if OUT_INVERT is true). Return a count of lines printed. */ static int -grepbuf (char *beg, char *lim) +grepbuf (char const *beg, char const *lim) { int nlines, n; - register char *p, *b; - char *endp; - char eol = eolbyte; + register char const *p; + size_t match_offset; + size_t match_size; nlines = 0; p = beg; - while ((b = (*execute)(p, lim - p, &endp)) != 0) + while ((match_offset = (*execute) (p, lim - p, &match_size, 0)) != (size_t) -1) { + char const *b = p + match_offset; + char const *endp = b + match_size; /* Avoid matching the empty line at the end of the buffer. */ - if (b == lim && ((b > beg && b[-1] == eol) || b == beg)) + if (b == lim) break; if (!out_invert) { prtext (b, endp, (int *) 0); - nlines += 1; - if (done_on_match) - return nlines; + nlines++; + outleft--; + if (!outleft || done_on_match) + { + if (exit_on_match) + exit (0); + after_last_match = bufoffset - (buflim - endp); + return nlines; + } } else if (p < b) { prtext (p, b, &n); nlines += n; + outleft -= n; + if (!outleft) + return nlines; } p = endp; } @@ -734,6 +809,7 @@ grepbuf (char *beg, char *lim) { prtext (p, lim, &n); nlines += n; + outleft -= n; } return nlines; } @@ -747,7 +823,9 @@ grep (int fd, char const *file, struct stats *stats) int nlines, i; int not_text; size_t residue, save; - char *beg, *lim; + char oldc; + char *beg; + char *lim; char eol = eolbyte; if (!reset (fd, file, stats)) @@ -767,13 +845,15 @@ grep (int fd, char const *file, struct stats *stats) else #endif if (close (fd) != 0) - error (file, errno); + error (0, errno, "%s", file); return grepdir (file, stats) - 2; } totalcc = 0; lastout = 0; totalnl = 0; + outleft = max_count; + after_last_match = 0; pending = 0; nlines = 0; @@ -782,8 +862,8 @@ grep (int fd, char const *file, struct stats *stats) if (! fillbuf (save, stats)) { - if (! (is_EISDIR (errno, file) && suppress_errors)) - error (filename, errno); + if (! is_EISDIR (errno, file)) + suppressible_error (filename, errno); return 0; } @@ -800,20 +880,38 @@ grep (int fd, char const *file, struct stats *stats) lastnl = bufbeg; if (lastout) lastout = bufbeg; - if (buflim - bufbeg == save) + + beg = bufbeg + save; + + /* no more data to scan (eof) except for maybe a residue -> break */ + if (beg == buflim) break; - beg = bufbeg + save - residue; - for (lim = buflim; lim > beg && lim[-1] != eol; --lim) - ; + + /* Determine new residue (the length of an incomplete line at the end of + the buffer, 0 means there is no incomplete last line). */ + oldc = beg[-1]; + beg[-1] = eol; + for (lim = buflim; lim[-1] != eol; lim--) + continue; + beg[-1] = oldc; + if (lim == beg) + lim = beg - residue; + beg -= residue; residue = buflim - lim; + if (beg < lim) { - nlines += grepbuf (beg, lim); + if (outleft) + nlines += grepbuf (beg, lim); if (pending) prpending (lim); - if (nlines && done_on_match && !out_invert) + if((!outleft && !pending) || (nlines && done_on_match && !out_invert)) goto finish_grep; } + + /* The last OUT_BEFORE lines at the end of the buffer will be needed as + leading context if there is a matching line at the begin of the + next data. Make beg point to their begin. */ i = 0; beg = lim; while (i < out_before && beg > bufbeg && beg != lastout) @@ -821,27 +919,33 @@ grep (int fd, char const *file, struct stats *stats) ++i; do --beg; - while (beg > bufbeg && beg[-1] != eol); + while (beg[-1] != eol); } + + /* detect if leading context is discontinuous from last printed line. */ if (beg != lastout) lastout = 0; + + /* Handle some details and read more data to scan. */ save = residue + lim - beg; - totalcc += buflim - bufbeg - save; + if (out_byte) + totalcc = add_count (totalcc, buflim - bufbeg - save); if (out_line) nlscan (beg); if (! fillbuf (save, stats)) { - if (! (is_EISDIR (errno, file) && suppress_errors)) - error (filename, errno); + if (! is_EISDIR (errno, file)) + suppressible_error (filename, errno); goto finish_grep; } } if (residue) { *buflim++ = eol; - nlines += grepbuf (bufbeg + save - residue, buflim); + if (outleft) + nlines += grepbuf (bufbeg + save - residue, buflim); if (pending) - prpending (buflim); + prpending (buflim); } finish_grep: @@ -862,7 +966,7 @@ grepfile (char const *file, struct stats *stats) if (! file) { desc = 0; - filename = _("(standard input)"); + filename = label ? label : _("(standard input)"); } else { @@ -872,46 +976,44 @@ grepfile (char const *file, struct stats *stats) if (desc < 0) { int e = errno; - + if (is_EISDIR (e, file) && directories == RECURSE_DIRECTORIES) { if (stat (file, &stats->stat) != 0) { - error (file, errno); + error (0, errno, "%s", file); return 1; } return grepdir (file, stats); } - + if (!suppress_errors) { if (directories == SKIP_DIRECTORIES) switch (e) { -#ifdef EISDIR +#if defined(EISDIR) case EISDIR: return 1; #endif case EACCES: /* When skipping directories, don't worry about directories that can't be opened. */ - if (stat (file, &stats->stat) == 0 - && S_ISDIR (stats->stat.st_mode)) + if (isdir (file)) return 1; break; } - - error (file, e); } + suppressible_error (file, e); return 1; } filename = file; } -#if O_BINARY +#if defined(SET_BINARY) /* Set input to binary mode. Pipes are simulated with files on DOS, so this includes the case of "foo | grep bar". */ if (!isatty (desc)) @@ -942,11 +1044,19 @@ grepfile (char const *file, struct stats *stats) gzclose(gzbufdesc); else #endif - if (file) + if (! file) + { + off_t required_offset = outleft ? bufoffset : after_last_match; + if ((bufmapped || required_offset != bufoffset) + && lseek (desc, required_offset, SEEK_SET) < 0 + && S_ISREG (stats->stat.st_mode)) + error (0, errno, "%s", filename); + } + else while (close (desc) != 0) if (errno != EINTR) { - error (file, errno); + error (0, errno, "%s", file); break; } } @@ -955,33 +1065,34 @@ grepfile (char const *file, struct stats *stats) } static int -grepdir (char const *dir, struct stats *stats) +grepdir (char const *dir, struct stats const *stats) { int status = 1; - struct stats *ancestor; + struct stats const *ancestor; char *name_space; - for (ancestor = stats; (ancestor = ancestor->parent) != 0; ) - if (ancestor->stat.st_ino == stats->stat.st_ino - && ancestor->stat.st_dev == stats->stat.st_dev) - { - if (!suppress_errors) - fprintf (stderr, _("%s: warning: %s: %s\n"), prog, dir, + /* Mingw32 does not support st_ino. No known working hosts use zero + for st_ino, so assume that the Mingw32 bug applies if it's zero. */ + if (stats->stat.st_ino) + for (ancestor = stats; (ancestor = ancestor->parent) != 0; ) + if (ancestor->stat.st_ino == stats->stat.st_ino + && ancestor->stat.st_dev == stats->stat.st_dev) + { + if (!suppress_errors) + error (0, 0, _("warning: %s: %s\n"), dir, _("recursive directory loop")); - return 1; - } + return 1; + } - name_space = savedir (dir, (unsigned) stats->stat.st_size); + name_space = savedir (dir, stats->stat.st_size, included_patterns, + excluded_patterns); if (! name_space) { if (errno) - { - if (!suppress_errors) - error (dir, errno); - } + suppressible_error (dir, errno); else - fatal (_("Memory exhausted"), 0); + xalloc_die (); } else { @@ -989,7 +1100,7 @@ grepdir (char const *dir, struct stats *stats) int needs_slash = ! (dirlen == FILESYSTEM_PREFIX_LEN (dir) || IS_SLASH (dir[dirlen - 1])); char *file = NULL; - char *namep = name_space; + char const *namep = name_space; struct stats child; child.parent = stats; out_file += !no_filenames; @@ -1017,21 +1128,24 @@ usage (int status) { if (status != 0) { - fprintf (stderr, _("Usage: %s [OPTION]... PATTERN [FILE]...\n"), prog); - fprintf (stderr, _("Try `%s --help' for more information.\n"), prog); + fprintf (stderr, _("Usage: %s [OPTION]... PATTERN [FILE]...\n"), + program_name); + fprintf (stderr, _("Try `%s --help' for more information.\n"), + program_name); } else { - printf (_("Usage: %s [OPTION]... PATTERN [FILE] ...\n"), prog); + printf (_("Usage: %s [OPTION]... PATTERN [FILE] ...\n"), program_name); printf (_("\ Search for PATTERN in each FILE or standard input.\n\ Example: %s -i 'hello world' menu.h main.c\n\ \n\ -Regexp selection and interpretation:\n"), prog); +Regexp selection and interpretation:\n"), program_name); printf (_("\ -E, --extended-regexp PATTERN is an extended regular expression\n\ -F, --fixed-strings PATTERN is a set of newline-separated strings\n\ - -G, --basic-regexp PATTERN is a basic regular expression\n")); + -G, --basic-regexp PATTERN is a basic regular expression\n\ + -P, --perl-regexp PATTERN is a Perl regular expression\n")); printf (_("\ -e, --regexp=PATTERN use PATTERN as a regular expression\n\ -f, --file=FILE obtain PATTERN from FILE\n\ @@ -1052,18 +1166,27 @@ Miscellaneous:\n\ printf (_("\ \n\ Output control:\n\ + -m, --max-count=NUM stop after NUM matches\n\ -b, --byte-offset print the byte offset with output lines\n\ -n, --line-number print line number with output lines\n\ + --line-buffered flush output on every line\n\ -H, --with-filename print the filename for each match\n\ -h, --no-filename suppress the prefixing filename on output\n\ + --label=LABEL print LABEL as filename for standard input\n\ + -o, --only-matching show only the part of a line matching PATTERN\n\ -q, --quiet, --silent suppress all normal output\n\ --binary-files=TYPE assume that binary files are TYPE\n\ - TYPE is 'binary', 'text', or 'without-match'.\n\ + TYPE is 'binary', 'text', or 'without-match'\n\ -a, --text equivalent to --binary-files=text\n\ -I equivalent to --binary-files=without-match\n\ -d, --directories=ACTION how to handle directories\n\ - ACTION is 'read', 'recurse', or 'skip'.\n\ - -r, --recursive equivalent to --directories=recurse.\n\ + ACTION is 'read', 'recurse', or 'skip'\n\ + -D, --devices=ACTION how to handle devices, FIFOs and sockets\n\ + ACTION is 'read' or 'skip'\n\ + -R, -r, --recursive equivalent to --directories=recurse\n\ + --include=PATTERN files that match PATTERN will be examined\n\ + --exclude=PATTERN files that match PATTERN will be skipped.\n\ + --exclude-from=FILE files that match PATTERN in FILE will be skipped.\n\ -L, --files-without-match only print FILE names containing no match\n\ -l, --files-with-matches only print FILE names containing matches\n\ -c, --count only print a count of matching lines per FILE\n\ @@ -1073,9 +1196,11 @@ Output control:\n\ Context control:\n\ -B, --before-context=NUM print NUM lines of leading context\n\ -A, --after-context=NUM print NUM lines of trailing context\n\ - -C, --context[=NUM] print NUM (default 2) lines of output context\n\ - unless overridden by -A or -B\n\ + -C, --context=NUM print NUM lines of output context\n\ -NUM same as --context=NUM\n\ + --color[=WHEN],\n\ + --colour[=WHEN] use markers to distinguish the matching string\n\ + WHEN may be `always', `never' or `auto'.\n\ -U, --binary do not strip CR characters at EOL (MSDOS)\n\ -u, --unix-byte-offsets report offsets as if CRs were not there (MSDOS)\n\ \n\ @@ -1093,7 +1218,7 @@ static void setmatcher (char const *m) { if (matcher && strcmp (matcher, m) != 0) - fatal (_("conflicting matchers specified"), 0); + error (2, 0, _("conflicting matchers specified")); matcher = m; } @@ -1103,16 +1228,16 @@ static int install_matcher (char const *name) { int i; -#ifdef HAVE_SETRLIMIT +#if defined(HAVE_SETRLIMIT) struct rlimit rlim; #endif - for (i = 0; matchers[i].name; ++i) + for (i = 0; matchers[i].compile; i++) if (strcmp (name, matchers[i].name) == 0) { compile = matchers[i].compile; execute = matchers[i].execute; -#if HAVE_SETRLIMIT && defined(RLIMIT_STACK) +#if defined(HAVE_SETRLIMIT) && defined(RLIMIT_STACK) /* I think every platform needs to do this, so that regex.c doesn't oveflow the stack. The default value of `re_max_failures' is too large for some platforms: it needs @@ -1132,9 +1257,10 @@ install_matcher (char const *name) re_max_failures = newlim / (2 * 20 * sizeof (char *)); } if (rlim.rlim_cur < newlim) - rlim.rlim_cur = newlim; - - setrlimit (RLIMIT_STACK, &rlim); + { + rlim.rlim_cur = newlim; + setrlimit (RLIMIT_STACK, &rlim); + } } #endif return 1; @@ -1195,6 +1321,47 @@ prepend_default_options (char const *options, int *pargc, char ***pargv) } } +/* Get the next non-digit option from ARGC and ARGV. + Return -1 if there are no more options. + Process any digit options that were encountered on the way, + and store the resulting integer into *DEFAULT_CONTEXT. */ +static int +get_nondigit_option (int argc, char *const *argv, int *default_context) +{ + int opt; + char buf[sizeof (uintmax_t) * CHAR_BIT + 4]; + char *p = buf; + + /* Set buf[0] to anything but '0', for the leading-zero test below. */ + buf[0] = '\0'; + + while (opt = getopt_long (argc, argv, short_options, long_options, NULL), + '0' <= opt && opt <= '9') + { + /* Suppress trivial leading zeros, to avoid incorrect + diagnostic on strings like 00000000000. */ + p -= buf[0] == '0'; + + *p++ = opt; + if (p == buf + sizeof buf - 4) + { + /* Too many digits. Append "..." to make context_length_arg + complain about "X...", where X contains the digits seen + so far. */ + strcpy (p, "..."); + p += 3; + break; + } + } + if (p != buf) + { + *p = '\0'; + context_length_arg (buf, default_context); + } + + return opt; +} + int main (int argc, char **argv) { @@ -1203,40 +1370,39 @@ main (int argc, char **argv) int with_filenames; int opt, cc, status; int default_context; - unsigned digit_args_val; FILE *fp; extern char *optarg; extern int optind; initialize_main (&argc, &argv); - prog = argv[0]; - if (prog && strrchr (prog, '/')) - prog = strrchr (prog, '/') + 1; + program_name = argv[0]; + if (program_name && strrchr (program_name, '/')) + program_name = strrchr (program_name, '/') + 1; #if HAVE_LIBZ > 0 - if (prog[0] == 'z') { + if (program_name[0] == 'z') { Zflag = 1; - ++prog; + ++program_name; } #endif - if (prog[0] == 'b') { + if (program_name[0] == 'b') { BZflag = 1; - ++prog; + ++program_name; } #if defined(__MSDOS__) || defined(_WIN32) /* DOS and MS-Windows use backslashes as directory separators, and usually have an .exe suffix. They also have case-insensitive filesystems. */ - if (prog) + if (program_name) { - char *p = prog; + char *p = program_name; char *bslash = strrchr (argv[0], '\\'); - if (bslash && bslash >= prog) /* for mixed forward/backslash case */ - prog = bslash + 1; - else if (prog == argv[0] + if (bslash && bslash >= program_name) /* for mixed forward/backslash case */ + program_name = bslash + 1; + else if (program_name == argv[0] && argv[0][0] && argv[0][1] == ':') /* "c:progname" */ - prog = argv[0] + 2; + program_name = argv[0] + 2; /* Collapse the letter-case, so `strcmp' could be used hence. */ for ( ; *p; p++) @@ -1244,7 +1410,7 @@ main (int argc, char **argv) *p += 'a' - 'A'; /* Remove the .exe extension, if any. */ - if ((p = strrchr (prog, '.')) && strcmp (p, ".exe") == 0) + if ((p = strrchr (program_name, '.')) && strcmp (p, ".exe") == 0) *p = '\0'; } #endif @@ -1255,79 +1421,74 @@ main (int argc, char **argv) eolbyte = '\n'; filename_mask = ~0; + max_count = TYPE_MAXIMUM (off_t); + /* The value -1 means to use DEFAULT_CONTEXT. */ out_after = out_before = -1; /* Default before/after context: chaged by -C/-NUM options */ default_context = 0; - /* Accumulated value of individual digits in a -NUM option */ - digit_args_val = 0; + /* Changed by -o option */ + only_matching = 0; - -/* Internationalization. */ -#if HAVE_SETLOCALE + /* Internationalization. */ +#if defined(HAVE_SETLOCALE) setlocale (LC_ALL, ""); #endif -#if ENABLE_NLS +#if defined(ENABLE_NLS) bindtextdomain (PACKAGE, LOCALEDIR); textdomain (PACKAGE); #endif + atexit (close_stdout); + prepend_default_options (getenv ("GREP_OPTIONS"), &argc, &argv); - while ((opt = getopt_long (argc, argv, short_options, long_options, NULL)) - != -1) + while ((opt = get_nondigit_option (argc, argv, &default_context)) != -1) switch (opt) { - case '0': - case '1': - case '2': - case '3': - case '4': - case '5': - case '6': - case '7': - case '8': - case '9': - digit_args_val = 10 * digit_args_val + opt - '0'; - default_context = digit_args_val; - break; case 'A': - if (optarg) - { - if (ck_atoi (optarg, &out_after)) - fatal (_("invalid context length argument"), 0); - } + context_length_arg (optarg, &out_after); break; + case 'B': - if (optarg) - { - if (ck_atoi (optarg, &out_before)) - fatal (_("invalid context length argument"), 0); - } + context_length_arg (optarg, &out_before); break; + case 'C': /* Set output match context, but let any explicit leading or trailing amount specified with -A or -B stand. */ - if (optarg) - { - if (ck_atoi (optarg, &default_context)) - fatal (_("invalid context length argument"), 0); - } + context_length_arg (optarg, &default_context); + break; + + case 'D': + if (strcmp (optarg, "read") == 0) + devices = READ_DEVICES; + else if (strcmp (optarg, "skip") == 0) + devices = SKIP_DEVICES; else - default_context = 2; + error (2, 0, _("unknown devices method")); break; + case 'E': setmatcher ("egrep"); break; + case 'F': setmatcher ("fgrep"); break; + + case 'P': + setmatcher ("perl"); + break; + case 'G': setmatcher ("grep"); break; + case 'H': with_filenames = 1; break; + case 'I': binary_files = WITHOUT_MATCH_BINARY_FILES; break; @@ -1339,32 +1500,39 @@ main (int argc, char **argv) } BZflag = 1; break; + case 'U': -#if O_BINARY +#if defined(HAVE_DOS_FILE_CONTENTS) dos_use_file_type = DOS_BINARY; #endif break; + case 'u': -#if O_BINARY +#if defined(HAVE_DOS_FILE_CONTENTS) dos_report_unix_offset = 1; #endif break; + case 'V': show_version = 1; break; + case 'X': setmatcher (optarg); break; + case 'a': binary_files = TEXT_BINARY_FILES; break; + case 'b': out_byte = 1; break; + case 'c': - out_quiet = 1; count_matches = 1; break; + case 'd': if (strcmp (optarg, "read") == 0) directories = READ_DIRECTORIES; @@ -1373,8 +1541,9 @@ main (int argc, char **argv) else if (strcmp (optarg, "recurse") == 0) directories = RECURSE_DIRECTORIES; else - fatal (_("unknown directories method"), 0); + error (2, 0, _("unknown directories method")); break; + case 'e': cc = strlen (optarg); keys = xrealloc (keys, keycc + cc + 1); @@ -1382,10 +1551,11 @@ main (int argc, char **argv) keycc += cc; keys[keycc++] = '\n'; break; + case 'f': fp = strcmp (optarg, "-") != 0 ? fopen (optarg, "r") : stdin; if (!fp) - fatal (optarg, errno); + error (2, errno, "%s", optarg); for (keyalloc = 1; keyalloc <= keycc + 1; keyalloc *= 2) ; keys = xrealloc (keys, keyalloc); @@ -1403,48 +1573,80 @@ main (int argc, char **argv) if (oldcc != keycc && keys[keycc - 1] != '\n') keys[keycc++] = '\n'; break; + case 'h': no_filenames = 1; break; + case 'i': case 'y': /* For old-timers . . . */ match_icase = 1; break; + case 'L': /* Like -l, except list files that don't contain matches. Inspired by the same option in Hume's gre. */ - out_quiet = 1; list_files = -1; - done_on_match = 1; break; + case 'l': - out_quiet = 1; list_files = 1; - done_on_match = 1; break; + + case 'm': + { + uintmax_t value; + switch (xstrtoumax (optarg, 0, 10, &value, "")) + { + case LONGINT_OK: + max_count = value; + if (0 <= max_count && max_count == value) + break; + /* Fall through. */ + case LONGINT_OVERFLOW: + max_count = TYPE_MAXIMUM (off_t); + break; + + default: + error (2, 0, _("invalid max count")); + } + } + break; + case 'n': out_line = 1; break; + + case 'o': + only_matching = 1; + break; + case 'q': - done_on_match = 1; - out_quiet = 1; + exit_on_match = 1; + close_stdout_set_status(0); break; + case 'R': case 'r': directories = RECURSE_DIRECTORIES; break; + case 's': suppress_errors = 1; break; + case 'v': out_invert = 1; break; + case 'w': match_words = 1; break; + case 'x': match_lines = 1; break; + case 'Z': #if HAVE_LIBZ > 0 if (BZflag) @@ -1457,9 +1659,11 @@ main (int argc, char **argv) filename_mask = 0; #endif break; + case 'z': eolbyte = '\0'; break; + case BINARY_FILES_OPTION: if (strcmp (optarg, "binary") == 0) binary_files = BINARY_BINARY_FILES; @@ -1468,30 +1672,105 @@ main (int argc, char **argv) else if (strcmp (optarg, "without-match") == 0) binary_files = WITHOUT_MATCH_BINARY_FILES; else - fatal (_("unknown binary-files type"), 0); + error (2, 0, _("unknown binary-files type")); + break; + + case COLOR_OPTION: + if(optarg) { + if(!strcasecmp(optarg, "always") || !strcasecmp(optarg, "yes") || + !strcasecmp(optarg, "force")) + color_option = 1; + else if(!strcasecmp(optarg, "never") || !strcasecmp(optarg, "no") || + !strcasecmp(optarg, "none")) + color_option = 0; + else if(!strcasecmp(optarg, "auto") || !strcasecmp(optarg, "tty") || + !strcasecmp(optarg, "if-tty")) + color_option = 2; + else + show_help = 1; + } else + color_option = 2; + if(color_option == 2) { + if(isatty(STDOUT_FILENO) && getenv("TERM") && + strcmp(getenv("TERM"), "dumb")) + color_option = 1; + else + color_option = 0; + } break; + + case EXCLUDE_OPTION: + if (!excluded_patterns) + excluded_patterns = new_exclude (); + add_exclude (excluded_patterns, optarg); + break; + + case EXCLUDE_FROM_OPTION: + if (!excluded_patterns) + excluded_patterns = new_exclude (); + if (add_exclude_file (add_exclude, excluded_patterns, optarg, '\n') + != 0) + { + error (2, errno, "%s", optarg); + } + break; + + case INCLUDE_OPTION: + if (!included_patterns) + included_patterns = new_exclude (); + add_exclude (included_patterns, optarg); + break; + + case LINE_BUFFERED_OPTION: + line_buffered = 1; + break; + + case LABEL_OPTION: + label = optarg; + break; + case 0: /* long options */ break; + default: usage (2); break; + } + /* POSIX.2 says that -q overrides -l, which in turn overrides the + other output options. */ + if (exit_on_match) + list_files = 0; + if (exit_on_match | list_files) + { + count_matches = 0; + done_on_match = 1; + } + out_quiet = count_matches | done_on_match; + if (out_after < 0) out_after = default_context; if (out_before < 0) out_before = default_context; + if (color_option) + { + char *userval = getenv ("GREP_COLOR"); + if (userval != NULL && *userval != '\0') + grep_color = userval; + } + if (! matcher) - matcher = prog; + matcher = program_name; if (show_version) { printf (_("%s (GNU grep) %s\n"), matcher, VERSION); printf ("\n"); printf (_("\ -Copyright 1988, 1992-1999, 2000 Free Software Foundation, Inc.\n")); +Copyright 1988, 1992-1999, 2000, 2001 Free Software Foundation, Inc.\n")); printf (_("\ This is free software; see the source for copying conditions. There is NO\n\ warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.\n")); @@ -1505,8 +1784,11 @@ warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.\n")) if (keys) { if (keycc == 0) - /* No keys were specified (e.g. -f /dev/null). Match nothing. */ - out_invert ^= 1; + { + /* No keys were specified (e.g. -f /dev/null). Match nothing. */ + out_invert ^= 1; + match_lines = match_words = 0; + } else /* Strip trailing newline. */ --keycc; @@ -1528,13 +1810,15 @@ warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.\n")) if ((argc - optind > 1 && !no_filenames) || with_filenames) out_file = 1; -#if O_BINARY +#ifdef SET_BINARY /* Output is set to binary mode because we shouldn't convert NL to CR-LF pairs, especially when grepping binary files. */ if (!isatty (1)) SET_BINARY (1); #endif + if (max_count == 0) + exit (1); if (optind < argc) { @@ -1542,6 +1826,16 @@ warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.\n")) do { char *file = argv[optind]; + if ((included_patterns || excluded_patterns) + && !isdir (file)) + { + if (included_patterns && + ! excluded_filename (included_patterns, file, 0)) + continue; + if (excluded_patterns && + excluded_filename (excluded_patterns, file, 0)) + continue; + } status &= grepfile (strcmp (file, "-") == 0 ? (char *) NULL : file, &stats_base); } @@ -1550,8 +1844,7 @@ warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.\n")) else status = grepfile ((char *) NULL, &stats_base); - if (fclose (stdout) == EOF) - error (_("writing output"), errno); - + /* We register via atexit() to test stdout. */ exit (errseen ? 2 : status); } +/* vim:set shiftwidth=2: */ diff --git a/gnu/usr.bin/grep/grep.h b/gnu/usr.bin/grep/grep.h index 264ad20..f4937e7 100644 --- a/gnu/usr.bin/grep/grep.h +++ b/gnu/usr.bin/grep/grep.h @@ -1,5 +1,5 @@ /* grep.h - interface to grep driver for searching subroutines. - Copyright (C) 1992, 1998 Free Software Foundation, Inc. + Copyright (C) 1992, 1998, 2001 Free Software Foundation, Inc. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -22,20 +22,16 @@ # define __attribute__(x) #endif -extern void fatal PARAMS ((const char *, int)) __attribute__((noreturn)); -extern char *xmalloc PARAMS ((size_t size)); -extern char *xrealloc PARAMS ((char *ptr, size_t size)); - /* Grep.c expects the matchers vector to be terminated - by an entry with a NULL name, and to contain at least + by an entry with a NULL compile, and to contain at least an entry named "default". */ extern struct matcher { - char *name; - void (*compile) PARAMS ((char *, size_t)); - char *(*execute) PARAMS ((char *, size_t, char **)); -} matchers[]; + char name[8]; + void (*compile) PARAMS ((char const *, size_t)); + size_t (*execute) PARAMS ((char const *, size_t, size_t *, int)); +} const matchers[]; /* Exported from fgrepmat.c, egrepmat.c, grepmat.c. */ extern char const *matcher; diff --git a/gnu/usr.bin/grep/kwset.c b/gnu/usr.bin/grep/kwset.c index 51deca4..7902539 100644 --- a/gnu/usr.bin/grep/kwset.c +++ b/gnu/usr.bin/grep/kwset.c @@ -83,22 +83,13 @@ struct kwset struct trie *next[NCHAR]; /* Table of children of the root. */ char *target; /* Target string if there's only one. */ int mind2; /* Used in Boyer-Moore search for one string. */ - char *trans; /* Character translation table. */ + char const *trans; /* Character translation table. */ }; -/* prototypes */ -static void enqueue PARAMS((struct tree *, struct trie **)); -static void treefails PARAMS((register struct tree *, struct trie *, struct trie *)); -static void treedelta PARAMS((register struct tree *,register unsigned int, unsigned char *)); -static int hasevery PARAMS((register struct tree *, register struct tree *)); -static void treenext PARAMS((struct tree *, struct trie **)); -static char * bmexec PARAMS((kwset_t, char *, size_t)); -static char * cwexec PARAMS((kwset_t, char *, size_t, struct kwsmatch *)); - /* Allocate and initialize a keyword set object, returning an opaque pointer to it. Return NULL if memory is not available. */ kwset_t -kwsalloc (char *trans) +kwsalloc (char const *trans) { struct kwset *kwset; @@ -133,7 +124,7 @@ kwsalloc (char *trans) /* Add the given string to the contents of the keyword set. Return NULL for success, an error message otherwise. */ char * -kwsincr (kwset_t kws, char *text, size_t len) +kwsincr (kwset_t kws, char const *text, size_t len) { struct kwset *kwset; register struct trie *trie; @@ -303,7 +294,8 @@ enqueue (struct tree *tree, struct trie **last) from the given tree, given the failure function for their parent as well as a last resort failure node. */ static void -treefails (register struct tree *tree, struct trie *fail, struct trie *recourse) +treefails (register struct tree const *tree, struct trie const *fail, + struct trie *recourse) { register struct tree *link; @@ -337,7 +329,7 @@ treefails (register struct tree *tree, struct trie *fail, struct trie *recourse) /* Set delta entries for the links of the given tree such that the preexisting delta value is larger than the current depth. */ static void -treedelta (register struct tree *tree, +treedelta (register struct tree const *tree, register unsigned int depth, unsigned char delta[]) { @@ -351,7 +343,7 @@ treedelta (register struct tree *tree, /* Return true if A has every label in B. */ static int -hasevery (register struct tree *a, register struct tree *b) +hasevery (register struct tree const *a, register struct tree const *b) { if (!b) return 1; @@ -370,7 +362,7 @@ hasevery (register struct tree *a, register struct tree *b) /* Compute a vector, indexed by character code, of the trie nodes referenced from the given tree. */ static void -treenext (struct tree *tree, struct trie *next[]) +treenext (struct tree const *tree, struct trie *next[]) { if (!tree) return; @@ -387,7 +379,7 @@ kwsprep (kwset_t kws) register struct kwset *kwset; register int i; register struct trie *curr, *fail; - register char *trans; + register char const *trans; unsigned char delta[NCHAR]; struct trie *last, *next[NCHAR]; @@ -499,23 +491,26 @@ kwsprep (kwset_t kws) #define U(C) ((unsigned char) (C)) /* Fast boyer-moore search. */ -static char * -bmexec (kwset_t kws, char *text, size_t size) +static size_t +bmexec (kwset_t kws, char const *text, size_t size) { - struct kwset *kwset; - register unsigned char *d1; - register char *ep, *sp, *tp; + struct kwset const *kwset; + register unsigned char const *d1; + register char const *ep, *sp, *tp; register int d, gc, i, len, md2; - kwset = (struct kwset *) kws; + kwset = (struct kwset const *) kws; len = kwset->mind; if (len == 0) - return text; - if (len > size) return 0; + if (len > size) + return -1; if (len == 1) - return memchr(text, kwset->target[0], size); + { + tp = memchr (text, kwset->target[0], size); + return tp ? tp - text : -1; + } d1 = kwset->delta; sp = kwset->target + len; @@ -554,7 +549,7 @@ bmexec (kwset_t kws, char *text, size_t size) for (i = 3; i <= len && U(tp[-i]) == U(sp[-i]); ++i) ; if (i > len) - return tp - len; + return tp - len - text; } tp += md2; } @@ -573,26 +568,29 @@ bmexec (kwset_t kws, char *text, size_t size) for (i = 3; i <= len && U(tp[-i]) == U(sp[-i]); ++i) ; if (i > len) - return tp - len; + return tp - len - text; } d = md2; } - return 0; + return -1; } /* Hairy multiple string search. */ -static char * -cwexec (kwset_t kws, char *text, size_t len, struct kwsmatch *kwsmatch) +static size_t +cwexec (kwset_t kws, char const *text, size_t len, struct kwsmatch *kwsmatch) { - struct kwset *kwset; - struct trie **next, *trie, *accept; - char *beg, *lim, *mch, *lmch; - register unsigned char c, *delta; + struct kwset const *kwset; + struct trie * const *next; + struct trie const *trie; + struct trie const *accept; + char const *beg, *lim, *mch, *lmch; + register unsigned char c; + register unsigned char const *delta; register int d; - register char *end, *qlim; - register struct tree *tree; - register char *trans; + register char const *end, *qlim; + register struct tree const *tree; + register char const *trans; #ifdef lint accept = NULL; @@ -601,7 +599,7 @@ cwexec (kwset_t kws, char *text, size_t len, struct kwsmatch *kwsmatch) /* Initialize register copies and look for easy ways out. */ kwset = (struct kwset *) kws; if (len < kwset->mind) - return 0; + return -1; next = kwset->next; delta = kwset->delta; trans = kwset->trans; @@ -670,7 +668,7 @@ cwexec (kwset_t kws, char *text, size_t len, struct kwsmatch *kwsmatch) if (mch) goto match; } - return 0; + return -1; match: /* Given a known match, find the longest possible match anchored @@ -730,10 +728,10 @@ cwexec (kwset_t kws, char *text, size_t len, struct kwsmatch *kwsmatch) if (kwsmatch) { kwsmatch->index = accept->accepting / 2; - kwsmatch->beg[0] = mch; + kwsmatch->offset[0] = mch - text; kwsmatch->size[0] = accept->depth; } - return mch; + return mch - text; } /* Search through the given text for a match of any member of the @@ -743,20 +741,18 @@ cwexec (kwset_t kws, char *text, size_t len, struct kwsmatch *kwsmatch) matching substring. Similarly, if FOUNDIDX is non-NULL, store in the referenced location the index number of the particular keyword matched. */ -char * -kwsexec (kwset_t kws, char *text, size_t size, struct kwsmatch *kwsmatch) +size_t +kwsexec (kwset_t kws, char const *text, size_t size, + struct kwsmatch *kwsmatch) { - struct kwset *kwset; - char *ret; - - kwset = (struct kwset *) kws; + struct kwset const *kwset = (struct kwset *) kws; if (kwset->words == 1 && kwset->trans == 0) { - ret = bmexec(kws, text, size); - if (kwsmatch != 0 && ret != 0) + size_t ret = bmexec (kws, text, size); + if (kwsmatch != 0 && ret != (size_t) -1) { kwsmatch->index = 0; - kwsmatch->beg[0] = ret; + kwsmatch->offset[0] = ret; kwsmatch->size[0] = kwset->mind; } return ret; diff --git a/gnu/usr.bin/grep/kwset.h b/gnu/usr.bin/grep/kwset.h index f812b2e..1724f68 100644 --- a/gnu/usr.bin/grep/kwset.h +++ b/gnu/usr.bin/grep/kwset.h @@ -25,7 +25,7 @@ struct kwsmatch { int index; /* Index number of matching keyword. */ - char *beg[1]; /* Begin pointer for each submatch. */ + size_t offset[1]; /* Offset of each submatch. */ size_t size[1]; /* Length of each submatch. */ }; @@ -35,12 +35,12 @@ typedef ptr_t kwset_t; if enough memory cannot be obtained. The argument if non-NULL specifies a table of character translations to be applied to all pattern and search text. */ -extern kwset_t kwsalloc PARAMS((char *)); +extern kwset_t kwsalloc PARAMS((char const *)); /* Incrementally extend the keyword set to include the given string. Return NULL for success, or an error message. Remember an index number for each keyword included in the set. */ -extern char *kwsincr PARAMS((kwset_t, char *, size_t)); +extern char *kwsincr PARAMS((kwset_t, char const *, size_t)); /* When the keyword set has been completely built, prepare it for use. Return NULL for success, or an error message. */ @@ -52,7 +52,7 @@ extern char *kwsprep PARAMS((kwset_t)); the matching substring in the integer it points to. Similarly, if foundindex is non-NULL, store the index of the particular keyword found therein. */ -extern char *kwsexec PARAMS((kwset_t, char *, size_t, struct kwsmatch *)); +extern size_t kwsexec PARAMS((kwset_t, char const *, size_t, struct kwsmatch *)); /* Deallocate the given keyword set and all its associated storage. */ extern void kwsfree PARAMS((kwset_t)); diff --git a/gnu/usr.bin/grep/search.c b/gnu/usr.bin/grep/search.c index d73f6d4..ab4938f 100644 --- a/gnu/usr.bin/grep/search.c +++ b/gnu/usr.bin/grep/search.c @@ -24,54 +24,71 @@ # include <config.h> #endif #include <sys/types.h> +#if defined HAVE_WCTYPE_H && defined HAVE_WCHAR_H && defined HAVE_MBRTOWC +/* We can handle multibyte string. */ +# define MBS_SUPPORT +# include <wchar.h> +# include <wctype.h> +#endif + #include "system.h" #include "grep.h" #include "regex.h" #include "dfa.h" #include "kwset.h" +#include "error.h" +#include "xalloc.h" +#ifdef HAVE_LIBPCRE +# include <pcre.h> +#endif #define NCHAR (UCHAR_MAX + 1) -static void Gcompile PARAMS((char *, size_t)); -static void Ecompile PARAMS((char *, size_t)); -static char *EGexecute PARAMS((char *, size_t, char **)); -static void Fcompile PARAMS((char *, size_t)); -static char *Fexecute PARAMS((char *, size_t, char **)); -static void kwsinit PARAMS((void)); - -/* Here is the matchers vector for the main program. */ -struct matcher matchers[] = { - { "default", Gcompile, EGexecute }, - { "grep", Gcompile, EGexecute }, - { "egrep", Ecompile, EGexecute }, - { "awk", Ecompile, EGexecute }, - { "fgrep", Fcompile, Fexecute }, - { 0, 0, 0 }, -}; - /* For -w, we also consider _ to be word constituent. */ #define WCHAR(C) (ISALNUM(C) || (C) == '_') /* DFA compiled regexp. */ static struct dfa dfa; -/* Regex compiled regexp. */ -static struct re_pattern_buffer regexbuf; +/* The Regex compiled patterns. */ +static struct patterns +{ + /* Regex compiled regexp. */ + struct re_pattern_buffer regexbuf; + struct re_registers regs; /* This is here on account of a BRAIN-DEAD + Q@#%!# library interface in regex.c. */ +} patterns0; + +struct patterns *patterns; +size_t pcount; /* KWset compiled pattern. For Ecompile and Gcompile, we compile a list of strings, at least one of which is known to occur in any string matching the regexp. */ static kwset_t kwset; -/* Last compiled fixed string known to exactly match the regexp. - If kwsexec() returns < lastexact, then we don't need to +/* Number of compiled fixed strings known to exactly match the regexp. + If kwsexec returns < kwset_exact_matches, then we don't need to call the regexp matcher at all. */ -static int lastexact; +static int kwset_exact_matches; + +#if defined(MBS_SUPPORT) +static char* check_multibyte_string PARAMS ((char const *buf, size_t size)); +#endif +static void kwsinit PARAMS ((void)); +static void kwsmusts PARAMS ((void)); +static void Gcompile PARAMS ((char const *, size_t)); +static void Ecompile PARAMS ((char const *, size_t)); +static size_t EGexecute PARAMS ((char const *, size_t, size_t *, int )); +static void Fcompile PARAMS ((char const *, size_t)); +static size_t Fexecute PARAMS ((char const *, size_t, size_t *, int)); +static void Pcompile PARAMS ((char const *, size_t )); +static size_t Pexecute PARAMS ((char const *, size_t, size_t *, int)); void dfaerror (char const *mesg) { - fatal(mesg, 0); + error (2, 0, mesg); } static void @@ -82,10 +99,10 @@ kwsinit (void) if (match_icase) for (i = 0; i < NCHAR; ++i) - trans[i] = TOLOWER(i); + trans[i] = TOLOWER (i); - if (!(kwset = kwsalloc(match_icase ? trans : (char *) 0))) - fatal("memory exhausted", 0); + if (!(kwset = kwsalloc (match_icase ? trans : (char *) 0))) + error (2, 0, _("memory exhausted")); } /* If the DFA turns out to have some set of fixed strings one of @@ -95,12 +112,12 @@ kwsinit (void) static void kwsmusts (void) { - struct dfamust *dm; - char *err; + struct dfamust const *dm; + char const *err; if (dfa.musts) { - kwsinit(); + kwsinit (); /* First, we compile in the substrings known to be exact matches. The kwset matcher will return the index of the matching string that it chooses. */ @@ -108,9 +125,9 @@ kwsmusts (void) { if (!dm->exact) continue; - ++lastexact; - if ((err = kwsincr(kwset, dm->must, strlen(dm->must))) != 0) - fatal(err, 0); + ++kwset_exact_matches; + if ((err = kwsincr (kwset, dm->must, strlen (dm->must))) != 0) + error (2, 0, err); } /* Now, we compile the substrings that will require the use of the regexp matcher. */ @@ -118,24 +135,90 @@ kwsmusts (void) { if (dm->exact) continue; - if ((err = kwsincr(kwset, dm->must, strlen(dm->must))) != 0) - fatal(err, 0); + if ((err = kwsincr (kwset, dm->must, strlen (dm->must))) != 0) + error (2, 0, err); + } + if ((err = kwsprep (kwset)) != 0) + error (2, 0, err); + } +} + +#ifdef MBS_SUPPORT +/* This function allocate the array which correspond to "buf". + Then this check multibyte string and mark on the positions which + are not singlebyte character nor the first byte of a multibyte + character. Caller must free the array. */ +static char* +check_multibyte_string(char const *buf, size_t size) +{ + char *mb_properties = malloc(size); + mbstate_t cur_state; + int i; + memset(&cur_state, 0, sizeof(mbstate_t)); + memset(mb_properties, 0, sizeof(char)*size); + for (i = 0; i < size ;) + { + size_t mbclen; + mbclen = mbrlen(buf + i, size - i, &cur_state); + + if (mbclen == (size_t) -1 || mbclen == (size_t) -2 || mbclen == 0) + { + /* An invalid sequence, or a truncated multibyte character. + We treat it as a singlebyte character. */ + mbclen = 1; } - if ((err = kwsprep(kwset)) != 0) - fatal(err, 0); + mb_properties[i] = mbclen; + i += mbclen; } + + return mb_properties; } +#endif static void -Gcompile (char *pattern, size_t size) +Gcompile (char const *pattern, size_t size) { const char *err; + char const *sep; + size_t total = size; + char const *motif = pattern; + + re_set_syntax (RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE); + dfasyntax (RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE, match_icase, eolbyte); + + /* For GNU regex compiler we have to pass the patterns separately to detect + errors like "[\nallo\n]\n". The patterns here are "[", "allo" and "]" + GNU regex should have raise a syntax error. The same for backref, where + the backref should have been local to each pattern. */ + do + { + size_t len; + sep = memchr (motif, '\n', total); + if (sep) + { + len = sep - motif; + sep++; + total -= (len + 1); + } + else + { + len = total; + total = 0; + } + + patterns = realloc (patterns, (pcount + 1) * sizeof (*patterns)); + if (patterns == NULL) + error (2, errno, _("memory exhausted")); + + patterns[pcount] = patterns0; - re_set_syntax(RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE); - dfasyntax(RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE, match_icase, eolbyte); + if ((err = re_compile_pattern (motif, len, + &(patterns[pcount].regexbuf))) != 0) + error (2, 0, err); + pcount++; - if ((err = re_compile_pattern(pattern, size, ®exbuf)) != 0) - fatal(err, 0); + motif = sep; + } while (sep && total != 0); /* In the match_words and match_lines cases, we use a different pattern for the DFA matcher that will quickly throw out cases that won't work. @@ -144,49 +227,42 @@ Gcompile (char *pattern, size_t size) if (match_words || match_lines) { /* In the whole-word case, we use the pattern: - (^|[^A-Za-z_])(userpattern)([^A-Za-z_]|$). + \(^\|[^[:alnum:]_]\)\(userpattern\)\([^[:alnum:]_]|$\). In the whole-line case, we use the pattern: - ^(userpattern)$. - BUG: Using [A-Za-z_] is locale-dependent! - So will use [:alnum:] */ - - char *n = malloc(size + 50); - int i = 0; - - strcpy(n, ""); - - if (match_lines) - strcpy(n, "^\\("); - if (match_words) - strcpy(n, "\\(^\\|[^[:alnum:]_]\\)\\("); - - i = strlen(n); - memcpy(n + i, pattern, size); + ^\(userpattern\)$. */ + + static char const line_beg[] = "^\\("; + static char const line_end[] = "\\)$"; + static char const word_beg[] = "\\(^\\|[^[:alnum:]_]\\)\\("; + static char const word_end[] = "\\)\\([^[:alnum:]_]\\|$\\)"; + char *n = malloc (sizeof word_beg - 1 + size + sizeof word_end); + size_t i; + strcpy (n, match_lines ? line_beg : word_beg); + i = strlen (n); + memcpy (n + i, pattern, size); i += size; - - if (match_words) - strcpy(n + i, "\\)\\([^[:alnum:]_]\\|$\\)"); - if (match_lines) - strcpy(n + i, "\\)$"); - - i += strlen(n + i); - dfacomp(n, i, &dfa, 1); + strcpy (n + i, match_lines ? line_end : word_end); + i += strlen (n + i); + pattern = n; + size = i; } - else - dfacomp(pattern, size, &dfa, 1); - kwsmusts(); + dfacomp (pattern, size, &dfa, 1); + kwsmusts (); } static void -Ecompile (char *pattern, size_t size) +Ecompile (char const *pattern, size_t size) { const char *err; + const char *sep; + size_t total = size; + char const *motif = pattern; - if (strcmp(matcher, "awk") == 0) + if (strcmp (matcher, "awk") == 0) { - re_set_syntax(RE_SYNTAX_AWK); - dfasyntax(RE_SYNTAX_AWK, match_icase, eolbyte); + re_set_syntax (RE_SYNTAX_AWK); + dfasyntax (RE_SYNTAX_AWK, match_icase, eolbyte); } else { @@ -194,8 +270,38 @@ Ecompile (char *pattern, size_t size) dfasyntax (RE_SYNTAX_POSIX_EGREP, match_icase, eolbyte); } - if ((err = re_compile_pattern(pattern, size, ®exbuf)) != 0) - fatal(err, 0); + /* For GNU regex compiler we have to pass the patterns separately to detect + errors like "[\nallo\n]\n". The patterns here are "[", "allo" and "]" + GNU regex should have raise a syntax error. The same for backref, where + the backref should have been local to each pattern. */ + do + { + size_t len; + sep = memchr (motif, '\n', total); + if (sep) + { + len = sep - motif; + sep++; + total -= (len + 1); + } + else + { + len = total; + total = 0; + } + + patterns = realloc (patterns, (pcount + 1) * sizeof (*patterns)); + if (patterns == NULL) + error (2, errno, _("memory exhausted")); + patterns[pcount] = patterns0; + + if ((err = re_compile_pattern (motif, len, + &(patterns[pcount].regexbuf))) != 0) + error (2, 0, err); + pcount++; + + motif = sep; + } while (sep && total != 0); /* In the match_words and match_lines cases, we use a different pattern for the DFA matcher that will quickly throw out cases that won't work. @@ -204,186 +310,236 @@ Ecompile (char *pattern, size_t size) if (match_words || match_lines) { /* In the whole-word case, we use the pattern: - (^|[^A-Za-z_])(userpattern)([^A-Za-z_]|$). + (^|[^[:alnum:]_])(userpattern)([^[:alnum:]_]|$). In the whole-line case, we use the pattern: - ^(userpattern)$. - BUG: Using [A-Za-z_] is locale-dependent! - so will use the char class */ - - char *n = malloc(size + 50); - int i = 0; - - strcpy(n, ""); - - if (match_lines) - strcpy(n, "^("); - if (match_words) - strcpy(n, "(^|[^[:alnum:]_])("); - + ^(userpattern)$. */ + + static char const line_beg[] = "^("; + static char const line_end[] = ")$"; + static char const word_beg[] = "(^|[^[:alnum:]_])("; + static char const word_end[] = ")([^[:alnum:]_]|$)"; + char *n = malloc (sizeof word_beg - 1 + size + sizeof word_end); + size_t i; + strcpy (n, match_lines ? line_beg : word_beg); i = strlen(n); - memcpy(n + i, pattern, size); + memcpy (n + i, pattern, size); i += size; - - if (match_words) - strcpy(n + i, ")([^[:alnum:]_]|$)"); - if (match_lines) - strcpy(n + i, ")$"); - - i += strlen(n + i); - dfacomp(n, i, &dfa, 1); + strcpy (n + i, match_lines ? line_end : word_end); + i += strlen (n + i); + pattern = n; + size = i; } - else - dfacomp(pattern, size, &dfa, 1); - kwsmusts(); + dfacomp (pattern, size, &dfa, 1); + kwsmusts (); } -static char * -EGexecute (char *buf, size_t size, char **endp) +static size_t +EGexecute (char const *buf, size_t size, size_t *match_size, int exact) { - register char *buflim, *beg, *end, save; + register char const *buflim, *beg, *end; char eol = eolbyte; int backref, start, len; struct kwsmatch kwsm; - static struct re_registers regs; /* This is static on account of a BRAIN-DEAD - Q@#%!# library interface in regex.c. */ + size_t i; +#ifdef MBS_SUPPORT + char *mb_properties = NULL; +#endif /* MBS_SUPPORT */ + +#ifdef MBS_SUPPORT + if (MB_CUR_MAX > 1 && kwset) + mb_properties = check_multibyte_string(buf, size); +#endif /* MBS_SUPPORT */ buflim = buf + size; - for (beg = end = buf; end < buflim; beg = end + 1) + for (beg = end = buf; end < buflim; beg = end) { - if (kwset) + if (!exact) { - /* Find a possible match using the KWset matcher. */ - beg = kwsexec(kwset, beg, buflim - beg, &kwsm); - if (!beg) - goto failure; - /* Narrow down to the line containing the candidate, and - run it through DFA. */ - end = memchr(beg, eol, buflim - beg); - if (!end) - end = buflim; - while (beg > buf && beg[-1] != eol) - --beg; - save = *end; - if (kwsm.index < lastexact) - goto success; - if (!dfaexec(&dfa, beg, end, 0, (int *) 0, &backref)) + if (kwset) { - *end = save; - continue; + /* Find a possible match using the KWset matcher. */ + size_t offset = kwsexec (kwset, beg, buflim - beg, &kwsm); + if (offset == (size_t) -1) + { +#ifdef MBS_SUPPORT + if (MB_CUR_MAX > 1) + free(mb_properties); +#endif + return (size_t)-1; + } + beg += offset; + /* Narrow down to the line containing the candidate, and + run it through DFA. */ + end = memchr(beg, eol, buflim - beg); + end++; +#ifdef MBS_SUPPORT + if (MB_CUR_MAX > 1 && mb_properties[beg - buf] == 0) + continue; +#endif + while (beg > buf && beg[-1] != eol) + --beg; + if (kwsm.index < kwset_exact_matches) + goto success; + if (dfaexec (&dfa, beg, end - beg, &backref) == (size_t) -1) + continue; + } + else + { + /* No good fixed strings; start with DFA. */ + size_t offset = dfaexec (&dfa, beg, buflim - beg, &backref); + if (offset == (size_t) -1) + break; + /* Narrow down to the line we've found. */ + beg += offset; + end = memchr (beg, eol, buflim - beg); + end++; + while (beg > buf && beg[-1] != eol) + --beg; } - *end = save; - /* Successful, no backreferences encountered. */ - if (!backref) - goto success; - } - else - { - /* No good fixed strings; start with DFA. */ - save = *buflim; - beg = dfaexec(&dfa, beg, buflim, 0, (int *) 0, &backref); - *buflim = save; - if (!beg) - goto failure; - /* Narrow down to the line we've found. */ - end = memchr(beg, eol, buflim - beg); - if (!end) - end = buflim; - while (beg > buf && beg[-1] != eol) - --beg; /* Successful, no backreferences encountered! */ if (!backref) goto success; } + else + end = beg + size; + /* If we've made it to this point, this means DFA has seen a probable match, and we need to run it through Regex. */ - regexbuf.not_eol = 0; - if ((start = re_search(®exbuf, beg, end - beg, 0, end - beg, ®s)) >= 0) + for (i = 0; i < pcount; i++) { - len = regs.end[0] - start; - if ((!match_lines && !match_words) - || (match_lines && len == end - beg)) - goto success; - /* If -w, check if the match aligns with word boundaries. - We do this iteratively because: - (a) the line may contain more than one occurence of the pattern, and - (b) Several alternatives in the pattern might be valid at a given - point, and we may need to consider a shorter one to find a word - boundary. */ - if (match_words) - while (start >= 0) - { - if ((start == 0 || !WCHAR ((unsigned char) beg[start - 1])) - && (len == end - beg - || !WCHAR ((unsigned char) beg[start + len]))) - goto success; - if (len > 0) - { - /* Try a shorter length anchored at the same place. */ - --len; - regexbuf.not_eol = 1; - len = re_match(®exbuf, beg, start + len, start, ®s); - } - if (len <= 0) + patterns[i].regexbuf.not_eol = 0; + if (0 <= (start = re_search (&(patterns[i].regexbuf), beg, + end - beg - 1, 0, + end - beg - 1, &(patterns[i].regs)))) + { + len = patterns[i].regs.end[0] - start; + if (exact) + { + *match_size = len; + return start; + } + if ((!match_lines && !match_words) + || (match_lines && len == end - beg - 1)) + goto success; + /* If -w, check if the match aligns with word boundaries. + We do this iteratively because: + (a) the line may contain more than one occurence of the + pattern, and + (b) Several alternatives in the pattern might be valid at a + given point, and we may need to consider a shorter one to + find a word boundary. */ + if (match_words) + while (start >= 0) { - /* Try looking further on. */ - if (start == end - beg) - break; - ++start; - regexbuf.not_eol = 0; - start = re_search(®exbuf, beg, end - beg, - start, end - beg - start, ®s); - len = regs.end[0] - start; + if ((start == 0 || !WCHAR ((unsigned char) beg[start - 1])) + && (len == end - beg - 1 + || !WCHAR ((unsigned char) beg[start + len]))) + goto success; + if (len > 0) + { + /* Try a shorter length anchored at the same place. */ + --len; + patterns[i].regexbuf.not_eol = 1; + len = re_match (&(patterns[i].regexbuf), beg, + start + len, start, + &(patterns[i].regs)); + } + if (len <= 0) + { + /* Try looking further on. */ + if (start == end - beg - 1) + break; + ++start; + patterns[i].regexbuf.not_eol = 0; + start = re_search (&(patterns[i].regexbuf), beg, + end - beg - 1, + start, end - beg - 1 - start, + &(patterns[i].regs)); + len = patterns[i].regs.end[0] - start; + } } - } - } - } - - failure: - return 0; + } + } /* for Regex patterns. */ + } /* for (beg = end ..) */ +#ifdef MBS_SUPPORT + if (MB_CUR_MAX > 1 && mb_properties) + free (mb_properties); +#endif /* MBS_SUPPORT */ + return (size_t) -1; success: - *endp = end < buflim ? end + 1 : end; - return beg; +#ifdef MBS_SUPPORT + if (MB_CUR_MAX > 1 && mb_properties) + free (mb_properties); +#endif /* MBS_SUPPORT */ + *match_size = end - beg; + return beg - buf; } static void -Fcompile (char *pattern, size_t size) +Fcompile (char const *pattern, size_t size) { - char *beg, *lim, *err; + char const *beg, *lim, *err; - kwsinit(); + kwsinit (); beg = pattern; do { for (lim = beg; lim < pattern + size && *lim != '\n'; ++lim) ; - if ((err = kwsincr(kwset, beg, lim - beg)) != 0) - fatal(err, 0); + if ((err = kwsincr (kwset, beg, lim - beg)) != 0) + error (2, 0, err); if (lim < pattern + size) ++lim; beg = lim; } while (beg < pattern + size); - if ((err = kwsprep(kwset)) != 0) - fatal(err, 0); + if ((err = kwsprep (kwset)) != 0) + error (2, 0, err); } -static char * -Fexecute (char *buf, size_t size, char **endp) +static size_t +Fexecute (char const *buf, size_t size, size_t *match_size, int exact) { - register char *beg, *try, *end; + register char const *beg, *try, *end; register size_t len; char eol = eolbyte; struct kwsmatch kwsmatch; +#ifdef MBS_SUPPORT + char *mb_properties; + if (MB_CUR_MAX > 1) + mb_properties = check_multibyte_string (buf, size); +#endif /* MBS_SUPPORT */ for (beg = buf; beg <= buf + size; ++beg) { - if (!(beg = kwsexec(kwset, beg, buf + size - beg, &kwsmatch))) - return 0; + size_t offset = kwsexec (kwset, beg, buf + size - beg, &kwsmatch); + if (offset == (size_t) -1) + { +#ifdef MBS_SUPPORT + if (MB_CUR_MAX > 1) + free(mb_properties); +#endif /* MBS_SUPPORT */ + return offset; + } +#ifdef MBS_SUPPORT + if (MB_CUR_MAX > 1 && mb_properties[offset+beg-buf] == 0) + continue; /* It is a part of multibyte character. */ +#endif /* MBS_SUPPORT */ + beg += offset; len = kwsmatch.size[0]; + if (exact) + { + *match_size = len; +#ifdef MBS_SUPPORT + if (MB_CUR_MAX > 1) + free (mb_properties); +#endif /* MBS_SUPPORT */ + return beg - buf; + } if (match_lines) { if (beg > buf && beg[-1] != eol) @@ -393,13 +549,22 @@ Fexecute (char *buf, size_t size, char **endp) goto success; } else if (match_words) - for (try = beg; len && try;) + for (try = beg; len; ) { if (try > buf && WCHAR((unsigned char) try[-1])) break; if (try + len < buf + size && WCHAR((unsigned char) try[len])) { - try = kwsexec(kwset, beg, --len, &kwsmatch); + offset = kwsexec (kwset, beg, --len, &kwsmatch); + if (offset == (size_t) -1) + { +#ifdef MBS_SUPPORT + if (MB_CUR_MAX > 1) + free (mb_properties); +#endif /* MBS_SUPPORT */ + return offset; + } + try = beg + offset; len = kwsmatch.size[0]; } else @@ -409,15 +574,153 @@ Fexecute (char *buf, size_t size, char **endp) goto success; } - return 0; +#ifdef MBS_SUPPORT + if (MB_CUR_MAX > 1) + free (mb_properties); +#endif /* MBS_SUPPORT */ + return -1; success: - if ((end = memchr(beg + len, eol, (buf + size) - (beg + len))) != 0) - ++end; - else - end = buf + size; - *endp = end; - while (beg > buf && beg[-1] != '\n') + end = memchr (beg + len, eol, (buf + size) - (beg + len)); + end++; + while (buf < beg && beg[-1] != eol) --beg; - return beg; + *match_size = end - beg; +#ifdef MBS_SUPPORT + if (MB_CUR_MAX > 1) + free (mb_properties); +#endif /* MBS_SUPPORT */ + return beg - buf; +} + +#if HAVE_LIBPCRE +/* Compiled internal form of a Perl regular expression. */ +static pcre *cre; + +/* Additional information about the pattern. */ +static pcre_extra *extra; +#endif + +static void +Pcompile (char const *pattern, size_t size) +{ +#if !HAVE_LIBPCRE + error (2, 0, _("The -P option is not supported")); +#else + int e; + char const *ep; + char *re = xmalloc (4 * size + 7); + int flags = PCRE_MULTILINE | (match_icase ? PCRE_CASELESS : 0); + char const *patlim = pattern + size; + char *n = re; + char const *p; + char const *pnul; + + /* FIXME: Remove this restriction. */ + if (eolbyte != '\n') + error (2, 0, _("The -P and -z options cannot be combined")); + + *n = '\0'; + if (match_lines) + strcpy (n, "^("); + if (match_words) + strcpy (n, "\\b("); + n += strlen (n); + + /* The PCRE interface doesn't allow NUL bytes in the pattern, so + replace each NUL byte in the pattern with the four characters + "\000", removing a preceding backslash if there are an odd + number of backslashes before the NUL. + + FIXME: This method does not work with some multibyte character + encodings, notably Shift-JIS, where a multibyte character can end + in a backslash byte. */ + for (p = pattern; (pnul = memchr (p, '\0', patlim - p)); p = pnul + 1) + { + memcpy (n, p, pnul - p); + n += pnul - p; + for (p = pnul; pattern < p && p[-1] == '\\'; p--) + continue; + n -= (pnul - p) & 1; + strcpy (n, "\\000"); + n += 4; + } + + memcpy (n, p, patlim - p); + n += patlim - p; + *n = '\0'; + if (match_words) + strcpy (n, ")\\b"); + if (match_lines) + strcpy (n, ")$"); + + cre = pcre_compile (re, flags, &ep, &e, pcre_maketables ()); + if (!cre) + error (2, 0, ep); + + extra = pcre_study (cre, 0, &ep); + if (ep) + error (2, 0, ep); + + free (re); +#endif +} + +static size_t +Pexecute (char const *buf, size_t size, size_t *match_size, int exact) +{ +#if !HAVE_LIBPCRE + abort (); + return -1; +#else + /* This array must have at least two elements; everything after that + is just for performance improvement in pcre_exec. */ + int sub[300]; + + int e = pcre_exec (cre, extra, buf, size, 0, 0, + sub, sizeof sub / sizeof *sub); + + if (e <= 0) + { + switch (e) + { + case PCRE_ERROR_NOMATCH: + return -1; + + case PCRE_ERROR_NOMEMORY: + error (2, 0, _("Memory exhausted")); + + default: + abort (); + } + } + else + { + /* Narrow down to the line we've found. */ + char const *beg = buf + sub[0]; + char const *end = buf + sub[1]; + char const *buflim = buf + size; + char eol = eolbyte; + if (!exact) + { + end = memchr (end, eol, buflim - end); + end++; + while (buf < beg && beg[-1] != eol) + --beg; + } + + *match_size = end - beg; + return beg - buf; + } +#endif } + +struct matcher const matchers[] = { + { "default", Gcompile, EGexecute }, + { "grep", Gcompile, EGexecute }, + { "egrep", Ecompile, EGexecute }, + { "awk", Ecompile, EGexecute }, + { "fgrep", Fcompile, Fexecute }, + { "perl", Pcompile, Pexecute }, + { "", 0, 0 }, +}; |