From ee5f93f56a2ef4f350b2806d8129fb2b520a0e4d Mon Sep 17 00:00:00 2001 From: tjr Date: Sun, 4 Jul 2004 16:16:59 +0000 Subject: Make grep run much (~10x) faster in multibyte locales by caching the wide character representation of input data across calls to dfaexec(), and by caching the lengths of character across calls to check_multibyte_string(). Obtained from: Fedora (Tim Waugh) --- gnu/usr.bin/grep/dfa.c | 83 +++++++++++++++++++++++++++++----------- gnu/usr.bin/grep/dfa.h | 5 ++- gnu/usr.bin/grep/grep.c | 63 +++++++++++++++++++----------- gnu/usr.bin/grep/grep.h | 5 ++- gnu/usr.bin/grep/mbcache.h | 11 ++++++ gnu/usr.bin/grep/search.c | 95 +++++++++++++++++++++++++++++++++------------- 6 files changed, 188 insertions(+), 74 deletions(-) create mode 100644 gnu/usr.bin/grep/mbcache.h diff --git a/gnu/usr.bin/grep/dfa.c b/gnu/usr.bin/grep/dfa.c index ffbb751..21329bc 100644 --- a/gnu/usr.bin/grep/dfa.c +++ b/gnu/usr.bin/grep/dfa.c @@ -2747,7 +2747,8 @@ transit_state (struct dfa *d, int s, unsigned char const **pp) match needs to be verified by a backtracking matcher. Otherwise we store a 0 in *backref. */ size_t -dfaexec (struct dfa *d, char const *begin, size_t size, int *backref) +dfaexec (struct dfa *d, char const *begin, size_t size, int *backref, + struct mb_cache *mb_cache) { register int s; /* Current state. */ register unsigned char const *p; /* Current input character. */ @@ -2779,43 +2780,79 @@ dfaexec (struct dfa *d, char const *begin, size_t size, int *backref) #ifdef MBS_SUPPORT if (MB_CUR_MAX > 1) { - int remain_bytes, i; buf_begin = begin; buf_end = end; - - /* initialize mblen_buf, and inputwcs. */ - MALLOC(mblen_buf, unsigned char, end - (unsigned char const *)begin + 2); - MALLOC(inputwcs, wchar_t, end - (unsigned char const *)begin + 2); - memset(&mbs, 0, sizeof(mbstate_t)); - remain_bytes = 0; - for (i = 0; i < end - (unsigned char const *)begin + 1; i++) + if (mb_cache && mb_cache->mblen_buf && mb_cache->wcs_buf && + begin > mb_cache->orig_buf && + (begin + size) <= (mb_cache->orig_buf + mb_cache->len)) + { + /* The cache can help us. */ + MALLOC(mblen_buf, unsigned char, size + 2); + MALLOC(inputwcs, wchar_t, size + 2); + memcpy (mblen_buf, + mb_cache->mblen_buf + (begin - mb_cache->orig_buf), + (size + 2) * sizeof (unsigned char)); + memcpy (inputwcs, + mb_cache->wcs_buf + (begin - mb_cache->orig_buf), + (size + 2) * sizeof (wchar_t)); + mblen_buf[size + 1] = 0; + inputwcs[size + 1] = 0; + } + else { - if (remain_bytes == 0) + int remain_bytes, i; + + /* initialize mblen_buf, and inputwcs. */ + MALLOC(mblen_buf, unsigned char, end - (unsigned char const *)begin + 2); + MALLOC(inputwcs, wchar_t, end - (unsigned char const *)begin + 2); + memset(&mbs, 0, sizeof(mbstate_t)); + remain_bytes = 0; + for (i = 0; i < end - (unsigned char const *)begin + 1; i++) { - remain_bytes - = mbrtowc(inputwcs + i, begin + i, - end - (unsigned char const *)begin - i + 1, &mbs); - if (remain_bytes <= 1) + if (remain_bytes == 0) { - remain_bytes = 0; - inputwcs[i] = (wchar_t)begin[i]; - mblen_buf[i] = 0; + remain_bytes + = mbrtowc(inputwcs + i, begin + i, + end - (unsigned char const *)begin - i + 1, &mbs); + if (remain_bytes <= 1) + { + remain_bytes = 0; + inputwcs[i] = (wchar_t)begin[i]; + mblen_buf[i] = 0; + } + else + { + mblen_buf[i] = remain_bytes; + remain_bytes--; + } } else { mblen_buf[i] = remain_bytes; + inputwcs[i] = 0; remain_bytes--; } } - else + mblen_buf[i] = 0; + inputwcs[i] = 0; /* sentinel */ + + if (mb_cache) { - mblen_buf[i] = remain_bytes; - inputwcs[i] = 0; - remain_bytes--; + /* Populate the cache. */ + mb_cache->len = size; + mb_cache->orig_buf = begin; + if (mb_cache->mblen_buf) + free (mb_cache->mblen_buf); + if (mb_cache->wcs_buf) + free (mb_cache->wcs_buf); + MALLOC(mb_cache->mblen_buf, unsigned char, size + 2); + MALLOC(mb_cache->wcs_buf, wchar_t, size + 2); + memcpy (mb_cache->mblen_buf, mblen_buf, + (size + 2) * sizeof (unsigned char)); + memcpy (mb_cache->wcs_buf, inputwcs, + (size + 2) * sizeof (wchar_t)); } } - mblen_buf[i] = 0; - inputwcs[i] = 0; /* sentinel */ } #endif /* MBS_SUPPORT */ diff --git a/gnu/usr.bin/grep/dfa.h b/gnu/usr.bin/grep/dfa.h index 4cdbe7a..014f1b3 100644 --- a/gnu/usr.bin/grep/dfa.h +++ b/gnu/usr.bin/grep/dfa.h @@ -24,6 +24,8 @@ In addition to clobbering modularity, we eat up valuable name space. */ +#include "mbcache.h" + #ifdef __STDC__ # ifndef _PTR_T # define _PTR_T @@ -405,7 +407,8 @@ extern void dfacomp PARAMS ((char const *, size_t, struct dfa *, int)); order to verify backreferencing; otherwise the flag will be cleared. Returns (size_t) -1 if no match is found, or the offset of the first character after the first & shortest matching string in the buffer. */ -extern size_t dfaexec PARAMS ((struct dfa *, char const *, size_t, int *)); +extern size_t dfaexec PARAMS ((struct dfa *, char const *, size_t, int *, + struct mb_cache *)); /* Free the storage held by the components of a struct dfa. */ extern void dfafree PARAMS ((struct dfa *)); diff --git a/gnu/usr.bin/grep/grep.c b/gnu/usr.bin/grep/grep.c index ad1dd49..e1f4e41 100644 --- a/gnu/usr.bin/grep/grep.c +++ b/gnu/usr.bin/grep/grep.c @@ -198,7 +198,8 @@ static inline int undossify_input PARAMS ((register char *, size_t)); /* Functions we'll use to search. */ static void (*compile) PARAMS ((char const *, size_t)); -static size_t (*execute) PARAMS ((char const *, size_t, size_t *, int)); +static size_t (*execute) PARAMS ((char const *, size_t, struct mb_cache *, + size_t *, int)); /* Like error, but suppress the diagnostic if requested. */ static void @@ -583,7 +584,7 @@ print_offset_sep (uintmax_t pos, char sep) } static void -prline (char const *beg, char const *lim, int sep) +prline (char const *beg, char const *lim, int sep, struct mb_cache *mb_cache) { if (out_file) printf ("%s%c", filename, sep & filename_mask); @@ -606,7 +607,8 @@ prline (char const *beg, char const *lim, int sep) { size_t match_size; size_t match_offset; - while ((match_offset = (*execute) (beg, lim - beg, &match_size, 1)) + while ((match_offset = (*execute) (beg, lim - beg, mb_cache, + &match_size, 1)) != (size_t) -1) { char const *b = beg + match_offset; @@ -640,7 +642,8 @@ prline (char const *beg, char const *lim, int sep) int i; for (i = 0; i < lim - beg; i++) ibeg[i] = tolower (beg[i]); - while ((match_offset = (*execute) (ibeg, ilim-ibeg, &match_size, 1)) + while ((match_offset = (*execute) (ibeg, ilim-ibeg, mb_cache, + &match_size, 1)) != (size_t) -1) { char const *b = beg + match_offset; @@ -658,7 +661,8 @@ prline (char const *beg, char const *lim, int sep) lastout = lim; return; } - while (lim-beg && (match_offset = (*execute) (beg, lim - beg, &match_size, 1)) + while (lim-beg && (match_offset = (*execute) (beg, lim - beg, mb_cache, + &match_size, 1)) != (size_t) -1) { char const *b = beg + match_offset; @@ -686,7 +690,7 @@ prline (char const *beg, char const *lim, int sep) /* Print pending lines of trailing context prior to LIM. Trailing context ends at the next matching line when OUTLEFT is 0. */ static void -prpending (char const *lim) +prpending (char const *lim, struct mb_cache *mb_cache) { if (!lastout) lastout = bufbeg; @@ -696,9 +700,10 @@ prpending (char const *lim) size_t match_size; --pending; if (outleft - || (((*execute) (lastout, nl - lastout, &match_size, 0) == (size_t) -1) + || (((*execute) (lastout, nl - lastout, mb_cache, + &match_size, 0) == (size_t) -1) == !out_invert)) - prline (lastout, nl + 1, '-'); + prline (lastout, nl + 1, '-', mb_cache); else pending = 0; } @@ -707,7 +712,8 @@ prpending (char const *lim) /* Print the lines between BEG and LIM. Deal with context crap. If NLINESP is non-null, store a count of lines between BEG and LIM. */ static void -prtext (char const *beg, char const *lim, int *nlinesp) +prtext (char const *beg, char const *lim, int *nlinesp, + struct mb_cache *mb_cache) { static int used; /* avoid printing "--" before any output */ char const *bp, *p; @@ -715,7 +721,7 @@ prtext (char const *beg, char const *lim, int *nlinesp) int i, n; if (!out_quiet && pending > 0) - prpending (beg); + prpending (beg, mb_cache); p = beg; @@ -739,7 +745,7 @@ prtext (char const *beg, char const *lim, int *nlinesp) { char const *nl = memchr (p, eol, beg - p); nl++; - prline (p, nl, '-'); + prline (p, nl, '-', mb_cache); p = nl; } } @@ -752,7 +758,7 @@ prtext (char const *beg, char const *lim, int *nlinesp) char const *nl = memchr (p, eol, lim - p); nl++; if (!out_quiet) - prline (p, nl, ':'); + prline (p, nl, ':', mb_cache); p = nl; } *nlinesp = n; @@ -762,7 +768,7 @@ prtext (char const *beg, char const *lim, int *nlinesp) } else if (!out_quiet) - prline (beg, lim, ':'); + prline (beg, lim, ':', mb_cache); pending = out_quiet ? 0 : out_after; used = 1; @@ -772,7 +778,7 @@ prtext (char const *beg, char const *lim, int *nlinesp) between matching lines if OUT_INVERT is true). Return a count of lines printed. */ static int -grepbuf (char const *beg, char const *lim) +grepbuf (char const *beg, char const *lim, struct mb_cache *mb_cache) { int nlines, n; register char const *p; @@ -781,7 +787,8 @@ grepbuf (char const *beg, char const *lim) nlines = 0; p = beg; - while ((match_offset = (*execute) (p, lim - p, &match_size, 0)) != (size_t) -1) + while ((match_offset = (*execute) (p, lim - p, mb_cache, + &match_size, 0)) != (size_t) -1) { char const *b = p + match_offset; char const *endp = b + match_size; @@ -790,7 +797,7 @@ grepbuf (char const *beg, char const *lim) break; if (!out_invert) { - prtext (b, endp, (int *) 0); + prtext (b, endp, (int *) 0, mb_cache); nlines++; outleft--; if (!outleft || done_on_match) @@ -803,7 +810,7 @@ grepbuf (char const *beg, char const *lim) } else if (p < b) { - prtext (p, b, &n); + prtext (p, b, &n, mb_cache); nlines += n; outleft -= n; if (!outleft) @@ -813,7 +820,7 @@ grepbuf (char const *beg, char const *lim) } if (out_invert && p < lim) { - prtext (p, lim, &n); + prtext (p, lim, &n, mb_cache); nlines += n; outleft -= n; } @@ -833,7 +840,9 @@ grep (int fd, char const *file, struct stats *stats) char *beg; char *lim; char eol = eolbyte; + struct mb_cache mb_cache; + memset (&mb_cache, 0, sizeof (mb_cache)); if (!reset (fd, file, stats)) return 0; @@ -908,9 +917,9 @@ grep (int fd, char const *file, struct stats *stats) if (beg < lim) { if (outleft) - nlines += grepbuf (beg, lim); + nlines += grepbuf (beg, lim, &mb_cache); if (pending) - prpending (lim); + prpending (lim, &mb_cache); if((!outleft && !pending) || (nlines && done_on_match && !out_invert)) goto finish_grep; } @@ -938,6 +947,11 @@ grep (int fd, char const *file, struct stats *stats) totalcc = add_count (totalcc, buflim - bufbeg - save); if (out_line) nlscan (beg); + if (mb_cache.wcs_buf) + free (mb_cache.wcs_buf); + if (mb_cache.mblen_buf) + free (mb_cache.mblen_buf); + memset (&mb_cache, 0, sizeof (mb_cache)); if (! fillbuf (save, stats)) { if (! is_EISDIR (errno, file)) @@ -949,9 +963,9 @@ grep (int fd, char const *file, struct stats *stats) { *buflim++ = eol; if (outleft) - nlines += grepbuf (bufbeg + save - residue, buflim); + nlines += grepbuf (bufbeg + save - residue, buflim, &mb_cache); if (pending) - prpending (buflim); + prpending (buflim, &mb_cache); } finish_grep: @@ -959,6 +973,11 @@ grep (int fd, char const *file, struct stats *stats) out_quiet -= not_text; if ((not_text & ~out_quiet) && nlines != 0) printf (_("Binary file %s matches\n"), filename); + + if (mb_cache.wcs_buf) + free (mb_cache.wcs_buf); + if (mb_cache.mblen_buf) + free (mb_cache.mblen_buf); return nlines; } diff --git a/gnu/usr.bin/grep/grep.h b/gnu/usr.bin/grep/grep.h index f4937e7..8a10330 100644 --- a/gnu/usr.bin/grep/grep.h +++ b/gnu/usr.bin/grep/grep.h @@ -22,6 +22,8 @@ # define __attribute__(x) #endif +#include "mbcache.h" + /* Grep.c expects the matchers vector to be terminated by an entry with a NULL compile, and to contain at least an entry named "default". */ @@ -30,7 +32,8 @@ extern struct matcher { char name[8]; void (*compile) PARAMS ((char const *, size_t)); - size_t (*execute) PARAMS ((char const *, size_t, size_t *, int)); + size_t (*execute) PARAMS ((char const *, size_t, struct mb_cache *, + size_t *, int)); } const matchers[]; /* Exported from fgrepmat.c, egrepmat.c, grepmat.c. */ diff --git a/gnu/usr.bin/grep/mbcache.h b/gnu/usr.bin/grep/mbcache.h new file mode 100644 index 0000000..3b76e25 --- /dev/null +++ b/gnu/usr.bin/grep/mbcache.h @@ -0,0 +1,11 @@ +/* $FreeBSD$ */ +#ifndef MB_CACHE_DEFINED +#define MB_CACHE_DEFINED +struct mb_cache +{ + size_t len; + const char *orig_buf; /* not the only reference; do not free */ + wchar_t *wcs_buf; + unsigned char *mblen_buf; +}; +#endif diff --git a/gnu/usr.bin/grep/search.c b/gnu/usr.bin/grep/search.c index b2514a9..1ea4217 100644 --- a/gnu/usr.bin/grep/search.c +++ b/gnu/usr.bin/grep/search.c @@ -73,17 +73,22 @@ static kwset_t kwset; static int kwset_exact_matches; #if defined(MBS_SUPPORT) -static char* check_multibyte_string PARAMS ((char const *buf, size_t size)); +static char* check_multibyte_string PARAMS ((char const *buf, size_t size, + struct mb_cache *, + char const *orig_buf)); #endif static void kwsinit PARAMS ((void)); static void kwsmusts PARAMS ((void)); static void Gcompile PARAMS ((char const *, size_t)); static void Ecompile PARAMS ((char const *, size_t)); -static size_t EGexecute PARAMS ((char const *, size_t, size_t *, int )); +static size_t EGexecute PARAMS ((char const *, size_t, struct mb_cache *, + size_t *, int )); static void Fcompile PARAMS ((char const *, size_t)); -static size_t Fexecute PARAMS ((char const *, size_t, size_t *, int)); +static size_t Fexecute PARAMS ((char const *, size_t, struct mb_cache *, + size_t *, int)); static void Pcompile PARAMS ((char const *, size_t )); -static size_t Pexecute PARAMS ((char const *, size_t, size_t *, int)); +static size_t Pexecute PARAMS ((char const *, size_t, struct mb_cache *, + size_t *, int)); void dfaerror (char const *mesg) @@ -149,35 +154,66 @@ kwsmusts (void) are not singlebyte character nor the first byte of a multibyte character. Caller must free the array. */ static char* -check_multibyte_string(char const *buf, size_t size) +check_multibyte_string(char const *buf, size_t size, struct mb_cache *mb_cache, + char const *orig_buf) { char *mb_properties = xmalloc(size); mbstate_t cur_state; wchar_t wc; int i; memset(&cur_state, 0, sizeof(mbstate_t)); - memset(mb_properties, 0, sizeof(char)*size); - for (i = 0; i < size ;) - { - size_t mbclen; - mbclen = mbrtowc(&wc, buf + i, size - i, &cur_state); - if (mbclen == (size_t) -1 || mbclen == (size_t) -2 || mbclen == 0) + if (mb_cache && mb_cache->mblen_buf && + orig_buf > mb_cache->orig_buf && + (orig_buf + size) <= (mb_cache->orig_buf + mb_cache->len)) + { + /* The cache can help us. */ + memcpy (mb_properties, + mb_cache->mblen_buf + (orig_buf - mb_cache->orig_buf), + size); + + } + else + { + memset(mb_properties, 0, sizeof(char)*size); + for (i = 0; i < size ;) { - /* An invalid sequence, or a truncated multibyte character. - We treat it as a singlebyte character. */ - mbclen = 1; + size_t mbclen; + mbclen = mbrtowc(&wc, buf + i, size - i, &cur_state); + + if (mbclen == (size_t) -1 || mbclen == (size_t) -2 || mbclen == 0) + { + /* An invalid sequence, or a truncated multibyte character. + We treat it as a singlebyte character. */ + mbclen = 1; + } + else if (match_icase) + { + if (iswupper((wint_t)wc)) + { + wc = towlower((wint_t)wc); + wcrtomb(buf + i, wc, &cur_state); + } + } + mb_properties[i] = mbclen; + i += mbclen; } - else if (match_icase) + + /* Now populate the cache. */ + if (mb_cache) { - if (iswupper((wint_t)wc)) + if (mb_cache->wcs_buf) { - wc = towlower((wint_t)wc); - wcrtomb(buf + i, wc, &cur_state); + free (mb_cache->wcs_buf); + mb_cache->wcs_buf = NULL; } + if (mb_cache->mblen_buf) + free (mb_cache->mblen_buf); + mb_cache->len = size; + mb_cache->orig_buf = orig_buf; + mb_cache->mblen_buf = xmalloc (size); + memcpy (mb_cache->mblen_buf, mb_properties, size); } - mb_properties[i] = mbclen; - i += mbclen; } return mb_properties; @@ -344,9 +380,11 @@ Ecompile (char const *pattern, size_t size) } static size_t -EGexecute (char const *buf, size_t size, size_t *match_size, int exact) +EGexecute (char const *buf, size_t size, struct mb_cache *mb_cache, + size_t *match_size, int exact) { register char const *buflim, *beg, *end; + char const *orig_buf = buf; char eol = eolbyte; int backref, start, len; struct kwsmatch kwsm; @@ -362,7 +400,7 @@ EGexecute (char const *buf, size_t size, size_t *match_size, int exact) buf = case_buf; } if (kwset) - mb_properties = check_multibyte_string(buf, size); + mb_properties = check_multibyte_string(buf, size, mb_cache, orig_buf); } #endif /* MBS_SUPPORT */ @@ -391,13 +429,13 @@ EGexecute (char const *buf, size_t size, size_t *match_size, int exact) --beg; if (kwsm.index < kwset_exact_matches) goto success_in_beg_and_end; - if (dfaexec (&dfa, beg, end - beg, &backref) == (size_t) -1) + if (dfaexec (&dfa, beg, end - beg, &backref, mb_cache) == (size_t) -1) continue; } else { /* No good fixed strings; start with DFA. */ - size_t offset = dfaexec (&dfa, beg, buflim - beg, &backref); + size_t offset = dfaexec (&dfa, beg, buflim - beg, &backref, mb_cache); if (offset == (size_t) -1) break; /* Narrow down to the line we've found. */ @@ -525,9 +563,11 @@ Fcompile (char const *pattern, size_t size) } static size_t -Fexecute (char const *buf, size_t size, size_t *match_size, int exact) +Fexecute (char const *buf, size_t size, struct mb_cache *mb_cache, + size_t *match_size, int exact) { register char const *beg, *try, *end; + char const *orig_buf = buf; register size_t len; char eol = eolbyte; struct kwsmatch kwsmatch; @@ -542,7 +582,7 @@ Fexecute (char const *buf, size_t size, size_t *match_size, int exact) memcpy(case_buf, buf, size); buf = case_buf; } - mb_properties = check_multibyte_string(buf, size); + mb_properties = check_multibyte_string(buf, size, mb_cache, orig_buf); } #endif /* MBS_SUPPORT */ @@ -707,7 +747,8 @@ Pcompile (char const *pattern, size_t size) } static size_t -Pexecute (char const *buf, size_t size, size_t *match_size, int exact) +Pexecute (char const *buf, size_t size, struct mb_cache *mb_cache, + size_t *match_size, int exact) { #if !HAVE_LIBPCRE abort (); -- cgit v1.1