diff options
author | gabor <gabor@FreeBSD.org> | 2010-08-18 17:40:10 +0000 |
---|---|---|
committer | gabor <gabor@FreeBSD.org> | 2010-08-18 17:40:10 +0000 |
commit | fb7d7246f68844fc4062c081c8cab78a7a9320db (patch) | |
tree | 31c9b1f57fe3aca149aa8365f6e17a9bb8f206fb /usr.bin/grep | |
parent | 82e4bb536ac304054c6d49d74651f89b7846106d (diff) | |
download | FreeBSD-src-fb7d7246f68844fc4062c081c8cab78a7a9320db.zip FreeBSD-src-fb7d7246f68844fc4062c081c8cab78a7a9320db.tar.gz |
- Refactor file reading code to use pure syscalls and an internal buffer
instead of stdio. This gives BSD grep a very big performance boost,
its speed is now almost comparable to GNU grep.
Submitted by: Dimitry Andric <dimitry@andric.com>
Approved by: delphij (mentor)
Diffstat (limited to 'usr.bin/grep')
-rw-r--r-- | usr.bin/grep/fastgrep.c | 2 | ||||
-rw-r--r-- | usr.bin/grep/file.c | 320 | ||||
-rw-r--r-- | usr.bin/grep/grep.h | 9 | ||||
-rw-r--r-- | usr.bin/grep/util.c | 4 |
4 files changed, 157 insertions, 178 deletions
diff --git a/usr.bin/grep/fastgrep.c b/usr.bin/grep/fastgrep.c index c66f0a7..30a2dc5 100644 --- a/usr.bin/grep/fastgrep.c +++ b/usr.bin/grep/fastgrep.c @@ -198,7 +198,7 @@ fastcomp(fastgrep_t *fg, const char *pat) } int -grep_search(fastgrep_t *fg, unsigned char *data, size_t len, regmatch_t *pmatch) +grep_search(fastgrep_t *fg, const unsigned char *data, size_t len, regmatch_t *pmatch) { unsigned int j; int ret = REG_NOMATCH; diff --git a/usr.bin/grep/file.c b/usr.bin/grep/file.c index 1872d0e..51be53d 100644 --- a/usr.bin/grep/file.c +++ b/usr.bin/grep/file.c @@ -2,7 +2,8 @@ /*- * Copyright (c) 1999 James Howard and Dag-Erling Coïdan Smørgrav - * Copyright (C) 2008-2009 Gabor Kovesdan <gabor@FreeBSD.org> + * Copyright (C) 2008-2010 Gabor Kovesdan <gabor@FreeBSD.org> + * Copyright (C) 2010 Dimitry Andric <dimitry@andric.com> * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -37,7 +38,8 @@ __FBSDID("$FreeBSD$"); #include <bzlib.h> #include <err.h> #include <errno.h> -#include <stdio.h> +#include <fcntl.h> +#include <stddef.h> #include <stdlib.h> #include <string.h> #include <unistd.h> @@ -47,222 +49,204 @@ __FBSDID("$FreeBSD$"); #include "grep.h" -static char fname[MAXPATHLEN]; /* file name */ +#define MAXBUFSIZ (32 * 1024) +#define LNBUFBUMP 80 -#define MAXBUFSIZ (16 * 1024) -#define PREREAD_M 0.2 +static gzFile gzbufdesc; +static BZFILE* bzbufdesc; -/* Some global variables for the buffering and reading. */ -static char *lnbuf; -static size_t lnbuflen; -static unsigned char *binbuf; -static int binbufsiz; -unsigned char *binbufptr; -static int bzerr; +static unsigned char buffer[MAXBUFSIZ]; +static unsigned char *bufpos; +static size_t bufrem; -#define iswbinary(ch) (!iswspace((ch)) && iswcntrl((ch)) && \ - (ch != L'\b') && (ch != L'\0')) +static unsigned char *lnbuf; +static size_t lnbuflen; -/* - * Returns a single character according to the file type. - * Returns -1 on failure. - */ static inline int -grep_fgetc(struct file *f) +grep_refill(struct file *f) { - unsigned char c; - - switch (filebehave) { - case FILE_STDIO: - return (getc_unlocked(f->f)); - case FILE_GZIP: - return (gzgetc(f->gzf)); - case FILE_BZIP: - BZ2_bzRead(&bzerr, f->bzf, &c, 1); - if (bzerr == BZ_STREAM_END) - return (-1); - else if (bzerr != BZ_SEQUENCE_ERROR && bzerr != BZ_OK) - errx(2, "%s", getstr(2)); - return (c); - } - return (-1); + ssize_t nr; + int bzerr; + + bufpos = buffer; + bufrem = 0; + + if (filebehave == FILE_GZIP) + nr = gzread(gzbufdesc, buffer, MAXBUFSIZ); + else if (filebehave == FILE_BZIP && bzbufdesc != NULL) { + nr = BZ2_bzRead(&bzerr, bzbufdesc, buffer, MAXBUFSIZ); + switch (bzerr) { + case BZ_OK: + case BZ_STREAM_END: + /* No problem, nr will be okay */ + break; + case BZ_DATA_ERROR_MAGIC: + /* + * As opposed to gzread(), which simply returns the + * plain file data, if it is not in the correct + * compressed format, BZ2_bzRead() instead aborts. + * + * So, just restart at the beginning of the file again, + * and use plain reads from now on. + */ + BZ2_bzReadClose(&bzerr, bzbufdesc); + bzbufdesc = NULL; + if (lseek(f->fd, 0, SEEK_SET) == -1) + return (-1); + nr = read(f->fd, buffer, MAXBUFSIZ); + break; + default: + /* Make sure we exit with an error */ + nr = -1; + } + } else + nr = read(f->fd, buffer, MAXBUFSIZ); + + if (nr < 0) + return (-1); + + bufrem = nr; + return (0); } -/* - * Returns true if the file position is a EOF, returns false - * otherwise. - */ static inline int -grep_feof(struct file *f) +grep_lnbufgrow(size_t newlen) { - switch (filebehave) { - case FILE_STDIO: - return (feof_unlocked(f->f)); - case FILE_GZIP: - return (gzeof(f->gzf)); - case FILE_BZIP: - return (bzerr == BZ_STREAM_END); + if (lnbuflen < newlen) { + lnbuf = grep_realloc(lnbuf, newlen); + lnbuflen = newlen; } - return (1); + + return (0); } -/* - * At the first call, fills in an internal buffer and checks if the given - * file is a binary file and sets the binary flag accordingly. Then returns - * a single line and sets len to the length of the returned line. - * At any other call returns a single line either from the internal buffer - * or from the file if the buffer is exhausted and sets len to the length - * of the line. - */ char * -grep_fgetln(struct file *f, size_t *len) +grep_fgetln(struct file *f, size_t *lenp) { - struct stat st; - size_t bufsiz, i = 0; - int ch = 0; - - /* Fill in the buffer if it is empty. */ - if (binbufptr == NULL) { - - /* Only pre-read to the buffer if we need the binary check. */ - if (binbehave != BINFILE_TEXT) { - if (f->stdin) - st.st_size = MAXBUFSIZ; - else if (stat(fname, &st) != 0) - err(2, NULL); - /* no need to allocate buffer. */ - if (st.st_size == 0) - return (NULL); - - bufsiz = (MAXBUFSIZ > (st.st_size * PREREAD_M)) ? - (st.st_size / 2) : MAXBUFSIZ; - - binbuf = grep_malloc(sizeof(char) * bufsiz); - - while (i < bufsiz) { - ch = grep_fgetc(f); - if (ch == EOF) - break; - binbuf[i++] = ch; - if ((ch == '\n') && lbflag) - break; - } - - f->binary = memchr(binbuf, (filebehave != FILE_GZIP) ? - '\0' : '\200', i - 1) != NULL; - } - binbufsiz = i; - binbufptr = binbuf; + unsigned char *p; + char *ret; + size_t len; + size_t off; + ptrdiff_t diff; + + /* Fill the buffer, if necessary */ + if (bufrem == 0 && grep_refill(f) != 0) + goto error; + + if (bufrem == 0) { + /* Return zero length to indicate EOF */ + *lenp = 0; + return (bufpos); } - /* Read a line whether from the buffer or from the file itself. */ - for (i = 0; !(grep_feof(f) && - (binbufptr == &binbuf[binbufsiz])); i++) { - if (binbufptr == &binbuf[binbufsiz]) { - ch = grep_fgetc(f); - } else { - ch = binbufptr[0]; - binbufptr++; - } - if (i >= lnbuflen) { - lnbuflen *= 2; - lnbuf = grep_realloc(lnbuf, ++lnbuflen); - } - if ((ch == '\n') || (ch == EOF)) { - lnbuf[i] = '\0'; + /* Look for a newline in the remaining part of the buffer */ + if ((p = memchr(bufpos, '\n', bufrem)) != NULL) { + ++p; /* advance over newline */ + ret = bufpos; + len = p - bufpos; + bufrem -= len; + bufpos = p; + *lenp = len; + return (ret); + } + + /* We have to copy the current buffered data to the line buffer */ + for (len = bufrem, off = 0; ; len += bufrem) { + /* Make sure there is room for more data */ + if (grep_lnbufgrow(len + LNBUFBUMP)) + goto error; + memcpy(lnbuf + off, bufpos, len - off); + off = len; + if (grep_refill(f) != 0) + goto error; + if (bufrem == 0) + /* EOF: return partial line */ break; - } else - lnbuf[i] = ch; + if ((p = memchr(bufpos, '\n', bufrem)) == NULL) + continue; + /* got it: finish up the line (like code above) */ + ++p; + diff = p - bufpos; + len += diff; + if (grep_lnbufgrow(len)) + goto error; + memcpy(lnbuf + off, bufpos, diff); + bufrem -= diff; + bufpos = p; + break; } - if (grep_feof(f) && (i == 0) && (ch != '\n')) - return (NULL); - *len = i; + *lenp = len; return (lnbuf); + +error: + *lenp = 0; + return (NULL); } -/* - * Opens the standard input for processing. - */ -struct file * -grep_stdin_open(void) +static inline struct file * +grep_file_init(struct file *f) { - struct file *f; - /* Processing stdin implies --line-buffered for tail -f to work. */ - lbflag = true; + if (filebehave == FILE_GZIP && + (gzbufdesc = gzdopen(f->fd, "r")) == NULL) + goto error; - snprintf(fname, sizeof fname, "%s", getstr(1)); + if (filebehave == FILE_BZIP && + (bzbufdesc = BZ2_bzdopen(f->fd, "r")) == NULL) + goto error; - f = grep_malloc(sizeof *f); + /* Fill read buffer, also catches errors early */ + if (grep_refill(f) != 0) + goto error; - binbuf = NULL; - if ((f->f = fdopen(STDIN_FILENO, "r")) != NULL) { - flockfile(f->f); - f->stdin = true; - return (f); - } + /* Check for binary stuff, if necessary */ + if (binbehave != BINFILE_TEXT && memchr(bufpos, '\0', bufrem) != NULL) + f->binary = true; + return (f); +error: + close(f->fd); free(f); return (NULL); } /* - * Opens a normal, a gzipped or a bzip2 compressed file for processing. + * Opens a file for processing. */ struct file * grep_open(const char *path) { struct file *f; - snprintf(fname, sizeof fname, "%s", path); - f = grep_malloc(sizeof *f); - - binbuf = NULL; - f->stdin = false; - switch (filebehave) { - case FILE_STDIO: - if ((f->f = fopen(path, "r")) != NULL) { - flockfile(f->f); - return (f); - } - break; - case FILE_GZIP: - if ((f->gzf = gzopen(fname, "r")) != NULL) - return (f); - break; - case FILE_BZIP: - if ((f->bzf = BZ2_bzopen(fname, "r")) != NULL) - return (f); - break; + memset(f, 0, sizeof *f); + if (path == NULL) { + /* Processing stdin implies --line-buffered. */ + lbflag = true; + f->fd = STDIN_FILENO; + } else if ((f->fd = open(path, O_RDONLY)) == -1) { + free(f); + return (NULL); } - free(f); - return (NULL); + return (grep_file_init(f)); } /* - * Closes a normal, a gzipped or a bzip2 compressed file. + * Closes a file. */ void grep_close(struct file *f) { - switch (filebehave) { - case FILE_STDIO: - funlockfile(f->f); - fclose(f->f); - break; - case FILE_GZIP: - gzclose(f->gzf); - break; - case FILE_BZIP: - BZ2_bzclose(f->bzf); - break; - } + close(f->fd); + + /* Reset read buffer and line buffer */ + bufpos = buffer; + bufrem = 0; - /* Reset read buffer for the file we are closing */ - binbufptr = NULL; - free(binbuf); + free(lnbuf); + lnbuf = NULL; + lnbuflen = 0; } diff --git a/usr.bin/grep/grep.h b/usr.bin/grep/grep.h index 2f743e4..cf4443d 100644 --- a/usr.bin/grep/grep.h +++ b/usr.bin/grep/grep.h @@ -77,12 +77,8 @@ extern const char *errstr[]; #define MAX_LINE_MATCHES 32 struct file { - struct mmfile *mmf; - BZFILE *bzf; - FILE *f; - gzFile *gzf; + int fd; bool binary; - bool stdin; }; struct str { @@ -150,11 +146,10 @@ void clearqueue(void); /* file.c */ void grep_close(struct file *f); -struct file *grep_stdin_open(void); struct file *grep_open(const char *path); char *grep_fgetln(struct file *f, size_t *len); /* fastgrep.c */ int fastcomp(fastgrep_t *, const char *); void fgrepcomp(fastgrep_t *, const char *); -int grep_search(fastgrep_t *, unsigned char *, size_t, regmatch_t *); +int grep_search(fastgrep_t *, const unsigned char *, size_t, regmatch_t *); diff --git a/usr.bin/grep/util.c b/usr.bin/grep/util.c index f084c3d..7ebbb55 100644 --- a/usr.bin/grep/util.c +++ b/usr.bin/grep/util.c @@ -184,7 +184,7 @@ procfile(const char *fn) if (strcmp(fn, "-") == 0) { fn = label != NULL ? label : getstr(1); - f = grep_stdin_open(); + f = grep_open(NULL); } else { if (!stat(fn, &sb)) { /* Check if we need to process the file */ @@ -215,7 +215,7 @@ procfile(const char *fn) for (c = 0; c == 0 || !(lflag || qflag); ) { ln.off += ln.len + 1; - if ((ln.dat = grep_fgetln(f, &ln.len)) == NULL) { + if ((ln.dat = grep_fgetln(f, &ln.len)) == NULL || ln.len == 0) { if (ln.line_no == 0 && matchall) exit(0); else |