summaryrefslogtreecommitdiffstats
path: root/usr.bin/grep
diff options
context:
space:
mode:
authorgabor <gabor@FreeBSD.org>2010-08-18 17:40:10 +0000
committergabor <gabor@FreeBSD.org>2010-08-18 17:40:10 +0000
commitfb7d7246f68844fc4062c081c8cab78a7a9320db (patch)
tree31c9b1f57fe3aca149aa8365f6e17a9bb8f206fb /usr.bin/grep
parent82e4bb536ac304054c6d49d74651f89b7846106d (diff)
downloadFreeBSD-src-fb7d7246f68844fc4062c081c8cab78a7a9320db.zip
FreeBSD-src-fb7d7246f68844fc4062c081c8cab78a7a9320db.tar.gz
- Refactor file reading code to use pure syscalls and an internal buffer
instead of stdio. This gives BSD grep a very big performance boost, its speed is now almost comparable to GNU grep. Submitted by: Dimitry Andric <dimitry@andric.com> Approved by: delphij (mentor)
Diffstat (limited to 'usr.bin/grep')
-rw-r--r--usr.bin/grep/fastgrep.c2
-rw-r--r--usr.bin/grep/file.c320
-rw-r--r--usr.bin/grep/grep.h9
-rw-r--r--usr.bin/grep/util.c4
4 files changed, 157 insertions, 178 deletions
diff --git a/usr.bin/grep/fastgrep.c b/usr.bin/grep/fastgrep.c
index c66f0a7..30a2dc5 100644
--- a/usr.bin/grep/fastgrep.c
+++ b/usr.bin/grep/fastgrep.c
@@ -198,7 +198,7 @@ fastcomp(fastgrep_t *fg, const char *pat)
}
int
-grep_search(fastgrep_t *fg, unsigned char *data, size_t len, regmatch_t *pmatch)
+grep_search(fastgrep_t *fg, const unsigned char *data, size_t len, regmatch_t *pmatch)
{
unsigned int j;
int ret = REG_NOMATCH;
diff --git a/usr.bin/grep/file.c b/usr.bin/grep/file.c
index 1872d0e..51be53d 100644
--- a/usr.bin/grep/file.c
+++ b/usr.bin/grep/file.c
@@ -2,7 +2,8 @@
/*-
* Copyright (c) 1999 James Howard and Dag-Erling Coïdan Smørgrav
- * Copyright (C) 2008-2009 Gabor Kovesdan <gabor@FreeBSD.org>
+ * Copyright (C) 2008-2010 Gabor Kovesdan <gabor@FreeBSD.org>
+ * Copyright (C) 2010 Dimitry Andric <dimitry@andric.com>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -37,7 +38,8 @@ __FBSDID("$FreeBSD$");
#include <bzlib.h>
#include <err.h>
#include <errno.h>
-#include <stdio.h>
+#include <fcntl.h>
+#include <stddef.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
@@ -47,222 +49,204 @@ __FBSDID("$FreeBSD$");
#include "grep.h"
-static char fname[MAXPATHLEN]; /* file name */
+#define MAXBUFSIZ (32 * 1024)
+#define LNBUFBUMP 80
-#define MAXBUFSIZ (16 * 1024)
-#define PREREAD_M 0.2
+static gzFile gzbufdesc;
+static BZFILE* bzbufdesc;
-/* Some global variables for the buffering and reading. */
-static char *lnbuf;
-static size_t lnbuflen;
-static unsigned char *binbuf;
-static int binbufsiz;
-unsigned char *binbufptr;
-static int bzerr;
+static unsigned char buffer[MAXBUFSIZ];
+static unsigned char *bufpos;
+static size_t bufrem;
-#define iswbinary(ch) (!iswspace((ch)) && iswcntrl((ch)) && \
- (ch != L'\b') && (ch != L'\0'))
+static unsigned char *lnbuf;
+static size_t lnbuflen;
-/*
- * Returns a single character according to the file type.
- * Returns -1 on failure.
- */
static inline int
-grep_fgetc(struct file *f)
+grep_refill(struct file *f)
{
- unsigned char c;
-
- switch (filebehave) {
- case FILE_STDIO:
- return (getc_unlocked(f->f));
- case FILE_GZIP:
- return (gzgetc(f->gzf));
- case FILE_BZIP:
- BZ2_bzRead(&bzerr, f->bzf, &c, 1);
- if (bzerr == BZ_STREAM_END)
- return (-1);
- else if (bzerr != BZ_SEQUENCE_ERROR && bzerr != BZ_OK)
- errx(2, "%s", getstr(2));
- return (c);
- }
- return (-1);
+ ssize_t nr;
+ int bzerr;
+
+ bufpos = buffer;
+ bufrem = 0;
+
+ if (filebehave == FILE_GZIP)
+ nr = gzread(gzbufdesc, buffer, MAXBUFSIZ);
+ else if (filebehave == FILE_BZIP && bzbufdesc != NULL) {
+ nr = BZ2_bzRead(&bzerr, bzbufdesc, buffer, MAXBUFSIZ);
+ switch (bzerr) {
+ case BZ_OK:
+ case BZ_STREAM_END:
+ /* No problem, nr will be okay */
+ break;
+ case BZ_DATA_ERROR_MAGIC:
+ /*
+ * As opposed to gzread(), which simply returns the
+ * plain file data, if it is not in the correct
+ * compressed format, BZ2_bzRead() instead aborts.
+ *
+ * So, just restart at the beginning of the file again,
+ * and use plain reads from now on.
+ */
+ BZ2_bzReadClose(&bzerr, bzbufdesc);
+ bzbufdesc = NULL;
+ if (lseek(f->fd, 0, SEEK_SET) == -1)
+ return (-1);
+ nr = read(f->fd, buffer, MAXBUFSIZ);
+ break;
+ default:
+ /* Make sure we exit with an error */
+ nr = -1;
+ }
+ } else
+ nr = read(f->fd, buffer, MAXBUFSIZ);
+
+ if (nr < 0)
+ return (-1);
+
+ bufrem = nr;
+ return (0);
}
-/*
- * Returns true if the file position is a EOF, returns false
- * otherwise.
- */
static inline int
-grep_feof(struct file *f)
+grep_lnbufgrow(size_t newlen)
{
- switch (filebehave) {
- case FILE_STDIO:
- return (feof_unlocked(f->f));
- case FILE_GZIP:
- return (gzeof(f->gzf));
- case FILE_BZIP:
- return (bzerr == BZ_STREAM_END);
+ if (lnbuflen < newlen) {
+ lnbuf = grep_realloc(lnbuf, newlen);
+ lnbuflen = newlen;
}
- return (1);
+
+ return (0);
}
-/*
- * At the first call, fills in an internal buffer and checks if the given
- * file is a binary file and sets the binary flag accordingly. Then returns
- * a single line and sets len to the length of the returned line.
- * At any other call returns a single line either from the internal buffer
- * or from the file if the buffer is exhausted and sets len to the length
- * of the line.
- */
char *
-grep_fgetln(struct file *f, size_t *len)
+grep_fgetln(struct file *f, size_t *lenp)
{
- struct stat st;
- size_t bufsiz, i = 0;
- int ch = 0;
-
- /* Fill in the buffer if it is empty. */
- if (binbufptr == NULL) {
-
- /* Only pre-read to the buffer if we need the binary check. */
- if (binbehave != BINFILE_TEXT) {
- if (f->stdin)
- st.st_size = MAXBUFSIZ;
- else if (stat(fname, &st) != 0)
- err(2, NULL);
- /* no need to allocate buffer. */
- if (st.st_size == 0)
- return (NULL);
-
- bufsiz = (MAXBUFSIZ > (st.st_size * PREREAD_M)) ?
- (st.st_size / 2) : MAXBUFSIZ;
-
- binbuf = grep_malloc(sizeof(char) * bufsiz);
-
- while (i < bufsiz) {
- ch = grep_fgetc(f);
- if (ch == EOF)
- break;
- binbuf[i++] = ch;
- if ((ch == '\n') && lbflag)
- break;
- }
-
- f->binary = memchr(binbuf, (filebehave != FILE_GZIP) ?
- '\0' : '\200', i - 1) != NULL;
- }
- binbufsiz = i;
- binbufptr = binbuf;
+ unsigned char *p;
+ char *ret;
+ size_t len;
+ size_t off;
+ ptrdiff_t diff;
+
+ /* Fill the buffer, if necessary */
+ if (bufrem == 0 && grep_refill(f) != 0)
+ goto error;
+
+ if (bufrem == 0) {
+ /* Return zero length to indicate EOF */
+ *lenp = 0;
+ return (bufpos);
}
- /* Read a line whether from the buffer or from the file itself. */
- for (i = 0; !(grep_feof(f) &&
- (binbufptr == &binbuf[binbufsiz])); i++) {
- if (binbufptr == &binbuf[binbufsiz]) {
- ch = grep_fgetc(f);
- } else {
- ch = binbufptr[0];
- binbufptr++;
- }
- if (i >= lnbuflen) {
- lnbuflen *= 2;
- lnbuf = grep_realloc(lnbuf, ++lnbuflen);
- }
- if ((ch == '\n') || (ch == EOF)) {
- lnbuf[i] = '\0';
+ /* Look for a newline in the remaining part of the buffer */
+ if ((p = memchr(bufpos, '\n', bufrem)) != NULL) {
+ ++p; /* advance over newline */
+ ret = bufpos;
+ len = p - bufpos;
+ bufrem -= len;
+ bufpos = p;
+ *lenp = len;
+ return (ret);
+ }
+
+ /* We have to copy the current buffered data to the line buffer */
+ for (len = bufrem, off = 0; ; len += bufrem) {
+ /* Make sure there is room for more data */
+ if (grep_lnbufgrow(len + LNBUFBUMP))
+ goto error;
+ memcpy(lnbuf + off, bufpos, len - off);
+ off = len;
+ if (grep_refill(f) != 0)
+ goto error;
+ if (bufrem == 0)
+ /* EOF: return partial line */
break;
- } else
- lnbuf[i] = ch;
+ if ((p = memchr(bufpos, '\n', bufrem)) == NULL)
+ continue;
+ /* got it: finish up the line (like code above) */
+ ++p;
+ diff = p - bufpos;
+ len += diff;
+ if (grep_lnbufgrow(len))
+ goto error;
+ memcpy(lnbuf + off, bufpos, diff);
+ bufrem -= diff;
+ bufpos = p;
+ break;
}
- if (grep_feof(f) && (i == 0) && (ch != '\n'))
- return (NULL);
- *len = i;
+ *lenp = len;
return (lnbuf);
+
+error:
+ *lenp = 0;
+ return (NULL);
}
-/*
- * Opens the standard input for processing.
- */
-struct file *
-grep_stdin_open(void)
+static inline struct file *
+grep_file_init(struct file *f)
{
- struct file *f;
- /* Processing stdin implies --line-buffered for tail -f to work. */
- lbflag = true;
+ if (filebehave == FILE_GZIP &&
+ (gzbufdesc = gzdopen(f->fd, "r")) == NULL)
+ goto error;
- snprintf(fname, sizeof fname, "%s", getstr(1));
+ if (filebehave == FILE_BZIP &&
+ (bzbufdesc = BZ2_bzdopen(f->fd, "r")) == NULL)
+ goto error;
- f = grep_malloc(sizeof *f);
+ /* Fill read buffer, also catches errors early */
+ if (grep_refill(f) != 0)
+ goto error;
- binbuf = NULL;
- if ((f->f = fdopen(STDIN_FILENO, "r")) != NULL) {
- flockfile(f->f);
- f->stdin = true;
- return (f);
- }
+ /* Check for binary stuff, if necessary */
+ if (binbehave != BINFILE_TEXT && memchr(bufpos, '\0', bufrem) != NULL)
+ f->binary = true;
+ return (f);
+error:
+ close(f->fd);
free(f);
return (NULL);
}
/*
- * Opens a normal, a gzipped or a bzip2 compressed file for processing.
+ * Opens a file for processing.
*/
struct file *
grep_open(const char *path)
{
struct file *f;
- snprintf(fname, sizeof fname, "%s", path);
-
f = grep_malloc(sizeof *f);
-
- binbuf = NULL;
- f->stdin = false;
- switch (filebehave) {
- case FILE_STDIO:
- if ((f->f = fopen(path, "r")) != NULL) {
- flockfile(f->f);
- return (f);
- }
- break;
- case FILE_GZIP:
- if ((f->gzf = gzopen(fname, "r")) != NULL)
- return (f);
- break;
- case FILE_BZIP:
- if ((f->bzf = BZ2_bzopen(fname, "r")) != NULL)
- return (f);
- break;
+ memset(f, 0, sizeof *f);
+ if (path == NULL) {
+ /* Processing stdin implies --line-buffered. */
+ lbflag = true;
+ f->fd = STDIN_FILENO;
+ } else if ((f->fd = open(path, O_RDONLY)) == -1) {
+ free(f);
+ return (NULL);
}
- free(f);
- return (NULL);
+ return (grep_file_init(f));
}
/*
- * Closes a normal, a gzipped or a bzip2 compressed file.
+ * Closes a file.
*/
void
grep_close(struct file *f)
{
- switch (filebehave) {
- case FILE_STDIO:
- funlockfile(f->f);
- fclose(f->f);
- break;
- case FILE_GZIP:
- gzclose(f->gzf);
- break;
- case FILE_BZIP:
- BZ2_bzclose(f->bzf);
- break;
- }
+ close(f->fd);
+
+ /* Reset read buffer and line buffer */
+ bufpos = buffer;
+ bufrem = 0;
- /* Reset read buffer for the file we are closing */
- binbufptr = NULL;
- free(binbuf);
+ free(lnbuf);
+ lnbuf = NULL;
+ lnbuflen = 0;
}
diff --git a/usr.bin/grep/grep.h b/usr.bin/grep/grep.h
index 2f743e4..cf4443d 100644
--- a/usr.bin/grep/grep.h
+++ b/usr.bin/grep/grep.h
@@ -77,12 +77,8 @@ extern const char *errstr[];
#define MAX_LINE_MATCHES 32
struct file {
- struct mmfile *mmf;
- BZFILE *bzf;
- FILE *f;
- gzFile *gzf;
+ int fd;
bool binary;
- bool stdin;
};
struct str {
@@ -150,11 +146,10 @@ void clearqueue(void);
/* file.c */
void grep_close(struct file *f);
-struct file *grep_stdin_open(void);
struct file *grep_open(const char *path);
char *grep_fgetln(struct file *f, size_t *len);
/* fastgrep.c */
int fastcomp(fastgrep_t *, const char *);
void fgrepcomp(fastgrep_t *, const char *);
-int grep_search(fastgrep_t *, unsigned char *, size_t, regmatch_t *);
+int grep_search(fastgrep_t *, const unsigned char *, size_t, regmatch_t *);
diff --git a/usr.bin/grep/util.c b/usr.bin/grep/util.c
index f084c3d..7ebbb55 100644
--- a/usr.bin/grep/util.c
+++ b/usr.bin/grep/util.c
@@ -184,7 +184,7 @@ procfile(const char *fn)
if (strcmp(fn, "-") == 0) {
fn = label != NULL ? label : getstr(1);
- f = grep_stdin_open();
+ f = grep_open(NULL);
} else {
if (!stat(fn, &sb)) {
/* Check if we need to process the file */
@@ -215,7 +215,7 @@ procfile(const char *fn)
for (c = 0; c == 0 || !(lflag || qflag); ) {
ln.off += ln.len + 1;
- if ((ln.dat = grep_fgetln(f, &ln.len)) == NULL) {
+ if ((ln.dat = grep_fgetln(f, &ln.len)) == NULL || ln.len == 0) {
if (ln.line_no == 0 && matchall)
exit(0);
else
OpenPOWER on IntegriCloud