summaryrefslogtreecommitdiffstats
path: root/usr.bin/grep
diff options
context:
space:
mode:
authorgabor <gabor@FreeBSD.org>2011-10-05 09:56:43 +0000
committergabor <gabor@FreeBSD.org>2011-10-05 09:56:43 +0000
commit1cb16d98872b7dcec16ad22150c6a1c95e4a0796 (patch)
tree23f8a074a9eca70aa3ae8f97192648beb5aa6572 /usr.bin/grep
parent58a76bb04f6febb9b77e88729b156b605c0be242 (diff)
downloadFreeBSD-src-1cb16d98872b7dcec16ad22150c6a1c95e4a0796.zip
FreeBSD-src-1cb16d98872b7dcec16ad22150c6a1c95e4a0796.tar.gz
Update BSD grep to the latest development version. It has some code
backported that was written for the TRE integration project in Google Summer of Code 2011. This is a temporary solution until the whole regex library is not replaced so that BSD grep development can continue and the backported code gets some review and testing. This change only improves scalability slightly, there is no big performance boost yet but several minor bugs have been found and fixed. Approved by: delphij (mentor) Sposored by: Google Summer of Code 2011 MFC after: 1 week
Diffstat (limited to 'usr.bin/grep')
-rw-r--r--usr.bin/grep/Makefile40
-rw-r--r--usr.bin/grep/fastgrep.c333
-rw-r--r--usr.bin/grep/file.c125
-rw-r--r--usr.bin/grep/grep.c128
-rw-r--r--usr.bin/grep/grep.h37
-rw-r--r--usr.bin/grep/regex/fastmatch.c169
-rw-r--r--usr.bin/grep/regex/fastmatch.h108
-rw-r--r--usr.bin/grep/regex/glue.h67
-rw-r--r--usr.bin/grep/regex/hashtable.c268
-rw-r--r--usr.bin/grep/regex/hashtable.h35
-rw-r--r--usr.bin/grep/regex/tre-compile.c103
-rw-r--r--usr.bin/grep/regex/tre-fastmatch.c1042
-rw-r--r--usr.bin/grep/regex/tre-fastmatch.h21
-rw-r--r--usr.bin/grep/regex/xmalloc.c349
-rw-r--r--usr.bin/grep/regex/xmalloc.h79
-rw-r--r--usr.bin/grep/util.c144
16 files changed, 2527 insertions, 521 deletions
diff --git a/usr.bin/grep/Makefile b/usr.bin/grep/Makefile
index f09a7d6..75fad49 100644
--- a/usr.bin/grep/Makefile
+++ b/usr.bin/grep/Makefile
@@ -8,28 +8,52 @@
PROG= grep
.else
PROG= bsdgrep
+CLEANFILES+= bsdgrep.1
+
+bsdgrep.1: grep.1
+ cp ${.ALLSRC} ${.TARGET}
.endif
-SRCS= fastgrep.c file.c grep.c queue.c util.c
+SRCS= file.c grep.c queue.c util.c
+
+# Extra files ported backported form some regex improvements
+.PATH: ${.CURDIR}/regex
+SRCS+= fastmatch.c hashtable.c tre-compile.c tre-fastmatch.c xmalloc.c
+CFLAGS+=-I${.CURDIR}/regex
.if ${MK_BSD_GREP} == "yes"
LINKS= ${BINDIR}/grep ${BINDIR}/egrep \
${BINDIR}/grep ${BINDIR}/fgrep \
${BINDIR}/grep ${BINDIR}/zgrep \
${BINDIR}/grep ${BINDIR}/zegrep \
- ${BINDIR}/grep ${BINDIR}/zfgrep
+ ${BINDIR}/grep ${BINDIR}/zfgrep \
+ ${BINDIR}/grep ${BINDIR}/bzgrep \
+ ${BINDIR}/grep ${BINDIR}/bzegrep \
+ ${BINDIR}/grep ${BINDIR}/bzfgrep \
+ ${BINDIR}/grep ${BINDIR}/xzgrep \
+ ${BINDIR}/grep ${BINDIR}/xzegrep \
+ ${BINDIR}/grep ${BINDIR}/xzfgrep \
+ ${BINDIR}/grep ${BINDIR}/lzgrep \
+ ${BINDIR}/grep ${BINDIR}/lzegrep \
+ ${BINDIR}/grep ${BINDIR}/lzfgrep
MLINKS= grep.1 egrep.1 \
grep.1 fgrep.1 \
grep.1 zgrep.1 \
grep.1 zegrep.1 \
- grep.1 zfgrep.1
+ grep.1 zfgrep.1 \
+ grep.1 bzgrep.1 \
+ grep.1 bzegrep.1 \
+ grep.1 bzfgrep.1 \
+ grep.1 xzgrep.1 \
+ grep.1 xzegrep.1 \
+ grep.1 xzfgrep.1 \
+ grep.1 lzgrep.1 \
+ grep.1 lzegrep.1 \
+ grep.1 lzfgrep.1
.endif
-bsdgrep.1: grep.1
- cp ${.ALLSRC} ${.TARGET}
-
-LDADD= -lz -lbz2
-DPADD= ${LIBZ} ${LIBBZ2}
+LDADD= -lz -lbz2 -llzma
+DPADD= ${LIBZ} ${LIBBZ2} ${LIBLZMA}
.if !defined(WITHOUT_GNU_COMPAT)
CFLAGS+= -I/usr/include/gnu
diff --git a/usr.bin/grep/fastgrep.c b/usr.bin/grep/fastgrep.c
deleted file mode 100644
index bc8a3af..0000000
--- a/usr.bin/grep/fastgrep.c
+++ /dev/null
@@ -1,333 +0,0 @@
-/* $OpenBSD: util.c,v 1.36 2007/10/02 17:59:18 otto Exp $ */
-/* $NetBSD: fastgrep.c,v 1.4 2011/02/27 17:33:37 joerg Exp $ */
-/* $FreeBSD$ */
-
-/*-
- * Copyright (c) 1999 James Howard and Dag-Erling Coïdan Smørgrav
- * Copyright (C) 2008 Gabor Kovesdan <gabor@FreeBSD.org>
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- */
-
-/*
- * XXX: This file is a speed up for grep to cover the defects of the
- * regex library. These optimizations should practically be implemented
- * there keeping this code clean. This is a future TODO, but for the
- * meantime, we need to use this workaround.
- */
-
-#include <sys/cdefs.h>
-__FBSDID("$FreeBSD$");
-
-#include <limits.h>
-#include <stdbool.h>
-#include <stdlib.h>
-#include <string.h>
-#include <wchar.h>
-#include <wctype.h>
-
-#include "grep.h"
-
-static inline int grep_cmp(const unsigned char *, const unsigned char *, size_t);
-static inline void grep_revstr(unsigned char *, int);
-
-void
-fgrepcomp(fastgrep_t *fg, const char *pat)
-{
- unsigned int i;
-
- /* Initialize. */
- fg->len = strlen(pat);
- fg->bol = false;
- fg->eol = false;
- fg->reversed = false;
-
- fg->pattern = (unsigned char *)grep_strdup(pat);
-
- /* Preprocess pattern. */
- for (i = 0; i <= UCHAR_MAX; i++)
- fg->qsBc[i] = fg->len;
- for (i = 1; i < fg->len; i++)
- fg->qsBc[fg->pattern[i]] = fg->len - i;
-}
-
-/*
- * Returns: -1 on failure, 0 on success
- */
-int
-fastcomp(fastgrep_t *fg, const char *pat)
-{
- unsigned int i;
- int firstHalfDot = -1;
- int firstLastHalfDot = -1;
- int hasDot = 0;
- int lastHalfDot = 0;
- int shiftPatternLen;
-
- /* Initialize. */
- fg->len = strlen(pat);
- fg->bol = false;
- fg->eol = false;
- fg->reversed = false;
- fg->word = false;
-
- /* Remove end-of-line character ('$'). */
- if (fg->len > 0 && pat[fg->len - 1] == '$') {
- fg->eol = true;
- fg->len--;
- }
-
- /* Remove beginning-of-line character ('^'). */
- if (pat[0] == '^') {
- fg->bol = true;
- fg->len--;
- pat++;
- }
-
- if (fg->len >= 14 &&
- memcmp(pat, "[[:<:]]", 7) == 0 &&
- memcmp(pat + fg->len - 7, "[[:>:]]", 7) == 0) {
- fg->len -= 14;
- pat += 7;
- /* Word boundary is handled separately in util.c */
- fg->word = true;
- }
-
- /*
- * pat has been adjusted earlier to not include '^', '$' or
- * the word match character classes at the beginning and ending
- * of the string respectively.
- */
- fg->pattern = grep_malloc(fg->len + 1);
- memcpy(fg->pattern, pat, fg->len);
- fg->pattern[fg->len] = '\0';
-
- /* Look for ways to cheat...er...avoid the full regex engine. */
- for (i = 0; i < fg->len; i++) {
- /* Can still cheat? */
- if (fg->pattern[i] == '.') {
- hasDot = i;
- if (i < fg->len / 2) {
- if (firstHalfDot < 0)
- /* Closest dot to the beginning */
- firstHalfDot = i;
- } else {
- /* Closest dot to the end of the pattern. */
- lastHalfDot = i;
- if (firstLastHalfDot < 0)
- firstLastHalfDot = i;
- }
- } else {
- /* Free memory and let others know this is empty. */
- free(fg->pattern);
- fg->pattern = NULL;
- return (-1);
- }
- }
-
- /*
- * Determine if a reverse search would be faster based on the placement
- * of the dots.
- */
- if ((!(lflag || cflag)) && ((!(fg->bol || fg->eol)) &&
- ((lastHalfDot) && ((firstHalfDot < 0) ||
- ((fg->len - (lastHalfDot + 1)) < (size_t)firstHalfDot)))) &&
- !oflag && !color) {
- fg->reversed = true;
- hasDot = fg->len - (firstHalfDot < 0 ?
- firstLastHalfDot : firstHalfDot) - 1;
- grep_revstr(fg->pattern, fg->len);
- }
-
- /*
- * Normal Quick Search would require a shift based on the position the
- * next character after the comparison is within the pattern. With
- * wildcards, the position of the last dot effects the maximum shift
- * distance.
- * The closer to the end the wild card is the slower the search. A
- * reverse version of this algorithm would be useful for wildcards near
- * the end of the string.
- *
- * Examples:
- * Pattern Max shift
- * ------- ---------
- * this 5
- * .his 4
- * t.is 3
- * th.s 2
- * thi. 1
- */
-
- /* Adjust the shift based on location of the last dot ('.'). */
- shiftPatternLen = fg->len - hasDot;
-
- /* Preprocess pattern. */
- for (i = 0; i <= (signed)UCHAR_MAX; i++)
- fg->qsBc[i] = shiftPatternLen;
- for (i = hasDot + 1; i < fg->len; i++) {
- fg->qsBc[fg->pattern[i]] = fg->len - i;
- }
-
- /*
- * Put pattern back to normal after pre-processing to allow for easy
- * comparisons later.
- */
- if (fg->reversed)
- grep_revstr(fg->pattern, fg->len);
-
- return (0);
-}
-
-int
-grep_search(fastgrep_t *fg, const unsigned char *data, size_t len, regmatch_t *pmatch)
-{
- unsigned int j;
- int ret = REG_NOMATCH;
-
- if (pmatch->rm_so == (ssize_t)len)
- return (ret);
-
- if (fg->bol && pmatch->rm_so != 0) {
- pmatch->rm_so = len;
- pmatch->rm_eo = len;
- return (ret);
- }
-
- /* No point in going farther if we do not have enough data. */
- if (len < fg->len)
- return (ret);
-
- /* Only try once at the beginning or ending of the line. */
- if (fg->bol || fg->eol) {
- /* Simple text comparison. */
- /* Verify data is >= pattern length before searching on it. */
- if (len >= fg->len) {
- /* Determine where in data to start search at. */
- j = fg->eol ? len - fg->len : 0;
- if (!((fg->bol && fg->eol) && (len != fg->len)))
- if (grep_cmp(fg->pattern, data + j,
- fg->len) == -1) {
- pmatch->rm_so = j;
- pmatch->rm_eo = j + fg->len;
- ret = 0;
- }
- }
- } else if (fg->reversed) {
- /* Quick Search algorithm. */
- j = len;
- do {
- if (grep_cmp(fg->pattern, data + j - fg->len,
- fg->len) == -1) {
- pmatch->rm_so = j - fg->len;
- pmatch->rm_eo = j;
- ret = 0;
- break;
- }
- /* Shift if within bounds, otherwise, we are done. */
- if (j == fg->len)
- break;
- j -= fg->qsBc[data[j - fg->len - 1]];
- } while (j >= fg->len);
- } else {
- /* Quick Search algorithm. */
- j = pmatch->rm_so;
- do {
- if (grep_cmp(fg->pattern, data + j, fg->len) == -1) {
- pmatch->rm_so = j;
- pmatch->rm_eo = j + fg->len;
- ret = 0;
- break;
- }
-
- /* Shift if within bounds, otherwise, we are done. */
- if (j + fg->len == len)
- break;
- else
- j += fg->qsBc[data[j + fg->len]];
- } while (j <= (len - fg->len));
- }
-
- return (ret);
-}
-
-/*
- * Returns: i >= 0 on failure (position that it failed)
- * -1 on success
- */
-static inline int
-grep_cmp(const unsigned char *pat, const unsigned char *data, size_t len)
-{
- size_t size;
- wchar_t *wdata, *wpat;
- unsigned int i;
-
- if (iflag) {
- if ((size = mbstowcs(NULL, (const char *)data, 0)) ==
- ((size_t) - 1))
- return (-1);
-
- wdata = grep_malloc(size * sizeof(wint_t));
-
- if (mbstowcs(wdata, (const char *)data, size) ==
- ((size_t) - 1))
- return (-1);
-
- if ((size = mbstowcs(NULL, (const char *)pat, 0)) ==
- ((size_t) - 1))
- return (-1);
-
- wpat = grep_malloc(size * sizeof(wint_t));
-
- if (mbstowcs(wpat, (const char *)pat, size) == ((size_t) - 1))
- return (-1);
- for (i = 0; i < len; i++) {
- if ((towlower(wpat[i]) == towlower(wdata[i])) ||
- ((grepbehave != GREP_FIXED) && wpat[i] == L'.'))
- continue;
- free(wpat);
- free(wdata);
- return (i);
- }
- } else {
- for (i = 0; i < len; i++) {
- if ((pat[i] == data[i]) || ((grepbehave != GREP_FIXED) &&
- pat[i] == '.'))
- continue;
- return (i);
- }
- }
- return (-1);
-}
-
-static inline void
-grep_revstr(unsigned char *str, int len)
-{
- int i;
- char c;
-
- for (i = 0; i < len / 2; i++) {
- c = str[i];
- str[i] = str[len - i - 1];
- str[len - i - 1] = c;
- }
-}
diff --git a/usr.bin/grep/file.c b/usr.bin/grep/file.c
index a6216e5..644c788 100644
--- a/usr.bin/grep/file.c
+++ b/usr.bin/grep/file.c
@@ -34,13 +34,15 @@
__FBSDID("$FreeBSD$");
#include <sys/param.h>
-#include <sys/types.h>
+#include <sys/mman.h>
#include <sys/stat.h>
+#include <sys/types.h>
#include <bzlib.h>
#include <err.h>
#include <errno.h>
#include <fcntl.h>
+#include <lzma.h>
#include <stddef.h>
#include <stdlib.h>
#include <string.h>
@@ -56,10 +58,12 @@ __FBSDID("$FreeBSD$");
static gzFile gzbufdesc;
static BZFILE* bzbufdesc;
+static lzma_stream lstrm = LZMA_STREAM_INIT;
-static unsigned char buffer[MAXBUFSIZ];
+static unsigned char *buffer;
static unsigned char *bufpos;
static size_t bufrem;
+static size_t fsiz;
static unsigned char *lnbuf;
static size_t lnbuflen;
@@ -70,6 +74,9 @@ grep_refill(struct file *f)
ssize_t nr;
int bzerr;
+ if (filebehave == FILE_MMAP)
+ return (0);
+
bufpos = buffer;
bufrem = 0;
@@ -101,6 +108,36 @@ grep_refill(struct file *f)
/* Make sure we exit with an error */
nr = -1;
}
+ } else if ((filebehave == FILE_XZ) || (filebehave == FILE_LZMA)) {
+ lzma_action action = LZMA_RUN;
+ uint8_t in_buf[MAXBUFSIZ];
+ lzma_ret ret;
+
+ ret = (filebehave == FILE_XZ) ?
+ lzma_stream_decoder(&lstrm, UINT64_MAX,
+ LZMA_CONCATENATED) :
+ lzma_alone_decoder(&lstrm, UINT64_MAX);
+
+ if (ret != LZMA_OK)
+ return (-1);
+
+ lstrm.next_out = buffer;
+ lstrm.avail_out = MAXBUFSIZ;
+ lstrm.next_in = in_buf;
+ nr = read(f->fd, in_buf, MAXBUFSIZ);
+
+ if (nr < 0)
+ return (-1);
+ else if (nr == 0)
+ action = LZMA_FINISH;
+
+ lstrm.avail_in = nr;
+ ret = lzma_code(&lstrm, action);
+
+ if (ret != LZMA_OK && ret != LZMA_STREAM_END)
+ return (-1);
+ bufrem = MAXBUFSIZ - lstrm.avail_out;
+ return (0);
} else
nr = read(f->fd, buffer, MAXBUFSIZ);
@@ -186,56 +223,76 @@ error:
return (NULL);
}
-static inline struct file *
-grep_file_init(struct file *f)
+/*
+ * Opens a file for processing.
+ */
+struct file *
+grep_open(const char *path)
{
+ struct file *f;
+
+ f = grep_malloc(sizeof *f);
+ memset(f, 0, sizeof *f);
+ if (path == NULL) {
+ /* Processing stdin implies --line-buffered. */
+ lbflag = true;
+ f->fd = STDIN_FILENO;
+ } else if ((f->fd = open(path, O_RDONLY)) == -1)
+ goto error1;
+
+ if (filebehave == FILE_MMAP) {
+ struct stat st;
+
+ if ((fstat(f->fd, &st) == -1) || (st.st_size > OFF_MAX) ||
+ (!S_ISREG(st.st_mode)))
+ filebehave = FILE_STDIO;
+ else {
+ int flags = MAP_PRIVATE | MAP_NOCORE | MAP_NOSYNC;
+#ifdef MAP_PREFAULT_READ
+ flags |= MAP_PREFAULT_READ;
+#endif
+ fsiz = st.st_size;
+ buffer = mmap(NULL, fsiz, PROT_READ, flags,
+ f->fd, (off_t)0);
+ if (buffer == MAP_FAILED)
+ filebehave = FILE_STDIO;
+ else {
+ bufrem = st.st_size;
+ bufpos = buffer;
+ madvise(buffer, st.st_size, MADV_SEQUENTIAL);
+ }
+ }
+ }
+
+ if ((buffer == NULL) || (buffer == MAP_FAILED))
+ buffer = grep_malloc(MAXBUFSIZ);
if (filebehave == FILE_GZIP &&
(gzbufdesc = gzdopen(f->fd, "r")) == NULL)
- goto error;
+ goto error2;
if (filebehave == FILE_BZIP &&
(bzbufdesc = BZ2_bzdopen(f->fd, "r")) == NULL)
- goto error;
+ goto error2;
/* Fill read buffer, also catches errors early */
- if (grep_refill(f) != 0)
- goto error;
+ if (bufrem == 0 && grep_refill(f) != 0)
+ goto error2;
/* Check for binary stuff, if necessary */
if (binbehave != BINFILE_TEXT && memchr(bufpos, '\0', bufrem) != NULL)
- f->binary = true;
+ f->binary = true;
return (f);
-error:
+
+error2:
close(f->fd);
+error1:
free(f);
return (NULL);
}
/*
- * Opens a file for processing.
- */
-struct file *
-grep_open(const char *path)
-{
- struct file *f;
-
- f = grep_malloc(sizeof *f);
- memset(f, 0, sizeof *f);
- if (path == NULL) {
- /* Processing stdin implies --line-buffered. */
- lbflag = true;
- f->fd = STDIN_FILENO;
- } else if ((f->fd = open(path, O_RDONLY)) == -1) {
- free(f);
- return (NULL);
- }
-
- return (grep_file_init(f));
-}
-
-/*
* Closes a file.
*/
void
@@ -245,6 +302,10 @@ grep_close(struct file *f)
close(f->fd);
/* Reset read buffer and line buffer */
+ if (filebehave == FILE_MMAP) {
+ munmap(buffer, fsiz);
+ buffer = NULL;
+ }
bufpos = buffer;
bufrem = 0;
diff --git a/usr.bin/grep/grep.c b/usr.bin/grep/grep.c
index 73e1d3d..e0f1d71 100644
--- a/usr.bin/grep/grep.c
+++ b/usr.bin/grep/grep.c
@@ -38,6 +38,7 @@ __FBSDID("$FreeBSD$");
#include <ctype.h>
#include <err.h>
#include <errno.h>
+#include <fcntl.h>
#include <getopt.h>
#include <limits.h>
#include <libgen.h>
@@ -48,6 +49,7 @@ __FBSDID("$FreeBSD$");
#include <string.h>
#include <unistd.h>
+#include "fastmatch.h"
#include "grep.h"
#ifndef WITHOUT_NLS
@@ -81,9 +83,9 @@ bool matchall;
/* Searching patterns */
unsigned int patterns, pattern_sz;
-char **pattern;
+struct pat *pattern;
regex_t *r_pattern;
-fastgrep_t *fg_pattern;
+fastmatch_t *fg_pattern;
/* Filename exclusion/inclusion patterns */
unsigned int fpatterns, fpattern_sz;
@@ -104,7 +106,7 @@ bool hflag; /* -h: don't print filename headers */
bool iflag; /* -i: ignore case */
bool lflag; /* -l: only show names of files with matches */
bool mflag; /* -m x: stop reading the files after x matches */
-unsigned long long mcount; /* count for -m */
+long long mcount; /* count for -m */
bool nflag; /* -n: show line numbers in front of matching lines */
bool oflag; /* -o: print only matching part */
bool qflag; /* -q: quiet mode (don't output anything) */
@@ -164,7 +166,7 @@ usage(void)
exit(2);
}
-static const char *optstr = "0123456789A:B:C:D:EFGHIJLOPSRUVZabcd:e:f:hilm:nopqrsuvwxy";
+static const char *optstr = "0123456789A:B:C:D:EFGHIJMLOPSRUVZabcd:e:f:hilm:nopqrsuvwxXy";
struct option long_options[] =
{
@@ -200,6 +202,7 @@ struct option long_options[] =
{"files-with-matches", no_argument, NULL, 'l'},
{"files-without-match", no_argument, NULL, 'L'},
{"max-count", required_argument, NULL, 'm'},
+ {"lzma", no_argument, NULL, 'M'},
{"line-number", no_argument, NULL, 'n'},
{"only-matching", no_argument, NULL, 'o'},
{"quiet", no_argument, NULL, 'q'},
@@ -212,6 +215,7 @@ struct option long_options[] =
{"version", no_argument, NULL, 'V'},
{"word-regexp", no_argument, NULL, 'w'},
{"line-regexp", no_argument, NULL, 'x'},
+ {"xz", no_argument, NULL, 'X'},
{"decompress", no_argument, NULL, 'Z'},
{NULL, no_argument, NULL, 0}
};
@@ -223,23 +227,35 @@ static void
add_pattern(char *pat, size_t len)
{
+ /* Do not add further pattern is we already match everything */
+ if (matchall)
+ return;
+
/* Check if we can do a shortcut */
- if (len == 0 || matchall) {
+ if (len == 0) {
matchall = true;
+ for (unsigned int i = 0; i < patterns; i++) {
+ free(pattern[i].pat);
+ }
+ pattern = grep_realloc(pattern, sizeof(struct pat));
+ pattern[0].pat = NULL;
+ pattern[0].len = 0;
+ patterns = 1;
return;
}
/* Increase size if necessary */
if (patterns == pattern_sz) {
pattern_sz *= 2;
pattern = grep_realloc(pattern, ++pattern_sz *
- sizeof(*pattern));
+ sizeof(struct pat));
}
if (len > 0 && pat[len - 1] == '\n')
--len;
/* pat may not be NUL-terminated */
- pattern[patterns] = grep_malloc(len + 1);
- memcpy(pattern[patterns], pat, len);
- pattern[patterns][len] = '\0';
+ pattern[patterns].pat = grep_malloc(len + 1);
+ memcpy(pattern[patterns].pat, pat, len);
+ pattern[patterns].len = len;
+ pattern[patterns].pat[len] = '\0';
++patterns;
}
@@ -285,14 +301,19 @@ add_dpattern(const char *pat, int mode)
static void
read_patterns(const char *fn)
{
+ struct stat st;
FILE *f;
char *line;
size_t len;
if ((f = fopen(fn, "r")) == NULL)
err(2, "%s", fn);
- while ((line = fgetln(f, &len)) != NULL)
- add_pattern(line, *line == '\n' ? 0 : len);
+ if ((fstat(fileno(f), &st) == -1) || (S_ISDIR(st.st_mode))) {
+ fclose(f);
+ return;
+ }
+ while ((line = fgetln(f, &len)) != NULL)
+ add_pattern(line, line[0] == '\n' ? 0 : len);
if (ferror(f))
err(2, "%s", fn);
fclose(f);
@@ -311,7 +332,7 @@ int
main(int argc, char *argv[])
{
char **aargv, **eargv, *eopts;
- char *ep;
+ char *pn, *ep;
unsigned long long l;
unsigned int aargc, eargc, i;
int c, lastc, needpattern, newarg, prevoptind;
@@ -325,30 +346,27 @@ main(int argc, char *argv[])
/* Check what is the program name of the binary. In this
way we can have all the funcionalities in one binary
without the need of scripting and using ugly hacks. */
- switch (__progname[0]) {
+ pn = __progname;
+ if (pn[0] == 'b' && pn[1] == 'z') {
+ filebehave = FILE_BZIP;
+ pn += 2;
+ } else if (pn[0] == 'x' && pn[1] == 'z') {
+ filebehave = FILE_XZ;
+ pn += 2;
+ } else if (pn[0] == 'l' && pn[1] == 'z') {
+ filebehave = FILE_LZMA;
+ pn += 2;
+ } else if (pn[0] == 'z') {
+ filebehave = FILE_GZIP;
+ pn += 1;
+ }
+ switch (pn[0]) {
case 'e':
grepbehave = GREP_EXTENDED;
break;
case 'f':
grepbehave = GREP_FIXED;
break;
- case 'g':
- grepbehave = GREP_BASIC;
- break;
- case 'z':
- filebehave = FILE_GZIP;
- switch(__progname[1]) {
- case 'e':
- grepbehave = GREP_EXTENDED;
- break;
- case 'f':
- grepbehave = GREP_FIXED;
- break;
- case 'g':
- grepbehave = GREP_BASIC;
- break;
- }
- break;
}
lastc = '\0';
@@ -503,8 +521,8 @@ main(int argc, char *argv[])
case 'm':
mflag = true;
errno = 0;
- mcount = strtoull(optarg, &ep, 10);
- if (((errno == ERANGE) && (mcount == ULLONG_MAX)) ||
+ mcount = strtoll(optarg, &ep, 10);
+ if (((errno == ERANGE) && (mcount == LLONG_MAX)) ||
((errno == EINVAL) && (mcount == 0)))
err(2, NULL);
else if (ep[0] != '\0') {
@@ -512,6 +530,9 @@ main(int argc, char *argv[])
err(2, NULL);
}
break;
+ case 'M':
+ filebehave = FILE_LZMA;
+ break;
case 'n':
nflag = true;
break;
@@ -544,7 +565,7 @@ main(int argc, char *argv[])
break;
case 'u':
case MMAP_OPT:
- /* noop, compatibility */
+ filebehave = FILE_MMAP;
break;
case 'V':
printf(getstr(9), __progname, VERSION);
@@ -560,6 +581,9 @@ main(int argc, char *argv[])
xflag = true;
cflags &= ~REG_NOSUB;
break;
+ case 'X':
+ filebehave = FILE_XZ;
+ break;
case 'Z':
filebehave = FILE_GZIP;
break;
@@ -630,6 +654,10 @@ main(int argc, char *argv[])
aargc -= optind;
aargv += optind;
+ /* Empty pattern file matches nothing */
+ if (!needpattern && (patterns == 0))
+ exit(1);
+
/* Fail if we don't have any pattern */
if (aargc == 0 && needpattern)
usage();
@@ -642,9 +670,12 @@ main(int argc, char *argv[])
}
switch (grepbehave) {
- case GREP_FIXED:
case GREP_BASIC:
break;
+ case GREP_FIXED:
+ /* XXX: header mess, REG_LITERAL not defined in gnu/regex.h */
+ cflags |= 0020;
+ break;
case GREP_EXTENDED:
cflags |= REG_EXTENDED;
break;
@@ -655,24 +686,17 @@ main(int argc, char *argv[])
fg_pattern = grep_calloc(patterns, sizeof(*fg_pattern));
r_pattern = grep_calloc(patterns, sizeof(*r_pattern));
-/*
- * XXX: fgrepcomp() and fastcomp() are workarounds for regexec() performance.
- * Optimizations should be done there.
- */
- /* Check if cheating is allowed (always is for fgrep). */
- if (grepbehave == GREP_FIXED) {
- for (i = 0; i < patterns; ++i)
- fgrepcomp(&fg_pattern[i], pattern[i]);
- } else {
- for (i = 0; i < patterns; ++i) {
- if (fastcomp(&fg_pattern[i], pattern[i])) {
- /* Fall back to full regex library */
- c = regcomp(&r_pattern[i], pattern[i], cflags);
- if (c != 0) {
- regerror(c, &r_pattern[i], re_error,
- RE_ERROR_BUF);
- errx(2, "%s", re_error);
- }
+
+ /* Check if cheating is allowed (always is for fgrep). */
+ for (i = 0; i < patterns; ++i) {
+ if (fastncomp(&fg_pattern[i], pattern[i].pat,
+ pattern[i].len, cflags) != 0) {
+ /* Fall back to full regex library */
+ c = regcomp(&r_pattern[i], pattern[i].pat, cflags);
+ if (c != 0) {
+ regerror(c, &r_pattern[i], re_error,
+ RE_ERROR_BUF);
+ errx(2, "%s", re_error);
}
}
}
diff --git a/usr.bin/grep/grep.h b/usr.bin/grep/grep.h
index fe15c17..47d4ab9 100644
--- a/usr.bin/grep/grep.h
+++ b/usr.bin/grep/grep.h
@@ -36,6 +36,8 @@
#include <stdio.h>
#include <zlib.h>
+#include "fastmatch.h"
+
#ifdef WITHOUT_NLS
#define getstr(n) errstr[n]
#else
@@ -58,8 +60,11 @@ extern const char *errstr[];
#define BINFILE_TEXT 2
#define FILE_STDIO 0
-#define FILE_GZIP 1
-#define FILE_BZIP 2
+#define FILE_MMAP 1
+#define FILE_GZIP 2
+#define FILE_BZIP 3
+#define FILE_XZ 4
+#define FILE_LZMA 5
#define DIR_READ 0
#define DIR_SKIP 1
@@ -90,22 +95,16 @@ struct str {
int line_no;
};
+struct pat {
+ char *pat;
+ int len;
+};
+
struct epat {
char *pat;
int mode;
};
-typedef struct {
- size_t len;
- unsigned char *pattern;
- int qsBc[UCHAR_MAX + 1];
- /* flags */
- bool bol;
- bool eol;
- bool reversed;
- bool word;
-} fastgrep_t;
-
/* Flags passed to regcomp() and regexec() */
extern int cflags, eflags;
@@ -114,7 +113,8 @@ extern bool Eflag, Fflag, Gflag, Hflag, Lflag,
bflag, cflag, hflag, iflag, lflag, mflag, nflag, oflag,
qflag, sflag, vflag, wflag, xflag;
extern bool dexclude, dinclude, fexclude, finclude, lbflag, nullflag;
-extern unsigned long long Aflag, Bflag, mcount;
+extern unsigned long long Aflag, Bflag;
+extern long long mcount;
extern char *label;
extern const char *color;
extern int binbehave, devbehave, dirbehave, filebehave, grepbehave, linkbehave;
@@ -122,10 +122,10 @@ extern int binbehave, devbehave, dirbehave, filebehave, grepbehave, linkbehave;
extern bool first, matchall, notfound, prev;
extern int tail;
extern unsigned int dpatterns, fpatterns, patterns;
-extern char **pattern;
+extern struct pat *pattern;
extern struct epat *dpattern, *fpattern;
extern regex_t *er_pattern, *r_pattern;
-extern fastgrep_t *fg_pattern;
+extern fastmatch_t *fg_pattern;
/* For regex errors */
#define RE_ERROR_BUF 512
@@ -150,8 +150,3 @@ void clearqueue(void);
void grep_close(struct file *f);
struct file *grep_open(const char *path);
char *grep_fgetln(struct file *f, size_t *len);
-
-/* fastgrep.c */
-int fastcomp(fastgrep_t *, const char *);
-void fgrepcomp(fastgrep_t *, const char *);
-int grep_search(fastgrep_t *, const unsigned char *, size_t, regmatch_t *);
diff --git a/usr.bin/grep/regex/fastmatch.c b/usr.bin/grep/regex/fastmatch.c
new file mode 100644
index 0000000..990d847
--- /dev/null
+++ b/usr.bin/grep/regex/fastmatch.c
@@ -0,0 +1,169 @@
+/* $FreeBSD$ */
+
+/*-
+ * Copyright (C) 2011 Gabor Kovesdan <gabor@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "glue.h"
+
+#include <errno.h>
+#include <fastmatch.h>
+#include <regex.h>
+#include <string.h>
+
+#include "tre-fastmatch.h"
+#include "xmalloc.h"
+
+int
+tre_fixncomp(fastmatch_t *preg, const char *regex, size_t n, int cflags)
+{
+ int ret;
+ tre_char_t *wregex;
+ size_t wlen;
+
+ if (n != 0)
+ {
+ ret = tre_convert_pattern(regex, n, &wregex, &wlen);
+ if (ret != REG_OK)
+ return ret;
+ else
+ ret = tre_compile_literal(preg, wregex, wlen, cflags);
+ tre_free_pattern(wregex);
+ return ret;
+ }
+ else
+ return tre_compile_literal(preg, NULL, 0, cflags);
+}
+
+int
+tre_fastncomp(fastmatch_t *preg, const char *regex, size_t n, int cflags)
+{
+ int ret;
+ tre_char_t *wregex;
+ size_t wlen;
+
+ if (n != 0)
+ {
+ ret = tre_convert_pattern(regex, n, &wregex, &wlen);
+ if (ret != REG_OK)
+ return ret;
+ else
+ ret = (cflags & REG_LITERAL)
+ ? tre_compile_literal(preg, wregex, wlen, cflags)
+ : tre_compile_fast(preg, wregex, wlen, cflags);
+ tre_free_pattern(wregex);
+ return ret;
+ }
+ else
+ return tre_compile_literal(preg, NULL, 0, cflags);
+}
+
+
+int
+tre_fixcomp(fastmatch_t *preg, const char *regex, int cflags)
+{
+ return tre_fixncomp(preg, regex, regex ? strlen(regex) : 0, cflags);
+}
+
+int
+tre_fastcomp(fastmatch_t *preg, const char *regex, int cflags)
+{
+ return tre_fastncomp(preg, regex, regex ? strlen(regex) : 0, cflags);
+}
+
+int
+tre_fixwncomp(fastmatch_t *preg, const wchar_t *regex, size_t n, int cflags)
+{
+ return tre_compile_literal(preg, regex, n, cflags);
+}
+
+int
+tre_fastwncomp(fastmatch_t *preg, const wchar_t *regex, size_t n, int cflags)
+{
+ return (cflags & REG_LITERAL) ?
+ tre_compile_literal(preg, regex, n, cflags) :
+ tre_compile_fast(preg, regex, n, cflags);
+}
+
+int
+tre_fixwcomp(fastmatch_t *preg, const wchar_t *regex, int cflags)
+{
+ return tre_fixwncomp(preg, regex, regex ? tre_strlen(regex) : 0, cflags);
+}
+
+int
+tre_fastwcomp(fastmatch_t *preg, const wchar_t *regex, int cflags)
+{
+ return tre_fastwncomp(preg, regex, regex ? tre_strlen(regex) : 0, cflags);
+}
+
+void
+tre_fastfree(fastmatch_t *preg)
+{
+ tre_free_fast(preg);
+}
+
+int
+tre_fastnexec(const fastmatch_t *preg, const char *string, size_t len,
+ size_t nmatch, regmatch_t pmatch[], int eflags)
+{
+ tre_str_type_t type = (TRE_MB_CUR_MAX == 1) ? STR_BYTE : STR_MBS;
+
+ if (eflags & REG_STARTEND)
+ CALL_WITH_OFFSET(tre_match_fast(preg, &string[offset], slen,
+ type, nmatch, pmatch, eflags));
+ else
+ return tre_match_fast(preg, string, len, type, nmatch,
+ pmatch, eflags);
+}
+
+int
+tre_fastexec(const fastmatch_t *preg, const char *string, size_t nmatch,
+ regmatch_t pmatch[], int eflags)
+{
+ return tre_fastnexec(preg, string, (size_t)-1, nmatch, pmatch, eflags);
+}
+
+int
+tre_fastwnexec(const fastmatch_t *preg, const wchar_t *string, size_t len,
+ size_t nmatch, regmatch_t pmatch[], int eflags)
+{
+ tre_str_type_t type = STR_WIDE;
+
+ if (eflags & REG_STARTEND)
+ CALL_WITH_OFFSET(tre_match_fast(preg, &string[offset], slen,
+ type, nmatch, pmatch, eflags));
+ else
+ return tre_match_fast(preg, string, len, type, nmatch,
+ pmatch, eflags);
+}
+
+int
+tre_fastwexec(const fastmatch_t *preg, const wchar_t *string,
+ size_t nmatch, regmatch_t pmatch[], int eflags)
+{
+ return tre_fastwnexec(preg, string, (size_t)-1, nmatch, pmatch, eflags);
+}
+
diff --git a/usr.bin/grep/regex/fastmatch.h b/usr.bin/grep/regex/fastmatch.h
new file mode 100644
index 0000000..68a50c6
--- /dev/null
+++ b/usr.bin/grep/regex/fastmatch.h
@@ -0,0 +1,108 @@
+/* $FreeBSD$ */
+
+#ifndef FASTMATCH_H
+#define FASTMATCH_H 1
+
+#include <limits.h>
+#include <regex.h>
+#include <stdbool.h>
+#include <wchar.h>
+
+typedef struct {
+ size_t wlen;
+ size_t len;
+ wchar_t *wpattern;
+ bool *wescmap;
+ unsigned int qsBc[UCHAR_MAX + 1];
+ unsigned int *bmGs;
+ char *pattern;
+ bool *escmap;
+ unsigned int defBc;
+ void *qsBc_table;
+ unsigned int *sbmGs;
+ const char *re_endp;
+
+ /* flags */
+ bool hasdot;
+ bool bol;
+ bool eol;
+ bool word;
+ bool icase;
+ bool newline;
+ bool nosub;
+ bool matchall;
+ bool reversed;
+} fastmatch_t;
+
+extern int
+tre_fixcomp(fastmatch_t *preg, const char *regex, int cflags);
+
+extern int
+tre_fastcomp(fastmatch_t *preg, const char *regex, int cflags);
+
+extern int
+tre_fastexec(const fastmatch_t *preg, const char *string, size_t nmatch,
+ regmatch_t pmatch[], int eflags);
+
+extern void
+tre_fastfree(fastmatch_t *preg);
+
+extern int
+tre_fixwcomp(fastmatch_t *preg, const wchar_t *regex, int cflags);
+
+extern int
+tre_fastwcomp(fastmatch_t *preg, const wchar_t *regex, int cflags);
+
+extern int
+tre_fastwexec(const fastmatch_t *preg, const wchar_t *string,
+ size_t nmatch, regmatch_t pmatch[], int eflags);
+
+/* Versions with a maximum length argument and therefore the capability to
+ handle null characters in the middle of the strings. */
+extern int
+tre_fixncomp(fastmatch_t *preg, const char *regex, size_t len, int cflags);
+
+extern int
+tre_fastncomp(fastmatch_t *preg, const char *regex, size_t len, int cflags);
+
+extern int
+tre_fastnexec(const fastmatch_t *preg, const char *string, size_t len,
+ size_t nmatch, regmatch_t pmatch[], int eflags);
+
+extern int
+tre_fixwncomp(fastmatch_t *preg, const wchar_t *regex, size_t len, int cflags);
+
+extern int
+tre_fastwncomp(fastmatch_t *preg, const wchar_t *regex, size_t len, int cflags);
+
+extern int
+tre_fastwnexec(const fastmatch_t *preg, const wchar_t *string, size_t len,
+ size_t nmatch, regmatch_t pmatch[], int eflags);
+
+#define fixncomp tre_fixncomp
+#define fastncomp tre_fastncomp
+#define fixcomp tre_fixcomp
+#define fastcomp tre_fastcomp
+#define fixwncomp tre_fixwncomp
+#define fastwncomp tre_fastwncomp
+#define fixwcomp tre_fixwcomp
+#define fastwcomp tre_fastwcomp
+#define fastfree tre_fastfree
+#define fastnexec tre_fastnexec
+#define fastexec tre_fastexec
+#define fastwnexec tre_fastwnexec
+#define fastwexec tre_fastwexec
+#define fixcomp tre_fixcomp
+#define fastcomp tre_fastcomp
+#define fastexec tre_fastexec
+#define fastfree tre_fastfree
+#define fixwcomp tre_fixwcomp
+#define fastwcomp tre_fastwcomp
+#define fastwexec tre_fastwexec
+#define fixncomp tre_fixncomp
+#define fastncomp tre_fastncomp
+#define fastnexec tre_fastnexec
+#define fixwncomp tre_fixwncomp
+#define fastwncomp tre_fastwncomp
+#define fastwnexec tre_fastwnexec
+#endif /* FASTMATCH_H */
diff --git a/usr.bin/grep/regex/glue.h b/usr.bin/grep/regex/glue.h
new file mode 100644
index 0000000..2fea4fd
--- /dev/null
+++ b/usr.bin/grep/regex/glue.h
@@ -0,0 +1,67 @@
+/* $FreeBSD$ */
+
+#ifndef GLUE_H
+#define GLUE_H
+
+#include <limits.h>
+#undef RE_DUP_MAX
+#include <regex.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#define TRE_WCHAR 1
+#define TRE_MULTIBYTE 1
+#define HAVE_MBSTATE_T 1
+
+#define TRE_CHAR(n) L##n
+#define CHF "%lc"
+
+#define tre_char_t wchar_t
+#define tre_mbrtowc(pwc, s, n, ps) (mbrtowc((pwc), (s), (n), (ps)))
+#define tre_strlen wcslen
+#define tre_isspace iswspace
+#define tre_isalnum iswalnum
+
+#define REG_OK 0
+#define REG_LITERAL 0020
+#define REG_WORD 0100
+#define REG_GNU 0400
+
+#define TRE_MB_CUR_MAX MB_CUR_MAX
+
+#ifndef _GREP_DEBUG
+#define DPRINT(msg)
+#else
+#define DPRINT(msg) do {printf msg; fflush(stdout);} while(/*CONSTCOND*/0)
+#endif
+
+#define MIN(a,b) ((a > b) ? (b) : (a))
+#define MAX(a,b) ((a > b) ? (a) : (b))
+
+typedef enum { STR_WIDE, STR_BYTE, STR_MBS, STR_USER } tre_str_type_t;
+
+#define CALL_WITH_OFFSET(fn) \
+ do \
+ { \
+ size_t slen = (size_t)(pmatch[0].rm_eo - pmatch[0].rm_so); \
+ size_t offset = pmatch[0].rm_so; \
+ int ret; \
+ \
+ if ((long long)pmatch[0].rm_eo - pmatch[0].rm_so < 0) \
+ return REG_NOMATCH; \
+ ret = fn; \
+ for (unsigned i = 0; (!(eflags & REG_NOSUB) && (i < nmatch)); i++)\
+ { \
+ pmatch[i].rm_so += offset; \
+ pmatch[i].rm_eo += offset; \
+ } \
+ return ret; \
+ } while (0 /*CONSTCOND*/)
+
+int
+tre_convert_pattern(const char *regex, size_t n, tre_char_t **w,
+ size_t *wn);
+
+void
+tre_free_pattern(tre_char_t *wregex);
+#endif
diff --git a/usr.bin/grep/regex/hashtable.c b/usr.bin/grep/regex/hashtable.c
new file mode 100644
index 0000000..41ebcd1
--- /dev/null
+++ b/usr.bin/grep/regex/hashtable.c
@@ -0,0 +1,268 @@
+/* $FreeBSD$ */
+
+/*-
+ * Copyright (C) 2011 Gabor Kovesdan <gabor@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "glue.h"
+
+#include <errno.h>
+#include <inttypes.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "hashtable.h"
+
+
+/*
+ * Return a 32-bit hash of the given buffer. The init
+ * value should be 0, or the previous hash value to extend
+ * the previous hash.
+ */
+static uint32_t
+hash32_buf(const void *buf, size_t len, uint32_t hash)
+{
+ const unsigned char *p = buf;
+
+ while (len--)
+ hash = HASHSTEP(hash, *p++);
+
+ return hash;
+}
+
+/*
+ * Initializes a hash table that can hold table_size number of entries,
+ * each of which has a key of key_size bytes and a value of value_size
+ * bytes. On successful allocation returns a pointer to the hash table.
+ * Otherwise, returns NULL and sets errno to indicate the error.
+ */
+hashtable
+*hashtable_init(size_t table_size, size_t key_size, size_t value_size)
+{
+ hashtable *tbl;
+
+ DPRINT(("hashtable_init: table_size %zu, key_size %zu, value_size %zu\n",
+ table_size, key_size, value_size));
+
+ tbl = malloc(sizeof(hashtable));
+ if (tbl == NULL)
+ goto mem1;
+
+ tbl->entries = calloc(sizeof(hashtable_entry *), table_size);
+ if (tbl->entries == NULL)
+ goto mem2;
+
+ tbl->table_size = table_size;
+ tbl->usage = 0;
+ tbl->key_size = key_size;
+ tbl->value_size = value_size;
+
+ return (tbl);
+
+mem2:
+ free(tbl);
+mem1:
+ DPRINT(("hashtable_init: allocation failed\n"));
+ errno = ENOMEM;
+ return (NULL);
+}
+
+/*
+ * Places the key-value pair to the hashtable tbl.
+ * Returns:
+ * HASH_OK: if the key was not present in the hash table yet
+ * but the kay-value pair has been successfully added.
+ * HASH_UPDATED: if the value for the key has been updated with the
+ * new value.
+ * HASH_FULL: if the hash table is full and the entry could not
+ * be added.
+ * HASH_FAIL: if an error has occurred and errno has been set to
+ * indicate the error.
+ */
+int
+hashtable_put(hashtable *tbl, const void *key, const void *value)
+{
+ uint32_t hash = 0;
+
+ if (tbl->table_size == tbl->usage)
+ {
+ DPRINT(("hashtable_put: hashtable is full\n"));
+ return (HASH_FULL);
+ }
+
+ hash = hash32_buf(key, tbl->key_size, hash) % tbl->table_size;
+ DPRINT(("hashtable_put: calculated hash %" PRIu32 "\n", hash));
+
+ /*
+ * On hash collision entries are inserted at the next free space,
+ * so we have to increase the index until we either find an entry
+ * with the same key (and update it) or we find a free space.
+ */
+ for(;;)
+ {
+ if (tbl->entries[hash] == NULL)
+ break;
+ else if (memcmp(tbl->entries[hash]->key, key, tbl->key_size) == 0)
+ {
+ memcpy(tbl->entries[hash]->value, value, tbl->value_size);
+ DPRINT(("hashtable_put: effective location is %" PRIu32
+ ", entry updated\n", hash));
+ return (HASH_UPDATED);
+ }
+ if (++hash == tbl->table_size)
+ hash = 0;
+ }
+
+ DPRINT(("hashtable_put: effective location is %" PRIu32 "\n", hash));
+
+ tbl->entries[hash] = malloc(sizeof(hashtable_entry));
+ if (tbl->entries[hash] == NULL)
+ {
+ errno = ENOMEM;
+ goto mem1;
+ }
+
+ tbl->entries[hash]->key = malloc(tbl->key_size);
+ if (tbl->entries[hash]->key == NULL)
+ {
+ errno = ENOMEM;
+ goto mem2;
+ }
+
+ tbl->entries[hash]->value = malloc(tbl->value_size);
+ if (tbl->entries[hash]->value == NULL)
+ {
+ errno = ENOMEM;
+ goto mem3;
+ }
+
+ memcpy(tbl->entries[hash]->key, key, tbl->key_size);
+ memcpy(tbl->entries[hash]->value, value, tbl->value_size);
+ tbl->usage++;
+
+ DPRINT(("hashtable_put: entry successfully inserted\n"));
+
+ return (HASH_OK);
+
+mem3:
+ free(tbl->entries[hash]->key);
+mem2:
+ free(tbl->entries[hash]);
+mem1:
+ DPRINT(("hashtable_put: insertion failed\n"));
+ return (HASH_FAIL);
+}
+
+static hashtable_entry
+**hashtable_lookup(const hashtable *tbl, const void *key)
+{
+ uint32_t hash = 0;
+
+ hash = hash32_buf(key, tbl->key_size, hash) % tbl->table_size;
+
+ for (;;)
+ {
+ if (tbl->entries[hash] == NULL)
+ return (NULL);
+ else if (memcmp(key, tbl->entries[hash]->key, tbl->key_size) == 0)
+ {
+ DPRINT(("hashtable_lookup: entry found at location %" PRIu32 "\n", hash));
+ return (&tbl->entries[hash]);
+ }
+
+ if (++hash == tbl->table_size)
+ hash = 0;
+ }
+}
+
+/*
+ * Retrieves the value for key from the hash table tbl and places
+ * it to the space indicated by the value argument.
+ * Returns HASH_OK if the value has been found and retrieved or
+ * HASH_NOTFOUND otherwise.
+ */
+int
+hashtable_get(hashtable *tbl, const void *key, void *value)
+{
+ hashtable_entry **entry;
+
+ entry = hashtable_lookup(tbl, key);
+ if (entry == NULL)
+ {
+ DPRINT(("hashtable_get: entry is not available in the hashtable\n"));
+ return (HASH_NOTFOUND);
+ }
+
+ memcpy(value, (*entry)->value, tbl->value_size);
+ DPRINT(("hashtable_get: entry successfully copied into output buffer\n"));
+ return (HASH_OK);
+}
+
+/*
+ * Removes the entry with the specifified key from the hash table
+ * tbl. Returns HASH_OK if the entry has been found and removed
+ * or HASH_NOTFOUND otherwise.
+ */
+int
+hashtable_remove(hashtable *tbl, const void *key)
+{
+ hashtable_entry **entry;
+
+ entry = hashtable_lookup(tbl, key);
+ if (entry == NULL)
+ {
+ DPRINT(("hashtable_remove: entry is not available in the hashtable\n"));
+ return (HASH_NOTFOUND);
+ }
+
+ free((*entry)->key);
+ free((*entry)->value);
+ free(*entry);
+ *entry = NULL;
+
+ tbl->usage--;
+ DPRINT(("hashtable_remove: entry successfully removed\n"));
+ return (HASH_OK);
+}
+
+/*
+ * Frees the resources associated with the hash table tbl.
+ */
+void
+hashtable_free(hashtable *tbl)
+{
+ if (tbl == NULL)
+ return;
+
+ for (unsigned int i = 0; i < tbl->table_size; i++)
+ if ((tbl->entries[i] != NULL))
+ {
+ free(tbl->entries[i]->key);
+ free(tbl->entries[i]->value);
+ }
+
+ free(tbl->entries);
+ DPRINT(("hashtable_free: resources are successfully freed\n"));
+}
diff --git a/usr.bin/grep/regex/hashtable.h b/usr.bin/grep/regex/hashtable.h
new file mode 100644
index 0000000..f8b1daa
--- /dev/null
+++ b/usr.bin/grep/regex/hashtable.h
@@ -0,0 +1,35 @@
+/* $FreeBSD$ */
+
+#ifndef HASHTABLE_H
+#define HASHTABLE_H 1
+
+#include <sys/types.h>
+
+#define HASH_OK 0
+#define HASH_UPDATED 1
+#define HASH_FAIL 2
+#define HASH_FULL 3
+#define HASH_NOTFOUND 4
+
+#define HASHSTEP(x,c) (((x << 5) + x) + (c))
+
+typedef struct {
+ void *key;
+ void *value;
+} hashtable_entry;
+
+typedef struct {
+ size_t key_size;
+ size_t table_size;
+ size_t usage;
+ size_t value_size;
+ hashtable_entry **entries;
+} hashtable;
+
+void hashtable_free(hashtable *);
+int hashtable_get(hashtable *, const void *, void *);
+hashtable *hashtable_init(size_t, size_t, size_t);
+int hashtable_put(hashtable *, const void *, const void *);
+int hashtable_remove(hashtable *, const void *);
+
+#endif /* HASHTABLE.H */
diff --git a/usr.bin/grep/regex/tre-compile.c b/usr.bin/grep/regex/tre-compile.c
new file mode 100644
index 0000000..f037c49
--- /dev/null
+++ b/usr.bin/grep/regex/tre-compile.c
@@ -0,0 +1,103 @@
+/* $FreeBSD$ */
+
+#include "glue.h"
+
+#include <stdio.h>
+#include <assert.h>
+#include <errno.h>
+#include <regex.h>
+#include <string.h>
+#include <wchar.h>
+
+#include "xmalloc.h"
+
+int
+tre_convert_pattern(const char *regex, size_t n, tre_char_t **w,
+ size_t *wn)
+{
+#if TRE_WCHAR
+ tre_char_t *wregex;
+ size_t wlen;
+
+ wregex = xmalloc(sizeof(tre_char_t) * (n + 1));
+ if (wregex == NULL)
+ return REG_ESPACE;
+
+ /* If the current locale uses the standard single byte encoding of
+ characters, we don't do a multibyte string conversion. If we did,
+ many applications which use the default locale would break since
+ the default "C" locale uses the 7-bit ASCII character set, and
+ all characters with the eighth bit set would be considered invalid. */
+#if TRE_MULTIBYTE
+ if (TRE_MB_CUR_MAX == 1)
+#endif /* TRE_MULTIBYTE */
+ {
+ unsigned int i;
+ const unsigned char *str = (const unsigned char *)regex;
+ tre_char_t *wstr = wregex;
+
+ for (i = 0; i < n; i++)
+ *(wstr++) = *(str++);
+ wlen = n;
+ }
+#if TRE_MULTIBYTE
+ else
+ {
+ int consumed;
+ tre_char_t *wcptr = wregex;
+#ifdef HAVE_MBSTATE_T
+ mbstate_t state;
+ memset(&state, '\0', sizeof(state));
+#endif /* HAVE_MBSTATE_T */
+ while (n > 0)
+ {
+ consumed = tre_mbrtowc(wcptr, regex, n, &state);
+
+ switch (consumed)
+ {
+ case 0:
+ if (*regex == '\0')
+ consumed = 1;
+ else
+ {
+ xfree(wregex);
+ return REG_BADPAT;
+ }
+ break;
+ case -1:
+ DPRINT(("mbrtowc: error %d: %s.\n", errno, strerror(errno)));
+ xfree(wregex);
+ return REG_BADPAT;
+ case -2:
+ /* The last character wasn't complete. Let's not call it a
+ fatal error. */
+ consumed = n;
+ break;
+ }
+ regex += consumed;
+ n -= consumed;
+ wcptr++;
+ }
+ wlen = wcptr - wregex;
+ }
+#endif /* TRE_MULTIBYTE */
+ wregex[wlen] = L'\0';
+ *w = wregex;
+ *wn = wlen;
+ return REG_OK;
+#else /* !TRE_WCHAR */
+ {
+ *w = (tre_char_t * const *)regex;
+ *wn = n;
+ return REG_OK;
+ }
+#endif /* !TRE_WCHAR */
+}
+
+void
+tre_free_pattern(tre_char_t *wregex)
+{
+#if TRE_WCHAR
+ xfree(wregex);
+#endif
+}
diff --git a/usr.bin/grep/regex/tre-fastmatch.c b/usr.bin/grep/regex/tre-fastmatch.c
new file mode 100644
index 0000000..9c5aab1
--- /dev/null
+++ b/usr.bin/grep/regex/tre-fastmatch.c
@@ -0,0 +1,1042 @@
+/* $FreeBSD$ */
+
+/*-
+ * Copyright (c) 1999 James Howard and Dag-Erling Coïdan Smørgrav
+ * Copyright (C) 2008-2011 Gabor Kovesdan <gabor@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "glue.h"
+
+#include <ctype.h>
+#include <limits.h>
+#include <regex.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <string.h>
+#ifdef TRE_WCHAR
+#include <wchar.h>
+#include <wctype.h>
+#endif
+
+#include "hashtable.h"
+#include "tre-fastmatch.h"
+#include "xmalloc.h"
+
+static int fastcmp(const fastmatch_t *fg, const void *data,
+ tre_str_type_t type);
+
+/*
+ * Clean up if pattern compilation fails.
+ */
+#define FAIL_COMP(errcode) \
+ { \
+ if (fg->pattern) \
+ xfree(fg->pattern); \
+ if (fg->wpattern) \
+ xfree(fg->wpattern); \
+ if (fg->qsBc_table) \
+ hashtable_free(fg->qsBc_table); \
+ fg = NULL; \
+ return errcode; \
+ }
+
+/*
+ * Skips n characters in the input string and assigns the start
+ * address to startptr. Note: as per IEEE Std 1003.1-2008
+ * matching is based on bit pattern not character representations
+ * so we can handle MB strings as byte sequences just like
+ * SB strings.
+ */
+#define SKIP_CHARS(n) \
+ switch (type) \
+ { \
+ case STR_WIDE: \
+ startptr = str_wide + n; \
+ break; \
+ default: \
+ startptr = str_byte + n; \
+ }
+
+/*
+ * Converts the wide string pattern to SB/MB string and stores
+ * it in fg->pattern. Sets fg->len to the byte length of the
+ * converted string.
+ */
+#define STORE_MBS_PAT \
+ { \
+ size_t siz; \
+ \
+ siz = wcstombs(NULL, fg->wpattern, 0); \
+ if (siz == (size_t)-1) \
+ return REG_BADPAT; \
+ fg->len = siz; \
+ fg->pattern = xmalloc(siz + 1); \
+ if (fg->pattern == NULL) \
+ return REG_ESPACE; \
+ wcstombs(fg->pattern, fg->wpattern, siz); \
+ fg->pattern[siz] = '\0'; \
+ } \
+
+#define IS_OUT_OF_BOUNDS \
+ ((!fg->reversed \
+ ? ((type == STR_WIDE) ? ((j + fg->wlen) > len) \
+ : ((j + fg->len) > len)) \
+ : (j < 0)))
+
+/*
+ * Checks whether the new position after shifting in the input string
+ * is out of the bounds and break out from the loop if so.
+ */
+#define CHECKBOUNDS \
+ if (IS_OUT_OF_BOUNDS) \
+ break; \
+
+/*
+ * Shifts in the input string after a mismatch. The position of the
+ * mismatch is stored in the mismatch variable.
+ */
+#define SHIFT \
+ CHECKBOUNDS; \
+ \
+ { \
+ int r = -1; \
+ unsigned int bc = 0, gs = 0, ts; \
+ \
+ switch (type) \
+ { \
+ case STR_WIDE: \
+ if (!fg->hasdot) \
+ { \
+ if (u != 0 && (unsigned)mismatch == fg->wlen - 1 - shift) \
+ mismatch -= u; \
+ v = fg->wlen - 1 - mismatch; \
+ r = hashtable_get(fg->qsBc_table, \
+ &str_wide[!fg->reversed ? (size_t)j + fg->wlen \
+ : (size_t)j - 1], &bc); \
+ gs = fg->bmGs[mismatch]; \
+ } \
+ bc = (r == HASH_OK) ? bc : fg->defBc; \
+ DPRINT(("tre_fast_match: mismatch on character" CHF ", " \
+ "BC %d, GS %d\n", \
+ ((const tre_char_t *)startptr)[mismatch + 1], \
+ bc, gs)); \
+ break; \
+ default: \
+ if (!fg->hasdot) \
+ { \
+ if (u != 0 && (unsigned)mismatch == fg->len - 1 - shift) \
+ mismatch -= u; \
+ v = fg->len - 1 - mismatch; \
+ gs = fg->sbmGs[mismatch]; \
+ } \
+ bc = fg->qsBc[((const unsigned char *)str_byte) \
+ [!fg->reversed ? (size_t)j + fg->len \
+ : (size_t)j - 1]]; \
+ DPRINT(("tre_fast_match: mismatch on character %c, " \
+ "BC %d, GS %d\n", \
+ ((const unsigned char *)startptr)[mismatch + 1], \
+ bc, gs)); \
+ } \
+ if (fg->hasdot) \
+ shift = bc; \
+ else \
+ { \
+ ts = ((long)u - v < 0) ? 0 : (u - v); \
+ shift = MAX(ts, bc); \
+ shift = MAX(shift, gs); \
+ if (shift == gs) \
+ u = MIN((type == STR_WIDE ? fg->wlen : fg->len) - shift, v); \
+ else \
+ { \
+ if (ts < bc) \
+ shift = MAX(shift, u + 1); \
+ u = 0; \
+ } \
+ } \
+ DPRINT(("tre_fast_match: shifting %u characters\n", shift)); \
+ j = !fg->reversed ? j + shift : j - shift; \
+ }
+
+/*
+ * Normal Quick Search would require a shift based on the position the
+ * next character after the comparison is within the pattern. With
+ * wildcards, the position of the last dot effects the maximum shift
+ * distance.
+ * The closer to the end the wild card is the slower the search.
+ *
+ * Examples:
+ * Pattern Max shift
+ * ------- ---------
+ * this 5
+ * .his 4
+ * t.is 3
+ * th.s 2
+ * thi. 1
+ */
+
+/*
+ * Fills in the bad character shift array for SB/MB strings.
+ */
+#define FILL_QSBC \
+ if (fg->reversed) \
+ { \
+ _FILL_QSBC_REVERSED \
+ } \
+ else \
+ { \
+ _FILL_QSBC \
+ }
+
+#define _FILL_QSBC \
+ for (unsigned int i = 0; i <= UCHAR_MAX; i++) \
+ fg->qsBc[i] = fg->len - hasdot; \
+ for (unsigned int i = hasdot + 1; i < fg->len; i++) \
+ { \
+ fg->qsBc[(unsigned char)fg->pattern[i]] = fg->len - i; \
+ DPRINT(("BC shift for char %c is %zu\n", fg->pattern[i], \
+ fg->len - i)); \
+ if (fg->icase) \
+ { \
+ char c = islower((unsigned char)fg->pattern[i]) ? \
+ toupper((unsigned char)fg->pattern[i]) : \
+ tolower((unsigned char)fg->pattern[i]); \
+ fg->qsBc[(unsigned char)c] = fg->len - i; \
+ DPRINT(("BC shift for char %c is %zu\n", c, fg->len - i)); \
+ } \
+ }
+
+#define _FILL_QSBC_REVERSED \
+ for (unsigned int i = 0; i <= UCHAR_MAX; i++) \
+ fg->qsBc[i] = firstdot + 1; \
+ for (int i = firstdot - 1; i >= 0; i--) \
+ { \
+ fg->qsBc[(unsigned char)fg->pattern[i]] = i + 1; \
+ DPRINT(("Reverse BC shift for char %c is %d\n", fg->pattern[i], \
+ i + 1)); \
+ if (fg->icase) \
+ { \
+ char c = islower((unsigned char)fg->pattern[i]) ? \
+ toupper((unsigned char)fg->pattern[i]) : \
+ tolower((unsigned char)fg->pattern[i]); \
+ fg->qsBc[(unsigned char)c] = i + 1; \
+ DPRINT(("Reverse BC shift for char %c is %d\n", c, i + 1)); \
+ } \
+ }
+
+/*
+ * Fills in the bad character shifts into a hastable for wide strings.
+ * With wide characters it is not possible any more to use a normal
+ * array because there are too many characters and we could not
+ * provide enough memory. Fortunately, we only have to store distinct
+ * values for so many characters as the number of distinct characters
+ * in the pattern, so we can store them in a hashtable and store a
+ * default shift value for the rest.
+ */
+#define FILL_QSBC_WIDE \
+ if (fg->reversed) \
+ { \
+ _FILL_QSBC_WIDE_REVERSED \
+ } \
+ else \
+ { \
+ _FILL_QSBC_WIDE \
+ }
+
+#define _FILL_QSBC_WIDE \
+ /* Adjust the shift based on location of the last dot ('.'). */ \
+ fg->defBc = fg->wlen - whasdot; \
+ \
+ /* Preprocess pattern. */ \
+ fg->qsBc_table = hashtable_init(fg->wlen * (fg->icase ? 8 : 4), \
+ sizeof(tre_char_t), sizeof(int)); \
+ if (!fg->qsBc_table) \
+ FAIL_COMP(REG_ESPACE); \
+ for (unsigned int i = whasdot + 1; i < fg->wlen; i++) \
+ { \
+ int k = fg->wlen - i; \
+ int r; \
+ \
+ r = hashtable_put(fg->qsBc_table, &fg->wpattern[i], &k); \
+ if ((r == HASH_FAIL) || (r == HASH_FULL)) \
+ FAIL_COMP(REG_ESPACE); \
+ DPRINT(("BC shift for wide char " CHF " is %d\n", fg->wpattern[i],\
+ k)); \
+ if (fg->icase) \
+ { \
+ tre_char_t wc = iswlower(fg->wpattern[i]) ? \
+ towupper(fg->wpattern[i]) : towlower(fg->wpattern[i]); \
+ r = hashtable_put(fg->qsBc_table, &wc, &k); \
+ if ((r == HASH_FAIL) || (r == HASH_FULL)) \
+ FAIL_COMP(REG_ESPACE); \
+ DPRINT(("BC shift for wide char " CHF " is %d\n", wc, k)); \
+ } \
+ }
+
+#define _FILL_QSBC_WIDE_REVERSED \
+ /* Adjust the shift based on location of the last dot ('.'). */ \
+ fg->defBc = (size_t)wfirstdot; \
+ \
+ /* Preprocess pattern. */ \
+ fg->qsBc_table = hashtable_init(fg->wlen * (fg->icase ? 8 : 4), \
+ sizeof(tre_char_t), sizeof(int)); \
+ if (!fg->qsBc_table) \
+ FAIL_COMP(REG_ESPACE); \
+ for (int i = wfirstdot - 1; i >= 0; i--) \
+ { \
+ int k = i + 1; \
+ int r; \
+ \
+ r = hashtable_put(fg->qsBc_table, &fg->wpattern[i], &k); \
+ if ((r == HASH_FAIL) || (r == HASH_FULL)) \
+ FAIL_COMP(REG_ESPACE); \
+ DPRINT(("Reverse BC shift for wide char " CHF " is %d\n", \
+ fg->wpattern[i], k)); \
+ if (fg->icase) \
+ { \
+ tre_char_t wc = iswlower(fg->wpattern[i]) ? \
+ towupper(fg->wpattern[i]) : towlower(fg->wpattern[i]); \
+ r = hashtable_put(fg->qsBc_table, &wc, &k); \
+ if ((r == HASH_FAIL) || (r == HASH_FULL)) \
+ FAIL_COMP(REG_ESPACE); \
+ DPRINT(("Reverse BC shift for wide char " CHF " is %d\n", wc, \
+ k)); \
+ } \
+ }
+
+#ifdef _GREP_DEBUG
+#define DPRINT_BMGS(len, fmt_str, sh) \
+ for (unsigned int i = 0; i < len; i++) \
+ DPRINT((fmt_str, i, sh[i]));
+#else
+#define DPRINT_BMGS(len, fmt_str, sh) \
+ do { } while(/*CONSTCOND*/0)
+#endif
+
+/*
+ * Fills in the good suffix table for SB/MB strings.
+ */
+#define FILL_BMGS \
+ if (!fg->hasdot) \
+ { \
+ fg->sbmGs = xmalloc(fg->len * sizeof(int)); \
+ if (!fg->sbmGs) \
+ return REG_ESPACE; \
+ if (fg->len == 1) \
+ fg->sbmGs[0] = 1; \
+ else \
+ _FILL_BMGS(fg->sbmGs, fg->pattern, fg->len, false); \
+ DPRINT_BMGS(fg->len, "GS shift for pos %d is %d\n", fg->sbmGs); \
+ }
+
+/*
+ * Fills in the good suffix table for wide strings.
+ */
+#define FILL_BMGS_WIDE \
+ if (!fg->hasdot) \
+ { \
+ fg->bmGs = xmalloc(fg->wlen * sizeof(int)); \
+ if (!fg->bmGs) \
+ return REG_ESPACE; \
+ if (fg->wlen == 1) \
+ fg->bmGs[0] = 1; \
+ else \
+ _FILL_BMGS(fg->bmGs, fg->wpattern, fg->wlen, true); \
+ DPRINT_BMGS(fg->wlen, "GS shift (wide) for pos %d is %d\n", \
+ fg->bmGs); \
+ }
+
+#define _FILL_BMGS(arr, pat, plen, wide) \
+ { \
+ char *p; \
+ tre_char_t *wp; \
+ \
+ if (wide) \
+ { \
+ if (fg->icase) \
+ { \
+ wp = xmalloc(plen * sizeof(tre_char_t)); \
+ if (wp == NULL) \
+ return REG_ESPACE; \
+ for (unsigned int i = 0; i < plen; i++) \
+ wp[i] = towlower(pat[i]); \
+ _CALC_BMGS(arr, wp, plen); \
+ xfree(wp); \
+ } \
+ else \
+ _CALC_BMGS(arr, pat, plen); \
+ } \
+ else \
+ { \
+ if (fg->icase) \
+ { \
+ p = xmalloc(plen); \
+ if (p == NULL) \
+ return REG_ESPACE; \
+ for (unsigned int i = 0; i < plen; i++) \
+ p[i] = tolower(pat[i]); \
+ _CALC_BMGS(arr, p, plen); \
+ xfree(p); \
+ } \
+ else \
+ _CALC_BMGS(arr, pat, plen); \
+ } \
+ }
+
+#define _CALC_BMGS(arr, pat, plen) \
+ { \
+ int f = 0, g; \
+ \
+ int *suff = xmalloc(plen * sizeof(int)); \
+ if (suff == NULL) \
+ return REG_ESPACE; \
+ \
+ suff[plen - 1] = plen; \
+ g = plen - 1; \
+ for (int i = plen - 2; i >= 0; i--) \
+ { \
+ if (i > g && suff[i + plen - 1 - f] < i - g) \
+ suff[i] = suff[i + plen - 1 - f]; \
+ else \
+ { \
+ if (i < g) \
+ g = i; \
+ f = i; \
+ while (g >= 0 && pat[g] == pat[g + plen - 1 - f]) \
+ g--; \
+ suff[i] = f - g; \
+ } \
+ } \
+ \
+ for (unsigned int i = 0; i < plen; i++) \
+ arr[i] = plen; \
+ g = 0; \
+ for (int i = plen - 1; i >= 0; i--) \
+ if (suff[i] == i + 1) \
+ for(; (unsigned long)g < plen - 1 - i; g++) \
+ if (arr[g] == plen) \
+ arr[g] = plen - 1 - i; \
+ for (unsigned int i = 0; i <= plen - 2; i++) \
+ arr[plen - 1 - suff[i]] = plen - 1 - i; \
+ \
+ xfree(suff); \
+ }
+
+/*
+ * Copies the pattern pat having lenght n to p and stores
+ * the size in l.
+ */
+#define SAVE_PATTERN(src, srclen, dst, dstlen) \
+ dstlen = srclen; \
+ dst = xmalloc((dstlen + 1) * sizeof(tre_char_t)); \
+ if (dst == NULL) \
+ return REG_ESPACE; \
+ if (dstlen > 0) \
+ memcpy(dst, src, dstlen * sizeof(tre_char_t)); \
+ dst[dstlen] = TRE_CHAR('\0');
+
+/*
+ * Initializes pattern compiling.
+ */
+#define INIT_COMP \
+ /* Initialize. */ \
+ memset(fg, 0, sizeof(*fg)); \
+ fg->icase = (cflags & REG_ICASE); \
+ fg->word = (cflags & REG_WORD); \
+ fg->newline = (cflags & REG_NEWLINE); \
+ fg->nosub = (cflags & REG_NOSUB); \
+ \
+ /* Cannot handle REG_ICASE with MB string */ \
+ if (fg->icase && (TRE_MB_CUR_MAX > 1)) \
+ { \
+ DPRINT(("Cannot use fast matcher for MBS with REG_ICASE\n")); \
+ return REG_BADPAT; \
+ }
+
+/*
+ * Checks whether we have a 0-length pattern that will match
+ * anything. If literal is set to false, the EOL anchor is also
+ * taken into account.
+ */
+#define CHECK_MATCHALL(literal) \
+ if (!literal && n == 1 && pat[0] == TRE_CHAR('$')) \
+ { \
+ n--; \
+ fg->eol = true; \
+ } \
+ \
+ if (n == 0) \
+ { \
+ fg->matchall = true; \
+ fg->pattern = xmalloc(sizeof(char)); \
+ if (!fg->pattern) \
+ FAIL_COMP(REG_ESPACE); \
+ fg->pattern[0] = '\0'; \
+ fg->wpattern = xmalloc(sizeof(tre_char_t)); \
+ if (!fg->wpattern) \
+ FAIL_COMP(REG_ESPACE); \
+ fg->wpattern[0] = TRE_CHAR('\0'); \
+ DPRINT(("Matching every input\n")); \
+ return REG_OK; \
+ }
+
+/*
+ * Returns: REG_OK on success, error code otherwise
+ */
+int
+tre_compile_literal(fastmatch_t *fg, const tre_char_t *pat, size_t n,
+ int cflags)
+{
+ size_t hasdot = 0, whasdot = 0;
+ ssize_t firstdot = -1, wfirstdot = -1;
+
+ INIT_COMP;
+
+ CHECK_MATCHALL(true);
+
+ /* Cannot handle word boundaries with MB string */
+ if (fg->word && (TRE_MB_CUR_MAX > 1))
+ return REG_BADPAT;
+
+#ifdef TRE_WCHAR
+ SAVE_PATTERN(pat, n, fg->wpattern, fg->wlen);
+ STORE_MBS_PAT;
+#else
+ SAVE_PATTERN(pat, n, fg->pattern, fg->len);
+#endif
+
+ DPRINT(("tre_compile_literal: pattern: %s, len %zu, icase: %c, word: %c, "
+ "newline %c\n", fg->pattern, fg->len, fg->icase ? 'y' : 'n',
+ fg->word ? 'y' : 'n', fg->newline ? 'y' : 'n'));
+
+ FILL_QSBC;
+ FILL_BMGS;
+#ifdef TRE_WCHAR
+ FILL_QSBC_WIDE;
+ FILL_BMGS_WIDE;
+#endif
+
+ return REG_OK;
+}
+
+/*
+ * Returns: REG_OK on success, error code otherwise
+ */
+int
+tre_compile_fast(fastmatch_t *fg, const tre_char_t *pat, size_t n,
+ int cflags)
+{
+ tre_char_t *tmp;
+ size_t pos = 0, hasdot = 0, whasdot = 0;;
+ ssize_t firstdot = -1, wfirstdot = -1;
+ bool escaped = false;
+ bool *_escmap = NULL;
+
+ INIT_COMP;
+
+ /* Remove beginning-of-line character ('^'). */
+ if (pat[0] == TRE_CHAR('^'))
+ {
+ fg->bol = true;
+ n--;
+ pat++;
+ }
+
+ CHECK_MATCHALL(false);
+
+ /* Handle word-boundary matching when GNU extensions are enabled */
+ if ((cflags & REG_GNU) && (n >= 14) &&
+ (memcmp(pat, TRE_CHAR("[[:<:]]"), 7 * sizeof(tre_char_t)) == 0) &&
+ (memcmp(pat + n - 7, TRE_CHAR("[[:>:]]"),
+ 7 * sizeof(tre_char_t)) == 0))
+ {
+ n -= 14;
+ pat += 7;
+ fg->word = true;
+ }
+
+ /* Cannot handle word boundaries with MB string */
+ if (fg->word && (TRE_MB_CUR_MAX > 1))
+ return REG_BADPAT;
+
+ tmp = xmalloc((n + 1) * sizeof(tre_char_t));
+ if (tmp == NULL)
+ return REG_ESPACE;
+
+/* Copies the char into the stored pattern and skips to the next char. */
+#define STORE_CHAR \
+ do \
+ { \
+ tmp[pos++] = pat[i]; \
+ escaped = false; \
+ continue; \
+ } while (0)
+
+ /* Traverse the input pattern for processing */
+ for (unsigned int i = 0; i < n; i++)
+ {
+ switch (pat[i])
+ {
+ case TRE_CHAR('\\'):
+ if (escaped)
+ STORE_CHAR;
+ else if (i == n - 1)
+ goto badpat;
+ else
+ escaped = true;
+ continue;
+ case TRE_CHAR('['):
+ if (escaped)
+ STORE_CHAR;
+ else
+ goto badpat;
+ continue;
+ case TRE_CHAR('*'):
+ if (escaped || (!(cflags & REG_EXTENDED) && (i == 0)))
+ STORE_CHAR;
+ else
+ goto badpat;
+ continue;
+ case TRE_CHAR('+'):
+ case TRE_CHAR('?'):
+ if ((cflags & REG_EXTENDED) && (i == 0))
+ continue;
+ else if ((cflags & REG_EXTENDED) ^ !escaped)
+ STORE_CHAR;
+ else
+ goto badpat;
+ continue;
+ case TRE_CHAR('.'):
+ if (escaped)
+ {
+ if (!_escmap)
+ _escmap = xmalloc(n * sizeof(bool));
+ if (!_escmap)
+ {
+ xfree(tmp);
+ return REG_ESPACE;
+ }
+ _escmap[i] = true;
+ STORE_CHAR;
+ }
+ else
+ {
+ whasdot = i;
+ if (wfirstdot == -1)
+ wfirstdot = i;
+ STORE_CHAR;
+ }
+ continue;
+ case TRE_CHAR('^'):
+ STORE_CHAR;
+ continue;
+ case TRE_CHAR('$'):
+ if (!escaped && (i == n - 1))
+ fg->eol = true;
+ else
+ STORE_CHAR;
+ continue;
+ case TRE_CHAR('('):
+ if ((cflags & REG_EXTENDED) ^ escaped)
+ goto badpat;
+ else
+ STORE_CHAR;
+ continue;
+ case TRE_CHAR('{'):
+ if (!(cflags & REG_EXTENDED) ^ escaped)
+ STORE_CHAR;
+ else if (!(cflags & REG_EXTENDED) && (i == 0))
+ STORE_CHAR;
+ else if ((cflags & REG_EXTENDED) && (i == 0))
+ continue;
+ else
+ goto badpat;
+ continue;
+ case TRE_CHAR('|'):
+ if ((cflags & REG_EXTENDED) ^ escaped)
+ goto badpat;
+ else
+ STORE_CHAR;
+ continue;
+ default:
+ if (escaped)
+ goto badpat;
+ else
+ STORE_CHAR;
+ continue;
+ }
+ continue;
+badpat:
+ xfree(tmp);
+ DPRINT(("tre_compile_fast: compilation of pattern failed, falling"
+ "back to NFA\n"));
+ return REG_BADPAT;
+ }
+
+ fg->hasdot = whasdot;
+
+ /*
+ * The pattern has been processed and copied to tmp as a literal string
+ * with escapes, anchors (^$) and the word boundary match character
+ * classes stripped out.
+ */
+#ifdef TRE_WCHAR
+ SAVE_PATTERN(tmp, pos, fg->wpattern, fg->wlen);
+ fg->wescmap = _escmap;
+ STORE_MBS_PAT;
+
+ /*
+ * The position of dots and escaped dots is different in the MB string
+ * than in to the wide string so traverse the converted string, as well,
+ * to store these positions.
+ */
+ if (fg->hasdot || (fg->wescmap != NULL))
+ {
+ if (fg->wescmap != NULL)
+ {
+ fg->escmap = xmalloc(fg->len * sizeof(bool));
+ if (!fg->escmap)
+ {
+ tre_free_fast(fg);
+ return REG_ESPACE;
+ }
+ }
+
+ escaped = false;
+ for (unsigned int i = 0; i < fg->len; i++)
+ if (fg->pattern[i] == '\\')
+ escaped = !escaped;
+ else if (fg->pattern[i] == '.' && escaped)
+ {
+ fg->escmap[i] = true;
+ escaped = false;
+ }
+ else if (fg->pattern[i] == '.' && !escaped)
+ {
+ hasdot = i;
+ if (firstdot == -1)
+ firstdot = i;
+ }
+ else
+ escaped = false;
+ }
+#else
+ SAVE_PATTERN(tmp, pos, fg->pattern, fg->len);
+ fg->escmap = _escmap;
+#endif
+
+ xfree(tmp);
+
+ DPRINT(("tre_compile_fast: pattern: %s, len %zu, bol %c, eol %c, "
+ "icase: %c, word: %c, newline %c\n", fg->pattern, fg->len,
+ fg->bol ? 'y' : 'n', fg->eol ? 'y' : 'n',
+ fg->icase ? 'y' : 'n', fg->word ? 'y' : 'n',
+ fg->newline ? 'y' : 'n'));
+
+ /* Check whether reverse QS algorithm is more efficient */
+ if ((wfirstdot > -1) && (fg->wlen - whasdot + 1 < (size_t)wfirstdot) &&
+ fg->nosub)
+ {
+ fg->reversed = true;
+ DPRINT(("tre_compile_fast: using reverse QS algorithm\n"));
+ }
+
+ FILL_QSBC;
+ FILL_BMGS;
+#ifdef TRE_WCHAR
+ FILL_QSBC_WIDE;
+ FILL_BMGS_WIDE;
+#endif
+
+ return REG_OK;
+}
+
+#define _SHIFT_ONE \
+ { \
+ shift = 1; \
+ j = !fg->reversed ? j + shift : j - shift; \
+ continue; \
+ }
+
+#define _BBOUND_COND \
+ ((type == STR_WIDE) ? \
+ ((j == 0) || !(tre_isalnum(str_wide[j - 1]) || \
+ (str_wide[j - 1] == TRE_CHAR('_')))) : \
+ ((j == 0) || !(tre_isalnum(str_byte[j - 1]) || \
+ (str_byte[j - 1] == '_'))))
+
+#define _EBOUND_COND \
+ ((type == STR_WIDE) ? \
+ ((j + fg->wlen == len) || !(tre_isalnum(str_wide[j + fg->wlen]) || \
+ (str_wide[j + fg->wlen] == TRE_CHAR('_')))) : \
+ ((j + fg->len == len) || !(tre_isalnum(str_byte[j + fg->len]) || \
+ (str_byte[j + fg->len] == '_'))))
+
+/*
+ * Condition to check whether the match on position j is on a
+ * word boundary.
+ */
+#define IS_ON_WORD_BOUNDARY \
+ (_BBOUND_COND && _EBOUND_COND)
+
+/*
+ * Checks word boundary and shifts one if match is not on a
+ * boundary.
+ */
+#define CHECK_WORD_BOUNDARY \
+ if (!IS_ON_WORD_BOUNDARY) \
+ _SHIFT_ONE;
+
+#define _BOL_COND \
+ ((j == 0) || ((type == STR_WIDE) ? (str_wide[j - 1] == TRE_CHAR('\n'))\
+ : (str_byte[j - 1] == '\n')))
+
+/*
+ * Checks BOL anchor and shifts one if match is not on a
+ * boundary.
+ */
+#define CHECK_BOL_ANCHOR \
+ if (!_BOL_COND) \
+ _SHIFT_ONE;
+
+#define _EOL_COND \
+ ((type == STR_WIDE) \
+ ? ((j + fg->wlen == len) || \
+ (str_wide[j + fg->wlen] == TRE_CHAR('\n'))) \
+ : ((j + fg->len == len) || (str_byte[j + fg->wlen] == '\n')))
+
+/*
+ * Checks EOL anchor and shifts one if match is not on a
+ * boundary.
+ */
+#define CHECK_EOL_ANCHOR \
+ if (!_EOL_COND) \
+ _SHIFT_ONE;
+
+/*
+ * Executes matching of the precompiled pattern on the input string.
+ * Returns REG_OK or REG_NOMATCH depending on if we find a match or not.
+ */
+int
+tre_match_fast(const fastmatch_t *fg, const void *data, size_t len,
+ tre_str_type_t type, int nmatch, regmatch_t pmatch[], int eflags)
+{
+ unsigned int shift, u = 0, v = 0;
+ ssize_t j = 0;
+ int ret = REG_NOMATCH;
+ int mismatch;
+ const char *str_byte = data;
+ const void *startptr = NULL;
+ const tre_char_t *str_wide = data;
+
+ /* Calculate length if unspecified. */
+ if (len == (size_t)-1)
+ switch (type)
+ {
+ case STR_WIDE:
+ len = tre_strlen(str_wide);
+ break;
+ default:
+ len = strlen(str_byte);
+ break;
+ }
+
+ /* Shortcut for empty pattern */
+ if (fg->matchall)
+ {
+ if (!fg->nosub && nmatch >= 1)
+ {
+ pmatch[0].rm_so = 0;
+ pmatch[0].rm_eo = len;
+ }
+ if (fg->bol && fg->eol)
+ return (len == 0) ? REG_OK : REG_NOMATCH;
+ else
+ return REG_OK;
+ }
+
+ /* No point in going farther if we do not have enough data. */
+ switch (type)
+ {
+ case STR_WIDE:
+ if (len < fg->wlen)
+ return ret;
+ shift = fg->wlen;
+ break;
+ default:
+ if (len < fg->len)
+ return ret;
+ shift = fg->len;
+ }
+
+ /*
+ * REG_NOTBOL means not anchoring ^ to the beginning of the line, so we
+ * can shift one because there can't be a match at the beginning.
+ */
+ if (fg->bol && (eflags & REG_NOTBOL))
+ j = 1;
+
+ /*
+ * Like above, we cannot have a match at the very end when anchoring to
+ * the end and REG_NOTEOL is specified.
+ */
+ if (fg->eol && (eflags & REG_NOTEOL))
+ len--;
+
+ if (fg->reversed)
+ j = len - (type == STR_WIDE ? fg->wlen : fg->len);
+
+
+ /* Only try once at the beginning or ending of the line. */
+ if ((fg->bol || fg->eol) && !fg->newline && !(eflags & REG_NOTBOL) &&
+ !(eflags & REG_NOTEOL))
+ {
+ /* Simple text comparison. */
+ if (!((fg->bol && fg->eol) &&
+ (type == STR_WIDE ? (len != fg->wlen) : (len != fg->len))))
+ {
+ /* Determine where in data to start search at. */
+ j = fg->eol ? len - (type == STR_WIDE ? fg->wlen : fg->len) : 0;
+ SKIP_CHARS(j);
+ mismatch = fastcmp(fg, startptr, type);
+ if (mismatch == REG_OK)
+ {
+ if (fg->word && !IS_ON_WORD_BOUNDARY)
+ return ret;
+ if (!fg->nosub && nmatch >= 1)
+ {
+ pmatch[0].rm_so = j;
+ pmatch[0].rm_eo = j + (type == STR_WIDE ? fg->wlen : fg->len);
+ }
+ return REG_OK;
+ }
+ }
+ }
+ else
+ {
+ /* Quick Search / Turbo Boyer-Moore algorithm. */
+ do
+ {
+ SKIP_CHARS(j);
+ mismatch = fastcmp(fg, startptr, type);
+ if (mismatch == REG_OK)
+ {
+ if (fg->word)
+ CHECK_WORD_BOUNDARY;
+ if (fg->bol)
+ CHECK_BOL_ANCHOR;
+ if (fg->eol)
+ CHECK_EOL_ANCHOR;
+ if (!fg->nosub && nmatch >= 1)
+ {
+ pmatch[0].rm_so = j;
+ pmatch[0].rm_eo = j + ((type == STR_WIDE) ? fg->wlen : fg->len);
+ }
+ return REG_OK;
+ }
+ else if (mismatch > 0)
+ return mismatch;
+ mismatch = -mismatch - 1;
+ SHIFT;
+ } while (!IS_OUT_OF_BOUNDS);
+ }
+ return ret;
+}
+
+/*
+ * Frees the resources that were allocated when the pattern was compiled.
+ */
+void
+tre_free_fast(fastmatch_t *fg)
+{
+
+ DPRINT(("tre_fast_free: freeing structures for pattern %s\n",
+ fg->pattern));
+
+#ifdef TRE_WCHAR
+ hashtable_free(fg->qsBc_table);
+ if (!fg->hasdot)
+ xfree(fg->bmGs);
+ if (fg->wescmap)
+ xfree(fg->wescmap);
+ xfree(fg->wpattern);
+#endif
+ if (!fg->hasdot)
+ xfree(fg->sbmGs);
+ if (fg->escmap)
+ xfree(fg->escmap);
+ xfree(fg->pattern);
+}
+
+/*
+ * Returns: -(i + 1) on failure (position that it failed with minus sign)
+ * error code on error
+ * REG_OK on success
+ */
+static inline int
+fastcmp(const fastmatch_t *fg, const void *data, tre_str_type_t type)
+{
+ const char *str_byte = data;
+ const char *pat_byte = fg->pattern;
+ const tre_char_t *str_wide = data;
+ const tre_char_t *pat_wide = fg->wpattern;
+ const bool *escmap = (type == STR_WIDE) ? fg->wescmap : fg->escmap;
+ size_t len = (type == STR_WIDE) ? fg->wlen : fg->len;
+ int ret = REG_OK;
+
+ /* Compare the pattern and the input char-by-char from the last position. */
+ for (int i = len - 1; i >= 0; i--) {
+ switch (type)
+ {
+ case STR_WIDE:
+
+ /* Check dot */
+ if (fg->hasdot && pat_wide[i] == TRE_CHAR('.') &&
+ (!escmap || !escmap[i]) &&
+ (!fg->newline || (str_wide[i] != TRE_CHAR('\n'))))
+ continue;
+
+ /* Compare */
+ if (fg->icase ? (towlower(pat_wide[i]) == towlower(str_wide[i]))
+ : (pat_wide[i] == str_wide[i]))
+ continue;
+ break;
+ default:
+ /* Check dot */
+ if (fg->hasdot && pat_byte[i] == '.' &&
+ (!escmap || !escmap[i]) &&
+ (!fg->newline || (str_byte[i] != '\n')))
+ continue;
+
+ /* Compare */
+ if (fg->icase ? (tolower(pat_byte[i]) == tolower(str_byte[i]))
+ : (pat_byte[i] == str_byte[i]))
+ continue;
+ }
+ DPRINT(("fastcmp: mismatch at position %d\n", i));
+ ret = -(i + 1);
+ break;
+ }
+ return ret;
+}
diff --git a/usr.bin/grep/regex/tre-fastmatch.h b/usr.bin/grep/regex/tre-fastmatch.h
new file mode 100644
index 0000000..f72397c
--- /dev/null
+++ b/usr.bin/grep/regex/tre-fastmatch.h
@@ -0,0 +1,21 @@
+/* $FreeBSD$ */
+
+#ifndef TRE_FASTMATCH_H
+#define TRE_FASTMATCH_H 1
+
+#include <fastmatch.h>
+#include <hashtable.h>
+#include <limits.h>
+#include <regex.h>
+#include <stdbool.h>
+
+#include "hashtable.h"
+
+int tre_compile_literal(fastmatch_t *preg, const tre_char_t *regex,
+ size_t, int);
+int tre_compile_fast(fastmatch_t *preg, const tre_char_t *regex, size_t, int);
+int tre_match_fast(const fastmatch_t *fg, const void *data, size_t len,
+ tre_str_type_t type, int nmatch, regmatch_t pmatch[], int eflags);
+void tre_free_fast(fastmatch_t *preg);
+
+#endif /* TRE_FASTMATCH_H */
diff --git a/usr.bin/grep/regex/xmalloc.c b/usr.bin/grep/regex/xmalloc.c
new file mode 100644
index 0000000..b1acc9c
--- /dev/null
+++ b/usr.bin/grep/regex/xmalloc.c
@@ -0,0 +1,349 @@
+/* $FreeBSD$ */
+
+/*
+ xmalloc.c - Simple malloc debugging library implementation
+
+ This software is released under a BSD-style license.
+ See the file LICENSE for details and copyright.
+
+*/
+
+/*
+ TODO:
+ - red zones
+ - group dumps by source location
+*/
+
+#include <stdlib.h>
+#include <assert.h>
+#include <stdio.h>
+#define XMALLOC_INTERNAL 1
+#include "xmalloc.h"
+
+
+/*
+ Internal stuff.
+*/
+
+typedef struct hashTableItemRec {
+ void *ptr;
+ int bytes;
+ const char *file;
+ int line;
+ const char *func;
+ struct hashTableItemRec *next;
+} hashTableItem;
+
+typedef struct {
+ hashTableItem **table;
+} hashTable;
+
+static int xmalloc_peak;
+int xmalloc_current;
+static int xmalloc_peak_blocks;
+int xmalloc_current_blocks;
+static int xmalloc_fail_after;
+
+#define TABLE_BITS 8
+#define TABLE_MASK ((1 << TABLE_BITS) - 1)
+#define TABLE_SIZE (1 << TABLE_BITS)
+
+static hashTable *
+hash_table_new(void)
+{
+ hashTable *tbl;
+
+ tbl = malloc(sizeof(*tbl));
+
+ if (tbl != NULL)
+ {
+ tbl->table = calloc(TABLE_SIZE, sizeof(*tbl->table));
+
+ if (tbl->table == NULL)
+ {
+ free(tbl);
+ return NULL;
+ }
+ }
+
+ return tbl;
+}
+
+static int
+hash_void_ptr(void *ptr)
+{
+ int hash;
+ int i;
+
+ /* I took this hash function just off the top of my head, I have
+ no idea whether it is bad or very bad. */
+ hash = 0;
+ for (i = 0; i < (int)sizeof(ptr)*8 / TABLE_BITS; i++)
+ {
+ hash ^= (unsigned long)ptr >> i*8;
+ hash += i * 17;
+ hash &= TABLE_MASK;
+ }
+ return hash;
+}
+
+static void
+hash_table_add(hashTable *tbl, void *ptr, int bytes,
+ const char *file, int line, const char *func)
+{
+ int i;
+ hashTableItem *item, *new;
+
+ i = hash_void_ptr(ptr);
+
+ item = tbl->table[i];
+ if (item != NULL)
+ while (item->next != NULL)
+ item = item->next;
+
+ new = malloc(sizeof(*new));
+ assert(new != NULL);
+ new->ptr = ptr;
+ new->bytes = bytes;
+ new->file = file;
+ new->line = line;
+ new->func = func;
+ new->next = NULL;
+ if (item != NULL)
+ item->next = new;
+ else
+ tbl->table[i] = new;
+
+ xmalloc_current += bytes;
+ if (xmalloc_current > xmalloc_peak)
+ xmalloc_peak = xmalloc_current;
+ xmalloc_current_blocks++;
+ if (xmalloc_current_blocks > xmalloc_peak_blocks)
+ xmalloc_peak_blocks = xmalloc_current_blocks;
+}
+
+static void
+hash_table_del(hashTable *tbl, void *ptr)
+{
+ int i;
+ hashTableItem *item, *prev;
+
+ i = hash_void_ptr(ptr);
+
+ item = tbl->table[i];
+ if (item == NULL)
+ {
+ printf("xfree: invalid ptr %p\n", ptr);
+ abort();
+ }
+ prev = NULL;
+ while (item->ptr != ptr)
+ {
+ prev = item;
+ item = item->next;
+ }
+ if (item->ptr != ptr)
+ {
+ printf("xfree: invalid ptr %p\n", ptr);
+ abort();
+ }
+
+ xmalloc_current -= item->bytes;
+ xmalloc_current_blocks--;
+
+ if (prev != NULL)
+ {
+ prev->next = item->next;
+ free(item);
+ }
+ else
+ {
+ tbl->table[i] = item->next;
+ free(item);
+ }
+}
+
+static hashTable *xmalloc_table = NULL;
+
+static void
+xmalloc_init(void)
+{
+ if (xmalloc_table == NULL)
+ {
+ xmalloc_table = hash_table_new();
+ xmalloc_peak = 0;
+ xmalloc_peak_blocks = 0;
+ xmalloc_current = 0;
+ xmalloc_current_blocks = 0;
+ xmalloc_fail_after = -1;
+ }
+ assert(xmalloc_table != NULL);
+ assert(xmalloc_table->table != NULL);
+}
+
+
+
+/*
+ Public API.
+*/
+
+void
+xmalloc_configure(int fail_after)
+{
+ xmalloc_init();
+ xmalloc_fail_after = fail_after;
+}
+
+int
+xmalloc_dump_leaks(void)
+{
+ int i;
+ int num_leaks = 0;
+ int leaked_bytes = 0;
+ hashTableItem *item;
+
+ xmalloc_init();
+
+ for (i = 0; i < TABLE_SIZE; i++)
+ {
+ item = xmalloc_table->table[i];
+ while (item != NULL)
+ {
+ printf("%s:%d: %s: %d bytes at %p not freed\n",
+ item->file, item->line, item->func, item->bytes, item->ptr);
+ num_leaks++;
+ leaked_bytes += item->bytes;
+ item = item->next;
+ }
+ }
+ if (num_leaks == 0)
+ printf("No memory leaks.\n");
+ else
+ printf("%d unfreed memory chuncks, total %d unfreed bytes.\n",
+ num_leaks, leaked_bytes);
+ printf("Peak memory consumption %d bytes (%.1f kB, %.1f MB) in %d blocks ",
+ xmalloc_peak, (double)xmalloc_peak / 1024,
+ (double)xmalloc_peak / (1024*1024), xmalloc_peak_blocks);
+ printf("(average ");
+ if (xmalloc_peak_blocks)
+ printf("%d", ((xmalloc_peak + xmalloc_peak_blocks / 2)
+ / xmalloc_peak_blocks));
+ else
+ printf("N/A");
+ printf(" bytes per block).\n");
+
+ return num_leaks;
+}
+
+void *
+xmalloc_impl(size_t size, const char *file, int line, const char *func)
+{
+ void *ptr;
+
+ xmalloc_init();
+ assert(size > 0);
+
+ if (xmalloc_fail_after == 0)
+ {
+ xmalloc_fail_after = -2;
+#if 0
+ printf("xmalloc: forced failure %s:%d: %s\n", file, line, func);
+#endif
+ return NULL;
+ }
+ else if (xmalloc_fail_after == -2)
+ {
+ printf("xmalloc: called after failure from %s:%d: %s\n",
+ file, line, func);
+ assert(0);
+ }
+ else if (xmalloc_fail_after > 0)
+ xmalloc_fail_after--;
+
+ ptr = malloc(size);
+ if (ptr != NULL)
+ hash_table_add(xmalloc_table, ptr, (int)size, file, line, func);
+ return ptr;
+}
+
+void *
+xcalloc_impl(size_t nmemb, size_t size, const char *file, int line,
+ const char *func)
+{
+ void *ptr;
+
+ xmalloc_init();
+ assert(size > 0);
+
+ if (xmalloc_fail_after == 0)
+ {
+ xmalloc_fail_after = -2;
+#if 0
+ printf("xcalloc: forced failure %s:%d: %s\n", file, line, func);
+#endif
+ return NULL;
+ }
+ else if (xmalloc_fail_after == -2)
+ {
+ printf("xcalloc: called after failure from %s:%d: %s\n",
+ file, line, func);
+ assert(0);
+ }
+ else if (xmalloc_fail_after > 0)
+ xmalloc_fail_after--;
+
+ ptr = calloc(nmemb, size);
+ if (ptr != NULL)
+ hash_table_add(xmalloc_table, ptr, (int)(nmemb * size), file, line, func);
+ return ptr;
+}
+
+void
+xfree_impl(void *ptr, const char *file, int line, const char *func)
+{
+ /*LINTED*/(void)&file;
+ /*LINTED*/(void)&line;
+ /*LINTED*/(void)&func;
+ xmalloc_init();
+
+ if (ptr != NULL)
+ hash_table_del(xmalloc_table, ptr);
+ free(ptr);
+}
+
+void *
+xrealloc_impl(void *ptr, size_t new_size, const char *file, int line,
+ const char *func)
+{
+ void *new_ptr;
+
+ xmalloc_init();
+ assert(ptr != NULL);
+ assert(new_size > 0);
+
+ if (xmalloc_fail_after == 0)
+ {
+ xmalloc_fail_after = -2;
+ return NULL;
+ }
+ else if (xmalloc_fail_after == -2)
+ {
+ printf("xrealloc: called after failure from %s:%d: %s\n",
+ file, line, func);
+ assert(0);
+ }
+ else if (xmalloc_fail_after > 0)
+ xmalloc_fail_after--;
+
+ new_ptr = realloc(ptr, new_size);
+ if (new_ptr != NULL)
+ {
+ hash_table_del(xmalloc_table, ptr);
+ hash_table_add(xmalloc_table, new_ptr, (int)new_size, file, line, func);
+ }
+ return new_ptr;
+}
+
+
+
+/* EOF */
diff --git a/usr.bin/grep/regex/xmalloc.h b/usr.bin/grep/regex/xmalloc.h
new file mode 100644
index 0000000..5cde986
--- /dev/null
+++ b/usr.bin/grep/regex/xmalloc.h
@@ -0,0 +1,79 @@
+/* $FreeBSD$ */
+
+/*
+ xmalloc.h - Simple malloc debugging library API
+
+ This software is released under a BSD-style license.
+ See the file LICENSE for details and copyright.
+
+*/
+
+#ifndef _XMALLOC_H
+#define _XMALLOC_H 1
+
+void *xmalloc_impl(size_t size, const char *file, int line, const char *func);
+void *xcalloc_impl(size_t nmemb, size_t size, const char *file, int line,
+ const char *func);
+void xfree_impl(void *ptr, const char *file, int line, const char *func);
+void *xrealloc_impl(void *ptr, size_t new_size, const char *file, int line,
+ const char *func);
+int xmalloc_dump_leaks(void);
+void xmalloc_configure(int fail_after);
+
+
+#ifndef XMALLOC_INTERNAL
+#ifdef MALLOC_DEBUGGING
+
+/* Version 2.4 and later of GCC define a magical variable `__PRETTY_FUNCTION__'
+ which contains the name of the function currently being defined.
+# define __XMALLOC_FUNCTION __PRETTY_FUNCTION__
+ This is broken in G++ before version 2.6.
+ C9x has a similar variable called __func__, but prefer the GCC one since
+ it demangles C++ function names. */
+# ifdef __GNUC__
+# if __GNUC__ > 2 || (__GNUC__ == 2 \
+ && __GNUC_MINOR__ >= (defined __cplusplus ? 6 : 4))
+# define __XMALLOC_FUNCTION __PRETTY_FUNCTION__
+# else
+# define __XMALLOC_FUNCTION ((const char *) 0)
+# endif
+# else
+# if defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L
+# define __XMALLOC_FUNCTION __func__
+# else
+# define __XMALLOC_FUNCTION ((const char *) 0)
+# endif
+# endif
+
+#define xmalloc(size) xmalloc_impl(size, __FILE__, __LINE__, \
+ __XMALLOC_FUNCTION)
+#define xcalloc(nmemb, size) xcalloc_impl(nmemb, size, __FILE__, __LINE__, \
+ __XMALLOC_FUNCTION)
+#define xfree(ptr) xfree_impl(ptr, __FILE__, __LINE__, __XMALLOC_FUNCTION)
+#define xrealloc(ptr, new_size) xrealloc_impl(ptr, new_size, __FILE__, \
+ __LINE__, __XMALLOC_FUNCTION)
+#undef malloc
+#undef calloc
+#undef free
+#undef realloc
+
+#define malloc USE_XMALLOC_INSTEAD_OF_MALLOC
+#define calloc USE_XCALLOC_INSTEAD_OF_CALLOC
+#define free USE_XFREE_INSTEAD_OF_FREE
+#define realloc USE_XREALLOC_INSTEAD_OF_REALLOC
+
+#else /* !MALLOC_DEBUGGING */
+
+#include <stdlib.h>
+
+#define xmalloc(size) malloc(size)
+#define xcalloc(nmemb, size) calloc(nmemb, size)
+#define xfree(ptr) free(ptr)
+#define xrealloc(ptr, new_size) realloc(ptr, new_size)
+
+#endif /* !MALLOC_DEBUGGING */
+#endif /* !XMALLOC_INTERNAL */
+
+#endif /* _XMALLOC_H */
+
+/* EOF */
diff --git a/usr.bin/grep/util.c b/usr.bin/grep/util.c
index 780f0fc..e209968 100644
--- a/usr.bin/grep/util.c
+++ b/usr.bin/grep/util.c
@@ -49,6 +49,7 @@ __FBSDID("$FreeBSD$");
#include <wchar.h>
#include <wctype.h>
+#include "fastmatch.h"
#include "grep.h"
static int linesqueued;
@@ -232,13 +233,8 @@ procfile(const char *fn)
linesqueued++;
}
c += t;
-
- /* Count the matches if we have a match limit */
- if (mflag) {
- mcount -= t;
- if (mcount <= 0)
- break;
- }
+ if (mflag && mcount < 0)
+ break;
}
if (Bflag > 0)
clearqueue();
@@ -280,79 +276,77 @@ procline(struct str *l, int nottext)
unsigned int i;
int c = 0, m = 0, r = 0;
- if (!matchall) {
- /* Loop to process the whole line */
- while (st <= l->len) {
- pmatch.rm_so = st;
- pmatch.rm_eo = l->len;
+ /* Loop to process the whole line */
+ while (st <= l->len) {
+ pmatch.rm_so = st;
+ pmatch.rm_eo = l->len;
- /* Loop to compare with all the patterns */
- for (i = 0; i < patterns; i++) {
-/*
- * XXX: grep_search() is a workaround for speed up and should be
- * removed in the future. See fastgrep.c.
- */
- if (fg_pattern[i].pattern)
- r = grep_search(&fg_pattern[i],
- (unsigned char *)l->dat,
- l->len, &pmatch);
- else
- r = regexec(&r_pattern[i], l->dat, 1,
- &pmatch, eflags);
- r = (r == 0) ? 0 : REG_NOMATCH;
- st = (cflags & REG_NOSUB)
- ? (size_t)l->len
- : (size_t)pmatch.rm_eo;
- if (r == REG_NOMATCH)
- continue;
- /* Check for full match */
- if (r == 0 && xflag)
- if (pmatch.rm_so != 0 ||
- (size_t)pmatch.rm_eo != l->len)
- r = REG_NOMATCH;
- /* Check for whole word match */
- if (r == 0 && (wflag || fg_pattern[i].word)) {
- wint_t wbegin, wend;
-
- wbegin = wend = L' ';
- if (pmatch.rm_so != 0 &&
- sscanf(&l->dat[pmatch.rm_so - 1],
- "%lc", &wbegin) != 1)
- r = REG_NOMATCH;
- else if ((size_t)pmatch.rm_eo !=
- l->len &&
- sscanf(&l->dat[pmatch.rm_eo],
- "%lc", &wend) != 1)
- r = REG_NOMATCH;
- else if (iswword(wbegin) ||
- iswword(wend))
- r = REG_NOMATCH;
- }
- if (r == 0) {
- if (m == 0)
- c++;
- if (m < MAX_LINE_MATCHES)
- matches[m++] = pmatch;
- /* matches - skip further patterns */
- if ((color == NULL && !oflag) ||
- qflag || lflag)
- break;
- }
+ /* Loop to compare with all the patterns */
+ for (i = 0; i < patterns; i++) {
+ if (fg_pattern[i].pattern)
+ r = fastexec(&fg_pattern[i],
+ l->dat, 1, &pmatch, eflags);
+ else
+ r = regexec(&r_pattern[i], l->dat, 1,
+ &pmatch, eflags);
+ r = (r == 0) ? 0 : REG_NOMATCH;
+ st = (cflags & REG_NOSUB)
+ ? (size_t)l->len
+ : (size_t)pmatch.rm_eo;
+ if (r == REG_NOMATCH)
+ continue;
+ /* Check for full match */
+ if (r == 0 && xflag)
+ if (pmatch.rm_so != 0 ||
+ (size_t)pmatch.rm_eo != l->len)
+ r = REG_NOMATCH;
+ /* Check for whole word match */
+ if (r == 0 && (wflag || fg_pattern[i].word)) {
+ wint_t wbegin, wend;
+
+ wbegin = wend = L' ';
+ if (pmatch.rm_so != 0 &&
+ sscanf(&l->dat[pmatch.rm_so - 1],
+ "%lc", &wbegin) != 1)
+ r = REG_NOMATCH;
+ else if ((size_t)pmatch.rm_eo !=
+ l->len &&
+ sscanf(&l->dat[pmatch.rm_eo],
+ "%lc", &wend) != 1)
+ r = REG_NOMATCH;
+ else if (iswword(wbegin) ||
+ iswword(wend))
+ r = REG_NOMATCH;
}
-
- if (vflag) {
- c = !c;
- break;
+ if (r == 0) {
+ if (m == 0)
+ c++;
+ if (m < MAX_LINE_MATCHES)
+ matches[m++] = pmatch;
+ /* matches - skip further patterns */
+ if ((color == NULL && !oflag) ||
+ qflag || lflag)
+ break;
}
- /* One pass if we are not recording matches */
- if ((color == NULL && !oflag) || qflag || lflag)
- break;
+ }
- if (st == (size_t)pmatch.rm_so)
- break; /* No matches */
+ if (vflag) {
+ c = !c;
+ break;
}
- } else
- c = !vflag;
+
+ /* One pass if we are not recording matches */
+ if ((color == NULL && !oflag) || qflag || lflag)
+ break;
+
+ if (st == (size_t)pmatch.rm_so)
+ break; /* No matches */
+ }
+
+
+ /* Count the matches if we have a match limit */
+ if (mflag)
+ mcount -= c;
if (c && binbehave == BINFILE_BIN && nottext)
return (c); /* Binary file */
OpenPOWER on IntegriCloud