diff options
Diffstat (limited to 'usr.bin/grep/regex')
-rw-r--r-- | usr.bin/grep/regex/fastmatch.c | 169 | ||||
-rw-r--r-- | usr.bin/grep/regex/fastmatch.h | 108 | ||||
-rw-r--r-- | usr.bin/grep/regex/glue.h | 67 | ||||
-rw-r--r-- | usr.bin/grep/regex/hashtable.c | 268 | ||||
-rw-r--r-- | usr.bin/grep/regex/hashtable.h | 35 | ||||
-rw-r--r-- | usr.bin/grep/regex/tre-compile.c | 103 | ||||
-rw-r--r-- | usr.bin/grep/regex/tre-fastmatch.c | 1042 | ||||
-rw-r--r-- | usr.bin/grep/regex/tre-fastmatch.h | 21 | ||||
-rw-r--r-- | usr.bin/grep/regex/xmalloc.c | 349 | ||||
-rw-r--r-- | usr.bin/grep/regex/xmalloc.h | 79 |
10 files changed, 2241 insertions, 0 deletions
diff --git a/usr.bin/grep/regex/fastmatch.c b/usr.bin/grep/regex/fastmatch.c new file mode 100644 index 0000000..990d847 --- /dev/null +++ b/usr.bin/grep/regex/fastmatch.c @@ -0,0 +1,169 @@ +/* $FreeBSD$ */ + +/*- + * Copyright (C) 2011 Gabor Kovesdan <gabor@FreeBSD.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include "glue.h" + +#include <errno.h> +#include <fastmatch.h> +#include <regex.h> +#include <string.h> + +#include "tre-fastmatch.h" +#include "xmalloc.h" + +int +tre_fixncomp(fastmatch_t *preg, const char *regex, size_t n, int cflags) +{ + int ret; + tre_char_t *wregex; + size_t wlen; + + if (n != 0) + { + ret = tre_convert_pattern(regex, n, &wregex, &wlen); + if (ret != REG_OK) + return ret; + else + ret = tre_compile_literal(preg, wregex, wlen, cflags); + tre_free_pattern(wregex); + return ret; + } + else + return tre_compile_literal(preg, NULL, 0, cflags); +} + +int +tre_fastncomp(fastmatch_t *preg, const char *regex, size_t n, int cflags) +{ + int ret; + tre_char_t *wregex; + size_t wlen; + + if (n != 0) + { + ret = tre_convert_pattern(regex, n, &wregex, &wlen); + if (ret != REG_OK) + return ret; + else + ret = (cflags & REG_LITERAL) + ? tre_compile_literal(preg, wregex, wlen, cflags) + : tre_compile_fast(preg, wregex, wlen, cflags); + tre_free_pattern(wregex); + return ret; + } + else + return tre_compile_literal(preg, NULL, 0, cflags); +} + + +int +tre_fixcomp(fastmatch_t *preg, const char *regex, int cflags) +{ + return tre_fixncomp(preg, regex, regex ? strlen(regex) : 0, cflags); +} + +int +tre_fastcomp(fastmatch_t *preg, const char *regex, int cflags) +{ + return tre_fastncomp(preg, regex, regex ? strlen(regex) : 0, cflags); +} + +int +tre_fixwncomp(fastmatch_t *preg, const wchar_t *regex, size_t n, int cflags) +{ + return tre_compile_literal(preg, regex, n, cflags); +} + +int +tre_fastwncomp(fastmatch_t *preg, const wchar_t *regex, size_t n, int cflags) +{ + return (cflags & REG_LITERAL) ? + tre_compile_literal(preg, regex, n, cflags) : + tre_compile_fast(preg, regex, n, cflags); +} + +int +tre_fixwcomp(fastmatch_t *preg, const wchar_t *regex, int cflags) +{ + return tre_fixwncomp(preg, regex, regex ? tre_strlen(regex) : 0, cflags); +} + +int +tre_fastwcomp(fastmatch_t *preg, const wchar_t *regex, int cflags) +{ + return tre_fastwncomp(preg, regex, regex ? tre_strlen(regex) : 0, cflags); +} + +void +tre_fastfree(fastmatch_t *preg) +{ + tre_free_fast(preg); +} + +int +tre_fastnexec(const fastmatch_t *preg, const char *string, size_t len, + size_t nmatch, regmatch_t pmatch[], int eflags) +{ + tre_str_type_t type = (TRE_MB_CUR_MAX == 1) ? STR_BYTE : STR_MBS; + + if (eflags & REG_STARTEND) + CALL_WITH_OFFSET(tre_match_fast(preg, &string[offset], slen, + type, nmatch, pmatch, eflags)); + else + return tre_match_fast(preg, string, len, type, nmatch, + pmatch, eflags); +} + +int +tre_fastexec(const fastmatch_t *preg, const char *string, size_t nmatch, + regmatch_t pmatch[], int eflags) +{ + return tre_fastnexec(preg, string, (size_t)-1, nmatch, pmatch, eflags); +} + +int +tre_fastwnexec(const fastmatch_t *preg, const wchar_t *string, size_t len, + size_t nmatch, regmatch_t pmatch[], int eflags) +{ + tre_str_type_t type = STR_WIDE; + + if (eflags & REG_STARTEND) + CALL_WITH_OFFSET(tre_match_fast(preg, &string[offset], slen, + type, nmatch, pmatch, eflags)); + else + return tre_match_fast(preg, string, len, type, nmatch, + pmatch, eflags); +} + +int +tre_fastwexec(const fastmatch_t *preg, const wchar_t *string, + size_t nmatch, regmatch_t pmatch[], int eflags) +{ + return tre_fastwnexec(preg, string, (size_t)-1, nmatch, pmatch, eflags); +} + diff --git a/usr.bin/grep/regex/fastmatch.h b/usr.bin/grep/regex/fastmatch.h new file mode 100644 index 0000000..68a50c6 --- /dev/null +++ b/usr.bin/grep/regex/fastmatch.h @@ -0,0 +1,108 @@ +/* $FreeBSD$ */ + +#ifndef FASTMATCH_H +#define FASTMATCH_H 1 + +#include <limits.h> +#include <regex.h> +#include <stdbool.h> +#include <wchar.h> + +typedef struct { + size_t wlen; + size_t len; + wchar_t *wpattern; + bool *wescmap; + unsigned int qsBc[UCHAR_MAX + 1]; + unsigned int *bmGs; + char *pattern; + bool *escmap; + unsigned int defBc; + void *qsBc_table; + unsigned int *sbmGs; + const char *re_endp; + + /* flags */ + bool hasdot; + bool bol; + bool eol; + bool word; + bool icase; + bool newline; + bool nosub; + bool matchall; + bool reversed; +} fastmatch_t; + +extern int +tre_fixcomp(fastmatch_t *preg, const char *regex, int cflags); + +extern int +tre_fastcomp(fastmatch_t *preg, const char *regex, int cflags); + +extern int +tre_fastexec(const fastmatch_t *preg, const char *string, size_t nmatch, + regmatch_t pmatch[], int eflags); + +extern void +tre_fastfree(fastmatch_t *preg); + +extern int +tre_fixwcomp(fastmatch_t *preg, const wchar_t *regex, int cflags); + +extern int +tre_fastwcomp(fastmatch_t *preg, const wchar_t *regex, int cflags); + +extern int +tre_fastwexec(const fastmatch_t *preg, const wchar_t *string, + size_t nmatch, regmatch_t pmatch[], int eflags); + +/* Versions with a maximum length argument and therefore the capability to + handle null characters in the middle of the strings. */ +extern int +tre_fixncomp(fastmatch_t *preg, const char *regex, size_t len, int cflags); + +extern int +tre_fastncomp(fastmatch_t *preg, const char *regex, size_t len, int cflags); + +extern int +tre_fastnexec(const fastmatch_t *preg, const char *string, size_t len, + size_t nmatch, regmatch_t pmatch[], int eflags); + +extern int +tre_fixwncomp(fastmatch_t *preg, const wchar_t *regex, size_t len, int cflags); + +extern int +tre_fastwncomp(fastmatch_t *preg, const wchar_t *regex, size_t len, int cflags); + +extern int +tre_fastwnexec(const fastmatch_t *preg, const wchar_t *string, size_t len, + size_t nmatch, regmatch_t pmatch[], int eflags); + +#define fixncomp tre_fixncomp +#define fastncomp tre_fastncomp +#define fixcomp tre_fixcomp +#define fastcomp tre_fastcomp +#define fixwncomp tre_fixwncomp +#define fastwncomp tre_fastwncomp +#define fixwcomp tre_fixwcomp +#define fastwcomp tre_fastwcomp +#define fastfree tre_fastfree +#define fastnexec tre_fastnexec +#define fastexec tre_fastexec +#define fastwnexec tre_fastwnexec +#define fastwexec tre_fastwexec +#define fixcomp tre_fixcomp +#define fastcomp tre_fastcomp +#define fastexec tre_fastexec +#define fastfree tre_fastfree +#define fixwcomp tre_fixwcomp +#define fastwcomp tre_fastwcomp +#define fastwexec tre_fastwexec +#define fixncomp tre_fixncomp +#define fastncomp tre_fastncomp +#define fastnexec tre_fastnexec +#define fixwncomp tre_fixwncomp +#define fastwncomp tre_fastwncomp +#define fastwnexec tre_fastwnexec +#endif /* FASTMATCH_H */ diff --git a/usr.bin/grep/regex/glue.h b/usr.bin/grep/regex/glue.h new file mode 100644 index 0000000..2fea4fd --- /dev/null +++ b/usr.bin/grep/regex/glue.h @@ -0,0 +1,67 @@ +/* $FreeBSD$ */ + +#ifndef GLUE_H +#define GLUE_H + +#include <limits.h> +#undef RE_DUP_MAX +#include <regex.h> +#include <stdio.h> +#include <stdlib.h> + +#define TRE_WCHAR 1 +#define TRE_MULTIBYTE 1 +#define HAVE_MBSTATE_T 1 + +#define TRE_CHAR(n) L##n +#define CHF "%lc" + +#define tre_char_t wchar_t +#define tre_mbrtowc(pwc, s, n, ps) (mbrtowc((pwc), (s), (n), (ps))) +#define tre_strlen wcslen +#define tre_isspace iswspace +#define tre_isalnum iswalnum + +#define REG_OK 0 +#define REG_LITERAL 0020 +#define REG_WORD 0100 +#define REG_GNU 0400 + +#define TRE_MB_CUR_MAX MB_CUR_MAX + +#ifndef _GREP_DEBUG +#define DPRINT(msg) +#else +#define DPRINT(msg) do {printf msg; fflush(stdout);} while(/*CONSTCOND*/0) +#endif + +#define MIN(a,b) ((a > b) ? (b) : (a)) +#define MAX(a,b) ((a > b) ? (a) : (b)) + +typedef enum { STR_WIDE, STR_BYTE, STR_MBS, STR_USER } tre_str_type_t; + +#define CALL_WITH_OFFSET(fn) \ + do \ + { \ + size_t slen = (size_t)(pmatch[0].rm_eo - pmatch[0].rm_so); \ + size_t offset = pmatch[0].rm_so; \ + int ret; \ + \ + if ((long long)pmatch[0].rm_eo - pmatch[0].rm_so < 0) \ + return REG_NOMATCH; \ + ret = fn; \ + for (unsigned i = 0; (!(eflags & REG_NOSUB) && (i < nmatch)); i++)\ + { \ + pmatch[i].rm_so += offset; \ + pmatch[i].rm_eo += offset; \ + } \ + return ret; \ + } while (0 /*CONSTCOND*/) + +int +tre_convert_pattern(const char *regex, size_t n, tre_char_t **w, + size_t *wn); + +void +tre_free_pattern(tre_char_t *wregex); +#endif diff --git a/usr.bin/grep/regex/hashtable.c b/usr.bin/grep/regex/hashtable.c new file mode 100644 index 0000000..41ebcd1 --- /dev/null +++ b/usr.bin/grep/regex/hashtable.c @@ -0,0 +1,268 @@ +/* $FreeBSD$ */ + +/*- + * Copyright (C) 2011 Gabor Kovesdan <gabor@FreeBSD.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include "glue.h" + +#include <errno.h> +#include <inttypes.h> +#include <stdlib.h> +#include <string.h> + +#include "hashtable.h" + + +/* + * Return a 32-bit hash of the given buffer. The init + * value should be 0, or the previous hash value to extend + * the previous hash. + */ +static uint32_t +hash32_buf(const void *buf, size_t len, uint32_t hash) +{ + const unsigned char *p = buf; + + while (len--) + hash = HASHSTEP(hash, *p++); + + return hash; +} + +/* + * Initializes a hash table that can hold table_size number of entries, + * each of which has a key of key_size bytes and a value of value_size + * bytes. On successful allocation returns a pointer to the hash table. + * Otherwise, returns NULL and sets errno to indicate the error. + */ +hashtable +*hashtable_init(size_t table_size, size_t key_size, size_t value_size) +{ + hashtable *tbl; + + DPRINT(("hashtable_init: table_size %zu, key_size %zu, value_size %zu\n", + table_size, key_size, value_size)); + + tbl = malloc(sizeof(hashtable)); + if (tbl == NULL) + goto mem1; + + tbl->entries = calloc(sizeof(hashtable_entry *), table_size); + if (tbl->entries == NULL) + goto mem2; + + tbl->table_size = table_size; + tbl->usage = 0; + tbl->key_size = key_size; + tbl->value_size = value_size; + + return (tbl); + +mem2: + free(tbl); +mem1: + DPRINT(("hashtable_init: allocation failed\n")); + errno = ENOMEM; + return (NULL); +} + +/* + * Places the key-value pair to the hashtable tbl. + * Returns: + * HASH_OK: if the key was not present in the hash table yet + * but the kay-value pair has been successfully added. + * HASH_UPDATED: if the value for the key has been updated with the + * new value. + * HASH_FULL: if the hash table is full and the entry could not + * be added. + * HASH_FAIL: if an error has occurred and errno has been set to + * indicate the error. + */ +int +hashtable_put(hashtable *tbl, const void *key, const void *value) +{ + uint32_t hash = 0; + + if (tbl->table_size == tbl->usage) + { + DPRINT(("hashtable_put: hashtable is full\n")); + return (HASH_FULL); + } + + hash = hash32_buf(key, tbl->key_size, hash) % tbl->table_size; + DPRINT(("hashtable_put: calculated hash %" PRIu32 "\n", hash)); + + /* + * On hash collision entries are inserted at the next free space, + * so we have to increase the index until we either find an entry + * with the same key (and update it) or we find a free space. + */ + for(;;) + { + if (tbl->entries[hash] == NULL) + break; + else if (memcmp(tbl->entries[hash]->key, key, tbl->key_size) == 0) + { + memcpy(tbl->entries[hash]->value, value, tbl->value_size); + DPRINT(("hashtable_put: effective location is %" PRIu32 + ", entry updated\n", hash)); + return (HASH_UPDATED); + } + if (++hash == tbl->table_size) + hash = 0; + } + + DPRINT(("hashtable_put: effective location is %" PRIu32 "\n", hash)); + + tbl->entries[hash] = malloc(sizeof(hashtable_entry)); + if (tbl->entries[hash] == NULL) + { + errno = ENOMEM; + goto mem1; + } + + tbl->entries[hash]->key = malloc(tbl->key_size); + if (tbl->entries[hash]->key == NULL) + { + errno = ENOMEM; + goto mem2; + } + + tbl->entries[hash]->value = malloc(tbl->value_size); + if (tbl->entries[hash]->value == NULL) + { + errno = ENOMEM; + goto mem3; + } + + memcpy(tbl->entries[hash]->key, key, tbl->key_size); + memcpy(tbl->entries[hash]->value, value, tbl->value_size); + tbl->usage++; + + DPRINT(("hashtable_put: entry successfully inserted\n")); + + return (HASH_OK); + +mem3: + free(tbl->entries[hash]->key); +mem2: + free(tbl->entries[hash]); +mem1: + DPRINT(("hashtable_put: insertion failed\n")); + return (HASH_FAIL); +} + +static hashtable_entry +**hashtable_lookup(const hashtable *tbl, const void *key) +{ + uint32_t hash = 0; + + hash = hash32_buf(key, tbl->key_size, hash) % tbl->table_size; + + for (;;) + { + if (tbl->entries[hash] == NULL) + return (NULL); + else if (memcmp(key, tbl->entries[hash]->key, tbl->key_size) == 0) + { + DPRINT(("hashtable_lookup: entry found at location %" PRIu32 "\n", hash)); + return (&tbl->entries[hash]); + } + + if (++hash == tbl->table_size) + hash = 0; + } +} + +/* + * Retrieves the value for key from the hash table tbl and places + * it to the space indicated by the value argument. + * Returns HASH_OK if the value has been found and retrieved or + * HASH_NOTFOUND otherwise. + */ +int +hashtable_get(hashtable *tbl, const void *key, void *value) +{ + hashtable_entry **entry; + + entry = hashtable_lookup(tbl, key); + if (entry == NULL) + { + DPRINT(("hashtable_get: entry is not available in the hashtable\n")); + return (HASH_NOTFOUND); + } + + memcpy(value, (*entry)->value, tbl->value_size); + DPRINT(("hashtable_get: entry successfully copied into output buffer\n")); + return (HASH_OK); +} + +/* + * Removes the entry with the specifified key from the hash table + * tbl. Returns HASH_OK if the entry has been found and removed + * or HASH_NOTFOUND otherwise. + */ +int +hashtable_remove(hashtable *tbl, const void *key) +{ + hashtable_entry **entry; + + entry = hashtable_lookup(tbl, key); + if (entry == NULL) + { + DPRINT(("hashtable_remove: entry is not available in the hashtable\n")); + return (HASH_NOTFOUND); + } + + free((*entry)->key); + free((*entry)->value); + free(*entry); + *entry = NULL; + + tbl->usage--; + DPRINT(("hashtable_remove: entry successfully removed\n")); + return (HASH_OK); +} + +/* + * Frees the resources associated with the hash table tbl. + */ +void +hashtable_free(hashtable *tbl) +{ + if (tbl == NULL) + return; + + for (unsigned int i = 0; i < tbl->table_size; i++) + if ((tbl->entries[i] != NULL)) + { + free(tbl->entries[i]->key); + free(tbl->entries[i]->value); + } + + free(tbl->entries); + DPRINT(("hashtable_free: resources are successfully freed\n")); +} diff --git a/usr.bin/grep/regex/hashtable.h b/usr.bin/grep/regex/hashtable.h new file mode 100644 index 0000000..f8b1daa --- /dev/null +++ b/usr.bin/grep/regex/hashtable.h @@ -0,0 +1,35 @@ +/* $FreeBSD$ */ + +#ifndef HASHTABLE_H +#define HASHTABLE_H 1 + +#include <sys/types.h> + +#define HASH_OK 0 +#define HASH_UPDATED 1 +#define HASH_FAIL 2 +#define HASH_FULL 3 +#define HASH_NOTFOUND 4 + +#define HASHSTEP(x,c) (((x << 5) + x) + (c)) + +typedef struct { + void *key; + void *value; +} hashtable_entry; + +typedef struct { + size_t key_size; + size_t table_size; + size_t usage; + size_t value_size; + hashtable_entry **entries; +} hashtable; + +void hashtable_free(hashtable *); +int hashtable_get(hashtable *, const void *, void *); +hashtable *hashtable_init(size_t, size_t, size_t); +int hashtable_put(hashtable *, const void *, const void *); +int hashtable_remove(hashtable *, const void *); + +#endif /* HASHTABLE.H */ diff --git a/usr.bin/grep/regex/tre-compile.c b/usr.bin/grep/regex/tre-compile.c new file mode 100644 index 0000000..f037c49 --- /dev/null +++ b/usr.bin/grep/regex/tre-compile.c @@ -0,0 +1,103 @@ +/* $FreeBSD$ */ + +#include "glue.h" + +#include <stdio.h> +#include <assert.h> +#include <errno.h> +#include <regex.h> +#include <string.h> +#include <wchar.h> + +#include "xmalloc.h" + +int +tre_convert_pattern(const char *regex, size_t n, tre_char_t **w, + size_t *wn) +{ +#if TRE_WCHAR + tre_char_t *wregex; + size_t wlen; + + wregex = xmalloc(sizeof(tre_char_t) * (n + 1)); + if (wregex == NULL) + return REG_ESPACE; + + /* If the current locale uses the standard single byte encoding of + characters, we don't do a multibyte string conversion. If we did, + many applications which use the default locale would break since + the default "C" locale uses the 7-bit ASCII character set, and + all characters with the eighth bit set would be considered invalid. */ +#if TRE_MULTIBYTE + if (TRE_MB_CUR_MAX == 1) +#endif /* TRE_MULTIBYTE */ + { + unsigned int i; + const unsigned char *str = (const unsigned char *)regex; + tre_char_t *wstr = wregex; + + for (i = 0; i < n; i++) + *(wstr++) = *(str++); + wlen = n; + } +#if TRE_MULTIBYTE + else + { + int consumed; + tre_char_t *wcptr = wregex; +#ifdef HAVE_MBSTATE_T + mbstate_t state; + memset(&state, '\0', sizeof(state)); +#endif /* HAVE_MBSTATE_T */ + while (n > 0) + { + consumed = tre_mbrtowc(wcptr, regex, n, &state); + + switch (consumed) + { + case 0: + if (*regex == '\0') + consumed = 1; + else + { + xfree(wregex); + return REG_BADPAT; + } + break; + case -1: + DPRINT(("mbrtowc: error %d: %s.\n", errno, strerror(errno))); + xfree(wregex); + return REG_BADPAT; + case -2: + /* The last character wasn't complete. Let's not call it a + fatal error. */ + consumed = n; + break; + } + regex += consumed; + n -= consumed; + wcptr++; + } + wlen = wcptr - wregex; + } +#endif /* TRE_MULTIBYTE */ + wregex[wlen] = L'\0'; + *w = wregex; + *wn = wlen; + return REG_OK; +#else /* !TRE_WCHAR */ + { + *w = (tre_char_t * const *)regex; + *wn = n; + return REG_OK; + } +#endif /* !TRE_WCHAR */ +} + +void +tre_free_pattern(tre_char_t *wregex) +{ +#if TRE_WCHAR + xfree(wregex); +#endif +} diff --git a/usr.bin/grep/regex/tre-fastmatch.c b/usr.bin/grep/regex/tre-fastmatch.c new file mode 100644 index 0000000..6f1aec6 --- /dev/null +++ b/usr.bin/grep/regex/tre-fastmatch.c @@ -0,0 +1,1042 @@ +/* $FreeBSD$ */ + +/*- + * Copyright (c) 1999 James Howard and Dag-Erling Coïdan Smørgrav + * Copyright (C) 2008-2011 Gabor Kovesdan <gabor@FreeBSD.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include "glue.h" + +#include <ctype.h> +#include <limits.h> +#include <regex.h> +#include <stdbool.h> +#include <stdlib.h> +#include <string.h> +#ifdef TRE_WCHAR +#include <wchar.h> +#include <wctype.h> +#endif + +#include "hashtable.h" +#include "tre-fastmatch.h" +#include "xmalloc.h" + +static int fastcmp(const fastmatch_t *fg, const void *data, + tre_str_type_t type); + +/* + * Clean up if pattern compilation fails. + */ +#define FAIL_COMP(errcode) \ + { \ + if (fg->pattern) \ + xfree(fg->pattern); \ + if (fg->wpattern) \ + xfree(fg->wpattern); \ + if (fg->qsBc_table) \ + hashtable_free(fg->qsBc_table); \ + fg = NULL; \ + return errcode; \ + } + +/* + * Skips n characters in the input string and assigns the start + * address to startptr. Note: as per IEEE Std 1003.1-2008 + * matching is based on bit pattern not character representations + * so we can handle MB strings as byte sequences just like + * SB strings. + */ +#define SKIP_CHARS(n) \ + switch (type) \ + { \ + case STR_WIDE: \ + startptr = str_wide + n; \ + break; \ + default: \ + startptr = str_byte + n; \ + } + +/* + * Converts the wide string pattern to SB/MB string and stores + * it in fg->pattern. Sets fg->len to the byte length of the + * converted string. + */ +#define STORE_MBS_PAT \ + { \ + size_t siz; \ + \ + siz = wcstombs(NULL, fg->wpattern, 0); \ + if (siz == (size_t)-1) \ + return REG_BADPAT; \ + fg->len = siz; \ + fg->pattern = xmalloc(siz + 1); \ + if (fg->pattern == NULL) \ + return REG_ESPACE; \ + wcstombs(fg->pattern, fg->wpattern, siz); \ + fg->pattern[siz] = '\0'; \ + } \ + +#define IS_OUT_OF_BOUNDS \ + ((!fg->reversed \ + ? ((type == STR_WIDE) ? ((j + fg->wlen) > len) \ + : ((j + fg->len) > len)) \ + : (j < 0))) + +/* + * Checks whether the new position after shifting in the input string + * is out of the bounds and break out from the loop if so. + */ +#define CHECKBOUNDS \ + if (IS_OUT_OF_BOUNDS) \ + break; \ + +/* + * Shifts in the input string after a mismatch. The position of the + * mismatch is stored in the mismatch variable. + */ +#define SHIFT \ + CHECKBOUNDS; \ + \ + { \ + int r = -1; \ + unsigned int bc = 0, gs = 0, ts; \ + \ + switch (type) \ + { \ + case STR_WIDE: \ + if (!fg->hasdot) \ + { \ + if (u != 0 && (unsigned)mismatch == fg->wlen - 1 - shift) \ + mismatch -= u; \ + v = fg->wlen - 1 - mismatch; \ + r = hashtable_get(fg->qsBc_table, \ + &str_wide[!fg->reversed ? (size_t)j + fg->wlen \ + : (size_t)j - 1], &bc); \ + gs = fg->bmGs[mismatch]; \ + } \ + bc = (r == HASH_OK) ? bc : fg->defBc; \ + DPRINT(("tre_fast_match: mismatch on character" CHF ", " \ + "BC %d, GS %d\n", \ + ((const tre_char_t *)startptr)[mismatch + 1], \ + bc, gs)); \ + break; \ + default: \ + if (!fg->hasdot) \ + { \ + if (u != 0 && (unsigned)mismatch == fg->len - 1 - shift) \ + mismatch -= u; \ + v = fg->len - 1 - mismatch; \ + gs = fg->sbmGs[mismatch]; \ + } \ + bc = fg->qsBc[((const unsigned char *)str_byte) \ + [!fg->reversed ? (size_t)j + fg->len \ + : (size_t)j - 1]]; \ + DPRINT(("tre_fast_match: mismatch on character %c, " \ + "BC %d, GS %d\n", \ + ((const unsigned char *)startptr)[mismatch + 1], \ + bc, gs)); \ + } \ + if (fg->hasdot) \ + shift = bc; \ + else \ + { \ + ts = (u >= v) ? (u - v) : 0; \ + shift = MAX(ts, bc); \ + shift = MAX(shift, gs); \ + if (shift == gs) \ + u = MIN((type == STR_WIDE ? fg->wlen : fg->len) - shift, v); \ + else \ + { \ + if (ts < bc) \ + shift = MAX(shift, u + 1); \ + u = 0; \ + } \ + } \ + DPRINT(("tre_fast_match: shifting %u characters\n", shift)); \ + j = !fg->reversed ? j + shift : j - shift; \ + } + +/* + * Normal Quick Search would require a shift based on the position the + * next character after the comparison is within the pattern. With + * wildcards, the position of the last dot effects the maximum shift + * distance. + * The closer to the end the wild card is the slower the search. + * + * Examples: + * Pattern Max shift + * ------- --------- + * this 5 + * .his 4 + * t.is 3 + * th.s 2 + * thi. 1 + */ + +/* + * Fills in the bad character shift array for SB/MB strings. + */ +#define FILL_QSBC \ + if (fg->reversed) \ + { \ + _FILL_QSBC_REVERSED \ + } \ + else \ + { \ + _FILL_QSBC \ + } + +#define _FILL_QSBC \ + for (unsigned int i = 0; i <= UCHAR_MAX; i++) \ + fg->qsBc[i] = fg->len - hasdot; \ + for (unsigned int i = hasdot + 1; i < fg->len; i++) \ + { \ + fg->qsBc[(unsigned char)fg->pattern[i]] = fg->len - i; \ + DPRINT(("BC shift for char %c is %zu\n", fg->pattern[i], \ + fg->len - i)); \ + if (fg->icase) \ + { \ + char c = islower((unsigned char)fg->pattern[i]) ? \ + toupper((unsigned char)fg->pattern[i]) : \ + tolower((unsigned char)fg->pattern[i]); \ + fg->qsBc[(unsigned char)c] = fg->len - i; \ + DPRINT(("BC shift for char %c is %zu\n", c, fg->len - i)); \ + } \ + } + +#define _FILL_QSBC_REVERSED \ + for (unsigned int i = 0; i <= UCHAR_MAX; i++) \ + fg->qsBc[i] = firstdot + 1; \ + for (int i = firstdot - 1; i >= 0; i--) \ + { \ + fg->qsBc[(unsigned char)fg->pattern[i]] = i + 1; \ + DPRINT(("Reverse BC shift for char %c is %d\n", fg->pattern[i], \ + i + 1)); \ + if (fg->icase) \ + { \ + char c = islower((unsigned char)fg->pattern[i]) ? \ + toupper((unsigned char)fg->pattern[i]) : \ + tolower((unsigned char)fg->pattern[i]); \ + fg->qsBc[(unsigned char)c] = i + 1; \ + DPRINT(("Reverse BC shift for char %c is %d\n", c, i + 1)); \ + } \ + } + +/* + * Fills in the bad character shifts into a hastable for wide strings. + * With wide characters it is not possible any more to use a normal + * array because there are too many characters and we could not + * provide enough memory. Fortunately, we only have to store distinct + * values for so many characters as the number of distinct characters + * in the pattern, so we can store them in a hashtable and store a + * default shift value for the rest. + */ +#define FILL_QSBC_WIDE \ + if (fg->reversed) \ + { \ + _FILL_QSBC_WIDE_REVERSED \ + } \ + else \ + { \ + _FILL_QSBC_WIDE \ + } + +#define _FILL_QSBC_WIDE \ + /* Adjust the shift based on location of the last dot ('.'). */ \ + fg->defBc = fg->wlen - whasdot; \ + \ + /* Preprocess pattern. */ \ + fg->qsBc_table = hashtable_init(fg->wlen * (fg->icase ? 8 : 4), \ + sizeof(tre_char_t), sizeof(int)); \ + if (!fg->qsBc_table) \ + FAIL_COMP(REG_ESPACE); \ + for (unsigned int i = whasdot + 1; i < fg->wlen; i++) \ + { \ + int k = fg->wlen - i; \ + int r; \ + \ + r = hashtable_put(fg->qsBc_table, &fg->wpattern[i], &k); \ + if ((r == HASH_FAIL) || (r == HASH_FULL)) \ + FAIL_COMP(REG_ESPACE); \ + DPRINT(("BC shift for wide char " CHF " is %d\n", fg->wpattern[i],\ + k)); \ + if (fg->icase) \ + { \ + tre_char_t wc = iswlower(fg->wpattern[i]) ? \ + towupper(fg->wpattern[i]) : towlower(fg->wpattern[i]); \ + r = hashtable_put(fg->qsBc_table, &wc, &k); \ + if ((r == HASH_FAIL) || (r == HASH_FULL)) \ + FAIL_COMP(REG_ESPACE); \ + DPRINT(("BC shift for wide char " CHF " is %d\n", wc, k)); \ + } \ + } + +#define _FILL_QSBC_WIDE_REVERSED \ + /* Adjust the shift based on location of the last dot ('.'). */ \ + fg->defBc = (size_t)wfirstdot; \ + \ + /* Preprocess pattern. */ \ + fg->qsBc_table = hashtable_init(fg->wlen * (fg->icase ? 8 : 4), \ + sizeof(tre_char_t), sizeof(int)); \ + if (!fg->qsBc_table) \ + FAIL_COMP(REG_ESPACE); \ + for (int i = wfirstdot - 1; i >= 0; i--) \ + { \ + int k = i + 1; \ + int r; \ + \ + r = hashtable_put(fg->qsBc_table, &fg->wpattern[i], &k); \ + if ((r == HASH_FAIL) || (r == HASH_FULL)) \ + FAIL_COMP(REG_ESPACE); \ + DPRINT(("Reverse BC shift for wide char " CHF " is %d\n", \ + fg->wpattern[i], k)); \ + if (fg->icase) \ + { \ + tre_char_t wc = iswlower(fg->wpattern[i]) ? \ + towupper(fg->wpattern[i]) : towlower(fg->wpattern[i]); \ + r = hashtable_put(fg->qsBc_table, &wc, &k); \ + if ((r == HASH_FAIL) || (r == HASH_FULL)) \ + FAIL_COMP(REG_ESPACE); \ + DPRINT(("Reverse BC shift for wide char " CHF " is %d\n", wc, \ + k)); \ + } \ + } + +#ifdef _GREP_DEBUG +#define DPRINT_BMGS(len, fmt_str, sh) \ + for (unsigned int i = 0; i < len; i++) \ + DPRINT((fmt_str, i, sh[i])); +#else +#define DPRINT_BMGS(len, fmt_str, sh) \ + do { } while(/*CONSTCOND*/0) +#endif + +/* + * Fills in the good suffix table for SB/MB strings. + */ +#define FILL_BMGS \ + if (!fg->hasdot) \ + { \ + fg->sbmGs = xmalloc(fg->len * sizeof(int)); \ + if (!fg->sbmGs) \ + return REG_ESPACE; \ + if (fg->len == 1) \ + fg->sbmGs[0] = 1; \ + else \ + _FILL_BMGS(fg->sbmGs, fg->pattern, fg->len, false); \ + DPRINT_BMGS(fg->len, "GS shift for pos %d is %d\n", fg->sbmGs); \ + } + +/* + * Fills in the good suffix table for wide strings. + */ +#define FILL_BMGS_WIDE \ + if (!fg->hasdot) \ + { \ + fg->bmGs = xmalloc(fg->wlen * sizeof(int)); \ + if (!fg->bmGs) \ + return REG_ESPACE; \ + if (fg->wlen == 1) \ + fg->bmGs[0] = 1; \ + else \ + _FILL_BMGS(fg->bmGs, fg->wpattern, fg->wlen, true); \ + DPRINT_BMGS(fg->wlen, "GS shift (wide) for pos %d is %d\n", \ + fg->bmGs); \ + } + +#define _FILL_BMGS(arr, pat, plen, wide) \ + { \ + char *p; \ + tre_char_t *wp; \ + \ + if (wide) \ + { \ + if (fg->icase) \ + { \ + wp = xmalloc(plen * sizeof(tre_char_t)); \ + if (wp == NULL) \ + return REG_ESPACE; \ + for (unsigned int i = 0; i < plen; i++) \ + wp[i] = towlower(pat[i]); \ + _CALC_BMGS(arr, wp, plen); \ + xfree(wp); \ + } \ + else \ + _CALC_BMGS(arr, pat, plen); \ + } \ + else \ + { \ + if (fg->icase) \ + { \ + p = xmalloc(plen); \ + if (p == NULL) \ + return REG_ESPACE; \ + for (unsigned int i = 0; i < plen; i++) \ + p[i] = tolower(pat[i]); \ + _CALC_BMGS(arr, p, plen); \ + xfree(p); \ + } \ + else \ + _CALC_BMGS(arr, pat, plen); \ + } \ + } + +#define _CALC_BMGS(arr, pat, plen) \ + { \ + int f = 0, g; \ + \ + int *suff = xmalloc(plen * sizeof(int)); \ + if (suff == NULL) \ + return REG_ESPACE; \ + \ + suff[plen - 1] = plen; \ + g = plen - 1; \ + for (int i = plen - 2; i >= 0; i--) \ + { \ + if (i > g && suff[i + plen - 1 - f] < i - g) \ + suff[i] = suff[i + plen - 1 - f]; \ + else \ + { \ + if (i < g) \ + g = i; \ + f = i; \ + while (g >= 0 && pat[g] == pat[g + plen - 1 - f]) \ + g--; \ + suff[i] = f - g; \ + } \ + } \ + \ + for (unsigned int i = 0; i < plen; i++) \ + arr[i] = plen; \ + g = 0; \ + for (int i = plen - 1; i >= 0; i--) \ + if (suff[i] == i + 1) \ + for(; (unsigned long)g < plen - 1 - i; g++) \ + if (arr[g] == plen) \ + arr[g] = plen - 1 - i; \ + for (unsigned int i = 0; i <= plen - 2; i++) \ + arr[plen - 1 - suff[i]] = plen - 1 - i; \ + \ + xfree(suff); \ + } + +/* + * Copies the pattern pat having lenght n to p and stores + * the size in l. + */ +#define SAVE_PATTERN(src, srclen, dst, dstlen) \ + dstlen = srclen; \ + dst = xmalloc((dstlen + 1) * sizeof(tre_char_t)); \ + if (dst == NULL) \ + return REG_ESPACE; \ + if (dstlen > 0) \ + memcpy(dst, src, dstlen * sizeof(tre_char_t)); \ + dst[dstlen] = TRE_CHAR('\0'); + +/* + * Initializes pattern compiling. + */ +#define INIT_COMP \ + /* Initialize. */ \ + memset(fg, 0, sizeof(*fg)); \ + fg->icase = (cflags & REG_ICASE); \ + fg->word = (cflags & REG_WORD); \ + fg->newline = (cflags & REG_NEWLINE); \ + fg->nosub = (cflags & REG_NOSUB); \ + \ + /* Cannot handle REG_ICASE with MB string */ \ + if (fg->icase && (TRE_MB_CUR_MAX > 1)) \ + { \ + DPRINT(("Cannot use fast matcher for MBS with REG_ICASE\n")); \ + return REG_BADPAT; \ + } + +/* + * Checks whether we have a 0-length pattern that will match + * anything. If literal is set to false, the EOL anchor is also + * taken into account. + */ +#define CHECK_MATCHALL(literal) \ + if (!literal && n == 1 && pat[0] == TRE_CHAR('$')) \ + { \ + n--; \ + fg->eol = true; \ + } \ + \ + if (n == 0) \ + { \ + fg->matchall = true; \ + fg->pattern = xmalloc(sizeof(char)); \ + if (!fg->pattern) \ + FAIL_COMP(REG_ESPACE); \ + fg->pattern[0] = '\0'; \ + fg->wpattern = xmalloc(sizeof(tre_char_t)); \ + if (!fg->wpattern) \ + FAIL_COMP(REG_ESPACE); \ + fg->wpattern[0] = TRE_CHAR('\0'); \ + DPRINT(("Matching every input\n")); \ + return REG_OK; \ + } + +/* + * Returns: REG_OK on success, error code otherwise + */ +int +tre_compile_literal(fastmatch_t *fg, const tre_char_t *pat, size_t n, + int cflags) +{ + size_t hasdot = 0, whasdot = 0; + ssize_t firstdot = -1, wfirstdot = -1; + + INIT_COMP; + + CHECK_MATCHALL(true); + + /* Cannot handle word boundaries with MB string */ + if (fg->word && (TRE_MB_CUR_MAX > 1)) + return REG_BADPAT; + +#ifdef TRE_WCHAR + SAVE_PATTERN(pat, n, fg->wpattern, fg->wlen); + STORE_MBS_PAT; +#else + SAVE_PATTERN(pat, n, fg->pattern, fg->len); +#endif + + DPRINT(("tre_compile_literal: pattern: %s, len %zu, icase: %c, word: %c, " + "newline %c\n", fg->pattern, fg->len, fg->icase ? 'y' : 'n', + fg->word ? 'y' : 'n', fg->newline ? 'y' : 'n')); + + FILL_QSBC; + FILL_BMGS; +#ifdef TRE_WCHAR + FILL_QSBC_WIDE; + FILL_BMGS_WIDE; +#endif + + return REG_OK; +} + +/* + * Returns: REG_OK on success, error code otherwise + */ +int +tre_compile_fast(fastmatch_t *fg, const tre_char_t *pat, size_t n, + int cflags) +{ + tre_char_t *tmp; + size_t pos = 0, hasdot = 0, whasdot = 0; + ssize_t firstdot = -1, wfirstdot = -1; + bool escaped = false; + bool *_escmap = NULL; + + INIT_COMP; + + /* Remove beginning-of-line character ('^'). */ + if (pat[0] == TRE_CHAR('^')) + { + fg->bol = true; + n--; + pat++; + } + + CHECK_MATCHALL(false); + + /* Handle word-boundary matching when GNU extensions are enabled */ + if ((cflags & REG_GNU) && (n >= 14) && + (memcmp(pat, TRE_CHAR("[[:<:]]"), 7 * sizeof(tre_char_t)) == 0) && + (memcmp(pat + n - 7, TRE_CHAR("[[:>:]]"), + 7 * sizeof(tre_char_t)) == 0)) + { + n -= 14; + pat += 7; + fg->word = true; + } + + /* Cannot handle word boundaries with MB string */ + if (fg->word && (TRE_MB_CUR_MAX > 1)) + return REG_BADPAT; + + tmp = xmalloc((n + 1) * sizeof(tre_char_t)); + if (tmp == NULL) + return REG_ESPACE; + +/* Copies the char into the stored pattern and skips to the next char. */ +#define STORE_CHAR \ + do \ + { \ + tmp[pos++] = pat[i]; \ + escaped = false; \ + continue; \ + } while (0) + + /* Traverse the input pattern for processing */ + for (unsigned int i = 0; i < n; i++) + { + switch (pat[i]) + { + case TRE_CHAR('\\'): + if (escaped) + STORE_CHAR; + else if (i == n - 1) + goto badpat; + else + escaped = true; + continue; + case TRE_CHAR('['): + if (escaped) + STORE_CHAR; + else + goto badpat; + continue; + case TRE_CHAR('*'): + if (escaped || (!(cflags & REG_EXTENDED) && (i == 0))) + STORE_CHAR; + else + goto badpat; + continue; + case TRE_CHAR('+'): + case TRE_CHAR('?'): + if ((cflags & REG_EXTENDED) && (i == 0)) + continue; + else if ((cflags & REG_EXTENDED) ^ !escaped) + STORE_CHAR; + else + goto badpat; + continue; + case TRE_CHAR('.'): + if (escaped) + { + if (!_escmap) + _escmap = xmalloc(n * sizeof(bool)); + if (!_escmap) + { + xfree(tmp); + return REG_ESPACE; + } + _escmap[i] = true; + STORE_CHAR; + } + else + { + whasdot = i; + if (wfirstdot == -1) + wfirstdot = i; + STORE_CHAR; + } + continue; + case TRE_CHAR('^'): + STORE_CHAR; + continue; + case TRE_CHAR('$'): + if (!escaped && (i == n - 1)) + fg->eol = true; + else + STORE_CHAR; + continue; + case TRE_CHAR('('): + if ((cflags & REG_EXTENDED) ^ escaped) + goto badpat; + else + STORE_CHAR; + continue; + case TRE_CHAR('{'): + if (!(cflags & REG_EXTENDED) ^ escaped) + STORE_CHAR; + else if (!(cflags & REG_EXTENDED) && (i == 0)) + STORE_CHAR; + else if ((cflags & REG_EXTENDED) && (i == 0)) + continue; + else + goto badpat; + continue; + case TRE_CHAR('|'): + if ((cflags & REG_EXTENDED) ^ escaped) + goto badpat; + else + STORE_CHAR; + continue; + default: + if (escaped) + goto badpat; + else + STORE_CHAR; + continue; + } + continue; +badpat: + xfree(tmp); + DPRINT(("tre_compile_fast: compilation of pattern failed, falling" + "back to NFA\n")); + return REG_BADPAT; + } + + fg->hasdot = wfirstdot > -1; + + /* + * The pattern has been processed and copied to tmp as a literal string + * with escapes, anchors (^$) and the word boundary match character + * classes stripped out. + */ +#ifdef TRE_WCHAR + SAVE_PATTERN(tmp, pos, fg->wpattern, fg->wlen); + fg->wescmap = _escmap; + STORE_MBS_PAT; + + /* + * The position of dots and escaped dots is different in the MB string + * than in to the wide string so traverse the converted string, as well, + * to store these positions. + */ + if (fg->hasdot || (fg->wescmap != NULL)) + { + if (fg->wescmap != NULL) + { + fg->escmap = xmalloc(fg->len * sizeof(bool)); + if (!fg->escmap) + { + tre_free_fast(fg); + return REG_ESPACE; + } + } + + escaped = false; + for (unsigned int i = 0; i < fg->len; i++) + if (fg->pattern[i] == '\\') + escaped = !escaped; + else if (fg->pattern[i] == '.' && escaped) + { + fg->escmap[i] = true; + escaped = false; + } + else if (fg->pattern[i] == '.' && !escaped) + { + hasdot = i; + if (firstdot == -1) + firstdot = i; + } + else + escaped = false; + } +#else + SAVE_PATTERN(tmp, pos, fg->pattern, fg->len); + fg->escmap = _escmap; +#endif + + xfree(tmp); + + DPRINT(("tre_compile_fast: pattern: %s, len %zu, bol %c, eol %c, " + "icase: %c, word: %c, newline %c\n", fg->pattern, fg->len, + fg->bol ? 'y' : 'n', fg->eol ? 'y' : 'n', + fg->icase ? 'y' : 'n', fg->word ? 'y' : 'n', + fg->newline ? 'y' : 'n')); + + /* Check whether reverse QS algorithm is more efficient */ + if ((wfirstdot > -1) && (fg->wlen - whasdot + 1 < (size_t)wfirstdot) && + fg->nosub) + { + fg->reversed = true; + DPRINT(("tre_compile_fast: using reverse QS algorithm\n")); + } + + FILL_QSBC; + FILL_BMGS; +#ifdef TRE_WCHAR + FILL_QSBC_WIDE; + FILL_BMGS_WIDE; +#endif + + return REG_OK; +} + +#define _SHIFT_ONE \ + { \ + shift = 1; \ + j = !fg->reversed ? j + shift : j - shift; \ + continue; \ + } + +#define _BBOUND_COND \ + ((type == STR_WIDE) ? \ + ((j == 0) || !(tre_isalnum(str_wide[j - 1]) || \ + (str_wide[j - 1] == TRE_CHAR('_')))) : \ + ((j == 0) || !(tre_isalnum(str_byte[j - 1]) || \ + (str_byte[j - 1] == '_')))) + +#define _EBOUND_COND \ + ((type == STR_WIDE) ? \ + ((j + fg->wlen == len) || !(tre_isalnum(str_wide[j + fg->wlen]) || \ + (str_wide[j + fg->wlen] == TRE_CHAR('_')))) : \ + ((j + fg->len == len) || !(tre_isalnum(str_byte[j + fg->len]) || \ + (str_byte[j + fg->len] == '_')))) + +/* + * Condition to check whether the match on position j is on a + * word boundary. + */ +#define IS_ON_WORD_BOUNDARY \ + (_BBOUND_COND && _EBOUND_COND) + +/* + * Checks word boundary and shifts one if match is not on a + * boundary. + */ +#define CHECK_WORD_BOUNDARY \ + if (!IS_ON_WORD_BOUNDARY) \ + _SHIFT_ONE; + +#define _BOL_COND \ + ((j == 0) || ((type == STR_WIDE) ? (str_wide[j - 1] == TRE_CHAR('\n'))\ + : (str_byte[j - 1] == '\n'))) + +/* + * Checks BOL anchor and shifts one if match is not on a + * boundary. + */ +#define CHECK_BOL_ANCHOR \ + if (!_BOL_COND) \ + _SHIFT_ONE; + +#define _EOL_COND \ + ((type == STR_WIDE) \ + ? ((j + fg->wlen == len) || \ + (str_wide[j + fg->wlen] == TRE_CHAR('\n'))) \ + : ((j + fg->len == len) || (str_byte[j + fg->wlen] == '\n'))) + +/* + * Checks EOL anchor and shifts one if match is not on a + * boundary. + */ +#define CHECK_EOL_ANCHOR \ + if (!_EOL_COND) \ + _SHIFT_ONE; + +/* + * Executes matching of the precompiled pattern on the input string. + * Returns REG_OK or REG_NOMATCH depending on if we find a match or not. + */ +int +tre_match_fast(const fastmatch_t *fg, const void *data, size_t len, + tre_str_type_t type, int nmatch, regmatch_t pmatch[], int eflags) +{ + unsigned int shift, u = 0, v = 0; + ssize_t j = 0; + int ret = REG_NOMATCH; + int mismatch; + const char *str_byte = data; + const void *startptr = NULL; + const tre_char_t *str_wide = data; + + /* Calculate length if unspecified. */ + if (len == (size_t)-1) + switch (type) + { + case STR_WIDE: + len = tre_strlen(str_wide); + break; + default: + len = strlen(str_byte); + break; + } + + /* Shortcut for empty pattern */ + if (fg->matchall) + { + if (!fg->nosub && nmatch >= 1) + { + pmatch[0].rm_so = 0; + pmatch[0].rm_eo = len; + } + if (fg->bol && fg->eol) + return (len == 0) ? REG_OK : REG_NOMATCH; + else + return REG_OK; + } + + /* No point in going farther if we do not have enough data. */ + switch (type) + { + case STR_WIDE: + if (len < fg->wlen) + return ret; + shift = fg->wlen; + break; + default: + if (len < fg->len) + return ret; + shift = fg->len; + } + + /* + * REG_NOTBOL means not anchoring ^ to the beginning of the line, so we + * can shift one because there can't be a match at the beginning. + */ + if (fg->bol && (eflags & REG_NOTBOL)) + j = 1; + + /* + * Like above, we cannot have a match at the very end when anchoring to + * the end and REG_NOTEOL is specified. + */ + if (fg->eol && (eflags & REG_NOTEOL)) + len--; + + if (fg->reversed) + j = len - (type == STR_WIDE ? fg->wlen : fg->len); + + + /* Only try once at the beginning or ending of the line. */ + if ((fg->bol || fg->eol) && !fg->newline && !(eflags & REG_NOTBOL) && + !(eflags & REG_NOTEOL)) + { + /* Simple text comparison. */ + if (!((fg->bol && fg->eol) && + (type == STR_WIDE ? (len != fg->wlen) : (len != fg->len)))) + { + /* Determine where in data to start search at. */ + j = fg->eol ? len - (type == STR_WIDE ? fg->wlen : fg->len) : 0; + SKIP_CHARS(j); + mismatch = fastcmp(fg, startptr, type); + if (mismatch == REG_OK) + { + if (fg->word && !IS_ON_WORD_BOUNDARY) + return ret; + if (!fg->nosub && nmatch >= 1) + { + pmatch[0].rm_so = j; + pmatch[0].rm_eo = j + (type == STR_WIDE ? fg->wlen : fg->len); + } + return REG_OK; + } + } + } + else + { + /* Quick Search / Turbo Boyer-Moore algorithm. */ + do + { + SKIP_CHARS(j); + mismatch = fastcmp(fg, startptr, type); + if (mismatch == REG_OK) + { + if (fg->word) + CHECK_WORD_BOUNDARY; + if (fg->bol) + CHECK_BOL_ANCHOR; + if (fg->eol) + CHECK_EOL_ANCHOR; + if (!fg->nosub && nmatch >= 1) + { + pmatch[0].rm_so = j; + pmatch[0].rm_eo = j + ((type == STR_WIDE) ? fg->wlen : fg->len); + } + return REG_OK; + } + else if (mismatch > 0) + return mismatch; + mismatch = -mismatch - 1; + SHIFT; + } while (!IS_OUT_OF_BOUNDS); + } + return ret; +} + +/* + * Frees the resources that were allocated when the pattern was compiled. + */ +void +tre_free_fast(fastmatch_t *fg) +{ + + DPRINT(("tre_fast_free: freeing structures for pattern %s\n", + fg->pattern)); + +#ifdef TRE_WCHAR + hashtable_free(fg->qsBc_table); + if (!fg->hasdot) + xfree(fg->bmGs); + if (fg->wescmap) + xfree(fg->wescmap); + xfree(fg->wpattern); +#endif + if (!fg->hasdot) + xfree(fg->sbmGs); + if (fg->escmap) + xfree(fg->escmap); + xfree(fg->pattern); +} + +/* + * Returns: -(i + 1) on failure (position that it failed with minus sign) + * error code on error + * REG_OK on success + */ +static inline int +fastcmp(const fastmatch_t *fg, const void *data, tre_str_type_t type) +{ + const char *str_byte = data; + const char *pat_byte = fg->pattern; + const tre_char_t *str_wide = data; + const tre_char_t *pat_wide = fg->wpattern; + const bool *escmap = (type == STR_WIDE) ? fg->wescmap : fg->escmap; + size_t len = (type == STR_WIDE) ? fg->wlen : fg->len; + int ret = REG_OK; + + /* Compare the pattern and the input char-by-char from the last position. */ + for (int i = len - 1; i >= 0; i--) { + switch (type) + { + case STR_WIDE: + + /* Check dot */ + if (fg->hasdot && pat_wide[i] == TRE_CHAR('.') && + (!escmap || !escmap[i]) && + (!fg->newline || (str_wide[i] != TRE_CHAR('\n')))) + continue; + + /* Compare */ + if (fg->icase ? (towlower(pat_wide[i]) == towlower(str_wide[i])) + : (pat_wide[i] == str_wide[i])) + continue; + break; + default: + /* Check dot */ + if (fg->hasdot && pat_byte[i] == '.' && + (!escmap || !escmap[i]) && + (!fg->newline || (str_byte[i] != '\n'))) + continue; + + /* Compare */ + if (fg->icase ? (tolower(pat_byte[i]) == tolower(str_byte[i])) + : (pat_byte[i] == str_byte[i])) + continue; + } + DPRINT(("fastcmp: mismatch at position %d\n", i)); + ret = -(i + 1); + break; + } + return ret; +} diff --git a/usr.bin/grep/regex/tre-fastmatch.h b/usr.bin/grep/regex/tre-fastmatch.h new file mode 100644 index 0000000..f72397c --- /dev/null +++ b/usr.bin/grep/regex/tre-fastmatch.h @@ -0,0 +1,21 @@ +/* $FreeBSD$ */ + +#ifndef TRE_FASTMATCH_H +#define TRE_FASTMATCH_H 1 + +#include <fastmatch.h> +#include <hashtable.h> +#include <limits.h> +#include <regex.h> +#include <stdbool.h> + +#include "hashtable.h" + +int tre_compile_literal(fastmatch_t *preg, const tre_char_t *regex, + size_t, int); +int tre_compile_fast(fastmatch_t *preg, const tre_char_t *regex, size_t, int); +int tre_match_fast(const fastmatch_t *fg, const void *data, size_t len, + tre_str_type_t type, int nmatch, regmatch_t pmatch[], int eflags); +void tre_free_fast(fastmatch_t *preg); + +#endif /* TRE_FASTMATCH_H */ diff --git a/usr.bin/grep/regex/xmalloc.c b/usr.bin/grep/regex/xmalloc.c new file mode 100644 index 0000000..b1acc9c --- /dev/null +++ b/usr.bin/grep/regex/xmalloc.c @@ -0,0 +1,349 @@ +/* $FreeBSD$ */ + +/* + xmalloc.c - Simple malloc debugging library implementation + + This software is released under a BSD-style license. + See the file LICENSE for details and copyright. + +*/ + +/* + TODO: + - red zones + - group dumps by source location +*/ + +#include <stdlib.h> +#include <assert.h> +#include <stdio.h> +#define XMALLOC_INTERNAL 1 +#include "xmalloc.h" + + +/* + Internal stuff. +*/ + +typedef struct hashTableItemRec { + void *ptr; + int bytes; + const char *file; + int line; + const char *func; + struct hashTableItemRec *next; +} hashTableItem; + +typedef struct { + hashTableItem **table; +} hashTable; + +static int xmalloc_peak; +int xmalloc_current; +static int xmalloc_peak_blocks; +int xmalloc_current_blocks; +static int xmalloc_fail_after; + +#define TABLE_BITS 8 +#define TABLE_MASK ((1 << TABLE_BITS) - 1) +#define TABLE_SIZE (1 << TABLE_BITS) + +static hashTable * +hash_table_new(void) +{ + hashTable *tbl; + + tbl = malloc(sizeof(*tbl)); + + if (tbl != NULL) + { + tbl->table = calloc(TABLE_SIZE, sizeof(*tbl->table)); + + if (tbl->table == NULL) + { + free(tbl); + return NULL; + } + } + + return tbl; +} + +static int +hash_void_ptr(void *ptr) +{ + int hash; + int i; + + /* I took this hash function just off the top of my head, I have + no idea whether it is bad or very bad. */ + hash = 0; + for (i = 0; i < (int)sizeof(ptr)*8 / TABLE_BITS; i++) + { + hash ^= (unsigned long)ptr >> i*8; + hash += i * 17; + hash &= TABLE_MASK; + } + return hash; +} + +static void +hash_table_add(hashTable *tbl, void *ptr, int bytes, + const char *file, int line, const char *func) +{ + int i; + hashTableItem *item, *new; + + i = hash_void_ptr(ptr); + + item = tbl->table[i]; + if (item != NULL) + while (item->next != NULL) + item = item->next; + + new = malloc(sizeof(*new)); + assert(new != NULL); + new->ptr = ptr; + new->bytes = bytes; + new->file = file; + new->line = line; + new->func = func; + new->next = NULL; + if (item != NULL) + item->next = new; + else + tbl->table[i] = new; + + xmalloc_current += bytes; + if (xmalloc_current > xmalloc_peak) + xmalloc_peak = xmalloc_current; + xmalloc_current_blocks++; + if (xmalloc_current_blocks > xmalloc_peak_blocks) + xmalloc_peak_blocks = xmalloc_current_blocks; +} + +static void +hash_table_del(hashTable *tbl, void *ptr) +{ + int i; + hashTableItem *item, *prev; + + i = hash_void_ptr(ptr); + + item = tbl->table[i]; + if (item == NULL) + { + printf("xfree: invalid ptr %p\n", ptr); + abort(); + } + prev = NULL; + while (item->ptr != ptr) + { + prev = item; + item = item->next; + } + if (item->ptr != ptr) + { + printf("xfree: invalid ptr %p\n", ptr); + abort(); + } + + xmalloc_current -= item->bytes; + xmalloc_current_blocks--; + + if (prev != NULL) + { + prev->next = item->next; + free(item); + } + else + { + tbl->table[i] = item->next; + free(item); + } +} + +static hashTable *xmalloc_table = NULL; + +static void +xmalloc_init(void) +{ + if (xmalloc_table == NULL) + { + xmalloc_table = hash_table_new(); + xmalloc_peak = 0; + xmalloc_peak_blocks = 0; + xmalloc_current = 0; + xmalloc_current_blocks = 0; + xmalloc_fail_after = -1; + } + assert(xmalloc_table != NULL); + assert(xmalloc_table->table != NULL); +} + + + +/* + Public API. +*/ + +void +xmalloc_configure(int fail_after) +{ + xmalloc_init(); + xmalloc_fail_after = fail_after; +} + +int +xmalloc_dump_leaks(void) +{ + int i; + int num_leaks = 0; + int leaked_bytes = 0; + hashTableItem *item; + + xmalloc_init(); + + for (i = 0; i < TABLE_SIZE; i++) + { + item = xmalloc_table->table[i]; + while (item != NULL) + { + printf("%s:%d: %s: %d bytes at %p not freed\n", + item->file, item->line, item->func, item->bytes, item->ptr); + num_leaks++; + leaked_bytes += item->bytes; + item = item->next; + } + } + if (num_leaks == 0) + printf("No memory leaks.\n"); + else + printf("%d unfreed memory chuncks, total %d unfreed bytes.\n", + num_leaks, leaked_bytes); + printf("Peak memory consumption %d bytes (%.1f kB, %.1f MB) in %d blocks ", + xmalloc_peak, (double)xmalloc_peak / 1024, + (double)xmalloc_peak / (1024*1024), xmalloc_peak_blocks); + printf("(average "); + if (xmalloc_peak_blocks) + printf("%d", ((xmalloc_peak + xmalloc_peak_blocks / 2) + / xmalloc_peak_blocks)); + else + printf("N/A"); + printf(" bytes per block).\n"); + + return num_leaks; +} + +void * +xmalloc_impl(size_t size, const char *file, int line, const char *func) +{ + void *ptr; + + xmalloc_init(); + assert(size > 0); + + if (xmalloc_fail_after == 0) + { + xmalloc_fail_after = -2; +#if 0 + printf("xmalloc: forced failure %s:%d: %s\n", file, line, func); +#endif + return NULL; + } + else if (xmalloc_fail_after == -2) + { + printf("xmalloc: called after failure from %s:%d: %s\n", + file, line, func); + assert(0); + } + else if (xmalloc_fail_after > 0) + xmalloc_fail_after--; + + ptr = malloc(size); + if (ptr != NULL) + hash_table_add(xmalloc_table, ptr, (int)size, file, line, func); + return ptr; +} + +void * +xcalloc_impl(size_t nmemb, size_t size, const char *file, int line, + const char *func) +{ + void *ptr; + + xmalloc_init(); + assert(size > 0); + + if (xmalloc_fail_after == 0) + { + xmalloc_fail_after = -2; +#if 0 + printf("xcalloc: forced failure %s:%d: %s\n", file, line, func); +#endif + return NULL; + } + else if (xmalloc_fail_after == -2) + { + printf("xcalloc: called after failure from %s:%d: %s\n", + file, line, func); + assert(0); + } + else if (xmalloc_fail_after > 0) + xmalloc_fail_after--; + + ptr = calloc(nmemb, size); + if (ptr != NULL) + hash_table_add(xmalloc_table, ptr, (int)(nmemb * size), file, line, func); + return ptr; +} + +void +xfree_impl(void *ptr, const char *file, int line, const char *func) +{ + /*LINTED*/(void)&file; + /*LINTED*/(void)&line; + /*LINTED*/(void)&func; + xmalloc_init(); + + if (ptr != NULL) + hash_table_del(xmalloc_table, ptr); + free(ptr); +} + +void * +xrealloc_impl(void *ptr, size_t new_size, const char *file, int line, + const char *func) +{ + void *new_ptr; + + xmalloc_init(); + assert(ptr != NULL); + assert(new_size > 0); + + if (xmalloc_fail_after == 0) + { + xmalloc_fail_after = -2; + return NULL; + } + else if (xmalloc_fail_after == -2) + { + printf("xrealloc: called after failure from %s:%d: %s\n", + file, line, func); + assert(0); + } + else if (xmalloc_fail_after > 0) + xmalloc_fail_after--; + + new_ptr = realloc(ptr, new_size); + if (new_ptr != NULL) + { + hash_table_del(xmalloc_table, ptr); + hash_table_add(xmalloc_table, new_ptr, (int)new_size, file, line, func); + } + return new_ptr; +} + + + +/* EOF */ diff --git a/usr.bin/grep/regex/xmalloc.h b/usr.bin/grep/regex/xmalloc.h new file mode 100644 index 0000000..5cde986 --- /dev/null +++ b/usr.bin/grep/regex/xmalloc.h @@ -0,0 +1,79 @@ +/* $FreeBSD$ */ + +/* + xmalloc.h - Simple malloc debugging library API + + This software is released under a BSD-style license. + See the file LICENSE for details and copyright. + +*/ + +#ifndef _XMALLOC_H +#define _XMALLOC_H 1 + +void *xmalloc_impl(size_t size, const char *file, int line, const char *func); +void *xcalloc_impl(size_t nmemb, size_t size, const char *file, int line, + const char *func); +void xfree_impl(void *ptr, const char *file, int line, const char *func); +void *xrealloc_impl(void *ptr, size_t new_size, const char *file, int line, + const char *func); +int xmalloc_dump_leaks(void); +void xmalloc_configure(int fail_after); + + +#ifndef XMALLOC_INTERNAL +#ifdef MALLOC_DEBUGGING + +/* Version 2.4 and later of GCC define a magical variable `__PRETTY_FUNCTION__' + which contains the name of the function currently being defined. +# define __XMALLOC_FUNCTION __PRETTY_FUNCTION__ + This is broken in G++ before version 2.6. + C9x has a similar variable called __func__, but prefer the GCC one since + it demangles C++ function names. */ +# ifdef __GNUC__ +# if __GNUC__ > 2 || (__GNUC__ == 2 \ + && __GNUC_MINOR__ >= (defined __cplusplus ? 6 : 4)) +# define __XMALLOC_FUNCTION __PRETTY_FUNCTION__ +# else +# define __XMALLOC_FUNCTION ((const char *) 0) +# endif +# else +# if defined __STDC_VERSION__ && __STDC_VERSION__ >= 199901L +# define __XMALLOC_FUNCTION __func__ +# else +# define __XMALLOC_FUNCTION ((const char *) 0) +# endif +# endif + +#define xmalloc(size) xmalloc_impl(size, __FILE__, __LINE__, \ + __XMALLOC_FUNCTION) +#define xcalloc(nmemb, size) xcalloc_impl(nmemb, size, __FILE__, __LINE__, \ + __XMALLOC_FUNCTION) +#define xfree(ptr) xfree_impl(ptr, __FILE__, __LINE__, __XMALLOC_FUNCTION) +#define xrealloc(ptr, new_size) xrealloc_impl(ptr, new_size, __FILE__, \ + __LINE__, __XMALLOC_FUNCTION) +#undef malloc +#undef calloc +#undef free +#undef realloc + +#define malloc USE_XMALLOC_INSTEAD_OF_MALLOC +#define calloc USE_XCALLOC_INSTEAD_OF_CALLOC +#define free USE_XFREE_INSTEAD_OF_FREE +#define realloc USE_XREALLOC_INSTEAD_OF_REALLOC + +#else /* !MALLOC_DEBUGGING */ + +#include <stdlib.h> + +#define xmalloc(size) malloc(size) +#define xcalloc(nmemb, size) calloc(nmemb, size) +#define xfree(ptr) free(ptr) +#define xrealloc(ptr, new_size) realloc(ptr, new_size) + +#endif /* !MALLOC_DEBUGGING */ +#endif /* !XMALLOC_INTERNAL */ + +#endif /* _XMALLOC_H */ + +/* EOF */ |