diff options
author | tjr <tjr@FreeBSD.org> | 2004-07-29 03:13:10 +0000 |
---|---|---|
committer | tjr <tjr@FreeBSD.org> | 2004-07-29 03:13:10 +0000 |
commit | 76ab8ea7ebc4775b3a4c011fae8330ce56baf20d (patch) | |
tree | 3b396e55f2bcaae193c252396711e47731f54a2b /lib/libc | |
parent | 206c50d39399b1c5bdae5fd5f7414938e67156b5 (diff) | |
download | FreeBSD-src-76ab8ea7ebc4775b3a4c011fae8330ce56baf20d.zip FreeBSD-src-76ab8ea7ebc4775b3a4c011fae8330ce56baf20d.tar.gz |
Add support for multibyte characters.
Diffstat (limited to 'lib/libc')
-rw-r--r-- | lib/libc/gen/fnmatch.c | 161 |
1 files changed, 112 insertions, 49 deletions
diff --git a/lib/libc/gen/fnmatch.c b/lib/libc/gen/fnmatch.c index a07a2d0..3c3377f 100644 --- a/lib/libc/gen/fnmatch.c +++ b/lib/libc/gen/fnmatch.c @@ -45,10 +45,22 @@ __FBSDID("$FreeBSD$"); * Compares a filename or pathname to a pattern. */ -#include <ctype.h> +/* + * Some notes on multibyte character support: + * 1. Patterns with illegal byte sequences match nothing. + * 2. Illegal byte sequences in the "string" argument are handled by treating + * them as single-byte characters with a value of the first byte of the + * sequence cast to wchar_t. + * 3. Multibyte conversion state objects (mbstate_t) are passed around and + * used for most, but not all, conversions. Further work will be required + * to support state-dependent encodings. + */ + #include <fnmatch.h> +#include <limits.h> #include <string.h> -#include <stdio.h> +#include <wchar.h> +#include <wctype.h> #include "collate.h" @@ -58,33 +70,57 @@ __FBSDID("$FreeBSD$"); #define RANGE_NOMATCH 0 #define RANGE_ERROR (-1) -static int rangematch(const char *, char, int, char **); +static int rangematch(const char *, wchar_t, int, char **, mbstate_t *); +static int fnmatch1(const char *, const char *, int, mbstate_t, mbstate_t); int fnmatch(pattern, string, flags) const char *pattern, *string; int flags; { + static const mbstate_t initial; + + return (fnmatch1(pattern, string, flags, initial, initial)); +} + +static int +fnmatch1(pattern, string, flags, patmbs, strmbs) + const char *pattern, *string; + int flags; + mbstate_t patmbs, strmbs; +{ const char *stringstart; char *newp; - char c, test; + char c; + wchar_t pc, sc; + size_t pclen, sclen; - for (stringstart = string;;) - switch (c = *pattern++) { + for (stringstart = string;;) { + pclen = mbrtowc(&pc, pattern, MB_LEN_MAX, &patmbs); + if (pclen == (size_t)-1 || pclen == (size_t)-2) + return (FNM_NOMATCH); + pattern += pclen; + sclen = mbrtowc(&sc, string, MB_LEN_MAX, &strmbs); + if (sclen == (size_t)-1 || sclen == (size_t)-2) { + sc = (unsigned char)*string; + sclen = 1; + memset(&strmbs, 0, sizeof(strmbs)); + } + switch (pc) { case EOS: - if ((flags & FNM_LEADING_DIR) && *string == '/') + if ((flags & FNM_LEADING_DIR) && sc == '/') return (0); - return (*string == EOS ? 0 : FNM_NOMATCH); + return (sc == EOS ? 0 : FNM_NOMATCH); case '?': - if (*string == EOS) + if (sc == EOS) return (FNM_NOMATCH); - if (*string == '/' && (flags & FNM_PATHNAME)) + if (sc == '/' && (flags & FNM_PATHNAME)) return (FNM_NOMATCH); - if (*string == '.' && (flags & FNM_PERIOD) && + if (sc == '.' && (flags & FNM_PERIOD) && (string == stringstart || ((flags & FNM_PATHNAME) && *(string - 1) == '/'))) return (FNM_NOMATCH); - ++string; + string += sclen; break; case '*': c = *pattern; @@ -92,7 +128,7 @@ fnmatch(pattern, string, flags) while (c == '*') c = *++pattern; - if (*string == '.' && (flags & FNM_PERIOD) && + if (sc == '.' && (flags & FNM_PERIOD) && (string == stringstart || ((flags & FNM_PATHNAME) && *(string - 1) == '/'))) return (FNM_NOMATCH); @@ -112,25 +148,35 @@ fnmatch(pattern, string, flags) } /* General case, use recursion. */ - while ((test = *string) != EOS) { - if (!fnmatch(pattern, string, flags & ~FNM_PERIOD)) + while (sc != EOS) { + if (!fnmatch1(pattern, string, + flags & ~FNM_PERIOD, patmbs, strmbs)) return (0); - if (test == '/' && flags & FNM_PATHNAME) + sclen = mbrtowc(&sc, string, MB_LEN_MAX, + &strmbs); + if (sclen == (size_t)-1 || + sclen == (size_t)-2) { + sc = (unsigned char)*string; + sclen = 1; + memset(&strmbs, 0, sizeof(strmbs)); + } + if (sc == '/' && flags & FNM_PATHNAME) break; - ++string; + string += sclen; } return (FNM_NOMATCH); case '[': - if (*string == EOS) + if (sc == EOS) return (FNM_NOMATCH); - if (*string == '/' && (flags & FNM_PATHNAME)) + if (sc == '/' && (flags & FNM_PATHNAME)) return (FNM_NOMATCH); - if (*string == '.' && (flags & FNM_PERIOD) && + if (sc == '.' && (flags & FNM_PERIOD) && (string == stringstart || ((flags & FNM_PATHNAME) && *(string - 1) == '/'))) return (FNM_NOMATCH); - switch (rangematch(pattern, *string, flags, &newp)) { + switch (rangematch(pattern, sc, flags, &newp, + &patmbs)) { case RANGE_ERROR: goto norm; case RANGE_MATCH: @@ -139,41 +185,47 @@ fnmatch(pattern, string, flags) case RANGE_NOMATCH: return (FNM_NOMATCH); } - ++string; + string += sclen; break; case '\\': if (!(flags & FNM_NOESCAPE)) { - if ((c = *pattern++) == EOS) { - c = '\\'; - --pattern; - } + pclen = mbrtowc(&pc, pattern, MB_LEN_MAX, + &patmbs); + if (pclen == (size_t)-1 || pclen == (size_t)-2) + return (FNM_NOMATCH); + if (pclen == 0) + pc = '\\'; + pattern += pclen; } /* FALLTHROUGH */ default: norm: - if (c == *string) + if (pc == sc) ; else if ((flags & FNM_CASEFOLD) && - (tolower((unsigned char)c) == - tolower((unsigned char)*string))) + (towlower(pc) == towlower(sc))) ; else return (FNM_NOMATCH); - string++; + string += sclen; break; } + } /* NOTREACHED */ } static int -rangematch(pattern, test, flags, newp) +rangematch(pattern, test, flags, newp, patmbs) const char *pattern; - char test; + wchar_t test; int flags; char **newp; + mbstate_t *patmbs; { int negate, ok; - char c, c2; + wchar_t c, c2; + size_t pclen; + const char *origpat; /* * A bracket expression starting with an unquoted circumflex @@ -186,7 +238,7 @@ rangematch(pattern, test, flags, newp) ++pattern; if (flags & FNM_CASEFOLD) - test = tolower((unsigned char)test); + test = towlower(test); /* * A right bracket shall lose its special meaning and represent @@ -194,29 +246,40 @@ rangematch(pattern, test, flags, newp) * -- POSIX.2 2.8.3.2 */ ok = 0; - c = *pattern++; - do { - if (c == '\\' && !(flags & FNM_NOESCAPE)) - c = *pattern++; - if (c == EOS) + origpat = pattern; + for (;;) { + if (*pattern == ']' && pattern > origpat) { + pattern++; + break; + } else if (*pattern == '\0') { return (RANGE_ERROR); - - if (c == '/' && (flags & FNM_PATHNAME)) + } else if (*pattern == '/' && (flags & FNM_PATHNAME)) { + pattern++; + return (RANGE_NOMATCH); + } else if (*pattern == '\\' && !(flags & FNM_NOESCAPE)) + pattern++; + pclen = mbrtowc(&c, pattern, MB_LEN_MAX, patmbs); + if (pclen == (size_t)-1 || pclen == (size_t)-2) return (RANGE_NOMATCH); + pattern += pclen; if (flags & FNM_CASEFOLD) - c = tolower((unsigned char)c); + c = towlower(c); - if (*pattern == '-' - && (c2 = *(pattern+1)) != EOS && c2 != ']') { - pattern += 2; - if (c2 == '\\' && !(flags & FNM_NOESCAPE)) - c2 = *pattern++; + if (*pattern == '-' && *(pattern + 1) != EOS && + *(pattern + 1) != ']') { + if (*++pattern == '\\' && !(flags & FNM_NOESCAPE)) + if (*pattern != EOS) + pattern++; + pclen = mbrtowc(&c2, pattern, MB_LEN_MAX, patmbs); + if (pclen == (size_t)-1 || pclen == (size_t)-2) + return (RANGE_NOMATCH); + pattern += pclen; if (c2 == EOS) return (RANGE_ERROR); if (flags & FNM_CASEFOLD) - c2 = tolower((unsigned char)c2); + c2 = towlower(c2); if (__collate_load_error ? c <= test && test <= c2 : @@ -226,7 +289,7 @@ rangematch(pattern, test, flags, newp) ok = 1; } else if (c == test) ok = 1; - } while ((c = *pattern++) != ']'); + } *newp = (char *)pattern; return (ok == negate ? RANGE_NOMATCH : RANGE_MATCH); |