From 9e984856c23b7c44c439ac5abc77b3532540fbbc Mon Sep 17 00:00:00 2001 From: tjr Date: Mon, 19 Jul 2004 11:12:02 +0000 Subject: Add support for multibyte characters. While here, fix a longstanding bug in the implementation of the -d option: we were skipping too many characters when a non-alphanumeric character was encountered. --- usr.bin/look/look.1 | 6 --- usr.bin/look/look.c | 121 +++++++++++++++++++++++++++++----------------------- 2 files changed, 68 insertions(+), 59 deletions(-) (limited to 'usr.bin/look') diff --git a/usr.bin/look/look.1 b/usr.bin/look/look.1 index 4c75787..c3d0db2 100644 --- a/usr.bin/look/look.1 +++ b/usr.bin/look/look.1 @@ -106,12 +106,6 @@ A utility appeared in .At v7 . .Sh BUGS -The -.Nm -utility does not recognize multibyte characters. -.Pp -Lines are not compared according to the current locale's collating -order. Input files must be sorted with .Ev LC_COLLATE set to diff --git a/usr.bin/look/look.c b/usr.bin/look/look.c index 33a1295..68d6755 100644 --- a/usr.bin/look/look.c +++ b/usr.bin/look/look.c @@ -60,7 +60,6 @@ __FBSDID("$FreeBSD$"); #include #include -#include #include #include #include @@ -70,33 +69,25 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include +#include #include "pathnames.h" static char _path_words[] = _PATH_WORDS; -/* - * FOLD and DICT convert characters to a normal form for comparison, - * according to the user specified flags. - * - * DICT expects integers because it uses a non-character value to - * indicate a character which should not participate in comparisons. - */ #define EQUAL 0 #define GREATER 1 #define LESS (-1) -#define NO_COMPARE (-2) - -#define FOLD(c) (isupper(c) ? tolower(c) : (unsigned char) (c)) -#define DICT(c) (isalnum(c) ? (c) & 0xFF /* int */ : NO_COMPARE) int dflag, fflag; -char *binary_search(unsigned char *, unsigned char *, unsigned char *); -int compare(unsigned char *, unsigned char *, unsigned char *); -char *linear_search(unsigned char *, unsigned char *, unsigned char *); -int look(unsigned char *, unsigned char *, unsigned char *); -void print_from(unsigned char *, unsigned char *, unsigned char *); +char *binary_search(wchar_t *, unsigned char *, unsigned char *); +int compare(wchar_t *, unsigned char *, unsigned char *); +char *linear_search(wchar_t *, unsigned char *, unsigned char *); +int look(wchar_t *, unsigned char *, unsigned char *); +wchar_t *prepkey(const char *, wchar_t); +void print_from(wchar_t *, unsigned char *, unsigned char *); static void usage(void); @@ -104,14 +95,16 @@ int main(int argc, char *argv[]) { struct stat sb; - int ch, fd, termchar, match; - unsigned char *back, *front, *string, *p; + int ch, fd, match; + wchar_t termchar; + unsigned char *back, *front; unsigned const char *file; + wchar_t *key; (void) setlocale(LC_CTYPE, ""); file = _path_words; - termchar = '\0'; + termchar = L'\0'; while ((ch = getopt(argc, argv, "dft:")) != -1) switch(ch) { case 'd': @@ -121,7 +114,9 @@ main(int argc, char *argv[]) fflag = 1; break; case 't': - termchar = *optarg; + if (mbrtowc(&termchar, optarg, MB_LEN_MAX, NULL) != + strlen(optarg)) + errx(2, "invalid termination character"); break; case '?': default: @@ -134,12 +129,10 @@ main(int argc, char *argv[]) usage(); if (argc == 1) /* But set -df by default. */ dflag = fflag = 1; - string = *argv++; + key = prepkey(*argv++, termchar); if (argc >= 2) file = *argv++; - if (termchar != '\0' && (p = strchr(string, termchar)) != NULL) - *++p = '\0'; match = 1; do { @@ -150,29 +143,47 @@ main(int argc, char *argv[]) if ((front = mmap(NULL, (size_t)sb.st_size, PROT_READ, MAP_SHARED, fd, (off_t)0)) == MAP_FAILED) err(2, "%s", file); back = front + sb.st_size; - match *= (look(string, front, back)); + match *= (look(key, front, back)); close(fd); } while (argc-- > 2 && (file = *argv++)); exit(match); } -int -look(unsigned char *string, unsigned char *front, unsigned char *back) +wchar_t * +prepkey(const char *string, wchar_t termchar) { - int ch; - unsigned char *readp, *writep; + const char *readp; + wchar_t *key, *writep; + wchar_t ch; + size_t clen; - /* Reformat string string to avoid doing it multiple times later. */ - for (readp = writep = string; (ch = *readp++);) { + /* + * Reformat search string and convert to wide character representation + * to avoid doing it multiple times later. + */ + if ((key = malloc(sizeof(wchar_t) * (strlen(string) + 1))) == NULL) + err(2, NULL); + readp = string; + writep = key; + while ((clen = mbrtowc(&ch, readp, MB_LEN_MAX, NULL)) != 0) { + if (clen == (size_t)-1 || clen == (size_t)-2) + errc(2, EILSEQ, NULL); if (fflag) - ch = FOLD(ch); - if (dflag) - ch = DICT(ch); - if (ch != NO_COMPARE) - *(writep++) = ch; + ch = towlower(ch); + if (!dflag || iswalnum(ch)) + *writep++ = ch; + readp += clen; } - *writep = '\0'; + *writep = L'\0'; + if (termchar != L'\0' && (writep = wcschr(key, termchar)) != NULL) + *++writep = L'\0'; + return (key); +} + +int +look(wchar_t *string, unsigned char *front, unsigned char *back) +{ front = binary_search(string, front, back); front = linear_search(string, front, back); @@ -225,7 +236,7 @@ look(unsigned char *string, unsigned char *front, unsigned char *back) while (p < back && *p++ != '\n'); char * -binary_search(unsigned char *string, unsigned char *front, unsigned char *back) +binary_search(wchar_t *string, unsigned char *front, unsigned char *back) { unsigned char *p; @@ -259,7 +270,7 @@ binary_search(unsigned char *string, unsigned char *front, unsigned char *back) * o front is before or at the first line to be printed. */ char * -linear_search(unsigned char *string, unsigned char *front, unsigned char *back) +linear_search(wchar_t *string, unsigned char *front, unsigned char *back) { while (front < back) { switch (compare(string, front, back)) { @@ -281,7 +292,7 @@ linear_search(unsigned char *string, unsigned char *front, unsigned char *back) * Print as many lines as match string, starting at front. */ void -print_from(unsigned char *string, unsigned char *front, unsigned char *back) +print_from(wchar_t *string, unsigned char *front, unsigned char *back) { for (; front < back && compare(string, front, back) == EQUAL; ++front) { for (; front < back && *front != '\n'; ++front) @@ -306,23 +317,27 @@ print_from(unsigned char *string, unsigned char *front, unsigned char *back) * "back" terminated). */ int -compare(unsigned char *s1, unsigned char *s2, unsigned char *back) +compare(wchar_t *s1, unsigned char *s2, unsigned char *back) { - int ch; - - for (; *s1 && s2 < back && *s2 != '\n'; ++s1, ++s2) { - ch = *s2; + wchar_t ch1, ch2; + size_t len2; + + for (; *s1 && s2 < back && *s2 != '\n'; ++s1, s2 += len2) { + ch1 = *s1; + len2 = mbrtowc(&ch2, s2, back - s2, NULL); + if (len2 == (size_t)-1 || len2 == (size_t)-2) { + ch2 = *s2; + len2 = 1; + } if (fflag) - ch = FOLD(ch); - if (dflag) - ch = DICT(ch); - - if (ch == NO_COMPARE) { - ++s2; /* Ignore character in comparison. */ + ch2 = towlower(ch2); + if (dflag && !iswalnum(ch2)) { + /* Ignore character in comparison. */ + --s1; continue; } - if (*s1 != ch) - return (*s1 < ch ? LESS : GREATER); + if (ch1 != ch2) + return (ch1 < ch2 ? LESS : GREATER); } return (*s1 ? GREATER : EQUAL); } -- cgit v1.1