diff options
author | tjr <tjr@FreeBSD.org> | 2004-07-09 02:08:07 +0000 |
---|---|---|
committer | tjr <tjr@FreeBSD.org> | 2004-07-09 02:08:07 +0000 |
commit | d291df1e3f703e3724c6d1f0e4bf796ba44b546e (patch) | |
tree | 1c3e4abaa437f441d05a1cc08f7e7b91684fbad4 /usr.bin/tr/str.c | |
parent | fb654efba8e1f77bca7a08d83618ffdac3c226a0 (diff) | |
download | FreeBSD-src-d291df1e3f703e3724c6d1f0e4bf796ba44b546e.zip FreeBSD-src-d291df1e3f703e3724c6d1f0e4bf796ba44b546e.tar.gz |
Add support for multibyte characters. The challenge here was to use
data structures that scale better with large character sets, instead of
arrays indexed by character value:
- Sets of characters to delete/squeeze are stored in a new "cset" structure,
which is implemented as a splay tree of extents. This structure has the
ability to store character classes (ala wctype(3)), but this is not
currently fully utilized.
- Mappings between characters are stored in a new "cmap" structure, which
is also a splay tree.
- The parser no longer builds arrays containing all the characters in a
particular class; instead, next() determines them on-the-fly using
nextwctype(3).
Diffstat (limited to 'usr.bin/tr/str.c')
-rw-r--r-- | usr.bin/tr/str.c | 163 |
1 files changed, 77 insertions, 86 deletions
diff --git a/usr.bin/tr/str.c b/usr.bin/tr/str.c index 3365caf..f28b243 100644 --- a/usr.bin/tr/str.c +++ b/usr.bin/tr/str.c @@ -44,26 +44,31 @@ static const char sccsid[] = "@(#)str.c 8.2 (Berkeley) 4/28/95"; #include <ctype.h> #include <err.h> +#include <errno.h> #include <stddef.h> #include <stdio.h> #include <stdlib.h> #include <string.h> +#include <wchar.h> +#include <wctype.h> #include "extern.h" static int backslash(STR *, int *); static int bracket(STR *); -static int c_class(const void *, const void *); static void genclass(STR *); static void genequiv(STR *); static int genrange(STR *, int); static void genseq(STR *); -int +wint_t next(s) STR *s; { - int ch, is_octal; + int is_octal; + wint_t ch; + wchar_t wch; + size_t clen; switch (s->state) { case EOS: @@ -71,7 +76,7 @@ next(s) case INFINITE: return (1); case NORMAL: - switch (ch = (u_char)*s->str) { + switch (*s->str) { case '\0': s->state = EOS; return (0); @@ -83,9 +88,13 @@ next(s) return (next(s)); /* FALLTHROUGH */ default: + clen = mbrtowc(&wch, s->str, MB_LEN_MAX, NULL); + if (clen == (size_t)-1 || clen == (size_t)-2 || + clen == 0) + errc(1, EILSEQ, NULL); is_octal = 0; - ++s->str; - s->lastch = ch; + s->lastch = wch; + s->str += clen; break; } @@ -106,9 +115,18 @@ next(s) return (next(s)); } return (1); + case CCLASS: + case CCLASS_UPPER: + case CCLASS_LOWER: + s->cnt++; + ch = nextwctype(s->lastch, s->cclass); + if (ch == -1) { + s->state = NORMAL; + return (next(s)); + } + s->lastch = ch; + return (1); case SET: - case SET_UPPER: - case SET_LOWER: if ((ch = s->set[s->cnt++]) == OOBCH) { s->state = NORMAL; return (next(s)); @@ -159,74 +177,21 @@ bracket(s) /* NOTREACHED */ } -typedef struct { - const char *name; - int (*func)(int); - int *set; -} CLASS; - -static CLASS classes[] = { -#undef isalnum - { "alnum", isalnum, NULL }, -#undef isalpha - { "alpha", isalpha, NULL }, -#undef isblank - { "blank", isblank, NULL }, -#undef iscntrl - { "cntrl", iscntrl, NULL }, -#undef isdigit - { "digit", isdigit, NULL }, -#undef isgraph - { "graph", isgraph, NULL }, -#undef islower - { "lower", islower, NULL }, -#undef isprint - { "print", isprint, NULL }, -#undef ispunct - { "punct", ispunct, NULL }, -#undef isspace - { "space", isspace, NULL }, -#undef isupper - { "upper", isupper, NULL }, -#undef isxdigit - { "xdigit", isxdigit, NULL }, -}; - static void genclass(s) STR *s; { - int cnt, (*func)(int); - CLASS *cp, tmp; - int *p; - tmp.name = s->str; - if ((cp = (CLASS *)bsearch(&tmp, classes, sizeof(classes) / - sizeof(CLASS), sizeof(CLASS), c_class)) == NULL) + if ((s->cclass = wctype(s->str)) == 0) errx(1, "unknown class %s", s->str); - - if ((cp->set = p = malloc((NCHARS + 1) * sizeof(int))) == NULL) - err(1, "genclass() malloc"); - for (cnt = 0, func = cp->func; cnt < NCHARS; ++cnt) - if ((func)(cnt)) - *p++ = cnt; - *p = OOBCH; - s->cnt = 0; - s->set = cp->set; + s->lastch = -1; /* incremented before check in next() */ if (strcmp(s->str, "upper") == 0) - s->state = SET_UPPER; + s->state = CCLASS_UPPER; else if (strcmp(s->str, "lower") == 0) - s->state = SET_LOWER; + s->state = CCLASS_LOWER; else - s->state = SET; -} - -static int -c_class(a, b) - const void *a, *b; -{ - return (strcmp(((const CLASS *)a)->name, ((const CLASS *)b)->name)); + s->state = CCLASS; } static void @@ -235,6 +200,8 @@ genequiv(s) { int i, p, pri; char src[2], dst[3]; + size_t clen; + wchar_t wc; if (*s->str == '\\') { s->equiv[0] = backslash(s, NULL); @@ -242,10 +209,13 @@ genequiv(s) errx(1, "misplaced equivalence equals sign"); s->str += 2; } else { - s->equiv[0] = s->str[0]; - if (s->str[1] != '=') + clen = mbrtowc(&wc, s->str, MB_LEN_MAX, NULL); + if (clen == (size_t)-1 || clen == (size_t)-2 || clen == 0) + errc(1, EILSEQ, NULL); + s->equiv[0] = wc; + if (s->str[clen] != '=') errx(1, "misplaced equivalence equals sign"); - s->str += 3; + s->str += clen + 2; } /* @@ -255,12 +225,13 @@ genequiv(s) * XXX Knows too much about how strxfrm() is implemented. Assumes * it fills the string with primary collation weight bytes. Only one- * to-one mappings are supported. + * XXX Equivalence classes not supported in multibyte locales. */ - src[0] = s->equiv[0]; + src[0] = (char)s->equiv[0]; src[1] = '\0'; - if (strxfrm(dst, src, sizeof(dst)) == 1) { + if (MB_CUR_MAX == 1 && strxfrm(dst, src, sizeof(dst)) == 1) { pri = (unsigned char)*dst; - for (p = 1, i = 1; i < NCHARS; i++) { + for (p = 1, i = 1; i < NCHARS_SB; i++) { *src = i; if (strxfrm(dst, src, sizeof(dst)) == 1 && pri && pri == (unsigned char)*dst) @@ -280,28 +251,41 @@ genrange(STR *s, int was_octal) int stopval, octal; char *savestart; int n, cnt, *p; + size_t clen; + wchar_t wc; octal = 0; savestart = s->str; - stopval = *++s->str == '\\' ? backslash(s, &octal) : (u_char)*s->str++; - if (!octal) - octal = was_octal; - - if ((octal && stopval < s->lastch) || - (!octal && - charcoll((const void *)&stopval, (const void *)&(s->lastch)) < 0)) { - s->str = savestart; - return (0); + if (*++s->str == '\\') + stopval = backslash(s, &octal); + else { + clen = mbrtowc(&wc, s->str, MB_LEN_MAX, NULL); + if (clen == (size_t)-1 || clen == (size_t)-2) + errc(1, EILSEQ, NULL); + stopval = wc; + s->str += clen; } - if (octal) { + /* + * XXX Characters are not ordered according to collating sequence in + * multibyte locales. + */ + if (octal || was_octal || MB_CUR_MAX > 1) { + if (stopval < s->lastch) { + s->str = savestart; + return (0); + } s->cnt = stopval - s->lastch + 1; s->state = RANGE; --s->lastch; return (1); } - if ((s->set = p = malloc((NCHARS + 1) * sizeof(int))) == NULL) + if (charcoll((const void *)&stopval, (const void *)&(s->lastch)) < 0) { + s->str = savestart; + return (0); + } + if ((s->set = p = malloc((NCHARS_SB + 1) * sizeof(int))) == NULL) err(1, "genrange() malloc"); - for (cnt = 0; cnt < NCHARS; cnt++) + for (cnt = 0; cnt < NCHARS_SB; cnt++) if (charcoll((const void *)&cnt, (const void *)&(s->lastch)) >= 0 && charcoll((const void *)&cnt, (const void *)&stopval) <= 0) *p++ = cnt; @@ -320,14 +304,21 @@ genseq(s) STR *s; { char *ep; + wchar_t wc; + size_t clen; if (s->which == STRING1) errx(1, "sequences only valid in string2"); if (*s->str == '\\') s->lastch = backslash(s, NULL); - else - s->lastch = *s->str++; + else { + clen = mbrtowc(&wc, s->str, MB_LEN_MAX, NULL); + if (clen == (size_t)-1 || clen == (size_t)-2) + errc(1, EILSEQ, NULL); + s->lastch = wc; + s->str += clen; + } if (*s->str != '*') errx(1, "misplaced sequence asterisk"); |