From b7f5e217dda791d61c549a147e0e6ad6cd1b3f3d Mon Sep 17 00:00:00 2001 From: tjr Date: Wed, 14 Jul 2004 10:06:22 +0000 Subject: Make the 'y' (translate) command aware of multibyte characters. --- usr.bin/sed/compile.c | 82 +++++++++++++++++++++++++++++++++++++++++++-------- usr.bin/sed/defs.h | 16 +++++++++- usr.bin/sed/main.c | 1 + usr.bin/sed/misc.c | 1 + usr.bin/sed/process.c | 62 ++++++++++++++++++++++++++++++++++++-- 5 files changed, 145 insertions(+), 17 deletions(-) (limited to 'usr.bin') diff --git a/usr.bin/sed/compile.c b/usr.bin/sed/compile.c index f0a6c9b..afeaa01 100644 --- a/usr.bin/sed/compile.c +++ b/usr.bin/sed/compile.c @@ -47,12 +47,14 @@ static const char sccsid[] = "@(#)compile.c 8.1 (Berkeley) 6/6/93"; #include #include +#include #include #include #include #include #include #include +#include #include "defs.h" #include "extern.h" @@ -73,7 +75,7 @@ static char *compile_flags(char *, struct s_subst *); static char *compile_re(char *, regex_t **); static char *compile_subst(char *, struct s_subst *); static char *compile_text(void); -static char *compile_tr(char *, char **); +static char *compile_tr(char *, struct s_tr **); static struct s_command **compile_stream(struct s_command **); static char *duptoeol(char *, const char *); @@ -337,7 +339,7 @@ nonsel: /* Now parse the command */ break; case TR: /* y */ p++; - p = compile_tr(p, (char **)&cmd->u.y); + p = compile_tr(p, &cmd->u.y); EATSPACE(); if (*p == ';') { p++; @@ -619,12 +621,20 @@ compile_flags(char *p, struct s_subst *s) * Compile a translation set of strings into a lookup table. */ static char * -compile_tr(char *p, char **transtab) +compile_tr(char *p, struct s_tr **py) { + struct s_tr *y; int i; - char *lt, *op, *np; + const char *op, *np; char old[_POSIX2_LINE_MAX + 1]; char new[_POSIX2_LINE_MAX + 1]; + size_t oclen, oldlen, nclen, newlen; + mbstate_t mbs1, mbs2; + + if ((*py = y = malloc(sizeof(*y))) == NULL) + err(1, NULL); + y->multis = NULL; + y->nmultis = 0; if (*p == '\0' || *p == '\\') errx(1, @@ -639,17 +649,63 @@ compile_tr(char *p, char **transtab) errx(1, "%lu: %s: unterminated transform target string", linenum, fname); EATSPACE(); - if (strlen(new) != strlen(old)) + op = old; + oldlen = mbsrtowcs(NULL, &op, 0, NULL); + if (oldlen == (size_t)-1) + err(1, NULL); + np = new; + newlen = mbsrtowcs(NULL, &np, 0, NULL); + if (newlen == (size_t)-1) + err(1, NULL); + if (newlen != oldlen) errx(1, "%lu: %s: transform strings are not the same length", linenum, fname); - /* We assume characters are 8 bits */ - if ((lt = malloc(UCHAR_MAX)) == NULL) - err(1, "malloc"); - for (i = 0; i <= UCHAR_MAX; i++) - lt[i] = (char)i; - for (op = old, np = new; *op; op++, np++) - lt[(u_char)*op] = *np; - *transtab = lt; + if (MB_CUR_MAX == 1) { + /* + * The single-byte encoding case is easy: generate a + * lookup table. + */ + for (i = 0; i <= UCHAR_MAX; i++) + y->bytetab[i] = (char)i; + for (; *op; op++, np++) + y->bytetab[(u_char)*op] = *np; + } else { + /* + * Multi-byte encoding case: generate a lookup table as + * above, but only for single-byte characters. The first + * bytes of multi-byte characters have their lookup table + * entries set to 0, which causes do_tr() to search through + * an auxiliary vector of multi-byte mappings. + */ + memset(&mbs1, 0, sizeof(mbs1)); + memset(&mbs2, 0, sizeof(mbs2)); + for (i = 0; i <= UCHAR_MAX; i++) + y->bytetab[i] = (btowc(i) != WEOF) ? i : 0; + while (*op != '\0') { + oclen = mbrlen(op, MB_LEN_MAX, &mbs1); + if (oclen == (size_t)-1 || oclen == (size_t)-2) + errc(1, EILSEQ, NULL); + nclen = mbrlen(np, MB_LEN_MAX, &mbs2); + if (nclen == (size_t)-1 || nclen == (size_t)-2) + errc(1, EILSEQ, NULL); + if (oclen == 1 && nclen == 1) + y->bytetab[(u_char)*op] = *np; + else { + y->bytetab[(u_char)*op] = 0; + y->multis = realloc(y->multis, + (y->nmultis + 1) * sizeof(*y->multis)); + if (y->multis == NULL) + err(1, NULL); + i = y->nmultis++; + y->multis[i].fromlen = oclen; + memcpy(y->multis[i].from, op, oclen); + y->multis[i].tolen = nclen; + memcpy(y->multis[i].to, np, nclen); + } + op += oclen; + np += nclen; + } + } return (p); } diff --git a/usr.bin/sed/defs.h b/usr.bin/sed/defs.h index 0e77014..bc487d9 100644 --- a/usr.bin/sed/defs.h +++ b/usr.bin/sed/defs.h @@ -35,6 +35,7 @@ * SUCH DAMAGE. * * @(#)defs.h 8.1 (Berkeley) 6/6/93 + * $FreeBSD$ */ /* @@ -71,6 +72,19 @@ struct s_subst { char *new; /* Replacement text */ }; +/* + * Translate command. + */ +struct s_tr { + unsigned char bytetab[256]; + struct trmulti { + int fromlen; + char from[MB_LEN_MAX]; + int tolen; + char to[MB_LEN_MAX]; + } *multis; + int nmultis; +}; /* * An internally compiled command. @@ -84,7 +98,7 @@ struct s_command { union { struct s_command *c; /* Command(s) for b t { */ struct s_subst *s; /* Substitute command */ - u_char *y; /* Replace command array */ + struct s_tr *y; /* Replace command array */ int fd; /* File descriptor for w */ } u; char code; /* Command code */ diff --git a/usr.bin/sed/main.c b/usr.bin/sed/main.c index bb5ffec..670298e 100644 --- a/usr.bin/sed/main.c +++ b/usr.bin/sed/main.c @@ -56,6 +56,7 @@ static const char sccsid[] = "@(#)main.c 8.2 (Berkeley) 1/3/94"; #include #include #include +#include #include #include #include diff --git a/usr.bin/sed/misc.c b/usr.bin/sed/misc.c index 5b47f3d..91fff53 100644 --- a/usr.bin/sed/misc.c +++ b/usr.bin/sed/misc.c @@ -45,6 +45,7 @@ static const char sccsid[] = "@(#)misc.c 8.1 (Berkeley) 6/6/93"; #include #include +#include #include #include #include diff --git a/usr.bin/sed/process.c b/usr.bin/sed/process.c index 0d63994..1858b65 100644 --- a/usr.bin/sed/process.c +++ b/usr.bin/sed/process.c @@ -63,7 +63,7 @@ static const char sccsid[] = "@(#)process.c 8.6 (Berkeley) 4/20/94"; #include "defs.h" #include "extern.h" -static SPACE HS, PS, SS; +static SPACE HS, PS, SS, YS; #define pd PS.deleted #define ps PS.space #define psl PS.len @@ -71,6 +71,7 @@ static SPACE HS, PS, SS; #define hsl HS.len static __inline int applies(struct s_command *); +static void do_tr(struct s_tr *); static void flush_appends(void); static void lputs(char *, size_t); static __inline int regexec_e(regex_t *, const char *, int, int, size_t); @@ -97,6 +98,7 @@ process(void) SPACE tspace; size_t len, oldpsl = 0; char *p; + char nc; p = NULL; @@ -247,8 +249,7 @@ redirect: case 'y': if (pd || psl == 0) break; - for (p = ps, len = psl; len--; ++p) - *p = cp->u.y[(unsigned char)*p]; + do_tr(cp->u.y); break; case ':': case '}': @@ -426,6 +427,61 @@ substitute(struct s_command *cp) } /* + * do_tr -- + * Perform translation ('y' command) in the pattern space. + */ +static void +do_tr(struct s_tr *y) +{ + SPACE tmp; + char c, *p; + size_t clen, left; + int i; + + if (MB_CUR_MAX == 1) { + /* + * Single-byte encoding: perform in-place translation + * of the pattern space. + */ + for (p = ps; p < &ps[psl]; p++) + *p = y->bytetab[(u_char)*p]; + } else { + /* + * Multi-byte encoding: perform translation into the + * translation space, then swap the translation and + * pattern spaces. + */ + /* Clean translation space. */ + YS.len = 0; + for (p = ps, left = psl; left > 0; p += clen, left -= clen) { + if ((c = y->bytetab[(u_char)*p]) != '\0') { + cspace(&YS, &c, 1, APPEND); + clen = 1; + continue; + } + for (i = 0; i < y->nmultis; i++) + if (left >= y->multis[i].fromlen && + memcmp(p, y->multis[i].from, + y->multis[i].fromlen) == 0) + break; + if (i < y->nmultis) { + cspace(&YS, y->multis[i].to, + y->multis[i].tolen, APPEND); + clen = y->multis[i].fromlen; + } else { + cspace(&YS, p, 1, APPEND); + clen = 1; + } + } + /* Swap the translation space and the pattern space. */ + tmp = PS; + PS = YS; + YS = tmp; + YS.space = YS.back; + } +} + +/* * Flush append requests. Always called before reading a line, * therefore it also resets the substitution done (sdone) flag. */ -- cgit v1.1