summaryrefslogtreecommitdiffstats
path: root/usr.bin/sed
diff options
context:
space:
mode:
authortjr <tjr@FreeBSD.org>2004-07-14 10:06:22 +0000
committertjr <tjr@FreeBSD.org>2004-07-14 10:06:22 +0000
commitb7f5e217dda791d61c549a147e0e6ad6cd1b3f3d (patch)
treecebf0634774be08929212c168278981cd54ae195 /usr.bin/sed
parent084c37915e361d7646a6eefa02b04a5db5958496 (diff)
downloadFreeBSD-src-b7f5e217dda791d61c549a147e0e6ad6cd1b3f3d.zip
FreeBSD-src-b7f5e217dda791d61c549a147e0e6ad6cd1b3f3d.tar.gz
Make the 'y' (translate) command aware of multibyte characters.
Diffstat (limited to 'usr.bin/sed')
-rw-r--r--usr.bin/sed/compile.c82
-rw-r--r--usr.bin/sed/defs.h16
-rw-r--r--usr.bin/sed/main.c1
-rw-r--r--usr.bin/sed/misc.c1
-rw-r--r--usr.bin/sed/process.c62
5 files changed, 145 insertions, 17 deletions
diff --git a/usr.bin/sed/compile.c b/usr.bin/sed/compile.c
index f0a6c9b..afeaa01 100644
--- a/usr.bin/sed/compile.c
+++ b/usr.bin/sed/compile.c
@@ -47,12 +47,14 @@ static const char sccsid[] = "@(#)compile.c 8.1 (Berkeley) 6/6/93";
#include <ctype.h>
#include <err.h>
+#include <errno.h>
#include <fcntl.h>
#include <limits.h>
#include <regex.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
+#include <wchar.h>
#include "defs.h"
#include "extern.h"
@@ -73,7 +75,7 @@ static char *compile_flags(char *, struct s_subst *);
static char *compile_re(char *, regex_t **);
static char *compile_subst(char *, struct s_subst *);
static char *compile_text(void);
-static char *compile_tr(char *, char **);
+static char *compile_tr(char *, struct s_tr **);
static struct s_command
**compile_stream(struct s_command **);
static char *duptoeol(char *, const char *);
@@ -337,7 +339,7 @@ nonsel: /* Now parse the command */
break;
case TR: /* y */
p++;
- p = compile_tr(p, (char **)&cmd->u.y);
+ p = compile_tr(p, &cmd->u.y);
EATSPACE();
if (*p == ';') {
p++;
@@ -619,12 +621,20 @@ compile_flags(char *p, struct s_subst *s)
* Compile a translation set of strings into a lookup table.
*/
static char *
-compile_tr(char *p, char **transtab)
+compile_tr(char *p, struct s_tr **py)
{
+ struct s_tr *y;
int i;
- char *lt, *op, *np;
+ const char *op, *np;
char old[_POSIX2_LINE_MAX + 1];
char new[_POSIX2_LINE_MAX + 1];
+ size_t oclen, oldlen, nclen, newlen;
+ mbstate_t mbs1, mbs2;
+
+ if ((*py = y = malloc(sizeof(*y))) == NULL)
+ err(1, NULL);
+ y->multis = NULL;
+ y->nmultis = 0;
if (*p == '\0' || *p == '\\')
errx(1,
@@ -639,17 +649,63 @@ compile_tr(char *p, char **transtab)
errx(1, "%lu: %s: unterminated transform target string",
linenum, fname);
EATSPACE();
- if (strlen(new) != strlen(old))
+ op = old;
+ oldlen = mbsrtowcs(NULL, &op, 0, NULL);
+ if (oldlen == (size_t)-1)
+ err(1, NULL);
+ np = new;
+ newlen = mbsrtowcs(NULL, &np, 0, NULL);
+ if (newlen == (size_t)-1)
+ err(1, NULL);
+ if (newlen != oldlen)
errx(1, "%lu: %s: transform strings are not the same length",
linenum, fname);
- /* We assume characters are 8 bits */
- if ((lt = malloc(UCHAR_MAX)) == NULL)
- err(1, "malloc");
- for (i = 0; i <= UCHAR_MAX; i++)
- lt[i] = (char)i;
- for (op = old, np = new; *op; op++, np++)
- lt[(u_char)*op] = *np;
- *transtab = lt;
+ if (MB_CUR_MAX == 1) {
+ /*
+ * The single-byte encoding case is easy: generate a
+ * lookup table.
+ */
+ for (i = 0; i <= UCHAR_MAX; i++)
+ y->bytetab[i] = (char)i;
+ for (; *op; op++, np++)
+ y->bytetab[(u_char)*op] = *np;
+ } else {
+ /*
+ * Multi-byte encoding case: generate a lookup table as
+ * above, but only for single-byte characters. The first
+ * bytes of multi-byte characters have their lookup table
+ * entries set to 0, which causes do_tr() to search through
+ * an auxiliary vector of multi-byte mappings.
+ */
+ memset(&mbs1, 0, sizeof(mbs1));
+ memset(&mbs2, 0, sizeof(mbs2));
+ for (i = 0; i <= UCHAR_MAX; i++)
+ y->bytetab[i] = (btowc(i) != WEOF) ? i : 0;
+ while (*op != '\0') {
+ oclen = mbrlen(op, MB_LEN_MAX, &mbs1);
+ if (oclen == (size_t)-1 || oclen == (size_t)-2)
+ errc(1, EILSEQ, NULL);
+ nclen = mbrlen(np, MB_LEN_MAX, &mbs2);
+ if (nclen == (size_t)-1 || nclen == (size_t)-2)
+ errc(1, EILSEQ, NULL);
+ if (oclen == 1 && nclen == 1)
+ y->bytetab[(u_char)*op] = *np;
+ else {
+ y->bytetab[(u_char)*op] = 0;
+ y->multis = realloc(y->multis,
+ (y->nmultis + 1) * sizeof(*y->multis));
+ if (y->multis == NULL)
+ err(1, NULL);
+ i = y->nmultis++;
+ y->multis[i].fromlen = oclen;
+ memcpy(y->multis[i].from, op, oclen);
+ y->multis[i].tolen = nclen;
+ memcpy(y->multis[i].to, np, nclen);
+ }
+ op += oclen;
+ np += nclen;
+ }
+ }
return (p);
}
diff --git a/usr.bin/sed/defs.h b/usr.bin/sed/defs.h
index 0e77014..bc487d9 100644
--- a/usr.bin/sed/defs.h
+++ b/usr.bin/sed/defs.h
@@ -35,6 +35,7 @@
* SUCH DAMAGE.
*
* @(#)defs.h 8.1 (Berkeley) 6/6/93
+ * $FreeBSD$
*/
/*
@@ -71,6 +72,19 @@ struct s_subst {
char *new; /* Replacement text */
};
+/*
+ * Translate command.
+ */
+struct s_tr {
+ unsigned char bytetab[256];
+ struct trmulti {
+ int fromlen;
+ char from[MB_LEN_MAX];
+ int tolen;
+ char to[MB_LEN_MAX];
+ } *multis;
+ int nmultis;
+};
/*
* An internally compiled command.
@@ -84,7 +98,7 @@ struct s_command {
union {
struct s_command *c; /* Command(s) for b t { */
struct s_subst *s; /* Substitute command */
- u_char *y; /* Replace command array */
+ struct s_tr *y; /* Replace command array */
int fd; /* File descriptor for w */
} u;
char code; /* Command code */
diff --git a/usr.bin/sed/main.c b/usr.bin/sed/main.c
index bb5ffec..670298e 100644
--- a/usr.bin/sed/main.c
+++ b/usr.bin/sed/main.c
@@ -56,6 +56,7 @@ static const char sccsid[] = "@(#)main.c 8.2 (Berkeley) 1/3/94";
#include <err.h>
#include <errno.h>
#include <fcntl.h>
+#include <limits.h>
#include <locale.h>
#include <regex.h>
#include <stddef.h>
diff --git a/usr.bin/sed/misc.c b/usr.bin/sed/misc.c
index 5b47f3d..91fff53 100644
--- a/usr.bin/sed/misc.c
+++ b/usr.bin/sed/misc.c
@@ -45,6 +45,7 @@ static const char sccsid[] = "@(#)misc.c 8.1 (Berkeley) 6/6/93";
#include <sys/types.h>
#include <err.h>
+#include <limits.h>
#include <regex.h>
#include <stdio.h>
#include <stdlib.h>
diff --git a/usr.bin/sed/process.c b/usr.bin/sed/process.c
index 0d63994..1858b65 100644
--- a/usr.bin/sed/process.c
+++ b/usr.bin/sed/process.c
@@ -63,7 +63,7 @@ static const char sccsid[] = "@(#)process.c 8.6 (Berkeley) 4/20/94";
#include "defs.h"
#include "extern.h"
-static SPACE HS, PS, SS;
+static SPACE HS, PS, SS, YS;
#define pd PS.deleted
#define ps PS.space
#define psl PS.len
@@ -71,6 +71,7 @@ static SPACE HS, PS, SS;
#define hsl HS.len
static __inline int applies(struct s_command *);
+static void do_tr(struct s_tr *);
static void flush_appends(void);
static void lputs(char *, size_t);
static __inline int regexec_e(regex_t *, const char *, int, int, size_t);
@@ -97,6 +98,7 @@ process(void)
SPACE tspace;
size_t len, oldpsl = 0;
char *p;
+ char nc;
p = NULL;
@@ -247,8 +249,7 @@ redirect:
case 'y':
if (pd || psl == 0)
break;
- for (p = ps, len = psl; len--; ++p)
- *p = cp->u.y[(unsigned char)*p];
+ do_tr(cp->u.y);
break;
case ':':
case '}':
@@ -426,6 +427,61 @@ substitute(struct s_command *cp)
}
/*
+ * do_tr --
+ * Perform translation ('y' command) in the pattern space.
+ */
+static void
+do_tr(struct s_tr *y)
+{
+ SPACE tmp;
+ char c, *p;
+ size_t clen, left;
+ int i;
+
+ if (MB_CUR_MAX == 1) {
+ /*
+ * Single-byte encoding: perform in-place translation
+ * of the pattern space.
+ */
+ for (p = ps; p < &ps[psl]; p++)
+ *p = y->bytetab[(u_char)*p];
+ } else {
+ /*
+ * Multi-byte encoding: perform translation into the
+ * translation space, then swap the translation and
+ * pattern spaces.
+ */
+ /* Clean translation space. */
+ YS.len = 0;
+ for (p = ps, left = psl; left > 0; p += clen, left -= clen) {
+ if ((c = y->bytetab[(u_char)*p]) != '\0') {
+ cspace(&YS, &c, 1, APPEND);
+ clen = 1;
+ continue;
+ }
+ for (i = 0; i < y->nmultis; i++)
+ if (left >= y->multis[i].fromlen &&
+ memcmp(p, y->multis[i].from,
+ y->multis[i].fromlen) == 0)
+ break;
+ if (i < y->nmultis) {
+ cspace(&YS, y->multis[i].to,
+ y->multis[i].tolen, APPEND);
+ clen = y->multis[i].fromlen;
+ } else {
+ cspace(&YS, p, 1, APPEND);
+ clen = 1;
+ }
+ }
+ /* Swap the translation space and the pattern space. */
+ tmp = PS;
+ PS = YS;
+ YS = tmp;
+ YS.space = YS.back;
+ }
+}
+
+/*
* Flush append requests. Always called before reading a line,
* therefore it also resets the substitution done (sdone) flag.
*/
OpenPOWER on IntegriCloud