summaryrefslogtreecommitdiffstats
path: root/usr.bin/tr
diff options
context:
space:
mode:
authortjr <tjr@FreeBSD.org>2004-07-09 02:08:07 +0000
committertjr <tjr@FreeBSD.org>2004-07-09 02:08:07 +0000
commitd291df1e3f703e3724c6d1f0e4bf796ba44b546e (patch)
tree1c3e4abaa437f441d05a1cc08f7e7b91684fbad4 /usr.bin/tr
parentfb654efba8e1f77bca7a08d83618ffdac3c226a0 (diff)
downloadFreeBSD-src-d291df1e3f703e3724c6d1f0e4bf796ba44b546e.zip
FreeBSD-src-d291df1e3f703e3724c6d1f0e4bf796ba44b546e.tar.gz
Add support for multibyte characters. The challenge here was to use
data structures that scale better with large character sets, instead of arrays indexed by character value: - Sets of characters to delete/squeeze are stored in a new "cset" structure, which is implemented as a splay tree of extents. This structure has the ability to store character classes (ala wctype(3)), but this is not currently fully utilized. - Mappings between characters are stored in a new "cmap" structure, which is also a splay tree. - The parser no longer builds arrays containing all the characters in a particular class; instead, next() determines them on-the-fly using nextwctype(3).
Diffstat (limited to 'usr.bin/tr')
-rw-r--r--usr.bin/tr/Makefile3
-rw-r--r--usr.bin/tr/cmap.c212
-rw-r--r--usr.bin/tr/cmap.h83
-rw-r--r--usr.bin/tr/cset.c303
-rw-r--r--usr.bin/tr/cset.h75
-rw-r--r--usr.bin/tr/extern.h21
-rw-r--r--usr.bin/tr/str.c163
-rw-r--r--usr.bin/tr/tr.c220
8 files changed, 880 insertions, 200 deletions
diff --git a/usr.bin/tr/Makefile b/usr.bin/tr/Makefile
index 7124942..00bdd8d 100644
--- a/usr.bin/tr/Makefile
+++ b/usr.bin/tr/Makefile
@@ -1,6 +1,7 @@
# @(#)Makefile 8.1 (Berkeley) 6/6/93
+# $FreeBSD$
PROG= tr
-SRCS= str.c tr.c
+SRCS= cmap.c cset.c str.c tr.c
.include <bsd.prog.mk>
diff --git a/usr.bin/tr/cmap.c b/usr.bin/tr/cmap.c
new file mode 100644
index 0000000..a2cac33
--- /dev/null
+++ b/usr.bin/tr/cmap.c
@@ -0,0 +1,212 @@
+/*-
+ * Copyright (c) 2004 Tim J. Robbins.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+/*
+ * "Character map" ADT. Stores mappings between pairs of characters in a
+ * splay tree, with a lookup table cache to simplify looking up the first
+ * bunch of characters (which are presumably more common than others).
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <assert.h>
+#include <limits.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <wchar.h>
+#include "cmap.h"
+
+static struct cmapnode *cmap_splay(struct cmapnode *, wint_t);
+
+/*
+ * cmap_alloc --
+ * Allocate a character map.
+ */
+struct cmap *
+cmap_alloc(void)
+{
+ struct cmap *cm;
+
+ cm = malloc(sizeof(*cm));
+ if (cm == NULL)
+ return (NULL);
+ cm->cm_root = NULL;
+ cm->cm_def = CM_DEF_SELF;
+ cm->cm_havecache = false;
+ cm->cm_min = cm->cm_max = 0;
+ return (cm);
+}
+
+/*
+ * cmap_add --
+ * Add a mapping from "from" to "to" to the map.
+ */
+bool
+cmap_add(struct cmap *cm, wint_t from, wint_t to)
+{
+ struct cmapnode *cmn, *ncmn;
+
+ cm->cm_havecache = false;
+
+ if (cm->cm_root == NULL) {
+ cmn = malloc(sizeof(*cmn));
+ if (cmn == NULL)
+ return (false);
+ cmn->cmn_from = from;
+ cmn->cmn_to = to;
+ cmn->cmn_left = cmn->cmn_right = NULL;
+ cm->cm_root = cmn;
+ cm->cm_min = cm->cm_max = from;
+ return (true);
+ }
+
+ cmn = cm->cm_root = cmap_splay(cm->cm_root, from);
+
+ if (cmn->cmn_from == from) {
+ cmn->cmn_to = to;
+ return (true);
+ }
+
+ ncmn = malloc(sizeof(*ncmn));
+ if (ncmn == NULL)
+ return (false);
+ ncmn->cmn_from = from;
+ ncmn->cmn_to = to;
+ if (from < cmn->cmn_from) {
+ ncmn->cmn_left = cmn->cmn_left;
+ ncmn->cmn_right = cmn;
+ cmn->cmn_left = NULL;
+ } else {
+ ncmn->cmn_right = cmn->cmn_right;
+ ncmn->cmn_left = cmn;
+ cmn->cmn_right = NULL;
+ }
+ if (from < cm->cm_min)
+ cm->cm_min = from;
+ if (from > cm->cm_max)
+ cm->cm_max = from;
+ cm->cm_root = ncmn;
+
+ return (true);
+}
+
+/*
+ * cmap_lookup_hard --
+ * Look up the mapping for a character using the cache.
+ */
+wint_t
+cmap_lookup_hard(struct cmap *cm, wint_t ch)
+{
+
+ if (cm->cm_root != NULL) {
+ cm->cm_root = cmap_splay(cm->cm_root, ch);
+ if (cm->cm_root->cmn_from == ch)
+ return (cm->cm_root->cmn_to);
+ }
+ return (cm->cm_def == CM_DEF_SELF ? ch : cm->cm_def);
+}
+
+/*
+ * cmap_cache --
+ * Update the cache.
+ */
+void
+cmap_cache(struct cmap *cm)
+{
+ wint_t ch;
+
+ for (ch = 0; ch < CM_CACHE_SIZE; ch++)
+ cm->cm_cache[ch] = cmap_lookup_hard(cm, ch);
+
+ cm->cm_havecache = true;
+}
+
+/*
+ * cmap_default --
+ * Change the value that characters without mappings map to, and
+ * return the old value. The special character value CM_MAP_SELF
+ * means characters map to themselves.
+ */
+wint_t
+cmap_default(struct cmap *cm, wint_t def)
+{
+ wint_t old;
+
+ old = cm->cm_def;
+ cm->cm_def = def;
+ cm->cm_havecache = false;
+ return (old);
+}
+
+static struct cmapnode *
+cmap_splay(struct cmapnode *t, wint_t ch)
+{
+ struct cmapnode N, *l, *r, *y;
+
+ /*
+ * Based on public domain code from Sleator.
+ */
+
+ assert(t != NULL);
+
+ N.cmn_left = N.cmn_right = NULL;
+ l = r = &N;
+ for (;;) {
+ if (ch < t->cmn_from) {
+ if (t->cmn_left != NULL &&
+ ch < t->cmn_left->cmn_from) {
+ y = t->cmn_left;
+ t->cmn_left = y->cmn_right;
+ y->cmn_right = t;
+ t = y;
+ }
+ if (t->cmn_left == NULL)
+ break;
+ r->cmn_left = t;
+ r = t;
+ t = t->cmn_left;
+ } else if (ch > t->cmn_from) {
+ if (t->cmn_right != NULL &&
+ ch > t->cmn_right->cmn_from) {
+ y = t->cmn_right;
+ t->cmn_right = y->cmn_left;
+ y->cmn_left = t;
+ t = y;
+ }
+ if (t->cmn_right == NULL)
+ break;
+ l->cmn_right = t;
+ l = t;
+ t = t->cmn_right;
+ } else
+ break;
+ }
+ l->cmn_right = t->cmn_left;
+ r->cmn_left = t->cmn_right;
+ t->cmn_left = N.cmn_right;
+ t->cmn_right = N.cmn_left;
+ return (t);
+}
diff --git a/usr.bin/tr/cmap.h b/usr.bin/tr/cmap.h
new file mode 100644
index 0000000..9a81e13
--- /dev/null
+++ b/usr.bin/tr/cmap.h
@@ -0,0 +1,83 @@
+/*-
+ * Copyright (c) 2004 Tim J. Robbins.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef CMAP_H
+#define CMAP_H
+
+#include <limits.h>
+#include <stdbool.h>
+#include <wchar.h>
+
+struct cmapnode {
+ wint_t cmn_from;
+ wint_t cmn_to;
+ struct cmapnode *cmn_left;
+ struct cmapnode *cmn_right;
+};
+
+struct cmap {
+#define CM_CACHE_SIZE 128
+ wint_t cm_cache[CM_CACHE_SIZE];
+ bool cm_havecache;
+ struct cmapnode *cm_root;
+#define CM_DEF_SELF -2
+ wint_t cm_def;
+ wint_t cm_min;
+ wint_t cm_max;
+};
+
+struct cmap * cmap_alloc(void);
+bool cmap_add(struct cmap *, wint_t, wint_t);
+wint_t cmap_lookup_hard(struct cmap *, wint_t);
+void cmap_cache(struct cmap *);
+wint_t cmap_default(struct cmap *, wint_t);
+
+static __inline wint_t
+cmap_lookup(struct cmap *cm, wint_t from)
+{
+
+ if (from < CM_CACHE_SIZE && cm->cm_havecache)
+ return (cm->cm_cache[from]);
+ return (cmap_lookup_hard(cm, from));
+}
+
+static __inline wint_t
+cmap_min(struct cmap *cm)
+{
+
+ return (cm->cm_min);
+}
+
+static __inline wint_t
+cmap_max(struct cmap *cm)
+{
+
+ return (cm->cm_max);
+}
+
+#endif
diff --git a/usr.bin/tr/cset.c b/usr.bin/tr/cset.c
new file mode 100644
index 0000000..05dbd77
--- /dev/null
+++ b/usr.bin/tr/cset.c
@@ -0,0 +1,303 @@
+/*-
+ * Copyright (c) 2004 Tim J. Robbins.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+/*
+ * "Set of characters" ADT implemented as a splay tree of extents, with
+ * a lookup table cache to simplify looking up the first bunch of
+ * characters (which are presumably more common than others).
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <assert.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <wchar.h>
+#include <wctype.h>
+#include "cset.h"
+
+static struct csnode * cset_delete(struct csnode *, wchar_t);
+static __inline int cset_rangecmp(struct csnode *, wchar_t);
+static struct csnode * cset_splay(struct csnode *, wchar_t);
+
+/*
+ * cset_alloc --
+ * Allocate a set of characters.
+ */
+struct cset *
+cset_alloc(void)
+{
+ struct cset *cs;
+
+ if ((cs = malloc(sizeof(*cs))) == NULL)
+ return (NULL);
+ cs->cs_root = NULL;
+ cs->cs_classes = NULL;
+ cs->cs_havecache = false;
+ return (cs);
+}
+
+/*
+ * cset_add --
+ * Add a character to the set.
+ */
+bool
+cset_add(struct cset *cs, wchar_t ch)
+{
+ struct csnode *csn, *ncsn;
+ wchar_t oval;
+
+ cs->cs_havecache = false;
+
+ /*
+ * Inserting into empty tree; new item becomes the root.
+ */
+ if (cs->cs_root == NULL) {
+ csn = malloc(sizeof(*cs->cs_root));
+ if (csn == NULL)
+ return (false);
+ csn->csn_left = csn->csn_right = NULL;
+ csn->csn_min = csn->csn_max = ch;
+ cs->cs_root = csn;
+ return (true);
+ }
+
+ /*
+ * Splay to check whether the item already exists, and otherwise,
+ * where we should put it.
+ */
+ csn = cs->cs_root = cset_splay(cs->cs_root, ch);
+
+ /*
+ * Easy cases where we can avoid allocating a new node:
+ * (a) node already exists.
+ * (b) we can lower the extent's "min" to accomodate this
+ * character without having to coalesce.
+ * (c) we can raise the extent's "max" without having
+ * to coalesce.
+ */
+ if (cset_rangecmp(csn, ch) == 0)
+ return (true);
+ if (ch + 1 == csn->csn_min && (csn->csn_left == NULL ||
+ ch > csn->csn_left->csn_max + 1)) {
+ csn->csn_min--;
+ return (true);
+ }
+ if (ch == csn->csn_max + 1 && (csn->csn_right == NULL ||
+ ch + 1 < csn->csn_right->csn_min)) {
+ csn->csn_max++;
+ return (true);
+ }
+
+ /*
+ * Allocate a new node and link it into the tree as a direct
+ * child of the root.
+ */
+ ncsn = malloc(sizeof(*ncsn));
+ if (ncsn == NULL)
+ return (false);
+ ncsn->csn_min = ncsn->csn_max = ch;
+ if (cset_rangecmp(csn, ch) < 0) {
+ ncsn->csn_left = csn->csn_left;
+ ncsn->csn_right = csn;
+ csn->csn_left = NULL;
+ } else {
+ ncsn->csn_right = csn->csn_right;
+ ncsn->csn_left = csn;
+ csn->csn_right = NULL;
+ }
+ cs->cs_root = ncsn;
+
+ /*
+ * Splay to bring the newly inserted node to the root, then
+ * coalesce with left and right neighbours if possible.
+ */
+ csn = cs->cs_root = cset_splay(cs->cs_root, ch);
+ if (csn->csn_left != NULL &&
+ csn->csn_left->csn_max + 1 == csn->csn_min) {
+ oval = csn->csn_left->csn_min;
+ cs->cs_root = cset_delete(cs->cs_root,
+ csn->csn_left->csn_min);
+ ncsn->csn_min = oval;
+ }
+ csn = cs->cs_root = cset_splay(cs->cs_root, ch);
+ if (csn->csn_right != NULL &&
+ csn->csn_right->csn_min - 1 == csn->csn_max) {
+ oval = csn->csn_right->csn_max;
+ cs->cs_root = cset_delete(cs->cs_root,
+ csn->csn_right->csn_min);
+ ncsn->csn_max = oval;
+ }
+
+ return (true);
+}
+
+/*
+ * cset_in_hard --
+ * Determine whether a character is in the set without using
+ * the cache.
+ */
+bool
+cset_in_hard(struct cset *cs, wchar_t ch)
+{
+ struct csclass *csc;
+
+ for (csc = cs->cs_classes; csc != NULL; csc = csc->csc_next)
+ if (csc->csc_invert ^ iswctype(ch, csc->csc_type) != 0)
+ return (cs->cs_invert ^ true);
+ if (cs->cs_root != NULL) {
+ cs->cs_root = cset_splay(cs->cs_root, ch);
+ return (cs->cs_invert ^ cset_rangecmp(cs->cs_root, ch) == 0);
+ }
+ return (cs->cs_invert ^ false);
+}
+
+/*
+ * cset_cache --
+ * Update the cache.
+ */
+void
+cset_cache(struct cset *cs)
+{
+ wchar_t i;
+
+ for (i = 0; i < CS_CACHE_SIZE; i++)
+ cs->cs_cache[i] = cset_in_hard(cs, i);
+
+ cs->cs_havecache = true;
+}
+
+/*
+ * cset_invert --
+ * Invert the character set.
+ */
+void
+cset_invert(struct cset *cs)
+{
+
+ cs->cs_invert ^= true;
+ cs->cs_havecache = false;
+}
+
+/*
+ * cset_addclass --
+ * Add a wctype()-style character class to the set, optionally
+ * inverting it.
+ */
+bool
+cset_addclass(struct cset *cs, wctype_t type, bool invert)
+{
+ struct csclass *csc;
+
+ csc = malloc(sizeof(*csc));
+ if (csc == NULL)
+ return (false);
+ csc->csc_type = type;
+ csc->csc_invert = invert;
+ csc->csc_next = cs->cs_classes;
+ cs->cs_classes = csc;
+ cs->cs_havecache = false;
+ return (true);
+}
+
+static __inline int
+cset_rangecmp(struct csnode *t, wchar_t ch)
+{
+
+ if (ch < t->csn_min)
+ return (-1);
+ if (ch > t->csn_max)
+ return (1);
+ return (0);
+}
+
+static struct csnode *
+cset_splay(struct csnode *t, wchar_t ch)
+{
+ struct csnode N, *l, *r, *y;
+
+ /*
+ * Based on public domain code from Sleator.
+ */
+
+ assert(t != NULL);
+
+ N.csn_left = N.csn_right = NULL;
+ l = r = &N;
+ for (;;) {
+ if (cset_rangecmp(t, ch) < 0) {
+ if (t->csn_left != NULL &&
+ cset_rangecmp(t->csn_left, ch) < 0) {
+ y = t->csn_left;
+ t->csn_left = y->csn_right;
+ y->csn_right = t;
+ t = y;
+ }
+ if (t->csn_left == NULL)
+ break;
+ r->csn_left = t;
+ r = t;
+ t = t->csn_left;
+ } else if (cset_rangecmp(t, ch) > 0) {
+ if (t->csn_right != NULL &&
+ cset_rangecmp(t->csn_right, ch) > 0) {
+ y = t->csn_right;
+ t->csn_right = y->csn_left;
+ y->csn_left = t;
+ t = y;
+ }
+ if (t->csn_right == NULL)
+ break;
+ l->csn_right = t;
+ l = t;
+ t = t->csn_right;
+ } else
+ break;
+ }
+ l->csn_right = t->csn_left;
+ r->csn_left = t->csn_right;
+ t->csn_left = N.csn_right;
+ t->csn_right = N.csn_left;
+ return (t);
+}
+
+static struct csnode *
+cset_delete(struct csnode *t, wchar_t ch)
+{
+ struct csnode *x;
+
+ assert(t != NULL);
+ t = cset_splay(t, ch);
+ assert(cset_rangecmp(t, ch) == 0);
+ if (t->csn_left == NULL)
+ x = t->csn_right;
+ else {
+ x = cset_splay(t->csn_left, ch);
+ x->csn_right = t->csn_right;
+ }
+ free(t);
+ return x;
+}
diff --git a/usr.bin/tr/cset.h b/usr.bin/tr/cset.h
new file mode 100644
index 0000000..b853493
--- /dev/null
+++ b/usr.bin/tr/cset.h
@@ -0,0 +1,75 @@
+/*-
+ * Copyright (c) 2004 Tim J. Robbins.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef CSET_H
+#define CSET_H
+
+#include <stdbool.h>
+#include <wchar.h>
+#include <wctype.h>
+
+struct csnode {
+ wchar_t csn_min;
+ wchar_t csn_max;
+ struct csnode *csn_left;
+ struct csnode *csn_right;
+};
+
+struct csclass {
+ wctype_t csc_type;
+ bool csc_invert;
+ bool csc_value;
+ struct csclass *csc_next;
+};
+
+struct cset {
+#define CS_CACHE_SIZE 256
+ bool cs_cache[CS_CACHE_SIZE];
+ bool cs_havecache;
+ struct csclass *cs_classes;
+ struct csnode *cs_root;
+ bool cs_invert;
+};
+
+bool cset_addclass(struct cset *, wctype_t, bool);
+struct cset * cset_alloc(void);
+bool cset_add(struct cset *, wchar_t);
+void cset_invert(struct cset *);
+bool cset_in_hard(struct cset *, wchar_t);
+void cset_cache(struct cset *);
+
+static __inline bool
+cset_in(struct cset *cs, wchar_t ch)
+{
+
+ if (ch < CS_CACHE_SIZE && cs->cs_havecache)
+ return (cs->cs_cache[ch]);
+ return (cset_in_hard(cs, ch));
+}
+
+#endif /* CSET_H */
diff --git a/usr.bin/tr/extern.h b/usr.bin/tr/extern.h
index a5ed577..2fdbdf3 100644
--- a/usr.bin/tr/extern.h
+++ b/usr.bin/tr/extern.h
@@ -35,20 +35,21 @@
*/
#include <limits.h>
-#define NCHARS (UCHAR_MAX + 1) /* Number of possible characters. */
-#define OOBCH (UCHAR_MAX + 1) /* Out of band character value. */
+
+#define NCHARS_SB (UCHAR_MAX + 1) /* Number of single-byte characters. */
+#define OOBCH -1 /* Out of band character value. */
typedef struct {
enum { STRING1, STRING2 } which;
enum { EOS, INFINITE, NORMAL, RANGE, SEQUENCE,
- SET, SET_UPPER, SET_LOWER } state;
- int cnt; /* character count */
- int lastch; /* last character */
- int equiv[NCHARS]; /* equivalence set */
- int *set; /* set of characters */
- char *str; /* user's string */
+ CCLASS, CCLASS_UPPER, CCLASS_LOWER, SET } state;
+ int cnt; /* character count */
+ wint_t lastch; /* last character */
+ wctype_t cclass; /* character class from wctype() */
+ wint_t equiv[NCHARS_SB]; /* equivalence set */
+ wint_t *set; /* set of characters */
+ char *str; /* user's string */
} STR;
-int next(STR *);
+wint_t next(STR *);
int charcoll(const void *, const void *);
-
diff --git a/usr.bin/tr/str.c b/usr.bin/tr/str.c
index 3365caf..f28b243 100644
--- a/usr.bin/tr/str.c
+++ b/usr.bin/tr/str.c
@@ -44,26 +44,31 @@ static const char sccsid[] = "@(#)str.c 8.2 (Berkeley) 4/28/95";
#include <ctype.h>
#include <err.h>
+#include <errno.h>
#include <stddef.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
+#include <wchar.h>
+#include <wctype.h>
#include "extern.h"
static int backslash(STR *, int *);
static int bracket(STR *);
-static int c_class(const void *, const void *);
static void genclass(STR *);
static void genequiv(STR *);
static int genrange(STR *, int);
static void genseq(STR *);
-int
+wint_t
next(s)
STR *s;
{
- int ch, is_octal;
+ int is_octal;
+ wint_t ch;
+ wchar_t wch;
+ size_t clen;
switch (s->state) {
case EOS:
@@ -71,7 +76,7 @@ next(s)
case INFINITE:
return (1);
case NORMAL:
- switch (ch = (u_char)*s->str) {
+ switch (*s->str) {
case '\0':
s->state = EOS;
return (0);
@@ -83,9 +88,13 @@ next(s)
return (next(s));
/* FALLTHROUGH */
default:
+ clen = mbrtowc(&wch, s->str, MB_LEN_MAX, NULL);
+ if (clen == (size_t)-1 || clen == (size_t)-2 ||
+ clen == 0)
+ errc(1, EILSEQ, NULL);
is_octal = 0;
- ++s->str;
- s->lastch = ch;
+ s->lastch = wch;
+ s->str += clen;
break;
}
@@ -106,9 +115,18 @@ next(s)
return (next(s));
}
return (1);
+ case CCLASS:
+ case CCLASS_UPPER:
+ case CCLASS_LOWER:
+ s->cnt++;
+ ch = nextwctype(s->lastch, s->cclass);
+ if (ch == -1) {
+ s->state = NORMAL;
+ return (next(s));
+ }
+ s->lastch = ch;
+ return (1);
case SET:
- case SET_UPPER:
- case SET_LOWER:
if ((ch = s->set[s->cnt++]) == OOBCH) {
s->state = NORMAL;
return (next(s));
@@ -159,74 +177,21 @@ bracket(s)
/* NOTREACHED */
}
-typedef struct {
- const char *name;
- int (*func)(int);
- int *set;
-} CLASS;
-
-static CLASS classes[] = {
-#undef isalnum
- { "alnum", isalnum, NULL },
-#undef isalpha
- { "alpha", isalpha, NULL },
-#undef isblank
- { "blank", isblank, NULL },
-#undef iscntrl
- { "cntrl", iscntrl, NULL },
-#undef isdigit
- { "digit", isdigit, NULL },
-#undef isgraph
- { "graph", isgraph, NULL },
-#undef islower
- { "lower", islower, NULL },
-#undef isprint
- { "print", isprint, NULL },
-#undef ispunct
- { "punct", ispunct, NULL },
-#undef isspace
- { "space", isspace, NULL },
-#undef isupper
- { "upper", isupper, NULL },
-#undef isxdigit
- { "xdigit", isxdigit, NULL },
-};
-
static void
genclass(s)
STR *s;
{
- int cnt, (*func)(int);
- CLASS *cp, tmp;
- int *p;
- tmp.name = s->str;
- if ((cp = (CLASS *)bsearch(&tmp, classes, sizeof(classes) /
- sizeof(CLASS), sizeof(CLASS), c_class)) == NULL)
+ if ((s->cclass = wctype(s->str)) == 0)
errx(1, "unknown class %s", s->str);
-
- if ((cp->set = p = malloc((NCHARS + 1) * sizeof(int))) == NULL)
- err(1, "genclass() malloc");
- for (cnt = 0, func = cp->func; cnt < NCHARS; ++cnt)
- if ((func)(cnt))
- *p++ = cnt;
- *p = OOBCH;
-
s->cnt = 0;
- s->set = cp->set;
+ s->lastch = -1; /* incremented before check in next() */
if (strcmp(s->str, "upper") == 0)
- s->state = SET_UPPER;
+ s->state = CCLASS_UPPER;
else if (strcmp(s->str, "lower") == 0)
- s->state = SET_LOWER;
+ s->state = CCLASS_LOWER;
else
- s->state = SET;
-}
-
-static int
-c_class(a, b)
- const void *a, *b;
-{
- return (strcmp(((const CLASS *)a)->name, ((const CLASS *)b)->name));
+ s->state = CCLASS;
}
static void
@@ -235,6 +200,8 @@ genequiv(s)
{
int i, p, pri;
char src[2], dst[3];
+ size_t clen;
+ wchar_t wc;
if (*s->str == '\\') {
s->equiv[0] = backslash(s, NULL);
@@ -242,10 +209,13 @@ genequiv(s)
errx(1, "misplaced equivalence equals sign");
s->str += 2;
} else {
- s->equiv[0] = s->str[0];
- if (s->str[1] != '=')
+ clen = mbrtowc(&wc, s->str, MB_LEN_MAX, NULL);
+ if (clen == (size_t)-1 || clen == (size_t)-2 || clen == 0)
+ errc(1, EILSEQ, NULL);
+ s->equiv[0] = wc;
+ if (s->str[clen] != '=')
errx(1, "misplaced equivalence equals sign");
- s->str += 3;
+ s->str += clen + 2;
}
/*
@@ -255,12 +225,13 @@ genequiv(s)
* XXX Knows too much about how strxfrm() is implemented. Assumes
* it fills the string with primary collation weight bytes. Only one-
* to-one mappings are supported.
+ * XXX Equivalence classes not supported in multibyte locales.
*/
- src[0] = s->equiv[0];
+ src[0] = (char)s->equiv[0];
src[1] = '\0';
- if (strxfrm(dst, src, sizeof(dst)) == 1) {
+ if (MB_CUR_MAX == 1 && strxfrm(dst, src, sizeof(dst)) == 1) {
pri = (unsigned char)*dst;
- for (p = 1, i = 1; i < NCHARS; i++) {
+ for (p = 1, i = 1; i < NCHARS_SB; i++) {
*src = i;
if (strxfrm(dst, src, sizeof(dst)) == 1 && pri &&
pri == (unsigned char)*dst)
@@ -280,28 +251,41 @@ genrange(STR *s, int was_octal)
int stopval, octal;
char *savestart;
int n, cnt, *p;
+ size_t clen;
+ wchar_t wc;
octal = 0;
savestart = s->str;
- stopval = *++s->str == '\\' ? backslash(s, &octal) : (u_char)*s->str++;
- if (!octal)
- octal = was_octal;
-
- if ((octal && stopval < s->lastch) ||
- (!octal &&
- charcoll((const void *)&stopval, (const void *)&(s->lastch)) < 0)) {
- s->str = savestart;
- return (0);
+ if (*++s->str == '\\')
+ stopval = backslash(s, &octal);
+ else {
+ clen = mbrtowc(&wc, s->str, MB_LEN_MAX, NULL);
+ if (clen == (size_t)-1 || clen == (size_t)-2)
+ errc(1, EILSEQ, NULL);
+ stopval = wc;
+ s->str += clen;
}
- if (octal) {
+ /*
+ * XXX Characters are not ordered according to collating sequence in
+ * multibyte locales.
+ */
+ if (octal || was_octal || MB_CUR_MAX > 1) {
+ if (stopval < s->lastch) {
+ s->str = savestart;
+ return (0);
+ }
s->cnt = stopval - s->lastch + 1;
s->state = RANGE;
--s->lastch;
return (1);
}
- if ((s->set = p = malloc((NCHARS + 1) * sizeof(int))) == NULL)
+ if (charcoll((const void *)&stopval, (const void *)&(s->lastch)) < 0) {
+ s->str = savestart;
+ return (0);
+ }
+ if ((s->set = p = malloc((NCHARS_SB + 1) * sizeof(int))) == NULL)
err(1, "genrange() malloc");
- for (cnt = 0; cnt < NCHARS; cnt++)
+ for (cnt = 0; cnt < NCHARS_SB; cnt++)
if (charcoll((const void *)&cnt, (const void *)&(s->lastch)) >= 0 &&
charcoll((const void *)&cnt, (const void *)&stopval) <= 0)
*p++ = cnt;
@@ -320,14 +304,21 @@ genseq(s)
STR *s;
{
char *ep;
+ wchar_t wc;
+ size_t clen;
if (s->which == STRING1)
errx(1, "sequences only valid in string2");
if (*s->str == '\\')
s->lastch = backslash(s, NULL);
- else
- s->lastch = *s->str++;
+ else {
+ clen = mbrtowc(&wc, s->str, MB_LEN_MAX, NULL);
+ if (clen == (size_t)-1 || clen == (size_t)-2)
+ errc(1, EILSEQ, NULL);
+ s->lastch = wc;
+ s->str += clen;
+ }
if (*s->str != '*')
errx(1, "misplaced sequence asterisk");
diff --git a/usr.bin/tr/tr.c b/usr.bin/tr/tr.c
index a22ba13..f3ce7ee 100644
--- a/usr.bin/tr/tr.c
+++ b/usr.bin/tr/tr.c
@@ -49,67 +49,34 @@ static const char sccsid[] = "@(#)tr.c 8.2 (Berkeley) 5/4/95";
#include <ctype.h>
#include <err.h>
+#include <limits.h>
#include <locale.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
+#include <wchar.h>
+#include <wctype.h>
+#include "cmap.h"
+#include "cset.h"
#include "extern.h"
-/*
- * For -C option: determine whether a byte is a valid character in the
- * current character set (as defined by LC_CTYPE).
- */
-#define ISCHAR(c) (iscntrl(c) || isprint(c))
-
-static int string1[NCHARS] = {
- 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* ASCII */
- 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
- 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
- 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
- 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27,
- 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
- 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
- 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f,
- 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,
- 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f,
- 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57,
- 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f,
- 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
- 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,
- 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
- 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f,
- 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
- 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
- 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
- 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f,
- 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,
- 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf,
- 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7,
- 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf,
- 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7,
- 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf,
- 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7,
- 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf,
- 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7,
- 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef,
- 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7,
- 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff,
-}, string2[NCHARS];
-
-STR s1 = { STRING1, NORMAL, 0, OOBCH, { 0, OOBCH }, NULL, NULL };
-STR s2 = { STRING2, NORMAL, 0, OOBCH, { 0, OOBCH }, NULL, NULL };
-
-static void setup(int *, char *, STR *, int, int);
+STR s1 = { STRING1, NORMAL, 0, OOBCH, 0, { 0, OOBCH }, NULL, NULL };
+STR s2 = { STRING2, NORMAL, 0, OOBCH, 0, { 0, OOBCH }, NULL, NULL };
+
+static struct cset *setup(char *, STR *, int, int);
static void usage(void);
int
main(int argc, char **argv)
{
- static int carray[NCHARS];
- int ch, cnt, n, lastch, *p;
+ static int carray[NCHARS_SB];
+ struct cmap *map;
+ struct cset *delete, *squeeze;
+ int n, *p;
int Cflag, cflag, dflag, sflag, isstring2;
+ wint_t ch, cnt, i, lastch;
(void)setlocale(LC_ALL, "");
@@ -162,13 +129,14 @@ main(int argc, char **argv)
if (!isstring2)
usage();
- setup(string1, argv[0], &s1, cflag, Cflag);
- setup(string2, argv[1], &s2, 0, 0);
+ delete = setup(argv[0], &s1, cflag, Cflag);
+ squeeze = setup(argv[1], &s2, 0, 0);
- for (lastch = OOBCH; (ch = getchar()) != EOF;)
- if (!string1[ch] && (!string2[ch] || lastch != ch)) {
+ for (lastch = OOBCH; (ch = getwchar()) != WEOF;)
+ if (!cset_in(delete, ch) &&
+ (lastch != ch || !cset_in(squeeze, ch))) {
lastch = ch;
- (void)putchar(ch);
+ (void)putwchar(ch);
}
exit(0);
}
@@ -181,11 +149,11 @@ main(int argc, char **argv)
if (isstring2)
usage();
- setup(string1, argv[0], &s1, cflag, Cflag);
+ delete = setup(argv[0], &s1, cflag, Cflag);
- while ((ch = getchar()) != EOF)
- if (!string1[ch])
- (void)putchar(ch);
+ while ((ch = getwchar()) != WEOF)
+ if (!cset_in(delete, ch))
+ (void)putwchar(ch);
exit(0);
}
@@ -194,12 +162,12 @@ main(int argc, char **argv)
* Squeeze all characters (or complemented characters) in string1.
*/
if (sflag && !isstring2) {
- setup(string1, argv[0], &s1, cflag, Cflag);
+ squeeze = setup(argv[0], &s1, cflag, Cflag);
- for (lastch = OOBCH; (ch = getchar()) != EOF;)
- if (!string1[ch] || lastch != ch) {
+ for (lastch = OOBCH; (ch = getwchar()) != WEOF;)
+ if (lastch != ch || !cset_in(squeeze, ch)) {
lastch = ch;
- (void)putchar(ch);
+ (void)putwchar(ch);
}
exit(0);
}
@@ -213,13 +181,19 @@ main(int argc, char **argv)
if (!isstring2)
usage();
+ map = cmap_alloc();
+ if (map == NULL)
+ err(1, NULL);
+ squeeze = cset_alloc();
+ if (squeeze == NULL)
+ err(1, NULL);
+
s1.str = argv[0];
- if (cflag || Cflag) {
+
+ if (Cflag || cflag) {
+ cmap_default(map, OOBCH);
if ((s2.str = strdup(argv[1])) == NULL)
errx(1, "strdup(argv[1])");
-
- for (cnt = NCHARS, p = string1; cnt--;)
- *p++ = OOBCH;
} else
s2.str = argv[1];
@@ -235,52 +209,83 @@ main(int argc, char **argv)
/* If string2 runs out of characters, use the last one specified. */
while (next(&s1)) {
again:
- if (s1.state == SET_LOWER &&
- s2.state == SET_UPPER &&
+ if (s1.state == CCLASS_LOWER &&
+ s2.state == CCLASS_UPPER &&
s1.cnt == 1 && s2.cnt == 1) {
do {
- string1[s1.lastch] = ch = toupper(s1.lastch);
- if (sflag && isupper(ch))
- string2[ch] = 1;
+ ch = towupper(s1.lastch);
+ cmap_add(map, s1.lastch, ch);
+ if (sflag && iswupper(ch))
+ cset_add(squeeze, ch);
if (!next(&s1))
goto endloop;
- } while (s1.state == SET_LOWER && s1.cnt > 1);
+ } while (s1.state == CCLASS_LOWER && s1.cnt > 1);
/* skip upper set */
do {
if (!next(&s2))
break;
- } while (s2.state == SET_UPPER && s2.cnt > 1);
+ } while (s2.state == CCLASS_UPPER && s2.cnt > 1);
goto again;
- } else if (s1.state == SET_UPPER &&
- s2.state == SET_LOWER &&
+ } else if (s1.state == CCLASS_UPPER &&
+ s2.state == CCLASS_LOWER &&
s1.cnt == 1 && s2.cnt == 1) {
do {
- string1[s1.lastch] = ch = tolower(s1.lastch);
- if (sflag && islower(ch))
- string2[ch] = 1;
+ ch = towlower(s1.lastch);
+ cmap_add(map, s1.lastch, ch);
+ if (sflag && iswlower(ch))
+ cset_add(squeeze, ch);
if (!next(&s1))
goto endloop;
- } while (s1.state == SET_UPPER && s1.cnt > 1);
+ } while (s1.state == CCLASS_UPPER && s1.cnt > 1);
/* skip lower set */
do {
if (!next(&s2))
break;
- } while (s2.state == SET_LOWER && s2.cnt > 1);
+ } while (s2.state == CCLASS_LOWER && s2.cnt > 1);
goto again;
} else {
- string1[s1.lastch] = s2.lastch;
+ cmap_add(map, s1.lastch, s2.lastch);
if (sflag)
- string2[s2.lastch] = 1;
+ cset_add(squeeze, s2.lastch);
}
(void)next(&s2);
}
endloop:
- if (cflag || Cflag) {
- for (p = carray, cnt = 0; cnt < NCHARS; cnt++) {
- if (string1[cnt] == OOBCH && (!Cflag || ISCHAR(cnt)))
+ if (cflag || (Cflag && MB_CUR_MAX > 1)) {
+ /*
+ * This is somewhat tricky: since the character set is
+ * potentially huge, we need to avoid allocating a map
+ * entry for every character. Our strategy is to set the
+ * default mapping to the last character of string #2
+ * (= the one that gets automatically repeated), then to
+ * add back identity mappings for characters that should
+ * remain unchanged. We don't waste space on identity mappings
+ * for non-characters with the -C option; those are simulated
+ * in the I/O loop.
+ */
+ s2.str = argv[1];
+ s2.state = NORMAL;
+ for (cnt = 0; cnt < WCHAR_MAX; cnt++) {
+ if (Cflag && !iswrune(cnt))
+ continue;
+ if (cmap_lookup(map, cnt) == OOBCH) {
+ if (next(&s2))
+ cmap_add(map, cnt, s2.lastch);
+ if (sflag)
+ cset_add(squeeze, s2.lastch);
+ } else
+ cmap_add(map, cnt, cnt);
+ if ((s2.state == EOS || s2.state == INFINITE) &&
+ cnt >= cmap_max(map))
+ break;
+ }
+ cmap_default(map, s2.lastch);
+ } else if (Cflag) {
+ for (p = carray, cnt = 0; cnt < NCHARS_SB; cnt++) {
+ if (cmap_lookup(map, cnt) == OOBCH && iswrune(cnt))
*p++ = cnt;
else
- string1[cnt] = cnt;
+ cmap_add(map, cnt, cnt);
}
n = p - carray;
if (Cflag && n > 1)
@@ -290,46 +295,55 @@ endloop:
s2.state = NORMAL;
for (cnt = 0; cnt < n; cnt++) {
(void)next(&s2);
- string1[carray[cnt]] = s2.lastch;
+ cmap_add(map, carray[cnt], s2.lastch);
/*
* Chars taken from s2 can be different this time
* due to lack of complex upper/lower processing,
* so fill string2 again to not miss some.
*/
if (sflag)
- string2[s2.lastch] = 1;
+ cset_add(squeeze, s2.lastch);
}
}
+ cset_cache(squeeze);
+ cmap_cache(map);
+
if (sflag)
- for (lastch = OOBCH; (ch = getchar()) != EOF;) {
- ch = string1[ch];
- if (!string2[ch] || lastch != ch) {
+ for (lastch = OOBCH; (ch = getwchar()) != WEOF;) {
+ if (!Cflag || iswrune(ch))
+ ch = cmap_lookup(map, ch);
+ if (lastch != ch || !cset_in(squeeze, ch)) {
lastch = ch;
- (void)putchar(ch);
+ (void)putwchar(ch);
}
}
else
- while ((ch = getchar()) != EOF)
- (void)putchar(string1[ch]);
+ while ((ch = getwchar()) != WEOF) {
+ if (!Cflag || iswrune(ch))
+ ch = cmap_lookup(map, ch);
+ (void)putwchar(ch);
+ }
exit (0);
}
-static void
-setup(int *string, char *arg, STR *str, int cflag, int Cflag)
+static struct cset *
+setup(char *arg, STR *str, int cflag, int Cflag)
{
- int cnt, *p;
+ struct cset *cs;
+ cs = cset_alloc();
+ if (cs == NULL)
+ err(1, NULL);
str->str = arg;
- bzero(string, NCHARS * sizeof(int));
while (next(str))
- string[str->lastch] = 1;
- if (cflag)
- for (p = string, cnt = NCHARS; cnt--; ++p)
- *p = !*p;
- else if (Cflag)
- for (cnt = 0; cnt < NCHARS; cnt++)
- string[cnt] = !string[cnt] && ISCHAR(cnt);
+ cset_add(cs, str->lastch);
+ if (Cflag)
+ cset_addclass(cs, wctype("rune"), true);
+ if (cflag || Cflag)
+ cset_invert(cs);
+ cset_cache(cs);
+ return (cs);
}
int
OpenPOWER on IntegriCloud