summaryrefslogtreecommitdiffstats
path: root/usr.bin/join
diff options
context:
space:
mode:
authortjr <tjr@FreeBSD.org>2004-07-05 13:20:03 +0000
committertjr <tjr@FreeBSD.org>2004-07-05 13:20:03 +0000
commit99daa57e073ba3543a1793b38239966d5621bab6 (patch)
treee2a6a2901474941b4746d604d99924425ae03951 /usr.bin/join
parent335407ec32c5c60a1d37f6acd40f6869328f3fa9 (diff)
downloadFreeBSD-src-99daa57e073ba3543a1793b38239966d5621bab6.zip
FreeBSD-src-99daa57e073ba3543a1793b38239966d5621bab6.tar.gz
Add support for multibyte characters.
Diffstat (limited to 'usr.bin/join')
-rw-r--r--usr.bin/join/join.16
-rw-r--r--usr.bin/join/join.c78
2 files changed, 73 insertions, 11 deletions
diff --git a/usr.bin/join/join.1 b/usr.bin/join/join.1
index 8a807d5..aa6f739 100644
--- a/usr.bin/join/join.1
+++ b/usr.bin/join/join.1
@@ -35,7 +35,7 @@
.\" @(#)join.1 8.3 (Berkeley) 4/28/95
.\" $FreeBSD$
.\"
-.Dd June 25, 2004
+.Dd July 5, 2004
.Dt JOIN 1
.Os
.Sh NAME
@@ -217,7 +217,3 @@ command conforms to
.Xr paste 1 ,
.Xr sort 1 ,
.Xr uniq 1
-.Sh BUGS
-The
-.Nm
-utility does not recognize multibyte characters.
diff --git a/usr.bin/join/join.c b/usr.bin/join/join.c
index e360f86..097ecbe 100644
--- a/usr.bin/join/join.c
+++ b/usr.bin/join/join.c
@@ -53,11 +53,13 @@ __FBSDID("$FreeBSD$");
#include <err.h>
#include <errno.h>
+#include <limits.h>
#include <locale.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
+#include <wchar.h>
/*
* There's a structure per input file which encapsulates the state of the
@@ -100,17 +102,20 @@ int joinout = 1; /* show lines with matched join fields (-v) */
int needsep; /* need separator character */
int spans = 1; /* span multiple delimiters (-t) */
char *empty; /* empty field replacement string (-e) */
-static char default_tabchar[] = " \t";
-char *tabchar = default_tabchar;/* delimiter characters (-t) */
+static wchar_t default_tabchar[] = L" \t";
+wchar_t *tabchar = default_tabchar;/* delimiter characters (-t) */
int cmp(LINE *, u_long, LINE *, u_long);
void fieldarg(char *);
void joinlines(INPUT *, INPUT *);
+int mbscoll(const char *, const char *);
+char *mbssep(char **, const wchar_t *);
void obsolete(char **);
void outfield(LINE *, u_long, int);
void outoneline(INPUT *, LINE *);
void outtwoline(INPUT *, LINE *, INPUT *, LINE *);
void slurp(INPUT *);
+wchar_t *towcs(const char *);
void usage(void);
int
@@ -180,8 +185,10 @@ main(int argc, char *argv[])
break;
case 't':
spans = 0;
- if (strlen(tabchar = optarg) != 1)
+ if (mbrtowc(&tabchar[0], optarg, MB_LEN_MAX, NULL) !=
+ strlen(optarg))
errx(1, "illegal tab character specification");
+ tabchar[1] = L'\0';
break;
case 'v':
vflag = 1;
@@ -335,7 +342,7 @@ slurp(INPUT *F)
/* Split the line into fields, allocate space as necessary. */
lp->fieldcnt = 0;
- while ((fieldp = strsep(&bp, tabchar)) != NULL) {
+ while ((fieldp = mbssep(&bp, tabchar)) != NULL) {
if (spans && *fieldp == '\0')
continue;
if (lp->fieldcnt == lp->fieldalloc) {
@@ -356,6 +363,35 @@ slurp(INPUT *F)
}
}
+char *
+mbssep(char **stringp, const wchar_t *delim)
+{
+ char *s, *tok;
+ const wchar_t *spanp;
+ wchar_t c, sc;
+ size_t n;
+
+ if ((s = *stringp) == NULL)
+ return (NULL);
+ for (tok = s;;) {
+ n = mbrtowc(&c, s, MB_LEN_MAX, NULL);
+ if (n == (size_t)-1 || n == (size_t)-2)
+ errc(1, EILSEQ, NULL); /* XXX */
+ s += n;
+ spanp = delim;
+ do {
+ if ((sc = *spanp++) == c) {
+ if (c == 0)
+ s = NULL;
+ else
+ s[-n] = '\0';
+ *stringp = s;
+ return (tok);
+ }
+ } while (sc != 0);
+ }
+}
+
int
cmp(LINE *lp1, u_long fieldno1, LINE *lp2, u_long fieldno2)
{
@@ -363,7 +399,37 @@ cmp(LINE *lp1, u_long fieldno1, LINE *lp2, u_long fieldno2)
return (lp2->fieldcnt <= fieldno2 ? 0 : 1);
if (lp2->fieldcnt <= fieldno2)
return (-1);
- return (strcoll(lp1->fields[fieldno1], lp2->fields[fieldno2]));
+ return (mbscoll(lp1->fields[fieldno1], lp2->fields[fieldno2]));
+}
+
+int
+mbscoll(const char *s1, const char *s2)
+{
+ wchar_t *w1, *w2;
+ int ret;
+
+ if (MB_CUR_MAX == 1)
+ return (strcoll(s1, s2));
+ if ((w1 = towcs(s1)) == NULL || (w2 = towcs(s2)) == NULL)
+ err(1, NULL); /* XXX */
+ ret = wcscoll(w1, w2);
+ free(w1);
+ free(w2);
+ return (ret);
+}
+
+wchar_t *
+towcs(const char *s)
+{
+ wchar_t *wcs;
+ size_t n;
+
+ if ((n = mbsrtowcs(NULL, &s, 0, NULL)) == (size_t)-1)
+ return (NULL);
+ if ((wcs = malloc((n + 1) * sizeof(*wcs))) == NULL)
+ return (NULL);
+ mbsrtowcs(wcs, &s, n + 1, NULL);
+ return (wcs);
}
void
@@ -454,7 +520,7 @@ void
outfield(LINE *lp, u_long fieldno, int out_empty)
{
if (needsep++)
- (void)printf("%c", *tabchar);
+ (void)printf("%lc", *tabchar);
if (!ferror(stdout)) {
if (lp->fieldcnt <= fieldno || out_empty) {
if (empty != NULL)
OpenPOWER on IntegriCloud