summaryrefslogtreecommitdiffstats
path: root/contrib/gnu-sort
diff options
context:
space:
mode:
authortjr <tjr@FreeBSD.org>2004-07-02 11:07:42 +0000
committertjr <tjr@FreeBSD.org>2004-07-02 11:07:42 +0000
commitceb7da92e59eb53fa5b1ce5fd2ee262afcbc6936 (patch)
tree291921021f334786c86d788cf0ce23051ebe41ff /contrib/gnu-sort
parent64efc9f020477de2e03dfe3491ea1f820979588f (diff)
downloadFreeBSD-src-ceb7da92e59eb53fa5b1ce5fd2ee262afcbc6936.zip
FreeBSD-src-ceb7da92e59eb53fa5b1ce5fd2ee262afcbc6936.tar.gz
Add support for multi-byte character sets.
Obtained from: Mitsuru Chinen (IBM) via The Fedora Project
Diffstat (limited to 'contrib/gnu-sort')
-rw-r--r--contrib/gnu-sort/src/sort.c805
1 files changed, 763 insertions, 42 deletions
diff --git a/contrib/gnu-sort/src/sort.c b/contrib/gnu-sort/src/sort.c
index b67582a..8a6e897 100644
--- a/contrib/gnu-sort/src/sort.c
+++ b/contrib/gnu-sort/src/sort.c
@@ -1,3 +1,4 @@
+/* $FreeBSD$ */
/* sort - sort lines of text (with all kinds of options).
Copyright (C) 88, 1991-2004 Free Software Foundation, Inc.
@@ -23,10 +24,31 @@
#include <config.h>
+#include <assert.h>
#include <getopt.h>
#include <sys/types.h>
#include <signal.h>
#include <stdio.h>
+
+/* Solaris 2.5 has a bug: <wchar.h> must be included before <wctype.h>. */
+/* Get mbstate_t, mbrtowc(), wcwidth(). */
+#if HAVE_WCHAR_H
+# include <wchar.h>
+#endif
+
+/* Get isw* functions. */
+#if HAVE_WCTYPE_H
+# include <wctype.h>
+#endif
+
+/* Get nl_langinfo(). */
+#if HAVE_LANGINFO_CODESET
+# include <langinfo.h>
+#endif
+
+/* Include this after wctype.h so that we `#undef' ISPRINT
+ (from Solaris's euc.h, from widec.h, from wctype.h) before
+ redefining and using it. */
#include "system.h"
#include "error.h"
#include "hard-locale.h"
@@ -46,6 +68,17 @@ struct rlimit { size_t rlim_cur; };
# define getrlimit(Resource, Rlp) (-1)
#endif
+/* MB_LEN_MAX is incorrectly defined to be 1 in at least one GCC
+ installation; work around this configuration error. */
+#if !defined MB_LEN_MAX || MB_LEN_MAX == 1
+# define MB_LEN_MAX 16
+#endif
+
+/* Some systems, like BeOS, have multibyte encodings but lack mbstate_t. */
+#if HAVE_MBRTOWC && defined mbstate_t
+# define mbrtowc(pwc, s, n, ps) (mbrtowc) (pwc, s, n, 0)
+#endif
+
/* The official name of this program (e.g., no `g' prefix). */
#define PROGRAM_NAME "sort"
@@ -91,6 +124,7 @@ enum
static char decimal_point;
static int th_sep; /* if CHAR_MAX + 1, then there is no thousands separator */
+static int force_general_numcompare = 0;
/* Nonzero if the corresponding locales are hard. */
static bool hard_LC_COLLATE;
@@ -109,6 +143,28 @@ static bool hard_LC_TIME;
#define NONZERO(x) (x != 0)
+/* get a multibyte character's byte length. */
+#define GET_BYTELEN_OF_CHAR(LIM, PTR, MBLENGTH, STATE) \
+ do \
+ { \
+ wchar_t wc; \
+ mbstate_t state_bak; \
+ \
+ state_bak = STATE; \
+ mblength = mbrtowc (&wc, PTR, LIM - PTR, &STATE); \
+ \
+ switch (MBLENGTH) \
+ { \
+ case (size_t)-1: \
+ case (size_t)-2: \
+ STATE = state_bak; \
+ /* Fall through. */ \
+ case 0: \
+ MBLENGTH = 1; \
+ } \
+ } \
+ while (0)
+
/* The kind of blanks for '-b' to skip in various options. */
enum blanktype { bl_start, bl_end, bl_both };
@@ -251,7 +307,8 @@ enum { TAB_DEFAULT = CHAR_MAX + 1 };
/* Tab character separating fields. If TAB_DEFAULT, then fields are
separated by the empty string between a non-blank character and a blank
character. */
-static int tab = TAB_DEFAULT;
+static int tab[MB_LEN_MAX + 1] = { TAB_DEFAULT };
+static size_t tab_length = 1;
/* Flag to remove consecutive duplicate lines from the output.
Only the last of a sequence of equal lines will be output. */
@@ -384,6 +441,46 @@ struct tempnode
};
static struct tempnode *volatile temphead;
+/* Fucntion pointers. */
+static void
+(*inittables) (void);
+
+static char *
+(* begfield) (const struct line *line, const struct keyfield *key);
+
+static char *
+(* limfield) (const struct line *line, const struct keyfield *key);
+
+static int
+(*getmonth) (const char *s, size_t len);
+
+static int
+(* keycompare) (const struct line *a, const struct line *b);
+
+/* Test for white space multibyte character.
+ Set LENGTH the byte length of investigated multibyte character. */
+#if HAVE_MBRTOWC
+static int
+ismbblank (const char *str, size_t len, size_t *length)
+{
+ size_t mblength;
+ wchar_t wc;
+ mbstate_t state;
+
+ memset (&state, '\0', sizeof(mbstate_t));
+ mblength = mbrtowc (&wc, str, len, &state);
+
+ if (mblength == (size_t)-1 || mblength == (size_t)-2)
+ {
+ *length = 1;
+ return 0;
+ }
+
+ *length = (mblength < 1) ? 1 : mblength;
+ return iswblank (wc);
+}
+#endif
+
/* Clean up any remaining temporary files. */
static void
@@ -521,7 +618,7 @@ zaptemp (const char *name)
}
}
-#if HAVE_NL_LANGINFO
+#if HAVE_LANGINFO_CODESET
static int
struct_month_cmp (const void *m1, const void *m2)
@@ -536,7 +633,7 @@ struct_month_cmp (const void *m1, const void *m2)
/* Initialize the character class tables. */
static void
-inittables (void)
+inittables_uni (void)
{
int i;
@@ -574,6 +671,64 @@ inittables (void)
#endif
}
+#if HAVE_MBRTOWC
+static void
+inittables_mb (void)
+{
+ int i, j, k, l;
+ char *name, *s;
+ size_t s_len, mblength;
+ char mbc[MB_LEN_MAX];
+ wchar_t wc, pwc;
+ mbstate_t state_mb, state_wc;
+
+ for (i = 0; i < MONTHS_PER_YEAR; i++)
+ {
+ s = (char *) nl_langinfo (ABMON_1 + i);
+ s_len = strlen (s);
+ monthtab[i].name = name = (char *) xmalloc (s_len + 1);
+ monthtab[i].val = i + 1;
+
+ memset (&state_mb, '\0', sizeof (mbstate_t));
+ memset (&state_wc, '\0', sizeof (mbstate_t));
+
+ for (j = 0; j < s_len;)
+ {
+ if (!ismbblank (s + j, s_len - j, &mblength))
+ break;
+ j += mblength;
+ }
+
+ for (k = 0; j < s_len;)
+ {
+ mblength = mbrtowc (&wc, (s + j), (s_len - j), &state_mb);
+ assert (mblength != (size_t)-1 && mblength != (size_t)-2);
+ if (mblength == 0)
+ break;
+
+ pwc = towupper (wc);
+ if (pwc == wc)
+ {
+ memcpy (mbc, s + j, mblength);
+ j += mblength;
+ }
+ else
+ {
+ j += mblength;
+ mblength = wcrtomb (mbc, pwc, &state_wc);
+ assert (mblength != (size_t)0 && mblength != (size_t)-1);
+ }
+
+ for (l = 0; l < mblength; l++)
+ name[k++] = mbc[l];
+ }
+ name[k] = '\0';
+ }
+ qsort ((void *) monthtab, MONTHS_PER_YEAR,
+ sizeof (struct month), struct_month_cmp);
+}
+#endif
+
/* Specify the amount of main memory to use when sorting. */
static void
specify_sort_size (char const *s)
@@ -784,7 +939,7 @@ buffer_linelim (struct buffer const *buf)
by KEY in LINE. */
static char *
-begfield (const struct line *line, const struct keyfield *key)
+begfield_uni (const struct line *line, const struct keyfield *key)
{
register char *ptr = line->text, *lim = ptr + line->length - 1;
register size_t sword = key->sword;
@@ -794,10 +949,10 @@ begfield (const struct line *line, const struct keyfield *key)
/* The leading field separator itself is included in a field when -t
is absent. */
- if (tab != TAB_DEFAULT)
+ if (tab[0] != TAB_DEFAULT)
while (ptr < lim && sword--)
{
- while (ptr < lim && *ptr != tab)
+ while (ptr < lim && *ptr != tab[0])
++ptr;
if (ptr < lim)
++ptr;
@@ -825,11 +980,70 @@ begfield (const struct line *line, const struct keyfield *key)
return ptr;
}
+#if HAVE_MBRTOWC
+static char *
+begfield_mb (const struct line *line, const struct keyfield *key)
+{
+ int i;
+ char *ptr = line->text, *lim = ptr + line->length - 1;
+ size_t sword = key->sword;
+ size_t schar = key->schar;
+ size_t mblength;
+ mbstate_t state;
+
+ memset (&state, '\0', sizeof(mbstate_t));
+
+ if (tab[0] != TAB_DEFAULT)
+ while (ptr < lim && sword--)
+ {
+ while (ptr < lim && memcmp (ptr, tab, tab_length) != 0)
+ {
+ GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state);
+ ptr += mblength;
+ }
+ if (ptr < lim)
+ {
+ GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state);
+ ptr += mblength;
+ }
+ }
+ else
+ while (ptr < lim && sword--)
+ {
+ while (ptr < lim && ismbblank (ptr, lim - ptr, &mblength))
+ ptr += mblength;
+ if (ptr < lim)
+ {
+ GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state);
+ ptr += mblength;
+ }
+ while (ptr < lim && !ismbblank (ptr, lim - ptr, &mblength))
+ ptr += mblength;
+ }
+
+ if (key->skipsblanks)
+ while (ptr < lim && ismbblank (ptr, lim - ptr, &mblength))
+ ptr += mblength;
+
+ for (i = 0; i < schar; i++)
+ {
+ GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state);
+
+ if (ptr + mblength > lim)
+ break;
+ else
+ ptr += mblength;
+ }
+
+ return ptr;
+}
+#endif
+
/* Return the limit of (a pointer to the first character after) the field
in LINE specified by KEY. */
static char *
-limfield (const struct line *line, const struct keyfield *key)
+limfield_uni (const struct line *line, const struct keyfield *key)
{
register char *ptr = line->text, *lim = ptr + line->length - 1;
register size_t eword = key->eword, echar = key->echar;
@@ -842,10 +1056,10 @@ limfield (const struct line *line, const struct keyfield *key)
`beginning' is the first character following the delimiting TAB.
Otherwise, leave PTR pointing at the first `blank' character after
the preceding field. */
- if (tab != TAB_DEFAULT)
+ if (tab[0] != TAB_DEFAULT)
while (ptr < lim && eword--)
{
- while (ptr < lim && *ptr != tab)
+ while (ptr < lim && *ptr != tab[0])
++ptr;
if (ptr < lim && (eword | echar))
++ptr;
@@ -891,10 +1105,10 @@ limfield (const struct line *line, const struct keyfield *key)
*/
/* Make LIM point to the end of (one byte past) the current field. */
- if (tab != TAB_DEFAULT)
+ if (tab[0] != TAB_DEFAULT)
{
char *newlim;
- newlim = memchr (ptr, tab, lim - ptr);
+ newlim = memchr (ptr, tab[0], lim - ptr);
if (newlim)
lim = newlim;
}
@@ -926,15 +1140,137 @@ limfield (const struct line *line, const struct keyfield *key)
return ptr;
}
+#if HAVE_MBRTOWC
+static char *
+limfield_mb (const struct line *line, const struct keyfield *key)
+{
+ char *ptr = line->text, *lim = ptr + line->length - 1;
+ size_t eword = key->eword, echar = key->echar;
+ int i;
+ size_t mblength;
+ mbstate_t state;
+
+ memset (&state, '\0', sizeof(mbstate_t));
+
+ if (tab[0])
+ while (ptr < lim && eword--)
+ {
+ while (ptr < lim && memcmp (ptr, tab, tab_length) != 0)
+ {
+ GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state);
+ ptr += mblength;
+ }
+ if (ptr < lim && (eword | echar))
+ {
+ GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state);
+ ptr += mblength;
+ }
+ }
+ else
+ while (ptr < lim && eword--)
+ {
+ while (ptr < lim && ismbblank (ptr, lim - ptr, &mblength))
+ ptr += mblength;
+ if (ptr < lim)
+ {
+ GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state);
+ ptr += mblength;
+ }
+ while (ptr < lim && !ismbblank (ptr, lim - ptr, &mblength))
+ ptr += mblength;
+ }
+
+
+# ifdef POSIX_UNSPECIFIED
+ /* Make LIM point to the end of (one byte past) the current field. */
+ if (tab[0])
+ {
+ char *newlim, *p;
+
+ newlim = NULL;
+ for (p = ptr; p < lim;)
+ {
+ if (memcmp (p, tab, tab_length) == 0)
+ {
+ newlim = p;
+ break;
+ }
+
+ GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state);
+ p += mblength;
+ }
+ }
+ else
+ {
+ char *newlim;
+ newlim = ptr;
+
+ while (newlim < lim && ismbblank (newlim, lim - newlim, &mblength))
+ newlim += mblength;
+ if (ptr < lim)
+ {
+ GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state);
+ ptr += mblength;
+ }
+ while (newlim < lim && !ismbblank (newlim, lim - newlim, &mblength))
+ newlim += mblength;
+ lim = newlim;
+ }
+# endif
+
+ /* If we're skipping leading blanks, don't start counting characters
+ * until after skipping past any leading blanks. */
+ if (key->skipsblanks)
+ while (ptr < lim && ismbblank (ptr, lim - ptr, &mblength))
+ ptr += mblength;
+
+ memset (&state, '\0', sizeof(mbstate_t));
+
+ /* Advance PTR by ECHAR (if possible), but no further than LIM. */
+ for (i = 0; i < echar; i++)
+ {
+ GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state);
+
+ if (ptr + mblength > lim)
+ break;
+ else
+ ptr += mblength;
+ }
+
+ return ptr;
+}
+#endif
+
/* Return the number of trailing blanks in FIELD, with LEN bytes. */
static size_t
trailing_blanks (char const *field, size_t len)
{
- size_t i;
- for (i = len; 0 < i && blanks[UCHAR (field[i - 1])]; i--)
- continue;
- return len - i;
+#if HAVE_MBRTOWC
+ if (MB_CUR_MAX > 1)
+ {
+ size_t blanks = 0;
+
+ while (len) {
+ size_t mblength;
+ if (ismbblank (field, len, &mblength))
+ blanks++;
+ else
+ blanks = 0;
+
+ field += mblength, len -= mblength;
+ }
+
+ return blanks;
+ }
+ else
+#endif
+ {
+ size_t i;
+ for (i = len; 0 < i && blanks[UCHAR (field[i - 1])]; i--)
+ continue;
+ return len - i;
+ }
}
/* Fill BUF reading from FP, moving buf->left bytes from the end
@@ -1019,8 +1355,22 @@ fillbuf (struct buffer *buf, register FILE *fp, char const *file)
else
{
if (key->skipsblanks)
- while (blanks[UCHAR (*line_start)])
- line_start++;
+#if HAVE_MBRTOWC
+ {
+ if (MB_CUR_MAX > 1)
+ {
+ size_t mblength;
+
+ while (ismbblank (line_start, ptr - line_start, &mblength))
+ line_start += mblength;
+ }
+ else
+#endif
+ {
+ while (blanks[UCHAR (*line_start)])
+ line_start++;
+ }
+ }
line->keybeg = line_start;
}
if (key->skipeblanks)
@@ -1128,13 +1478,32 @@ numcompare (register const char *a, register const char *b)
register int tmpa, tmpb, tmp;
register size_t log_a, log_b;
- tmpa = *a;
- tmpb = *b;
+#if HAVE_MBRTOWC
+ if (MB_CUR_MAX > 1)
+ {
+ size_t mblength;
+ size_t alen = strnlen (a, MB_LEN_MAX);
+ size_t blen = strnlen (b, MB_LEN_MAX);
+
+ while (ismbblank (a, alen, &mblength))
+ a += mblength, alen -= mblength;
+ while (ismbblank (b, blen, &mblength))
+ b += mblength, blen -= mblength;
- while (blanks[UCHAR (tmpa)])
- tmpa = *++a;
- while (blanks[UCHAR (tmpb)])
- tmpb = *++b;
+ tmpa = *a;
+ tmpb = *b;
+ }
+ else
+#endif
+ {
+ tmpa = *a;
+ tmpb = *b;
+
+ while (blanks[UCHAR (tmpa)])
+ tmpa = *++a;
+ while (blanks[UCHAR (tmpb)])
+ tmpb = *++b;
+ }
if (tmpa == NEGATION_SIGN)
{
@@ -1268,15 +1637,60 @@ general_numcompare (const char *sa, const char *sb)
/* FIXME: maybe add option to try expensive FP conversion
only if A and B can't be compared more cheaply/accurately. */
- char *ea;
- char *eb;
- double a = strtod (sa, &ea);
- double b = strtod (sb, &eb);
+ char *bufa, *ea;
+ char *bufb, *eb;
+ double a;
+ double b;
+
+ char *p;
+ struct lconv *lconvp = localeconv ();
+ size_t thousands_sep_len = strlen (lconvp->thousands_sep);
+
+ bufa = (char *) xmalloc (strlen (sa) + 1);
+ bufb = (char *) xmalloc (strlen (sb) + 1);
+ strcpy (bufa, sa);
+ strcpy (bufb, sb);
+
+ if (force_general_numcompare)
+ {
+ while (1)
+ {
+ a = strtod (bufa, &ea);
+ if (memcmp (ea, lconvp->thousands_sep, thousands_sep_len) == 0)
+ {
+ for (p = ea; *(p + thousands_sep_len) != '\0'; p++)
+ *p = *(p + thousands_sep_len);
+ *p = '\0';
+ continue;
+ }
+ break;
+ }
+
+ while (1)
+ {
+ b = strtod (bufb, &eb);
+ if (memcmp (eb, lconvp->thousands_sep, thousands_sep_len) == 0)
+ {
+ for (p = eb; *(p + thousands_sep_len) != '\0'; p++)
+ *p = *(p + thousands_sep_len);
+ *p = '\0';
+ continue;
+ }
+ break;
+ }
+ }
+ else
+ {
+ a = strtod (bufa, &ea);
+ b = strtod (bufb, &eb);
+ }
/* Put conversion errors at the start of the collating sequence. */
- if (sa == ea)
- return sb == eb ? 0 : -1;
- if (sb == eb)
+ free (bufa);
+ free (bufb);
+ if (bufa == ea)
+ return bufb == eb ? 0 : -1;
+ if (bufb == eb)
return 1;
/* Sort numbers in the usual way, where -0 == +0. Put NaNs after
@@ -1294,7 +1708,7 @@ general_numcompare (const char *sa, const char *sb)
Return 0 if the name in S is not recognized. */
static int
-getmonth (const char *s, size_t len)
+getmonth_uni (const char *s, size_t len)
{
char *month;
register size_t i;
@@ -1332,11 +1746,79 @@ getmonth (const char *s, size_t len)
return result;
}
+#if HAVE_MBRTOWC
+static int
+getmonth_mb (const char *s, size_t len)
+{
+ char *month;
+ register size_t i;
+ register int lo = 0, hi = MONTHS_PER_YEAR, result;
+ char *tmp;
+ size_t wclength, mblength;
+ const char **pp;
+ const wchar_t **wpp;
+ wchar_t *month_wcs;
+ mbstate_t state;
+
+ while (len > 0 && ismbblank (s, len, &mblength))
+ {
+ s += mblength;
+ len -= mblength;
+ }
+
+ if (len == 0)
+ return 0;
+
+ month = (char *) alloca (len + 1);
+
+ tmp = (char *) alloca (len + 1);
+ memcpy (tmp, s, len);
+ tmp[len] = '\0';
+ pp = (const char **)&tmp;
+ month_wcs = (wchar_t *) alloca ((len + 1) * sizeof (wchar_t));
+ memset (&state, '\0', sizeof(mbstate_t));
+
+ wclength = mbsrtowcs (month_wcs, pp, len + 1, &state);
+ assert (wclength != (size_t)-1 && *pp == NULL);
+
+ for (i = 0; i < wclength; i++)
+ {
+ month_wcs[i] = towupper(month_wcs[i]);
+ if (iswblank (month_wcs[i]))
+ {
+ month_wcs[i] = L'\0';
+ break;
+ }
+ }
+
+ wpp = (const wchar_t **)&month_wcs;
+
+ mblength = wcsrtombs (month, wpp, len + 1, &state);
+ assert (mblength != (-1) && *wpp == NULL);
+
+ do
+ {
+ int ix = (lo + hi) / 2;
+
+ if (strncmp (month, monthtab[ix].name, strlen (monthtab[ix].name)) < 0)
+ hi = ix;
+ else
+ lo = ix;
+ }
+ while (hi - lo > 1);
+
+ result = (!strncmp (month, monthtab[lo].name, strlen (monthtab[lo].name))
+ ? monthtab[lo].val : 0);
+
+ return result;
+}
+#endif
+
/* Compare two lines A and B trying every key in sequence until there
are no more keys or a difference is found. */
static int
-keycompare (const struct line *a, const struct line *b)
+keycompare_uni (const struct line *a, const struct line *b)
{
struct keyfield const *key = keylist;
@@ -1507,6 +1989,187 @@ keycompare (const struct line *a, const struct line *b)
return key->reverse ? -diff : diff;
}
+#if HAVE_MBRTOWC
+static int
+keycompare_mb (const struct line *a, const struct line *b)
+{
+ struct keyfield *key = keylist;
+
+ /* For the first iteration only, the key positions have been
+ precomputed for us. */
+ char *texta = a->keybeg;
+ char *textb = b->keybeg;
+ char *lima = a->keylim;
+ char *limb = b->keylim;
+
+ size_t mblength_a, mblength_b;
+ wchar_t wc_a, wc_b;
+ mbstate_t state_a, state_b;
+
+ int diff;
+
+ memset (&state_a, '\0', sizeof(mbstate_t));
+ memset (&state_b, '\0', sizeof(mbstate_t));
+
+ for (;;)
+ {
+ unsigned char *translate = (unsigned char *) key->translate;
+ bool const *ignore = key->ignore;
+
+ /* Find the lengths. */
+ size_t lena = lima <= texta ? 0 : lima - texta;
+ size_t lenb = limb <= textb ? 0 : limb - textb;
+
+ if (key->skipeblanks)
+ {
+ char *a_end = texta + lena;
+ char *b_end = textb + lenb;
+ a_end -= trailing_blanks (texta, lena);
+ b_end -= trailing_blanks (textb, lenb);
+ lena = a_end - texta;
+ lenb = b_end - textb;
+ }
+
+ /* Actually compare the fields. */
+ if (key->numeric | key->general_numeric)
+ {
+ char savea = *lima, saveb = *limb;
+
+ *lima = *limb = '\0';
+ if (force_general_numcompare)
+ diff = general_numcompare (texta, textb);
+ else
+ diff = ((key->numeric ? numcompare : general_numcompare)
+ (texta, textb));
+ *lima = savea, *limb = saveb;
+ }
+ else if (key->month)
+ diff = getmonth (texta, lena) - getmonth (textb, lenb);
+ else
+ {
+ if (ignore || translate)
+ {
+ char *copy_a = (char *) alloca (lena + 1 + lenb + 1);
+ char *copy_b = copy_a + lena + 1;
+ size_t new_len_a, new_len_b;
+ size_t i, j;
+
+ /* Ignore and/or translate chars before comparing. */
+# define IGNORE_CHARS(NEW_LEN, LEN, TEXT, COPY, WC, MBLENGTH, STATE) \
+ do \
+ { \
+ wchar_t uwc; \
+ char mbc[MB_LEN_MAX]; \
+ mbstate_t state_wc; \
+ \
+ for (NEW_LEN = i = 0; i < LEN;) \
+ { \
+ mbstate_t state_bak; \
+ \
+ state_bak = STATE; \
+ MBLENGTH = mbrtowc (&WC, TEXT + i, LEN - i, &STATE); \
+ \
+ if (MBLENGTH == (size_t)-2 || MBLENGTH == (size_t)-1 \
+ || MBLENGTH == 0) \
+ { \
+ if (MBLENGTH == (size_t)-2 || MBLENGTH == (size_t)-1) \
+ STATE = state_bak; \
+ if (!ignore) \
+ COPY[NEW_LEN++] = TEXT[i++]; \
+ continue; \
+ } \
+ \
+ if (ignore) \
+ { \
+ if ((ignore == nonprinting && !iswprint (WC)) \
+ || (ignore == nondictionary \
+ && !iswalnum (WC) && !iswblank (WC))) \
+ { \
+ i += MBLENGTH; \
+ continue; \
+ } \
+ } \
+ \
+ if (translate) \
+ { \
+ \
+ uwc = toupper(WC); \
+ if (WC == uwc) \
+ { \
+ memcpy (mbc, TEXT + i, MBLENGTH); \
+ i += MBLENGTH; \
+ } \
+ else \
+ { \
+ i += MBLENGTH; \
+ WC = uwc; \
+ memset (&state_wc, '\0', sizeof (mbstate_t)); \
+ \
+ MBLENGTH = wcrtomb (mbc, WC, &state_wc); \
+ assert (MBLENGTH != (size_t)-1 && MBLENGTH != 0); \
+ } \
+ \
+ for (j = 0; j < MBLENGTH; j++) \
+ COPY[NEW_LEN++] = mbc[j]; \
+ } \
+ else \
+ for (j = 0; j < MBLENGTH; j++) \
+ COPY[NEW_LEN++] = TEXT[i++]; \
+ } \
+ COPY[NEW_LEN] = '\0'; \
+ } \
+ while (0)
+ IGNORE_CHARS (new_len_a, lena, texta, copy_a,
+ wc_a, mblength_a, state_a);
+ IGNORE_CHARS (new_len_b, lenb, textb, copy_b,
+ wc_b, mblength_b, state_b);
+ diff = xmemcoll (copy_a, new_len_a, copy_b, new_len_b);
+ }
+ else if (lena == 0)
+ diff = - NONZERO (lenb);
+ else if (lenb == 0)
+ goto greater;
+ else
+ diff = xmemcoll (texta, lena, textb, lenb);
+ }
+
+ if (diff)
+ goto not_equal;
+
+ key = key->next;
+ if (! key)
+ break;
+
+ /* Find the beginning and limit of the next field. */
+ if (key->eword != -1)
+ lima = limfield (a, key), limb = limfield (b, key);
+ else
+ lima = a->text + a->length - 1, limb = b->text + b->length - 1;
+
+ if (key->sword != -1)
+ texta = begfield (a, key), textb = begfield (b, key);
+ else
+ {
+ texta = a->text, textb = b->text;
+ if (key->skipsblanks)
+ {
+ while (texta < lima && ismbblank (texta, lima - texta, &mblength_a))
+ texta += mblength_a;
+ while (textb < limb && ismbblank (textb, limb - textb, &mblength_b))
+ textb += mblength_b;
+ }
+ }
+ }
+
+ return 0;
+
+greater:
+ diff = 1;
+not_equal:
+ return key->reverse ? -diff : diff;
+}
+#endif
+
/* Compare two lines A and B, returning negative, zero, or positive
depending on whether A compares less than, equal to, or greater than B. */
@@ -2252,20 +2915,44 @@ main (int argc, char **argv)
{
struct lconv const *lconvp = localeconv ();
- /* If the locale doesn't define a decimal point, or if the decimal
- point is multibyte, use the C decimal point. We don't support
- multibyte decimal points yet. */
decimal_point = *lconvp->decimal_point;
if (! decimal_point || lconvp->decimal_point[1])
- decimal_point = C_DECIMAL_POINT;
+ {
+ decimal_point = C_DECIMAL_POINT;
+ if (lconvp->decimal_point[0] && lconvp->decimal_point[1])
+ force_general_numcompare = 1;
+ }
/* We don't support multibyte thousands separators yet. */
th_sep = *lconvp->thousands_sep;
if (! th_sep || lconvp->thousands_sep[1])
- th_sep = CHAR_MAX + 1;
+ {
+ th_sep = CHAR_MAX + 1;
+ if (lconvp->thousands_sep[0] && lconvp->thousands_sep[1])
+ force_general_numcompare = 1;
+ }
}
#endif
+#if HAVE_MBRTOWC
+ if (MB_CUR_MAX > 1)
+ {
+ inittables = inittables_mb;
+ begfield = begfield_mb;
+ limfield = limfield_mb;
+ getmonth = getmonth_mb;
+ keycompare = keycompare_mb;
+ }
+ else
+#endif
+ {
+ inittables = inittables_uni;
+ begfield = begfield_uni;
+ limfield = limfield_uni;
+ keycompare = keycompare_uni;
+ getmonth = getmonth_uni;
+ }
+
have_read_stdin = false;
inittables ();
@@ -2462,13 +3149,47 @@ main (int argc, char **argv)
case 't':
{
- int newtab = optarg[0];
- if (! newtab)
+ char newtab[MB_LEN_MAX + 1];
+ strncpy (newtab, optarg, MB_LEN_MAX);
+ if (! newtab[0])
error (SORT_FAILURE, 0, _("empty tab"));
+#if HAVE_MBRTOWC
+ if (MB_CUR_MAX > 1)
+ {
+ wchar_t wc;
+ mbstate_t state;
+ size_t newtab_length, i;
+
+ memset (&state, '\0', sizeof (mbstate_t));
+ newtab_length = mbrtowc (&wc, newtab, strnlen (newtab, MB_LEN_MAX), &state);
+ switch (newtab_length)
+ {
+ case (size_t) -1:
+ case (size_t) -2:
+ case 0:
+ newtab_length = 1;
+ }
+
+ if (optarg[tab_length])
+ {
+ /* Provoke with `sort -txx'. Complain about
+ "multi-character tab" instead of "multibyte tab", so
+ that the diagnostic's wording does not need to be
+ changed once multibyte characters are supported. */
+ error (SORT_FAILURE, 0, _("multi-character tab `%s'"),
+ optarg);
+ }
+
+ for (i = 0; i < newtab_length; i++)
+ tab[i] = newtab[i];
+ }
+ else
+#endif
+
if (optarg[1])
{
if (strcmp (optarg, "\\0") == 0)
- newtab = '\0';
+ newtab[0] = '\0';
else
{
/* Provoke with `sort -txx'. Complain about
@@ -2479,9 +3200,9 @@ main (int argc, char **argv)
optarg);
}
}
- if (tab != TAB_DEFAULT && tab != newtab)
+ if (tab[0] != TAB_DEFAULT && tab[0] != newtab[0])
error (SORT_FAILURE, 0, _("incompatible tabs"));
- tab = newtab;
+ tab[0] = newtab[0];
}
break;
OpenPOWER on IntegriCloud