diff options
author | tjr <tjr@FreeBSD.org> | 2004-07-02 11:07:42 +0000 |
---|---|---|
committer | tjr <tjr@FreeBSD.org> | 2004-07-02 11:07:42 +0000 |
commit | ceb7da92e59eb53fa5b1ce5fd2ee262afcbc6936 (patch) | |
tree | 291921021f334786c86d788cf0ce23051ebe41ff | |
parent | 64efc9f020477de2e03dfe3491ea1f820979588f (diff) | |
download | FreeBSD-src-ceb7da92e59eb53fa5b1ce5fd2ee262afcbc6936.zip FreeBSD-src-ceb7da92e59eb53fa5b1ce5fd2ee262afcbc6936.tar.gz |
Add support for multi-byte character sets.
Obtained from: Mitsuru Chinen (IBM) via The Fedora Project
-rw-r--r-- | contrib/gnu-sort/src/sort.c | 805 |
1 files changed, 763 insertions, 42 deletions
diff --git a/contrib/gnu-sort/src/sort.c b/contrib/gnu-sort/src/sort.c index b67582a..8a6e897 100644 --- a/contrib/gnu-sort/src/sort.c +++ b/contrib/gnu-sort/src/sort.c @@ -1,3 +1,4 @@ +/* $FreeBSD$ */ /* sort - sort lines of text (with all kinds of options). Copyright (C) 88, 1991-2004 Free Software Foundation, Inc. @@ -23,10 +24,31 @@ #include <config.h> +#include <assert.h> #include <getopt.h> #include <sys/types.h> #include <signal.h> #include <stdio.h> + +/* Solaris 2.5 has a bug: <wchar.h> must be included before <wctype.h>. */ +/* Get mbstate_t, mbrtowc(), wcwidth(). */ +#if HAVE_WCHAR_H +# include <wchar.h> +#endif + +/* Get isw* functions. */ +#if HAVE_WCTYPE_H +# include <wctype.h> +#endif + +/* Get nl_langinfo(). */ +#if HAVE_LANGINFO_CODESET +# include <langinfo.h> +#endif + +/* Include this after wctype.h so that we `#undef' ISPRINT + (from Solaris's euc.h, from widec.h, from wctype.h) before + redefining and using it. */ #include "system.h" #include "error.h" #include "hard-locale.h" @@ -46,6 +68,17 @@ struct rlimit { size_t rlim_cur; }; # define getrlimit(Resource, Rlp) (-1) #endif +/* MB_LEN_MAX is incorrectly defined to be 1 in at least one GCC + installation; work around this configuration error. */ +#if !defined MB_LEN_MAX || MB_LEN_MAX == 1 +# define MB_LEN_MAX 16 +#endif + +/* Some systems, like BeOS, have multibyte encodings but lack mbstate_t. */ +#if HAVE_MBRTOWC && defined mbstate_t +# define mbrtowc(pwc, s, n, ps) (mbrtowc) (pwc, s, n, 0) +#endif + /* The official name of this program (e.g., no `g' prefix). */ #define PROGRAM_NAME "sort" @@ -91,6 +124,7 @@ enum static char decimal_point; static int th_sep; /* if CHAR_MAX + 1, then there is no thousands separator */ +static int force_general_numcompare = 0; /* Nonzero if the corresponding locales are hard. */ static bool hard_LC_COLLATE; @@ -109,6 +143,28 @@ static bool hard_LC_TIME; #define NONZERO(x) (x != 0) +/* get a multibyte character's byte length. */ +#define GET_BYTELEN_OF_CHAR(LIM, PTR, MBLENGTH, STATE) \ + do \ + { \ + wchar_t wc; \ + mbstate_t state_bak; \ + \ + state_bak = STATE; \ + mblength = mbrtowc (&wc, PTR, LIM - PTR, &STATE); \ + \ + switch (MBLENGTH) \ + { \ + case (size_t)-1: \ + case (size_t)-2: \ + STATE = state_bak; \ + /* Fall through. */ \ + case 0: \ + MBLENGTH = 1; \ + } \ + } \ + while (0) + /* The kind of blanks for '-b' to skip in various options. */ enum blanktype { bl_start, bl_end, bl_both }; @@ -251,7 +307,8 @@ enum { TAB_DEFAULT = CHAR_MAX + 1 }; /* Tab character separating fields. If TAB_DEFAULT, then fields are separated by the empty string between a non-blank character and a blank character. */ -static int tab = TAB_DEFAULT; +static int tab[MB_LEN_MAX + 1] = { TAB_DEFAULT }; +static size_t tab_length = 1; /* Flag to remove consecutive duplicate lines from the output. Only the last of a sequence of equal lines will be output. */ @@ -384,6 +441,46 @@ struct tempnode }; static struct tempnode *volatile temphead; +/* Fucntion pointers. */ +static void +(*inittables) (void); + +static char * +(* begfield) (const struct line *line, const struct keyfield *key); + +static char * +(* limfield) (const struct line *line, const struct keyfield *key); + +static int +(*getmonth) (const char *s, size_t len); + +static int +(* keycompare) (const struct line *a, const struct line *b); + +/* Test for white space multibyte character. + Set LENGTH the byte length of investigated multibyte character. */ +#if HAVE_MBRTOWC +static int +ismbblank (const char *str, size_t len, size_t *length) +{ + size_t mblength; + wchar_t wc; + mbstate_t state; + + memset (&state, '\0', sizeof(mbstate_t)); + mblength = mbrtowc (&wc, str, len, &state); + + if (mblength == (size_t)-1 || mblength == (size_t)-2) + { + *length = 1; + return 0; + } + + *length = (mblength < 1) ? 1 : mblength; + return iswblank (wc); +} +#endif + /* Clean up any remaining temporary files. */ static void @@ -521,7 +618,7 @@ zaptemp (const char *name) } } -#if HAVE_NL_LANGINFO +#if HAVE_LANGINFO_CODESET static int struct_month_cmp (const void *m1, const void *m2) @@ -536,7 +633,7 @@ struct_month_cmp (const void *m1, const void *m2) /* Initialize the character class tables. */ static void -inittables (void) +inittables_uni (void) { int i; @@ -574,6 +671,64 @@ inittables (void) #endif } +#if HAVE_MBRTOWC +static void +inittables_mb (void) +{ + int i, j, k, l; + char *name, *s; + size_t s_len, mblength; + char mbc[MB_LEN_MAX]; + wchar_t wc, pwc; + mbstate_t state_mb, state_wc; + + for (i = 0; i < MONTHS_PER_YEAR; i++) + { + s = (char *) nl_langinfo (ABMON_1 + i); + s_len = strlen (s); + monthtab[i].name = name = (char *) xmalloc (s_len + 1); + monthtab[i].val = i + 1; + + memset (&state_mb, '\0', sizeof (mbstate_t)); + memset (&state_wc, '\0', sizeof (mbstate_t)); + + for (j = 0; j < s_len;) + { + if (!ismbblank (s + j, s_len - j, &mblength)) + break; + j += mblength; + } + + for (k = 0; j < s_len;) + { + mblength = mbrtowc (&wc, (s + j), (s_len - j), &state_mb); + assert (mblength != (size_t)-1 && mblength != (size_t)-2); + if (mblength == 0) + break; + + pwc = towupper (wc); + if (pwc == wc) + { + memcpy (mbc, s + j, mblength); + j += mblength; + } + else + { + j += mblength; + mblength = wcrtomb (mbc, pwc, &state_wc); + assert (mblength != (size_t)0 && mblength != (size_t)-1); + } + + for (l = 0; l < mblength; l++) + name[k++] = mbc[l]; + } + name[k] = '\0'; + } + qsort ((void *) monthtab, MONTHS_PER_YEAR, + sizeof (struct month), struct_month_cmp); +} +#endif + /* Specify the amount of main memory to use when sorting. */ static void specify_sort_size (char const *s) @@ -784,7 +939,7 @@ buffer_linelim (struct buffer const *buf) by KEY in LINE. */ static char * -begfield (const struct line *line, const struct keyfield *key) +begfield_uni (const struct line *line, const struct keyfield *key) { register char *ptr = line->text, *lim = ptr + line->length - 1; register size_t sword = key->sword; @@ -794,10 +949,10 @@ begfield (const struct line *line, const struct keyfield *key) /* The leading field separator itself is included in a field when -t is absent. */ - if (tab != TAB_DEFAULT) + if (tab[0] != TAB_DEFAULT) while (ptr < lim && sword--) { - while (ptr < lim && *ptr != tab) + while (ptr < lim && *ptr != tab[0]) ++ptr; if (ptr < lim) ++ptr; @@ -825,11 +980,70 @@ begfield (const struct line *line, const struct keyfield *key) return ptr; } +#if HAVE_MBRTOWC +static char * +begfield_mb (const struct line *line, const struct keyfield *key) +{ + int i; + char *ptr = line->text, *lim = ptr + line->length - 1; + size_t sword = key->sword; + size_t schar = key->schar; + size_t mblength; + mbstate_t state; + + memset (&state, '\0', sizeof(mbstate_t)); + + if (tab[0] != TAB_DEFAULT) + while (ptr < lim && sword--) + { + while (ptr < lim && memcmp (ptr, tab, tab_length) != 0) + { + GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state); + ptr += mblength; + } + if (ptr < lim) + { + GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state); + ptr += mblength; + } + } + else + while (ptr < lim && sword--) + { + while (ptr < lim && ismbblank (ptr, lim - ptr, &mblength)) + ptr += mblength; + if (ptr < lim) + { + GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state); + ptr += mblength; + } + while (ptr < lim && !ismbblank (ptr, lim - ptr, &mblength)) + ptr += mblength; + } + + if (key->skipsblanks) + while (ptr < lim && ismbblank (ptr, lim - ptr, &mblength)) + ptr += mblength; + + for (i = 0; i < schar; i++) + { + GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state); + + if (ptr + mblength > lim) + break; + else + ptr += mblength; + } + + return ptr; +} +#endif + /* Return the limit of (a pointer to the first character after) the field in LINE specified by KEY. */ static char * -limfield (const struct line *line, const struct keyfield *key) +limfield_uni (const struct line *line, const struct keyfield *key) { register char *ptr = line->text, *lim = ptr + line->length - 1; register size_t eword = key->eword, echar = key->echar; @@ -842,10 +1056,10 @@ limfield (const struct line *line, const struct keyfield *key) `beginning' is the first character following the delimiting TAB. Otherwise, leave PTR pointing at the first `blank' character after the preceding field. */ - if (tab != TAB_DEFAULT) + if (tab[0] != TAB_DEFAULT) while (ptr < lim && eword--) { - while (ptr < lim && *ptr != tab) + while (ptr < lim && *ptr != tab[0]) ++ptr; if (ptr < lim && (eword | echar)) ++ptr; @@ -891,10 +1105,10 @@ limfield (const struct line *line, const struct keyfield *key) */ /* Make LIM point to the end of (one byte past) the current field. */ - if (tab != TAB_DEFAULT) + if (tab[0] != TAB_DEFAULT) { char *newlim; - newlim = memchr (ptr, tab, lim - ptr); + newlim = memchr (ptr, tab[0], lim - ptr); if (newlim) lim = newlim; } @@ -926,15 +1140,137 @@ limfield (const struct line *line, const struct keyfield *key) return ptr; } +#if HAVE_MBRTOWC +static char * +limfield_mb (const struct line *line, const struct keyfield *key) +{ + char *ptr = line->text, *lim = ptr + line->length - 1; + size_t eword = key->eword, echar = key->echar; + int i; + size_t mblength; + mbstate_t state; + + memset (&state, '\0', sizeof(mbstate_t)); + + if (tab[0]) + while (ptr < lim && eword--) + { + while (ptr < lim && memcmp (ptr, tab, tab_length) != 0) + { + GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state); + ptr += mblength; + } + if (ptr < lim && (eword | echar)) + { + GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state); + ptr += mblength; + } + } + else + while (ptr < lim && eword--) + { + while (ptr < lim && ismbblank (ptr, lim - ptr, &mblength)) + ptr += mblength; + if (ptr < lim) + { + GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state); + ptr += mblength; + } + while (ptr < lim && !ismbblank (ptr, lim - ptr, &mblength)) + ptr += mblength; + } + + +# ifdef POSIX_UNSPECIFIED + /* Make LIM point to the end of (one byte past) the current field. */ + if (tab[0]) + { + char *newlim, *p; + + newlim = NULL; + for (p = ptr; p < lim;) + { + if (memcmp (p, tab, tab_length) == 0) + { + newlim = p; + break; + } + + GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state); + p += mblength; + } + } + else + { + char *newlim; + newlim = ptr; + + while (newlim < lim && ismbblank (newlim, lim - newlim, &mblength)) + newlim += mblength; + if (ptr < lim) + { + GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state); + ptr += mblength; + } + while (newlim < lim && !ismbblank (newlim, lim - newlim, &mblength)) + newlim += mblength; + lim = newlim; + } +# endif + + /* If we're skipping leading blanks, don't start counting characters + * until after skipping past any leading blanks. */ + if (key->skipsblanks) + while (ptr < lim && ismbblank (ptr, lim - ptr, &mblength)) + ptr += mblength; + + memset (&state, '\0', sizeof(mbstate_t)); + + /* Advance PTR by ECHAR (if possible), but no further than LIM. */ + for (i = 0; i < echar; i++) + { + GET_BYTELEN_OF_CHAR (lim, ptr, mblength, state); + + if (ptr + mblength > lim) + break; + else + ptr += mblength; + } + + return ptr; +} +#endif + /* Return the number of trailing blanks in FIELD, with LEN bytes. */ static size_t trailing_blanks (char const *field, size_t len) { - size_t i; - for (i = len; 0 < i && blanks[UCHAR (field[i - 1])]; i--) - continue; - return len - i; +#if HAVE_MBRTOWC + if (MB_CUR_MAX > 1) + { + size_t blanks = 0; + + while (len) { + size_t mblength; + if (ismbblank (field, len, &mblength)) + blanks++; + else + blanks = 0; + + field += mblength, len -= mblength; + } + + return blanks; + } + else +#endif + { + size_t i; + for (i = len; 0 < i && blanks[UCHAR (field[i - 1])]; i--) + continue; + return len - i; + } } /* Fill BUF reading from FP, moving buf->left bytes from the end @@ -1019,8 +1355,22 @@ fillbuf (struct buffer *buf, register FILE *fp, char const *file) else { if (key->skipsblanks) - while (blanks[UCHAR (*line_start)]) - line_start++; +#if HAVE_MBRTOWC + { + if (MB_CUR_MAX > 1) + { + size_t mblength; + + while (ismbblank (line_start, ptr - line_start, &mblength)) + line_start += mblength; + } + else +#endif + { + while (blanks[UCHAR (*line_start)]) + line_start++; + } + } line->keybeg = line_start; } if (key->skipeblanks) @@ -1128,13 +1478,32 @@ numcompare (register const char *a, register const char *b) register int tmpa, tmpb, tmp; register size_t log_a, log_b; - tmpa = *a; - tmpb = *b; +#if HAVE_MBRTOWC + if (MB_CUR_MAX > 1) + { + size_t mblength; + size_t alen = strnlen (a, MB_LEN_MAX); + size_t blen = strnlen (b, MB_LEN_MAX); + + while (ismbblank (a, alen, &mblength)) + a += mblength, alen -= mblength; + while (ismbblank (b, blen, &mblength)) + b += mblength, blen -= mblength; - while (blanks[UCHAR (tmpa)]) - tmpa = *++a; - while (blanks[UCHAR (tmpb)]) - tmpb = *++b; + tmpa = *a; + tmpb = *b; + } + else +#endif + { + tmpa = *a; + tmpb = *b; + + while (blanks[UCHAR (tmpa)]) + tmpa = *++a; + while (blanks[UCHAR (tmpb)]) + tmpb = *++b; + } if (tmpa == NEGATION_SIGN) { @@ -1268,15 +1637,60 @@ general_numcompare (const char *sa, const char *sb) /* FIXME: maybe add option to try expensive FP conversion only if A and B can't be compared more cheaply/accurately. */ - char *ea; - char *eb; - double a = strtod (sa, &ea); - double b = strtod (sb, &eb); + char *bufa, *ea; + char *bufb, *eb; + double a; + double b; + + char *p; + struct lconv *lconvp = localeconv (); + size_t thousands_sep_len = strlen (lconvp->thousands_sep); + + bufa = (char *) xmalloc (strlen (sa) + 1); + bufb = (char *) xmalloc (strlen (sb) + 1); + strcpy (bufa, sa); + strcpy (bufb, sb); + + if (force_general_numcompare) + { + while (1) + { + a = strtod (bufa, &ea); + if (memcmp (ea, lconvp->thousands_sep, thousands_sep_len) == 0) + { + for (p = ea; *(p + thousands_sep_len) != '\0'; p++) + *p = *(p + thousands_sep_len); + *p = '\0'; + continue; + } + break; + } + + while (1) + { + b = strtod (bufb, &eb); + if (memcmp (eb, lconvp->thousands_sep, thousands_sep_len) == 0) + { + for (p = eb; *(p + thousands_sep_len) != '\0'; p++) + *p = *(p + thousands_sep_len); + *p = '\0'; + continue; + } + break; + } + } + else + { + a = strtod (bufa, &ea); + b = strtod (bufb, &eb); + } /* Put conversion errors at the start of the collating sequence. */ - if (sa == ea) - return sb == eb ? 0 : -1; - if (sb == eb) + free (bufa); + free (bufb); + if (bufa == ea) + return bufb == eb ? 0 : -1; + if (bufb == eb) return 1; /* Sort numbers in the usual way, where -0 == +0. Put NaNs after @@ -1294,7 +1708,7 @@ general_numcompare (const char *sa, const char *sb) Return 0 if the name in S is not recognized. */ static int -getmonth (const char *s, size_t len) +getmonth_uni (const char *s, size_t len) { char *month; register size_t i; @@ -1332,11 +1746,79 @@ getmonth (const char *s, size_t len) return result; } +#if HAVE_MBRTOWC +static int +getmonth_mb (const char *s, size_t len) +{ + char *month; + register size_t i; + register int lo = 0, hi = MONTHS_PER_YEAR, result; + char *tmp; + size_t wclength, mblength; + const char **pp; + const wchar_t **wpp; + wchar_t *month_wcs; + mbstate_t state; + + while (len > 0 && ismbblank (s, len, &mblength)) + { + s += mblength; + len -= mblength; + } + + if (len == 0) + return 0; + + month = (char *) alloca (len + 1); + + tmp = (char *) alloca (len + 1); + memcpy (tmp, s, len); + tmp[len] = '\0'; + pp = (const char **)&tmp; + month_wcs = (wchar_t *) alloca ((len + 1) * sizeof (wchar_t)); + memset (&state, '\0', sizeof(mbstate_t)); + + wclength = mbsrtowcs (month_wcs, pp, len + 1, &state); + assert (wclength != (size_t)-1 && *pp == NULL); + + for (i = 0; i < wclength; i++) + { + month_wcs[i] = towupper(month_wcs[i]); + if (iswblank (month_wcs[i])) + { + month_wcs[i] = L'\0'; + break; + } + } + + wpp = (const wchar_t **)&month_wcs; + + mblength = wcsrtombs (month, wpp, len + 1, &state); + assert (mblength != (-1) && *wpp == NULL); + + do + { + int ix = (lo + hi) / 2; + + if (strncmp (month, monthtab[ix].name, strlen (monthtab[ix].name)) < 0) + hi = ix; + else + lo = ix; + } + while (hi - lo > 1); + + result = (!strncmp (month, monthtab[lo].name, strlen (monthtab[lo].name)) + ? monthtab[lo].val : 0); + + return result; +} +#endif + /* Compare two lines A and B trying every key in sequence until there are no more keys or a difference is found. */ static int -keycompare (const struct line *a, const struct line *b) +keycompare_uni (const struct line *a, const struct line *b) { struct keyfield const *key = keylist; @@ -1507,6 +1989,187 @@ keycompare (const struct line *a, const struct line *b) return key->reverse ? -diff : diff; } +#if HAVE_MBRTOWC +static int +keycompare_mb (const struct line *a, const struct line *b) +{ + struct keyfield *key = keylist; + + /* For the first iteration only, the key positions have been + precomputed for us. */ + char *texta = a->keybeg; + char *textb = b->keybeg; + char *lima = a->keylim; + char *limb = b->keylim; + + size_t mblength_a, mblength_b; + wchar_t wc_a, wc_b; + mbstate_t state_a, state_b; + + int diff; + + memset (&state_a, '\0', sizeof(mbstate_t)); + memset (&state_b, '\0', sizeof(mbstate_t)); + + for (;;) + { + unsigned char *translate = (unsigned char *) key->translate; + bool const *ignore = key->ignore; + + /* Find the lengths. */ + size_t lena = lima <= texta ? 0 : lima - texta; + size_t lenb = limb <= textb ? 0 : limb - textb; + + if (key->skipeblanks) + { + char *a_end = texta + lena; + char *b_end = textb + lenb; + a_end -= trailing_blanks (texta, lena); + b_end -= trailing_blanks (textb, lenb); + lena = a_end - texta; + lenb = b_end - textb; + } + + /* Actually compare the fields. */ + if (key->numeric | key->general_numeric) + { + char savea = *lima, saveb = *limb; + + *lima = *limb = '\0'; + if (force_general_numcompare) + diff = general_numcompare (texta, textb); + else + diff = ((key->numeric ? numcompare : general_numcompare) + (texta, textb)); + *lima = savea, *limb = saveb; + } + else if (key->month) + diff = getmonth (texta, lena) - getmonth (textb, lenb); + else + { + if (ignore || translate) + { + char *copy_a = (char *) alloca (lena + 1 + lenb + 1); + char *copy_b = copy_a + lena + 1; + size_t new_len_a, new_len_b; + size_t i, j; + + /* Ignore and/or translate chars before comparing. */ +# define IGNORE_CHARS(NEW_LEN, LEN, TEXT, COPY, WC, MBLENGTH, STATE) \ + do \ + { \ + wchar_t uwc; \ + char mbc[MB_LEN_MAX]; \ + mbstate_t state_wc; \ + \ + for (NEW_LEN = i = 0; i < LEN;) \ + { \ + mbstate_t state_bak; \ + \ + state_bak = STATE; \ + MBLENGTH = mbrtowc (&WC, TEXT + i, LEN - i, &STATE); \ + \ + if (MBLENGTH == (size_t)-2 || MBLENGTH == (size_t)-1 \ + || MBLENGTH == 0) \ + { \ + if (MBLENGTH == (size_t)-2 || MBLENGTH == (size_t)-1) \ + STATE = state_bak; \ + if (!ignore) \ + COPY[NEW_LEN++] = TEXT[i++]; \ + continue; \ + } \ + \ + if (ignore) \ + { \ + if ((ignore == nonprinting && !iswprint (WC)) \ + || (ignore == nondictionary \ + && !iswalnum (WC) && !iswblank (WC))) \ + { \ + i += MBLENGTH; \ + continue; \ + } \ + } \ + \ + if (translate) \ + { \ + \ + uwc = toupper(WC); \ + if (WC == uwc) \ + { \ + memcpy (mbc, TEXT + i, MBLENGTH); \ + i += MBLENGTH; \ + } \ + else \ + { \ + i += MBLENGTH; \ + WC = uwc; \ + memset (&state_wc, '\0', sizeof (mbstate_t)); \ + \ + MBLENGTH = wcrtomb (mbc, WC, &state_wc); \ + assert (MBLENGTH != (size_t)-1 && MBLENGTH != 0); \ + } \ + \ + for (j = 0; j < MBLENGTH; j++) \ + COPY[NEW_LEN++] = mbc[j]; \ + } \ + else \ + for (j = 0; j < MBLENGTH; j++) \ + COPY[NEW_LEN++] = TEXT[i++]; \ + } \ + COPY[NEW_LEN] = '\0'; \ + } \ + while (0) + IGNORE_CHARS (new_len_a, lena, texta, copy_a, + wc_a, mblength_a, state_a); + IGNORE_CHARS (new_len_b, lenb, textb, copy_b, + wc_b, mblength_b, state_b); + diff = xmemcoll (copy_a, new_len_a, copy_b, new_len_b); + } + else if (lena == 0) + diff = - NONZERO (lenb); + else if (lenb == 0) + goto greater; + else + diff = xmemcoll (texta, lena, textb, lenb); + } + + if (diff) + goto not_equal; + + key = key->next; + if (! key) + break; + + /* Find the beginning and limit of the next field. */ + if (key->eword != -1) + lima = limfield (a, key), limb = limfield (b, key); + else + lima = a->text + a->length - 1, limb = b->text + b->length - 1; + + if (key->sword != -1) + texta = begfield (a, key), textb = begfield (b, key); + else + { + texta = a->text, textb = b->text; + if (key->skipsblanks) + { + while (texta < lima && ismbblank (texta, lima - texta, &mblength_a)) + texta += mblength_a; + while (textb < limb && ismbblank (textb, limb - textb, &mblength_b)) + textb += mblength_b; + } + } + } + + return 0; + +greater: + diff = 1; +not_equal: + return key->reverse ? -diff : diff; +} +#endif + /* Compare two lines A and B, returning negative, zero, or positive depending on whether A compares less than, equal to, or greater than B. */ @@ -2252,20 +2915,44 @@ main (int argc, char **argv) { struct lconv const *lconvp = localeconv (); - /* If the locale doesn't define a decimal point, or if the decimal - point is multibyte, use the C decimal point. We don't support - multibyte decimal points yet. */ decimal_point = *lconvp->decimal_point; if (! decimal_point || lconvp->decimal_point[1]) - decimal_point = C_DECIMAL_POINT; + { + decimal_point = C_DECIMAL_POINT; + if (lconvp->decimal_point[0] && lconvp->decimal_point[1]) + force_general_numcompare = 1; + } /* We don't support multibyte thousands separators yet. */ th_sep = *lconvp->thousands_sep; if (! th_sep || lconvp->thousands_sep[1]) - th_sep = CHAR_MAX + 1; + { + th_sep = CHAR_MAX + 1; + if (lconvp->thousands_sep[0] && lconvp->thousands_sep[1]) + force_general_numcompare = 1; + } } #endif +#if HAVE_MBRTOWC + if (MB_CUR_MAX > 1) + { + inittables = inittables_mb; + begfield = begfield_mb; + limfield = limfield_mb; + getmonth = getmonth_mb; + keycompare = keycompare_mb; + } + else +#endif + { + inittables = inittables_uni; + begfield = begfield_uni; + limfield = limfield_uni; + keycompare = keycompare_uni; + getmonth = getmonth_uni; + } + have_read_stdin = false; inittables (); @@ -2462,13 +3149,47 @@ main (int argc, char **argv) case 't': { - int newtab = optarg[0]; - if (! newtab) + char newtab[MB_LEN_MAX + 1]; + strncpy (newtab, optarg, MB_LEN_MAX); + if (! newtab[0]) error (SORT_FAILURE, 0, _("empty tab")); +#if HAVE_MBRTOWC + if (MB_CUR_MAX > 1) + { + wchar_t wc; + mbstate_t state; + size_t newtab_length, i; + + memset (&state, '\0', sizeof (mbstate_t)); + newtab_length = mbrtowc (&wc, newtab, strnlen (newtab, MB_LEN_MAX), &state); + switch (newtab_length) + { + case (size_t) -1: + case (size_t) -2: + case 0: + newtab_length = 1; + } + + if (optarg[tab_length]) + { + /* Provoke with `sort -txx'. Complain about + "multi-character tab" instead of "multibyte tab", so + that the diagnostic's wording does not need to be + changed once multibyte characters are supported. */ + error (SORT_FAILURE, 0, _("multi-character tab `%s'"), + optarg); + } + + for (i = 0; i < newtab_length; i++) + tab[i] = newtab[i]; + } + else +#endif + if (optarg[1]) { if (strcmp (optarg, "\\0") == 0) - newtab = '\0'; + newtab[0] = '\0'; else { /* Provoke with `sort -txx'. Complain about @@ -2479,9 +3200,9 @@ main (int argc, char **argv) optarg); } } - if (tab != TAB_DEFAULT && tab != newtab) + if (tab[0] != TAB_DEFAULT && tab[0] != newtab[0]) error (SORT_FAILURE, 0, _("incompatible tabs")); - tab = newtab; + tab[0] = newtab[0]; } break; |