diff options
-rw-r--r-- | lib/libc/locale/Symbol.map | 3 | ||||
-rw-r--r-- | lib/libc/locale/collate.c | 760 | ||||
-rw-r--r-- | lib/libc/locale/collate.h | 106 | ||||
-rw-r--r-- | lib/libc/locale/collcmp.c | 9 | ||||
-rw-r--r-- | lib/libc/locale/setrunelocale.c | 6 | ||||
-rw-r--r-- | lib/libc/string/strcoll.c | 112 | ||||
-rw-r--r-- | lib/libc/string/strxfrm.c | 66 | ||||
-rw-r--r-- | lib/libc/string/wcsxfrm.c | 84 |
8 files changed, 775 insertions, 371 deletions
diff --git a/lib/libc/locale/Symbol.map b/lib/libc/locale/Symbol.map index b2f2a35..87f2221 100644 --- a/lib/libc/locale/Symbol.map +++ b/lib/libc/locale/Symbol.map @@ -214,4 +214,7 @@ FBSDprivate_1.0 { __detect_path_locale; __collate_load_error; __collate_range_cmp; + __collate_load_tables_l; + __collate_lookup; + }; diff --git a/lib/libc/locale/collate.c b/lib/libc/locale/collate.c index 56513f4..0e43e1f 100644 --- a/lib/libc/locale/collate.c +++ b/lib/libc/locale/collate.c @@ -1,4 +1,5 @@ /*- + * Copright 2010 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 1995 Alex Tatmanjants <alex@elvisti.kiev.ua> * at Electronni Visti IA, Kiev, Ukraine. * All rights reserved. @@ -28,50 +29,39 @@ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. + * + * Adapted to xlocale by John Marino <draco@marino.st> */ #include <sys/cdefs.h> __FBSDID("$FreeBSD$"); #include "namespace.h" -#include <arpa/inet.h> #include <stdio.h> #include <stdlib.h> #include <string.h> +#include <wchar.h> #include <errno.h> #include <unistd.h> -#include <sysexits.h> +#include <fcntl.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <sys/mman.h> #include "un-namespace.h" #include "collate.h" #include "setlocale.h" #include "ldpart.h" -#include "libc_private.h" - -/* - * To avoid modifying the original (single-threaded) code too much, we'll just - * define the old globals as fields inside the table. - * - * We also modify the collation table test functions to search the thread-local - * table first and the global table second. - */ -#define __collate_substitute_nontrivial (table->__collate_substitute_nontrivial) -#define __collate_substitute_table_ptr (table->__collate_substitute_table_ptr) -#define __collate_char_pri_table_ptr (table->__collate_char_pri_table_ptr) -#define __collate_chain_pri_table (table->__collate_chain_pri_table) -int __collate_load_error; - - struct xlocale_collate __xlocale_global_collate = { - {{0}, "C"}, 1, 0 + {{0}, "C"}, 1, 0, 0, 0 }; - struct xlocale_collate __xlocale_C_collate = { - {{0}, "C"}, 1, 0 +struct xlocale_collate __xlocale_C_collate = { + {{0}, "C"}, 1, 0, 0, 0 }; -void __collate_err(int ex, const char *f) __dead2; +#include "libc_private.h" int __collate_load_tables_l(const char *encoding, struct xlocale_collate *table); @@ -80,14 +70,14 @@ static void destruct_collate(void *t) { struct xlocale_collate *table = t; - if (__collate_chain_pri_table) { - free(__collate_chain_pri_table); + if (table->map && (table->maplen > 0)) { + (void) munmap(table->map, table->maplen); } free(t); } void * -__collate_load(const char *encoding, locale_t unused) +__collate_load(const char *encoding, __unused locale_t unused) { if (strcmp(encoding, "C") == 0 || strcmp(encoding, "POSIX") == 0) { return &__xlocale_C_collate; @@ -110,18 +100,19 @@ int __collate_load_tables(const char *encoding) { int ret = __collate_load_tables_l(encoding, &__xlocale_global_collate); - __collate_load_error = __xlocale_global_collate.__collate_load_error; return ret; } int __collate_load_tables_l(const char *encoding, struct xlocale_collate *table) { - FILE *fp; - int i, saverr, chains; - uint32_t u32; - char strbuf[STR_LEN], buf[PATH_MAX]; - void *TMP_substitute_table, *TMP_char_pri_table, *TMP_chain_pri_table; + int i, chains, z; + char buf[PATH_MAX]; + char *TMP; + char *map; + collate_info_t *info; + struct stat sbuf; + int fd; /* 'encoding' must be already checked. */ if (strcmp(encoding, "C") == 0 || strcmp(encoding, "POSIX") == 0) { @@ -129,217 +120,582 @@ __collate_load_tables_l(const char *encoding, struct xlocale_collate *table) return (_LDP_CACHE); } - /* 'PathLocale' must be already set & checked. */ - /* Range checking not needed, encoding has fixed size */ - (void)strcpy(buf, _PathLocale); - (void)strcat(buf, "/"); - (void)strcat(buf, encoding); - (void)strcat(buf, "/LC_COLLATE"); - if ((fp = fopen(buf, "re")) == NULL) - return (_LDP_ERROR); + (void) snprintf(buf, sizeof (buf), "%s/%s/LC_COLLATE", + _PathLocale, encoding); - if (fread(strbuf, sizeof(strbuf), 1, fp) != 1) { - saverr = errno; - (void)fclose(fp); - errno = saverr; + if ((fd = _open(buf, O_RDONLY)) < 0) return (_LDP_ERROR); - } - chains = -1; - if (strcmp(strbuf, COLLATE_VERSION) == 0) - chains = 0; - else if (strcmp(strbuf, COLLATE_VERSION1_2) == 0) - chains = 1; - if (chains < 0) { - (void)fclose(fp); - errno = EFTYPE; + if (_fstat(fd, &sbuf) < 0) { + (void) _close(fd); return (_LDP_ERROR); } - if (chains) { - if (fread(&u32, sizeof(u32), 1, fp) != 1) { - saverr = errno; - (void)fclose(fp); - errno = saverr; - return (_LDP_ERROR); - } - if ((chains = (int)ntohl(u32)) < 1) { - (void)fclose(fp); - errno = EFTYPE; - return (_LDP_ERROR); - } - } else - chains = TABLE_SIZE; - - if ((TMP_substitute_table = - malloc(sizeof(__collate_substitute_table))) == NULL) { - saverr = errno; - (void)fclose(fp); - errno = saverr; + if (sbuf.st_size < (COLLATE_STR_LEN + sizeof (info))) { + (void) _close(fd); + errno = EINVAL; return (_LDP_ERROR); } - if ((TMP_char_pri_table = - malloc(sizeof(__collate_char_pri_table))) == NULL) { - saverr = errno; - free(TMP_substitute_table); - (void)fclose(fp); - errno = saverr; + map = mmap(NULL, sbuf.st_size, PROT_READ, MAP_PRIVATE, fd, 0); + (void) _close(fd); + if ((TMP = map) == NULL) { return (_LDP_ERROR); } - if ((TMP_chain_pri_table = - malloc(sizeof(*__collate_chain_pri_table) * chains)) == NULL) { - saverr = errno; - free(TMP_substitute_table); - free(TMP_char_pri_table); - (void)fclose(fp); - errno = saverr; + + if (strncmp(TMP, COLLATE_VERSION, COLLATE_STR_LEN) != 0) { + (void) munmap(map, sbuf.st_size); + errno = EINVAL; return (_LDP_ERROR); } + TMP += COLLATE_STR_LEN; -#define FREAD(a, b, c, d) \ -{ \ - if (fread(a, b, c, d) != c) { \ - saverr = errno; \ - free(TMP_substitute_table); \ - free(TMP_char_pri_table); \ - free(TMP_chain_pri_table); \ - (void)fclose(d); \ - errno = saverr; \ - return (_LDP_ERROR); \ - } \ -} + info = (void *)TMP; + TMP += sizeof (*info); - FREAD(TMP_substitute_table, sizeof(__collate_substitute_table), 1, fp); - FREAD(TMP_char_pri_table, sizeof(__collate_char_pri_table), 1, fp); - FREAD(TMP_chain_pri_table, - sizeof(*__collate_chain_pri_table), chains, fp); - (void)fclose(fp); - - if (__collate_substitute_table_ptr != NULL) - free(__collate_substitute_table_ptr); - __collate_substitute_table_ptr = TMP_substitute_table; - if (__collate_char_pri_table_ptr != NULL) - free(__collate_char_pri_table_ptr); - __collate_char_pri_table_ptr = TMP_char_pri_table; - for (i = 0; i < UCHAR_MAX + 1; i++) { - __collate_char_pri_table[i].prim = - ntohl(__collate_char_pri_table[i].prim); - __collate_char_pri_table[i].sec = - ntohl(__collate_char_pri_table[i].sec); + if ((info->directive_count < 1) || + (info->directive_count >= COLL_WEIGHTS_MAX) || + ((chains = info->chain_count) < 0)) { + (void) munmap(map, sbuf.st_size); + errno = EINVAL; + return (_LDP_ERROR); } - if (__collate_chain_pri_table != NULL) - free(__collate_chain_pri_table); - __collate_chain_pri_table = TMP_chain_pri_table; - for (i = 0; i < chains; i++) { - __collate_chain_pri_table[i].prim = - ntohl(__collate_chain_pri_table[i].prim); - __collate_chain_pri_table[i].sec = - ntohl(__collate_chain_pri_table[i].sec); + + i = (sizeof (collate_char_t) * (UCHAR_MAX + 1)) + + (sizeof (collate_chain_t) * chains) + + (sizeof (collate_large_t) * info->large_count); + for (z = 0; z < (info->directive_count); z++) { + i += sizeof (collate_subst_t) * info->subst_count[z]; + } + if (i != (sbuf.st_size - (TMP - map))) { + (void) munmap(map, sbuf.st_size); + errno = EINVAL; + return (_LDP_ERROR); } - __collate_substitute_nontrivial = 0; - for (i = 0; i < UCHAR_MAX + 1; i++) { - if (__collate_substitute_table[i][0] != i || - __collate_substitute_table[i][1] != 0) { - __collate_substitute_nontrivial = 1; - break; + + table->char_pri_table = (void *)TMP; + TMP += sizeof (collate_char_t) * (UCHAR_MAX + 1); + + for (z = 0; z < info->directive_count; z++) { + if (info->subst_count[z] > 0) { + table->subst_table[z] = (void *)TMP; + TMP += info->subst_count[z] * sizeof (collate_subst_t); + } else { + table->subst_table[z] = NULL; } } + + if (chains > 0) { + table->chain_pri_table = (void *)TMP; + TMP += chains * sizeof (collate_chain_t); + } else + table->chain_pri_table = NULL; + if (info->large_count > 0) + table->large_pri_table = (void *)TMP; + else + table->large_pri_table = NULL; + + table->info = info; table->__collate_load_error = 0; return (_LDP_LOADED); } -u_char * -__collate_substitute(struct xlocale_collate *table, const u_char *s) +/* + * Note: for performance reasons, we have expanded bsearch here. This avoids + * function call overhead with each comparison. + */ + +static int32_t * +substsearch(struct xlocale_collate *table, const wchar_t key, int pass) +{ + collate_subst_t *p; + int n = table->info->subst_count[pass]; + + if (n == 0) + return (NULL); + + if (pass >= table->info->directive_count) + return (NULL); + + if (!(key & COLLATE_SUBST_PRIORITY)) + return (NULL); + + p = table->subst_table[pass] + (key & ~COLLATE_SUBST_PRIORITY); + return (p->pri); +} + +static collate_chain_t * +chainsearch(struct xlocale_collate *table, const wchar_t *key, int *len) { - int dest_len, len, nlen; - int delta = strlen(s); - u_char *dest_str = NULL; - - if (s == NULL || *s == '\0') - return (__collate_strdup("")); - delta += delta / 8; - dest_str = malloc(dest_len = delta); - if (dest_str == NULL) - __collate_err(EX_OSERR, __func__); - len = 0; - while (*s) { - nlen = len + strlen(__collate_substitute_table[*s]); - if (dest_len <= nlen) { - dest_str = reallocf(dest_str, dest_len = nlen + delta); - if (dest_str == NULL) - __collate_err(EX_OSERR, __func__); + int low; + int high; + int next, compar, l; + collate_chain_t *p; + collate_chain_t *tab; + + if (table->info->chain_count == 0) + return (NULL); + + low = 0; + high = table->info->chain_count - 1; + tab = table->chain_pri_table; + + while (low <= high) { + next = (low + high) / 2; + p = tab + next; + compar = *key - *p->str; + if (compar == 0) { + l = wcsnlen(p->str, COLLATE_STR_LEN); + compar = wcsncmp(key, p->str, l); + if (compar == 0) { + *len = l; + return (p); + } } - (void)strcpy(dest_str + len, __collate_substitute_table[*s++]); - len = nlen; + if (compar > 0) + low = next + 1; + else + high = next - 1; } - return (dest_str); + return (NULL); +} + +static collate_large_t * +largesearch(struct xlocale_collate *table, const wchar_t key) +{ + int low = 0; + int high = table->info->large_count - 1; + int next, compar; + collate_large_t *p; + collate_large_t *tab = table->large_pri_table; + + if (table->info->large_count == 0) + return (NULL); + + while (low <= high) { + next = (low + high) / 2; + p = tab + next; + compar = key - p->val; + if (compar == 0) + return (p); + if (compar > 0) + low = next + 1; + else + high = next - 1; + } + return (NULL); } void -__collate_lookup(struct xlocale_collate *table, const u_char *t, int *len, int *prim, int *sec) +_collate_lookup(struct xlocale_collate *table, const wchar_t *t, int *len, + int *pri, int which, const int **state) { - struct __collate_st_chain_pri *p2; + collate_chain_t *p2; + collate_large_t *match; + int p, l; + const int *sptr; + + /* + * If this is the "last" pass for the UNDEFINED, then + * we just return the priority itself. + */ + if (which >= table->info->directive_count) { + *pri = *t; + *len = 1; + *state = NULL; + return; + } + /* + * If we have remaining substitution data from a previous + * call, consume it first. + */ + if ((sptr = *state) != NULL) { + *pri = *sptr; + sptr++; + *state = *sptr ? sptr : NULL; + *len = 0; + return; + } + + /* No active substitutions */ *len = 1; - *prim = *sec = 0; - for (p2 = __collate_chain_pri_table; p2->str[0] != '\0'; p2++) { - if (*t == p2->str[0] && - strncmp(t, p2->str, strlen(p2->str)) == 0) { - *len = strlen(p2->str); - *prim = p2->prim; - *sec = p2->sec; - return; + + /* + * Check for composites such as dipthongs that collate as a + * single element (aka chains or collating-elements). + */ + if (((p2 = chainsearch(table, t, &l)) != NULL) && + ((p = p2->pri[which]) >= 0)) { + + *len = l; + *pri = p; + + } else if (*t <= UCHAR_MAX) { + + /* + * Character is a small (8-bit) character. + * We just look these up directly for speed. + */ + *pri = table->char_pri_table[*t].pri[which]; + + } else if ((table->info->large_count > 0) && + ((match = largesearch(table, *t)) != NULL)) { + + /* + * Character was found in the extended table. + */ + *pri = match->pri.pri[which]; + + } else { + /* + * Character lacks a specific definition. + */ + if (table->info->directive[which] & DIRECTIVE_UNDEFINED) { + /* Mask off sign bit to prevent ordering confusion. */ + *pri = (*t & COLLATE_MAX_PRIORITY); + } else { + *pri = table->info->undef_pri[which]; + } + /* No substitutions for undefined characters! */ + return; + } + + /* + * Try substituting (expanding) the character. We are + * currently doing this *after* the chain compression. I + * think it should not matter, but this way might be slightly + * faster. + * + * We do this after the priority search, as this will help us + * to identify a single key value. In order for this to work, + * its important that the priority assigned to a given element + * to be substituted be unique for that level. The localedef + * code ensures this for us. + */ + if ((sptr = substsearch(table, *pri, which)) != NULL) { + if ((*pri = *sptr) != 0) { + sptr++; + *state = *sptr ? sptr : NULL; } } - *prim = __collate_char_pri_table[*t].prim; - *sec = __collate_char_pri_table[*t].sec; + } -u_char * -__collate_strdup(u_char *s) +/* + * This is the meaty part of wcsxfrm & strxfrm. Note that it does + * NOT NULL terminate. That is left to the caller. + */ +size_t +_collate_wxfrm(struct xlocale_collate *table, const wchar_t *src, wchar_t *xf, + size_t room) { - u_char *t = strdup(s); + int pri; + int len; + const wchar_t *t; + wchar_t *tr = NULL; + int direc; + int pass; + const int32_t *state; + size_t want = 0; + size_t need = 0; + + for (pass = 0; pass <= table->info->directive_count; pass++) { + + state = NULL; + + if (pass != 0) { + /* insert level separator from the previous pass */ + if (room) { + *xf++ = 1; + room--; + } + want++; + } - if (t == NULL) - __collate_err(EX_OSERR, __func__); - return (t); + /* special pass for undefined */ + if (pass == table->info->directive_count) { + direc = DIRECTIVE_FORWARD | DIRECTIVE_UNDEFINED; + } else { + direc = table->info->directive[pass]; + } + + t = src; + + if (direc & DIRECTIVE_BACKWARD) { + wchar_t *bp, *fp, c; + if (tr) + free(tr); + if ((tr = wcsdup(t)) == NULL) { + errno = ENOMEM; + goto fail; + } + bp = tr; + fp = tr + wcslen(tr) - 1; + while (bp < fp) { + c = *bp; + *bp++ = *fp; + *fp-- = c; + } + t = (const wchar_t *)tr; + } + + if (direc & DIRECTIVE_POSITION) { + while (*t || state) { + _collate_lookup(table, t, &len, &pri, pass, &state); + t += len; + if (pri <= 0) { + if (pri < 0) { + errno = EINVAL; + goto fail; + } + pri = COLLATE_MAX_PRIORITY; + } + if (room) { + *xf++ = pri; + room--; + } + want++; + need = want; + } + } else { + while (*t || state) { + _collate_lookup(table, t, &len, &pri, pass, &state); + t += len; + if (pri <= 0) { + if (pri < 0) { + errno = EINVAL; + goto fail; + } + continue; + } + if (room) { + *xf++ = pri; + room--; + } + want++; + need = want; + } + } + } + if (tr) + free(tr); + return (need); + +fail: + if (tr) + free(tr); + return ((size_t)(-1)); } -void -__collate_err(int ex, const char *f) +/* + * In the non-POSIX case, we transform each character into a string of + * characters representing the character's priority. Since char is usually + * signed, we are limited by 7 bits per byte. To avoid zero, we need to add + * XFRM_OFFSET, so we can't use a full 7 bits. For simplicity, we choose 6 + * bits per byte. + * + * It turns out that we sometimes have real priorities that are + * 31-bits wide. (But: be careful using priorities where the high + * order bit is set -- i.e. the priority is negative. The sort order + * may be surprising!) + * + * TODO: This would be a good area to optimize somewhat. It turns out + * that real prioririties *except for the last UNDEFINED pass* are generally + * very small. We need the localedef code to precalculate the max + * priority for us, and ideally also give us a mask, and then we could + * severely limit what we expand to. + */ +#define XFRM_BYTES 6 +#define XFRM_OFFSET ('0') /* make all printable characters */ +#define XFRM_SHIFT 6 +#define XFRM_MASK ((1 << XFRM_SHIFT) - 1) +#define XFRM_SEP ('.') /* chosen to be less than XFRM_OFFSET */ + +static int +xfrm(struct xlocale_collate *table, unsigned char *p, int pri, int pass) { - const char *s; - int serrno = errno; - - s = _getprogname(); - _write(STDERR_FILENO, s, strlen(s)); - _write(STDERR_FILENO, ": ", 2); - s = f; - _write(STDERR_FILENO, s, strlen(s)); - _write(STDERR_FILENO, ": ", 2); - s = strerror(serrno); - _write(STDERR_FILENO, s, strlen(s)); - _write(STDERR_FILENO, "\n", 1); - exit(ex); + /* we use unsigned to ensure zero fill on right shift */ + uint32_t val = (uint32_t)table->info->pri_count[pass]; + int nc = 0; + + while (val) { + *p = (pri & XFRM_MASK) + XFRM_OFFSET; + pri >>= XFRM_SHIFT; + val >>= XFRM_SHIFT; + p++; + nc++; + } + return (nc); } -#ifdef COLLATE_DEBUG -void -__collate_print_tables() +size_t +_collate_sxfrm(struct xlocale_collate *table, const wchar_t *src, char *xf, + size_t room) +{ + int pri; + int len; + const wchar_t *t; + wchar_t *tr = NULL; + int direc; + int pass; + const int32_t *state; + size_t want = 0; + size_t need = 0; + int b; + uint8_t buf[XFRM_BYTES]; + + for (pass = 0; pass <= table->info->directive_count; pass++) { + + state = NULL; + + if (pass != 0) { + /* insert level separator from the previous pass */ + if (room) { + *xf++ = XFRM_SEP; + room--; + } + want++; + } + + /* special pass for undefined */ + if (pass == table->info->directive_count) { + direc = DIRECTIVE_FORWARD | DIRECTIVE_UNDEFINED; + } else { + direc = table->info->directive[pass]; + } + + t = src; + + if (direc & DIRECTIVE_BACKWARD) { + wchar_t *bp, *fp, c; + if (tr) + free(tr); + if ((tr = wcsdup(t)) == NULL) { + errno = ENOMEM; + goto fail; + } + bp = tr; + fp = tr + wcslen(tr) - 1; + while (bp < fp) { + c = *bp; + *bp++ = *fp; + *fp-- = c; + } + t = (const wchar_t *)tr; + } + + if (direc & DIRECTIVE_POSITION) { + while (*t || state) { + + _collate_lookup(table, t, &len, &pri, pass, &state); + t += len; + if (pri <= 0) { + if (pri < 0) { + errno = EINVAL; + goto fail; + } + pri = COLLATE_MAX_PRIORITY; + } + + b = xfrm(table, buf, pri, pass); + want += b; + if (room) { + while (b) { + b--; + if (room) { + *xf++ = buf[b]; + room--; + } + } + } + need = want; + } + } else { + while (*t || state) { + _collate_lookup(table, t, &len, &pri, pass, &state); + t += len; + if (pri <= 0) { + if (pri < 0) { + errno = EINVAL; + goto fail; + } + continue; + } + + b = xfrm(table, buf, pri, pass); + want += b; + if (room) { + + while (b) { + b--; + if (room) { + *xf++ = buf[b]; + room--; + } + } + } + need = want; + } + } + } + if (tr) + free(tr); + return (need); + +fail: + if (tr) + free(tr); + return ((size_t)(-1)); +} + +/* + * __collate_equiv_value returns the primary collation value for the given + * collating symbol specified by str and len. Zero or negative is returned + * if the collating symbol was not found. This function is used by bracket + * code in the TRE regex library. + */ +int +__collate_equiv_value(locale_t locale, const wchar_t *str, size_t len) { - int i; - struct __collate_st_chain_pri *p2; - - printf("Substitute table:\n"); - for (i = 0; i < UCHAR_MAX + 1; i++) - if (i != *__collate_substitute_table[i]) - printf("\t'%c' --> \"%s\"\n", i, - __collate_substitute_table[i]); - printf("Chain priority table:\n"); - for (p2 = __collate_chain_pri_table; p2->str[0] != '\0'; p2++) - printf("\t\"%s\" : %d %d\n", p2->str, p2->prim, p2->sec); - printf("Char priority table:\n"); - for (i = 0; i < UCHAR_MAX + 1; i++) - printf("\t'%c' : %d %d\n", i, __collate_char_pri_table[i].prim, - __collate_char_pri_table[i].sec); + int32_t e; + + if (len < 1 || len >= COLLATE_STR_LEN) + return (-1); + + FIX_LOCALE(locale); + struct xlocale_collate *table = + (struct xlocale_collate*)locale->components[XLC_COLLATE]; + + if (table->__collate_load_error) + return ((len == 1 && *str <= UCHAR_MAX) ? *str : -1); + + if (len == 1) { + e = -1; + if (*str <= UCHAR_MAX) + e = table->char_pri_table[*str].pri[0]; + else if (table->info->large_count > 0) { + collate_large_t *match_large; + match_large = largesearch(table, *str); + if (match_large) + e = match_large->pri.pri[0]; + } + if (e == 0) + return (1); + return (e > 0 ? e : 0); + } + if (table->info->chain_count > 0) { + wchar_t name[COLLATE_STR_LEN]; + collate_chain_t *match_chain; + int clen; + + wcsncpy (name, str, len); + name[len] = 0; + match_chain = chainsearch(table, name, &clen); + if (match_chain) { + e = match_chain->pri[0]; + if (e == 0) + return (1); + return (e < 0 ? -e : e); + } + } + return (0); } -#endif diff --git a/lib/libc/locale/collate.h b/lib/libc/locale/collate.h index ad034d4..31aa57a 100644 --- a/lib/libc/locale/collate.h +++ b/lib/libc/locale/collate.h @@ -40,42 +40,98 @@ #include <limits.h> #include "xlocale_private.h" -#define STR_LEN 10 -#define TABLE_SIZE 100 -#define COLLATE_VERSION "1.0\n" -#define COLLATE_VERSION1_2 "1.2\n" +/* + * Work around buildworld bootstrapping from older systems whos limits.h + * sets COLL_WEIGHTS_MAX to 0. + */ +#if COLL_WEIGHTS_MAX == 0 +#undef COLL_WEIGHTS_MAX +#define COLL_WEIGHTS_MAX 10 +#endif -struct __collate_st_char_pri { - int prim, sec; -}; -struct __collate_st_chain_pri { - u_char str[STR_LEN]; - int prim, sec; -}; +#define COLLATE_STR_LEN 24 /* should be 64-bit multiple */ +#define COLLATE_VERSION "BSD 1.0\n" + +#define COLLATE_MAX_PRIORITY (0x7fffffff) /* max signed value */ +#define COLLATE_SUBST_PRIORITY (0x40000000) /* bit indicates subst table */ -#define __collate_substitute_table (*__collate_substitute_table_ptr) -#define __collate_char_pri_table (*__collate_char_pri_table_ptr) +#define DIRECTIVE_UNDEF 0x00 +#define DIRECTIVE_FORWARD 0x01 +#define DIRECTIVE_BACKWARD 0x02 +#define DIRECTIVE_POSITION 0x04 +#define DIRECTIVE_UNDEFINED 0x08 /* special last weight for UNDEFINED */ + +#define DIRECTIVE_DIRECTION_MASK (DIRECTIVE_FORWARD | DIRECTIVE_BACKWARD) + +/* + * The collate file format is as follows: + * + * char version[COLLATE_STR_LEN]; // must be COLLATE_VERSION + * collate_info_t info; // see below, includes padding + * collate_char_pri_t char_data[256]; // 8 bit char values + * collate_subst_t subst[*]; // 0 or more substitutions + * collate_chain_pri_t chains[*]; // 0 or more chains + * collate_large_pri_t large[*]; // extended char priorities + * + * Note that all structures must be 32-bit aligned, as each structure + * contains 32-bit member fields. The entire file is mmap'd, so its + * critical that alignment be observed. It is not generally safe to + * use any 64-bit values in the structures. + */ + +typedef struct collate_info { + uint8_t directive_count; + uint8_t directive[COLL_WEIGHTS_MAX]; + int32_t pri_count[COLL_WEIGHTS_MAX]; + int32_t flags; + int32_t chain_count; + int32_t large_count; + int32_t subst_count[COLL_WEIGHTS_MAX]; + int32_t undef_pri[COLL_WEIGHTS_MAX]; +} collate_info_t; + +typedef struct collate_char { + int32_t pri[COLL_WEIGHTS_MAX]; +} collate_char_t; + +typedef struct collate_chain { + wchar_t str[COLLATE_STR_LEN]; + int32_t pri[COLL_WEIGHTS_MAX]; +} collate_chain_t; + +typedef struct collate_large { + int32_t val; + collate_char_t pri; +} collate_large_t; + +typedef struct collate_subst { + int32_t key; + int32_t pri[COLLATE_STR_LEN]; +} collate_subst_t; struct xlocale_collate { struct xlocale_component header; int __collate_load_error; - int __collate_substitute_nontrivial; + char * map; + size_t maplen; - u_char (*__collate_substitute_table_ptr)[UCHAR_MAX + 1][STR_LEN]; - struct __collate_st_char_pri (*__collate_char_pri_table_ptr)[UCHAR_MAX + 1]; - struct __collate_st_chain_pri *__collate_chain_pri_table; + collate_info_t *info; + collate_char_t *char_pri_table; + collate_large_t *large_pri_table; + collate_chain_t *chain_pri_table; + collate_subst_t *subst_table[COLL_WEIGHTS_MAX]; }; - __BEGIN_DECLS -u_char *__collate_strdup(u_char *); -u_char *__collate_substitute(struct xlocale_collate *, const u_char *); int __collate_load_tables(const char *); -void __collate_lookup(struct xlocale_collate *, const u_char *, int *, int *, int *); -int __collate_range_cmp(struct xlocale_collate *, int, int); -#ifdef COLLATE_DEBUG -void __collate_print_tables(void); -#endif +int __collate_equiv_value(locale_t, const wchar_t *, size_t); +void _collate_lookup(struct xlocale_collate *,const wchar_t *, int *, int *, + int, const int **); +int __collate_range_cmp(struct xlocale_collate *, wchar_t, wchar_t); +size_t _collate_wxfrm(struct xlocale_collate *, const wchar_t *, wchar_t *, + size_t); +size_t _collate_sxfrm(struct xlocale_collate *, const wchar_t *, char *, + size_t); __END_DECLS #endif /* !_COLLATE_H_ */ diff --git a/lib/libc/locale/collcmp.c b/lib/libc/locale/collcmp.c index aa17afd..102fbfb 100644 --- a/lib/libc/locale/collcmp.c +++ b/lib/libc/locale/collcmp.c @@ -33,6 +33,7 @@ __FBSDID("$FreeBSD$"); #include <string.h> +#include <wchar.h> #include <xlocale.h> #include "collate.h" @@ -40,13 +41,15 @@ __FBSDID("$FreeBSD$"); * Compare two characters using collate */ -int __collate_range_cmp(struct xlocale_collate *table, int c1, int c2) +int __collate_range_cmp(struct xlocale_collate *table, wchar_t c1, wchar_t c2) { - static char s1[2], s2[2]; + wchar_t s1[2], s2[2]; s1[0] = c1; + s1[1] = 0; s2[0] = c2; + s2[1] = 0; struct _xlocale l = {{0}}; l.components[XLC_COLLATE] = (struct xlocale_component *)table; - return (strcoll_l(s1, s2, &l)); + return (wcscoll_l(s1, s2, &l)); } diff --git a/lib/libc/locale/setrunelocale.c b/lib/libc/locale/setrunelocale.c index 0a0943f..56918b5 100644 --- a/lib/libc/locale/setrunelocale.c +++ b/lib/libc/locale/setrunelocale.c @@ -67,12 +67,6 @@ extern _RuneLocale *_Read_RuneMagi(FILE *); static int __setrunelocale(struct xlocale_ctype *l, const char *); -#define __collate_substitute_nontrivial (table->__collate_substitute_nontrivial) -#define __collate_substitute_table_ptr (table->__collate_substitute_table_ptr) -#define __collate_char_pri_table_ptr (table->__collate_char_pri_table_ptr) -#define __collate_chain_pri_table (table->__collate_chain_pri_table) - - static void destruct_ctype(void *v) { diff --git a/lib/libc/string/strcoll.c b/lib/libc/string/strcoll.c index a918fca..5bad40c 100644 --- a/lib/libc/string/strcoll.c +++ b/lib/libc/string/strcoll.c @@ -1,4 +1,5 @@ /*- + * Copyright 2010 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 1995 Alex Tatmanjants <alex@elvisti.kiev.ua> * at Electronni Visti IA, Kiev, Ukraine. * All rights reserved. @@ -35,63 +36,82 @@ __FBSDID("$FreeBSD$"); #include <stdlib.h> #include <string.h> +#include <errno.h> +#include <wchar.h> #include "collate.h" -#include <stdio.h> +/* + * In order to properly handle multibyte locales, its easiet to just + * convert to wide characters and then use wcscoll. However if an + * error occurs, we gracefully fall back to simple strcmp. Caller + * should check errno. + */ int strcoll_l(const char *s, const char *s2, locale_t locale) { - int len, len2, prim, prim2, sec, sec2, ret, ret2; - const char *t, *t2; - char *tt, *tt2; + int ret; + wchar_t *t1 = NULL, *t2 = NULL; + wchar_t *w1 = NULL, *w2 = NULL; + const char *cs1, *cs2; + mbstate_t mbs1; + mbstate_t mbs2; + size_t sz1, sz2; + + memset(&mbs1, 0, sizeof (mbstate_t)); + memset(&mbs2, 0, sizeof (mbstate_t)); + + /* + * The mbsrtowcs_l function can set the src pointer to null upon + * failure, so it should act on a copy to avoid: + * - sending null pointer to strcmp + * - having strcoll/strcoll_l change *s or *s2 to null + */ + cs1 = s; + cs2 = s2; + FIX_LOCALE(locale); struct xlocale_collate *table = (struct xlocale_collate*)locale->components[XLC_COLLATE]; if (table->__collate_load_error) - return strcmp(s, s2); - - len = len2 = 1; - ret = ret2 = 0; - if (table->__collate_substitute_nontrivial) { - t = tt = __collate_substitute(table, s); - t2 = tt2 = __collate_substitute(table, s2); - } else { - tt = tt2 = NULL; - t = s; - t2 = s2; - } - while(*t && *t2) { - prim = prim2 = 0; - while(*t && !prim) { - __collate_lookup(table, t, &len, &prim, &sec); - t += len; - } - while(*t2 && !prim2) { - __collate_lookup(table, t2, &len2, &prim2, &sec2); - t2 += len2; - } - if(!prim || !prim2) - break; - if(prim != prim2) { - ret = prim - prim2; - goto end; - } - if(!ret2) - ret2 = sec - sec2; - } - if(!*t && *t2) - ret = -(int)((u_char)*t2); - else if(*t && !*t2) - ret = (u_char)*t; - else if(!*t && !*t2) - ret = ret2; - end: - free(tt); - free(tt2); - - return ret; + goto error; + + sz1 = strlen(s) + 1; + sz2 = strlen(s2) + 1; + + /* + * Simple assumption: conversion to wide format is strictly + * reducing, i.e. a single byte (or multibyte character) + * cannot result in multiple wide characters. + */ + if ((t1 = malloc(sz1 * sizeof (wchar_t))) == NULL) + goto error; + w1 = t1; + if ((t2 = malloc(sz2 * sizeof (wchar_t))) == NULL) + goto error; + w2 = t2; + + if ((mbsrtowcs_l(w1, &cs1, sz1, &mbs1, locale)) == (size_t)-1) + goto error; + + if ((mbsrtowcs_l(w2, &cs2, sz2, &mbs2, locale)) == (size_t)-1) + goto error; + + ret = wcscoll_l(w1, w2, locale); + if (t1) + free(t1); + if (t2) + free(t2); + + return (ret); + +error: + if (t1) + free(t1); + if (t2) + free(t2); + return (strcmp(s, s2)); } int diff --git a/lib/libc/string/strxfrm.c b/lib/libc/string/strxfrm.c index b758b0c..8b25b0e 100644 --- a/lib/libc/string/strxfrm.c +++ b/lib/libc/string/strxfrm.c @@ -1,4 +1,5 @@ /*- + * Copyright 2010 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 1995 Alex Tatmanjants <alex@elvisti.kiev.ua> * at Electronni Visti IA, Kiev, Ukraine. * All rights reserved. @@ -35,6 +36,8 @@ __FBSDID("$FreeBSD$"); #include <stdlib.h> #include <string.h> +#include <errno.h> +#include <wchar.h> #include "collate.h" size_t @@ -48,9 +51,10 @@ strxfrm(char * __restrict dest, const char * __restrict src, size_t len) size_t strxfrm_l(char * __restrict dest, const char * __restrict src, size_t len, locale_t locale) { - int prim, sec, l; size_t slen; - char *s, *ss; + size_t xlen; + wchar_t *wcs = NULL; + FIX_LOCALE(locale); struct xlocale_collate *table = (struct xlocale_collate*)locale->components[XLC_COLLATE]; @@ -58,32 +62,44 @@ strxfrm_l(char * __restrict dest, const char * __restrict src, size_t len, local if (!*src) { if (len > 0) *dest = '\0'; - return 0; + return (0); } + /* + * The conversion from multibyte to wide character strings is + * strictly reducing (one byte of an mbs cannot expand to more + * than one wide character.) + */ + slen = strlen(src); + if (table->__collate_load_error) - return strlcpy(dest, src, len); - - slen = 0; - prim = sec = 0; - ss = s = __collate_substitute(table, src); - while (*s) { - while (*s && !prim) { - __collate_lookup(table, s, &l, &prim, &sec); - s += l; - } - if (prim) { - if (len > 1) { - *dest++ = (char)prim; - len--; - } - slen++; - prim = 0; - } + goto error; + + if ((wcs = malloc((slen + 1) * sizeof (wchar_t))) == NULL) + goto error; + + if (mbstowcs_l(wcs, src, slen + 1, locale) == (size_t)-1) + goto error; + + if ((xlen = _collate_sxfrm(table, wcs, dest, len)) == (size_t)-1) + goto error; + + if (wcs) + free(wcs); + + if (len > xlen) { + dest[xlen] = 0; + } else if (len) { + dest[len-1] = 0; } - free(ss); - if (len > 0) - *dest = '\0'; - return slen; + return (xlen); + +error: + /* errno should be set to ENOMEM if malloc failed */ + if (wcs) + free(wcs); + (void) strlcpy(dest, src, len); + + return (slen); } diff --git a/lib/libc/string/wcsxfrm.c b/lib/libc/string/wcsxfrm.c index cea667e..3d6c960 100644 --- a/lib/libc/string/wcsxfrm.c +++ b/lib/libc/string/wcsxfrm.c @@ -1,4 +1,5 @@ /*- + * Copyright 2010 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 1995 Alex Tatmanjants <alex@elvisti.kiev.ua> * at Electronni Visti IA, Kiev, Ukraine. * All rights reserved. @@ -31,9 +32,6 @@ */ #include <sys/cdefs.h> -#if 0 -__FBSDID("FreeBSD: src/lib/libc/string/strxfrm.c,v 1.15 2002/09/06 11:24:06 tjr Exp "); -#endif __FBSDID("$FreeBSD$"); #include <stdlib.h> @@ -41,18 +39,10 @@ __FBSDID("$FreeBSD$"); #include <wchar.h> #include "collate.h" -static char *__mbsdup(const wchar_t *); - -/* - * Placeholder wcsxfrm() implementation. See wcscoll.c for a description of - * the logic used. - */ size_t wcsxfrm_l(wchar_t * __restrict dest, const wchar_t * __restrict src, size_t len, locale_t locale) { - int prim, sec, l; size_t slen; - char *mbsrc, *s, *ss; FIX_LOCALE(locale); struct xlocale_collate *table = (struct xlocale_collate*)locale->components[XLC_COLLATE]; @@ -63,67 +53,33 @@ wcsxfrm_l(wchar_t * __restrict dest, const wchar_t * __restrict src, size_t len, return (0); } - if (table->__collate_load_error || MB_CUR_MAX > 1) { - slen = wcslen(src); - if (len > 0) { - if (slen < len) - wcscpy(dest, src); - else { - wcsncpy(dest, src, len - 1); - dest[len - 1] = L'\0'; - } - } - return (slen); + if ((table->__collate_load_error) || + ((slen = _collate_wxfrm(table, src, dest, len)) == (size_t)-1)) { + goto error; } - mbsrc = __mbsdup(src); - slen = 0; - prim = sec = 0; - ss = s = __collate_substitute(table, mbsrc); - while (*s != '\0') { - while (*s != '\0' && prim == 0) { - __collate_lookup(table, s, &l, &prim, &sec); - s += l; - } - if (prim != 0) { - if (len > 1) { - *dest++ = (wchar_t)prim; - len--; - } - slen++; - prim = 0; - } + /* Add null termination at the correct location. */ + if (len > slen) { + dest[slen] = 0; + } else if (len) { + dest[len-1] = 0; } - free(ss); - free(mbsrc); - if (len != 0) - *dest = L'\0'; return (slen); + +error: + slen = wcslen(src); + if (slen < len) + (void) wcscpy(dest, src); + else { + (void) wcsncpy(dest, src, len - 1); + dest[len - 1] = L'\0'; + } + return (slen); } + size_t wcsxfrm(wchar_t * __restrict dest, const wchar_t * __restrict src, size_t len) { return wcsxfrm_l(dest, src, len, __get_locale()); } - -static char * -__mbsdup(const wchar_t *ws) -{ - static const mbstate_t initial; - mbstate_t st; - const wchar_t *wcp; - size_t len; - char *mbs; - - wcp = ws; - st = initial; - if ((len = wcsrtombs(NULL, &wcp, 0, &st)) == (size_t)-1) - return (NULL); - if ((mbs = malloc(len + 1)) == NULL) - return (NULL); - st = initial; - wcsrtombs(mbs, &ws, len + 1, &st); - - return (mbs); -} |