diff options
author | tjr <tjr@FreeBSD.org> | 2004-04-04 11:00:42 +0000 |
---|---|---|
committer | tjr <tjr@FreeBSD.org> | 2004-04-04 11:00:42 +0000 |
commit | d98eda520cd6501a40b6421303d87c2f5bfb3dda (patch) | |
tree | fff9cdb8ba54eada6a5f35792d8264fd7f99a0a9 /lib | |
parent | dd0763020dc3b8519399d22fa8eab091999dfa82 (diff) | |
download | FreeBSD-src-d98eda520cd6501a40b6421303d87c2f5bfb3dda.zip FreeBSD-src-d98eda520cd6501a40b6421303d87c2f5bfb3dda.tar.gz |
Reimplement the GB18030 encoding method using the new-style (mbrtowc()/
wcrtomb()) interface.
Diffstat (limited to 'lib')
-rw-r--r-- | lib/libc/locale/gb18030.c | 260 |
1 files changed, 137 insertions, 123 deletions
diff --git a/lib/libc/locale/gb18030.c b/lib/libc/locale/gb18030.c index bfc8275..dca718f 100644 --- a/lib/libc/locale/gb18030.c +++ b/lib/libc/locale/gb18030.c @@ -1,8 +1,6 @@ -/* - * Copyright (c) 2003 - * The Regents of the University of California. All rights reserved. - * - * This code is contributed to Robin Hu <huxw@knight.6test.edu.cn> +/*- + * Copyright (c) 2002-2004 Tim J. Robbins + * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -12,18 +10,11 @@ * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the University of - * California, Berkeley and its contributors. - * 4. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) @@ -32,135 +23,158 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ +/* + * PRC National Standard GB 18030-2000 encoding of Chinese text. + * + * See gb18030(5) for details. + */ #include <sys/cdefs.h> __FBSDID("$FreeBSD$"); -#include <rune.h> -#include <stddef.h> -#include <stdio.h> +#include <errno.h> +#include <runetype.h> #include <stdlib.h> -#include <sys/types.h> +#include <wchar.h> + +extern size_t (*__mbrtowc)(wchar_t * __restrict, const char * __restrict, + size_t, mbstate_t * __restrict); +extern size_t (*__wcrtomb)(char * __restrict, wchar_t, mbstate_t * __restrict); -rune_t _GB18030_sgetrune(const char *, size_t, char const **); -int _GB18030_sputrune(rune_t, char *, size_t, char **); +int _GB18030_init(_RuneLocale *); +size_t _GB18030_mbrtowc(wchar_t * __restrict, const char * __restrict, size_t, + mbstate_t * __restrict); +size_t _GB18030_wcrtomb(char * __restrict, wchar_t, mbstate_t * __restrict); int -_GB18030_init(rl) - _RuneLocale *rl; +_GB18030_init(_RuneLocale *rl) { - rl->sgetrune = _GB18030_sgetrune; - rl->sputrune = _GB18030_sputrune; + + __mbrtowc = _GB18030_mbrtowc; + __wcrtomb = _GB18030_wcrtomb; _CurrentRuneLocale = rl; __mb_cur_max = 4; - return (0); -} - -static inline int -_gb18030_check_string(s_, n) - const char* s_; - int n; -{ - const unsigned char* s = s_; - if ((s[0]>=0x81&&s[0]<=0xfe)) { - if (n<2) goto bad_string; - if ((s[1]>=0x40&&s[1]<=0x7e)||(s[1]>=0x80&&s[1]<=0xfe)) - return 2; - if ((s[1]>=0x30&&s[1]<=0x39)) { - if (n<4) goto bad_string; - if ((s[2]>=0x81&&s[2]<=0xfe) && (s[3]>=0x30&&s[3]<=0x39)) - return 4; - else - goto bad_string; - } - } else { - return 1; - } - bad_string: - return -1; -} -static inline int -_gb18030_check_rune(r) - rune_t r; -{ - if (r&0xff000000) { - return 4; - } - if (r&0xff00) { - return 2; - } - return 1; + return (0); } -rune_t -_GB18030_sgetrune(string, n, result) - const char *string; - size_t n; - char const **result; +size_t +_GB18030_mbrtowc(wchar_t * __restrict pwc, const char * __restrict s, + size_t n, mbstate_t * __restrict ps __unused) { - rune_t rune = 0; - int len; + wchar_t wch; + int ch, len; - len = _gb18030_check_string(string, n); + if (s == NULL) + /* Reset to initial shift state (no-op) */ + return (0); + if (n == 0) + /* Incomplete multibyte sequence */ + return ((size_t)-2); - if (len == -1) { - if (result) - *result = string; - return (_INVALID_RUNE); - } - - while (--len >= 0) - rune = (rune << 8) | ((u_int)(*string++) & 0xff); + /* + * Single byte: [00-7f] + * Two byte: [81-fe][40-7e,80-fe] + * Four byte: [81-fe][30-39][81-fe][30-39] + */ + ch = (unsigned char)*s++; + if (ch <= 0x7f) { + len = 1; + wch = ch; + } else if (ch >= 0x81 && ch <= 0xfe) { + wch = ch; + if (n < 2) + return ((size_t)-2); + ch = (unsigned char)*s++; + if ((ch >= 0x40 && ch <= 0x7e) || (ch >= 0x80 && ch <= 0xfe)) { + wch = (wch << 8) | ch; + len = 2; + } else if (ch >= 0x30 && ch <= 0x39) { + /* + * Strip high bit off the wide character we will + * eventually output so that it is positive when + * cast to wint_t on 32-bit twos-complement machines. + */ + wch = ((wch & 0x7f) << 8) | ch; + if (n < 3) + return ((size_t)-2); + ch = (unsigned char)*s++; + if (ch < 0x81 || ch > 0xfe) + goto ilseq; + wch = (wch << 8) | ch; + if (n < 4) + return ((size_t)-2); + ch = (unsigned char)*s++; + if (ch < 0x30 || ch > 0x39) + goto ilseq; + wch = (wch << 8) | ch; + len = 4; + } else + goto ilseq; + } else + goto ilseq; - rune &= 0x7fffffff; - if (result) - *result = string; - return rune; + if (pwc != NULL) + *pwc = wch; + return (wch == L'\0' ? 0 : len); +ilseq: + errno = EILSEQ; + return ((size_t)-1); } -int -_GB18030_sputrune(c, string, n, result) - rune_t c; - char *string, **result; - size_t n; +size_t +_GB18030_wcrtomb(char * __restrict s, wchar_t wc, + mbstate_t * __restrict ps __unused) { - int len; - len = _gb18030_check_rune(c); + size_t len; + int c; + + if (s == NULL) + /* Reset to initial shift state (no-op) */ + return (1); + + if ((wc & ~0x7fffffff) != 0) + goto ilseq; + if (wc & 0x7f000000) { + /* Replace high bit that mbrtowc() removed. */ + wc |= 0x80000000; + c = (wc >> 24) & 0xff; + if (c < 0x81 || c > 0xfe) + goto ilseq; + *s++ = c; + c = (wc >> 16) & 0xff; + if (c < 0x30 || c > 0x39) + goto ilseq; + *s++ = c; + c = (wc >> 8) & 0xff; + if (c < 0x81 || c > 0xfe) + goto ilseq; + *s++ = c; + c = wc & 0xff; + if (c < 0x30 || c > 0x39) + goto ilseq; + *s++ = c; + len = 4; + } else if (wc & 0x00ff0000) + goto ilseq; + else if (wc & 0x0000ff00) { + c = (wc >> 8) & 0xff; + if (c < 0x81 || c > 0xfe) + goto ilseq; + *s++ = c; + c = wc & 0xff; + if (c < 0x40 || c == 0x7f || c == 0xff) + goto ilseq; + *s++ = c; + len = 2; + } else if (wc <= 0x7f) { + *s++ = wc; + len = 1; + } else + goto ilseq; - switch (len) { - case 1: - if (n >= 1) { - *string = c & 0xff; - if (result) - *result = string + 1; - return (1); - } - break; - case 2: - if (n >= 2) { - string[0] = (c >> 8) & 0xff; - string[1] = c & 0xff; - if (result) - *result = string + 2; - return (2); - } - break; - case 4: - if (n >= 4) { - string[0] = ((c >>24) & 0xff) | 0x80; - string[1] = (c >>16) & 0xff; - string[2] = (c >>8) & 0xff; - string[3] = c & 0xff; - if (result) - *result = string + 4; - return (4); - } - break; - default: - break; - } - if (result) - *result = string; - return (0); + return (len); +ilseq: + errno = EILSEQ; + return ((size_t)-1); } |