/*- * Copyright (c) 2003, 2005 Ryuichiro Imura * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include "iconv_converter_if.h" /* * "UCS" converter */ #define KICONV_UCS_COMBINE 0x1 #define KICONV_UCS_FROM_UTF8 0x2 #define KICONV_UCS_TO_UTF8 0x4 #define KICONV_UCS_FROM_LE 0x8 #define KICONV_UCS_TO_LE 0x10 #define KICONV_UCS_FROM_UTF16 0x20 #define KICONV_UCS_TO_UTF16 0x40 #define KICONV_UCS_UCS4 0x80 #define ENCODING_UTF16 "UTF-16BE" #define ENCODING_UTF8 "UTF-8" static struct { const char *name; int from_flag, to_flag; } unicode_family[] = { { "UTF-8", KICONV_UCS_FROM_UTF8, KICONV_UCS_TO_UTF8 }, { "UCS-2LE", KICONV_UCS_FROM_LE, KICONV_UCS_TO_LE }, { "UTF-16BE", KICONV_UCS_FROM_UTF16, KICONV_UCS_TO_UTF16 }, { "UTF-16LE", KICONV_UCS_FROM_UTF16|KICONV_UCS_FROM_LE, KICONV_UCS_TO_UTF16|KICONV_UCS_TO_LE }, { NULL, 0, 0 } }; static uint32_t utf8_to_ucs4(const char *src, size_t *utf8width, size_t srclen); static u_char *ucs4_to_utf8(uint32_t ucs4, char * dst, size_t *utf8width, size_t dstlen); static uint32_t encode_surrogate(uint32_t code); static uint32_t decode_surrogate(const u_char *ucs); #ifdef MODULE_DEPEND MODULE_DEPEND(iconv_ucs, libiconv, 2, 2, 2); #endif /* * UCS converter instance */ struct iconv_ucs { KOBJ_FIELDS; int convtype; struct iconv_cspair * d_csp; struct iconv_cspair * d_cspf; void * f_ctp; void * t_ctp; void * ctype; }; static int iconv_ucs_open(struct iconv_converter_class *dcp, struct iconv_cspair *csp, struct iconv_cspair *cspf, void **dpp) { struct iconv_ucs *dp; int i; const char *from, *to; dp = (struct iconv_ucs *)kobj_create((struct kobj_class*)dcp, M_ICONV, M_WAITOK); to = csp->cp_to; from = cspf ? cspf->cp_from : csp->cp_from; dp->convtype = 0; if (cspf) dp->convtype |= KICONV_UCS_COMBINE; for (i = 0; unicode_family[i].name; i++) { if (strcasecmp(from, unicode_family[i].name) == 0) dp->convtype |= unicode_family[i].from_flag; if (strcasecmp(to, unicode_family[i].name) == 0) dp->convtype |= unicode_family[i].to_flag; } if (strcmp(ENCODING_UNICODE, ENCODING_UTF16) == 0) dp->convtype |= KICONV_UCS_UCS4; else dp->convtype &= ~KICONV_UCS_UCS4; dp->f_ctp = dp->t_ctp = NULL; if (dp->convtype & KICONV_UCS_COMBINE) { if ((dp->convtype & KICONV_UCS_FROM_UTF8) == 0 && (dp->convtype & KICONV_UCS_FROM_LE) == 0) { iconv_open(ENCODING_UNICODE, from, &dp->f_ctp); } if ((dp->convtype & KICONV_UCS_TO_UTF8) == 0 && (dp->convtype & KICONV_UCS_TO_LE) == 0) { iconv_open(to, ENCODING_UNICODE, &dp->t_ctp); } } dp->ctype = NULL; if (dp->convtype & (KICONV_UCS_FROM_UTF8 | KICONV_UCS_TO_UTF8)) iconv_open(KICONV_WCTYPE_NAME, ENCODING_UTF8, &dp->ctype); dp->d_csp = csp; if (dp->convtype & (KICONV_UCS_FROM_UTF8 | KICONV_UCS_FROM_LE)) { if (cspf) { dp->d_cspf = cspf; cspf->cp_refcount++; } else csp->cp_refcount++; } if (dp->convtype & (KICONV_UCS_TO_UTF8 | KICONV_UCS_TO_LE)) csp->cp_refcount++; *dpp = (void*)dp; return 0; } static int iconv_ucs_close(void *data) { struct iconv_ucs *dp = data; if (dp->f_ctp) iconv_close(dp->f_ctp); if (dp->t_ctp) iconv_close(dp->t_ctp); if (dp->ctype) iconv_close(dp->ctype); if (dp->d_cspf) dp->d_cspf->cp_refcount--; else if (dp->convtype & (KICONV_UCS_FROM_UTF8 | KICONV_UCS_FROM_LE)) dp->d_csp->cp_refcount--; if (dp->convtype & (KICONV_UCS_TO_UTF8 | KICONV_UCS_TO_LE)) dp->d_csp->cp_refcount--; kobj_delete((struct kobj*)data, M_ICONV); return 0; } static int iconv_ucs_conv(void *d2p, const char **inbuf, size_t *inbytesleft, char **outbuf, size_t *outbytesleft, int convchar, int casetype) { struct iconv_ucs *dp = (struct iconv_ucs*)d2p; int ret = 0, i; size_t in, on, ir, or, inlen, outlen, ucslen; const char *src, *p; char *dst; u_char ucs[4], *q; uint32_t code; if (inbuf == NULL || *inbuf == NULL || outbuf == NULL || *outbuf == NULL) return 0; ir = in = *inbytesleft; or = on = *outbytesleft; src = *inbuf; dst = *outbuf; while (ir > 0 && or > 0) { /* * The first half of conversion. * (convert any code into ENCODING_UNICODE) */ code = 0; p = src; if (dp->convtype & KICONV_UCS_FROM_UTF8) { /* convert UTF-8 to ENCODING_UNICODE */ inlen = 0; code = utf8_to_ucs4(p, &inlen, ir); if (code == 0) { ret = -1; break; } if (casetype == KICONV_FROM_LOWER && dp->ctype) { code = towlower(code, dp->ctype); } else if (casetype == KICONV_FROM_UPPER && dp->ctype) { code = towupper(code, dp->ctype); } if ((code >= 0xd800 && code < 0xe000) || code >= 0x110000 ) { /* reserved for utf-16 surrogate pair */ /* invalid unicode */ ret = -1; break; } if (inlen == 4) { if (dp->convtype & KICONV_UCS_UCS4) { ucslen = 4; code = encode_surrogate(code); } else { /* can't handle with ucs-2 */ ret = -1; break; } } else { ucslen = 2; } /* save UCS-4 into ucs[] */ for (q = ucs, i = ucslen - 1 ; i >= 0 ; i--) *q++ = (code >> (i << 3)) & 0xff; } else if (dp->convtype & KICONV_UCS_COMBINE && dp->f_ctp) { /* convert local code to ENCODING_UNICODE */ ucslen = 4; inlen = ir; q = ucs; ret = iconv_convchr_case(dp->f_ctp, &p, &inlen, (char **)&q, &ucslen, casetype & (KICONV_FROM_LOWER | KICONV_FROM_UPPER)); if (ret) break; inlen = ir - inlen; ucslen = 4 - ucslen; } else { /* src code is a proper subset of ENCODING_UNICODE */ q = ucs; if (dp->convtype & KICONV_UCS_FROM_LE) { *q = *(p + 1); *(q + 1) = *p; p += 2; } else { *q = *p++; *(q + 1) = *p++; } if ((*q & 0xfc) == 0xd8) { if (dp->convtype & KICONV_UCS_UCS4 && dp->convtype & KICONV_UCS_FROM_UTF16) { inlen = ucslen = 4; } else { /* invalid unicode */ ret = -1; break; } } else { inlen = ucslen = 2; } if (ir < inlen) { ret = -1; break; } if (ucslen == 4) { q += 2; if (dp->convtype & KICONV_UCS_FROM_LE) { *q = *(p + 1); *(q + 1) = *p; } else { *q = *p++; *(q + 1) = *p; } if ((*q & 0xfc) != 0xdc) { /* invalid unicode */ ret = -1; break; } } } /* * The second half of conversion. * (convert ENCODING_UNICODE into any code) */ p = ucs; if (dp->convtype & KICONV_UCS_TO_UTF8) { q = (u_char *)dst; if (ucslen == 4 && dp->convtype & KICONV_UCS_UCS4) { /* decode surrogate pair */ code = decode_surrogate(p); } else { code = (ucs[0] << 8) | ucs[1]; } if (casetype == KICONV_LOWER && dp->ctype) { code = towlower(code, dp->ctype); } else if (casetype == KICONV_UPPER && dp->ctype) { code = towupper(code, dp->ctype); } outlen = 0; if (ucs4_to_utf8(code, q, &outlen, or) == NULL) { ret = -1; break; } src += inlen; ir -= inlen; dst += outlen; or -= outlen; } else if (dp->convtype & KICONV_UCS_COMBINE && dp->t_ctp) { ret = iconv_convchr_case(dp->t_ctp, &p, &ucslen, &dst, &or, casetype & (KICONV_LOWER | KICONV_UPPER)); if (ret) break; src += inlen; ir -= inlen; } else { /* dst code is a proper subset of ENCODING_UNICODE */ if (or < ucslen) { ret = -1; break; } src += inlen; ir -= inlen; or -= ucslen; if (dp->convtype & KICONV_UCS_TO_LE) { *dst++ = *(p + 1); *dst++ = *p; p += 2; } else { *dst++ = *p++; *dst++ = *p++; } if (ucslen == 4) { if ((dp->convtype & KICONV_UCS_UCS4) == 0 || (dp->convtype & KICONV_UCS_TO_UTF16) == 0) { ret = -1; break; } if (dp->convtype & KICONV_UCS_TO_LE) { *dst++ = *(p + 1); *dst++ = *p; } else { *dst++ = *p++; *dst++ = *p; } } } if (convchar == 1) break; } *inbuf += in - ir; *outbuf += on - or; *inbytesleft -= in - ir; *outbytesleft -= on - or; return (ret); } static int iconv_ucs_init(struct iconv_converter_class *dcp) { int error; error = iconv_add(ENCODING_UNICODE, ENCODING_UNICODE, ENCODING_UTF8); if (error) return (error); error = iconv_add(ENCODING_UNICODE, ENCODING_UTF8, ENCODING_UNICODE); if (error) return (error); return (0); } static int iconv_ucs_done(struct iconv_converter_class *dcp) { return (0); } static const char * iconv_ucs_name(struct iconv_converter_class *dcp) { return (ENCODING_UNICODE); } static kobj_method_t iconv_ucs_methods[] = { KOBJMETHOD(iconv_converter_open, iconv_ucs_open), KOBJMETHOD(iconv_converter_close, iconv_ucs_close), KOBJMETHOD(iconv_converter_conv, iconv_ucs_conv), KOBJMETHOD(iconv_converter_init, iconv_ucs_init), KOBJMETHOD(iconv_converter_done, iconv_ucs_done), KOBJMETHOD(iconv_converter_name, iconv_ucs_name), {0, 0} }; KICONV_CONVERTER(ucs, sizeof(struct iconv_ucs)); static uint32_t utf8_to_ucs4(const char *src, size_t *utf8width, size_t srclen) { size_t i, w = 0; uint32_t ucs4 = 0; /* * get leading 1 byte from utf-8 */ if ((*src & 0x80) == 0) { /* * leading 1 bit is "0" * utf-8: 0xxxxxxx * ucs-4: 00000000 00000000 00000000 0xxxxxxx */ w = 1; /* get trailing 7 bits */ ucs4 = *src & 0x7f; } else if ((*src & 0xe0) == 0xc0) { /* * leading 3 bits are "110" * utf-8: 110xxxxx 10yyyyyy * ucs-4: 00000000 00000000 00000xxx xxyyyyyy */ w = 2; /* get trailing 5 bits */ ucs4 = *src & 0x1f; } else if ((*src & 0xf0) == 0xe0) { /* * leading 4 bits are "1110" * utf-8: 1110xxxx 10yyyyyy 10zzzzzz * ucs-4: 00000000 00000000 xxxxyyyy yyzzzzzz */ w = 3; /* get trailing 4 bits */ ucs4 = *src & 0x0f; } else if ((*src & 0xf8) == 0xf0) { /* * leading 5 bits are "11110" * utf-8: 11110www 10xxxxxx 10yyyyyy 10zzzzzz * ucs-4: 00000000 000wwwxx xxxxyyyy yyzzzzzz */ w = 4; /* get trailing 3 bits */ ucs4 = *src & 0x07; } else { /* out of utf-16 range or having illegal bits */ return (0); } if (srclen < w) return (0); /* * get left parts from utf-8 */ for (i = 1 ; i < w ; i++) { if ((*(src + i) & 0xc0) != 0x80) { /* invalid: leading 2 bits are not "10" */ return (0); } /* concatenate trailing 6 bits into ucs4 */ ucs4 <<= 6; ucs4 |= *(src + i) & 0x3f; } *utf8width = w; return (ucs4); } static u_char * ucs4_to_utf8(uint32_t ucs4, char *dst, size_t *utf8width, size_t dstlen) { u_char lead, *p; size_t i, w; /* * determine utf-8 width and leading bits */ if (ucs4 < 0x80) { w = 1; lead = 0; /* "0" */ } else if (ucs4 < 0x800) { w = 2; lead = 0xc0; /* "11" */ } else if (ucs4 < 0x10000) { w = 3; lead = 0xe0; /* "111" */ } else if (ucs4 < 0x200000) { w = 4; lead = 0xf0; /* "1111" */ } else { return (NULL); } if (dstlen < w) return (NULL); /* * construct utf-8 */ p = dst; for (i = w - 1 ; i >= 1 ; i--) { /* get trailing 6 bits and put it with leading bit as "1" */ *(p + i) = (ucs4 & 0x3f) | 0x80; ucs4 >>= 6; } *p = ucs4 | lead; *utf8width = w; return (p); } static uint32_t encode_surrogate(register uint32_t code) { return ((((code - 0x10000) << 6) & 0x3ff0000) | ((code - 0x10000) & 0x3ff) | 0xd800dc00); } static uint32_t decode_surrogate(register const u_char *ucs) { return ((((ucs[0] & 0x3) << 18) | (ucs[1] << 10) | ((ucs[2] & 0x3) << 8) | ucs[3]) + 0x10000); }