summaryrefslogtreecommitdiffstats
path: root/sys/libkern
diff options
context:
space:
mode:
authorkevlo <kevlo@FreeBSD.org>2011-11-18 03:05:20 +0000
committerkevlo <kevlo@FreeBSD.org>2011-11-18 03:05:20 +0000
commit1a26b28a9b13239c0dcae74ac1f94a5969fd2b3c (patch)
tree91e05f3019c9e61d84d991446d61ad595184fef3 /sys/libkern
parent3985e4e2af3096b00750b1a32f61f92d109f9372 (diff)
downloadFreeBSD-src-1a26b28a9b13239c0dcae74ac1f94a5969fd2b3c.zip
FreeBSD-src-1a26b28a9b13239c0dcae74ac1f94a5969fd2b3c.tar.gz
Add unicode support to msdosfs and smbfs; original pathes from imura,
bug fixes by Kuan-Chung Chiu <buganini at gmail dot com>. Tested by me in production for several days at work.
Diffstat (limited to 'sys/libkern')
-rw-r--r--sys/libkern/iconv.c12
-rw-r--r--sys/libkern/iconv_ucs.c540
2 files changed, 552 insertions, 0 deletions
diff --git a/sys/libkern/iconv.c b/sys/libkern/iconv.c
index 9b46db6..92b04c2 100644
--- a/sys/libkern/iconv.c
+++ b/sys/libkern/iconv.c
@@ -377,6 +377,18 @@ iconv_sysctl_cslist(SYSCTL_HANDLER_ARGS)
SYSCTL_PROC(_kern_iconv, OID_AUTO, cslist, CTLFLAG_RD | CTLTYPE_OPAQUE,
NULL, 0, iconv_sysctl_cslist, "S,xlat", "registered charset pairs");
+int
+iconv_add(const char *converter, const char *to, const char *from)
+{
+ struct iconv_converter_class *dcp;
+ struct iconv_cspair *csp;
+
+ if (iconv_lookupconv(converter, &dcp) != 0)
+ return EINVAL;
+
+ return iconv_register_cspair(to, from, dcp, NULL, &csp);
+}
+
/*
* Add new charset pair
*/
diff --git a/sys/libkern/iconv_ucs.c b/sys/libkern/iconv_ucs.c
new file mode 100644
index 0000000..c6f2823
--- /dev/null
+++ b/sys/libkern/iconv_ucs.c
@@ -0,0 +1,540 @@
+/*-
+ * Copyright (c) 2003, 2005 Ryuichiro Imura
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/iconv.h>
+
+#include "iconv_converter_if.h"
+
+/*
+ * "UCS" converter
+ */
+
+#define KICONV_UCS_COMBINE 0x1
+#define KICONV_UCS_FROM_UTF8 0x2
+#define KICONV_UCS_TO_UTF8 0x4
+#define KICONV_UCS_FROM_LE 0x8
+#define KICONV_UCS_TO_LE 0x10
+#define KICONV_UCS_FROM_UTF16 0x20
+#define KICONV_UCS_TO_UTF16 0x40
+#define KICONV_UCS_UCS4 0x80
+
+#define ENCODING_UTF16 "UTF-16BE"
+#define ENCODING_UTF8 "UTF-8"
+
+static struct {
+ const char *name;
+ int from_flag, to_flag;
+} unicode_family[] = {
+ { "UTF-8", KICONV_UCS_FROM_UTF8, KICONV_UCS_TO_UTF8 },
+ { "UCS-2LE", KICONV_UCS_FROM_LE, KICONV_UCS_TO_LE },
+ { "UTF-16BE", KICONV_UCS_FROM_UTF16, KICONV_UCS_TO_UTF16 },
+ { "UTF-16LE", KICONV_UCS_FROM_UTF16|KICONV_UCS_FROM_LE,
+ KICONV_UCS_TO_UTF16|KICONV_UCS_TO_LE },
+ { NULL, 0, 0 }
+};
+
+static uint32_t utf8_to_ucs4(const char *src, size_t *utf8width, size_t srclen);
+static u_char *ucs4_to_utf8(uint32_t ucs4, char * dst, size_t *utf8width, size_t dstlen);
+static uint32_t encode_surrogate(uint32_t code);
+static uint32_t decode_surrogate(const u_char *ucs);
+
+#ifdef MODULE_DEPEND
+MODULE_DEPEND(iconv_ucs, libiconv, 2, 2, 2);
+#endif
+
+/*
+ * UCS converter instance
+ */
+struct iconv_ucs {
+ KOBJ_FIELDS;
+ int convtype;
+ struct iconv_cspair * d_csp;
+ struct iconv_cspair * d_cspf;
+ void * f_ctp;
+ void * t_ctp;
+ void * ctype;
+};
+
+static int
+iconv_ucs_open(struct iconv_converter_class *dcp,
+ struct iconv_cspair *csp, struct iconv_cspair *cspf, void **dpp)
+{
+ struct iconv_ucs *dp;
+ int i;
+ const char *from, *to;
+
+ dp = (struct iconv_ucs *)kobj_create((struct kobj_class*)dcp, M_ICONV, M_WAITOK);
+ to = csp->cp_to;
+ from = cspf ? cspf->cp_from : csp->cp_from;
+
+ dp->convtype = 0;
+
+ if (cspf)
+ dp->convtype |= KICONV_UCS_COMBINE;
+ for (i = 0; unicode_family[i].name; i++) {
+ if (strcmp(from, unicode_family[i].name) == 0)
+ dp->convtype |= unicode_family[i].from_flag;
+ if (strcmp(to, unicode_family[i].name) == 0)
+ dp->convtype |= unicode_family[i].to_flag;
+ }
+ if (strcmp(ENCODING_UNICODE, ENCODING_UTF16) == 0)
+ dp->convtype |= KICONV_UCS_UCS4;
+ else
+ dp->convtype &= ~KICONV_UCS_UCS4;
+
+ dp->f_ctp = dp->t_ctp = NULL;
+ if (dp->convtype & KICONV_UCS_COMBINE) {
+ if ((dp->convtype & KICONV_UCS_FROM_UTF8) == 0 &&
+ (dp->convtype & KICONV_UCS_FROM_LE) == 0) {
+ iconv_open(ENCODING_UNICODE, from, &dp->f_ctp);
+ }
+ if ((dp->convtype & KICONV_UCS_TO_UTF8) == 0 &&
+ (dp->convtype & KICONV_UCS_TO_LE) == 0) {
+ iconv_open(to, ENCODING_UNICODE, &dp->t_ctp);
+ }
+ }
+
+ dp->ctype = NULL;
+ if (dp->convtype & (KICONV_UCS_FROM_UTF8 | KICONV_UCS_TO_UTF8))
+ iconv_open(KICONV_WCTYPE_NAME, ENCODING_UTF8, &dp->ctype);
+
+ dp->d_csp = csp;
+ if (dp->convtype & (KICONV_UCS_FROM_UTF8 | KICONV_UCS_FROM_LE)) {
+ if (cspf) {
+ dp->d_cspf = cspf;
+ cspf->cp_refcount++;
+ } else
+ csp->cp_refcount++;
+ }
+ if (dp->convtype & (KICONV_UCS_TO_UTF8 | KICONV_UCS_TO_LE))
+ csp->cp_refcount++;
+ *dpp = (void*)dp;
+ return 0;
+}
+
+static int
+iconv_ucs_close(void *data)
+{
+ struct iconv_ucs *dp = data;
+
+ if (dp->f_ctp)
+ iconv_close(dp->f_ctp);
+ if (dp->t_ctp)
+ iconv_close(dp->t_ctp);
+ if (dp->ctype)
+ iconv_close(dp->ctype);
+ if (dp->d_cspf)
+ dp->d_cspf->cp_refcount--;
+ else if (dp->convtype & (KICONV_UCS_FROM_UTF8 | KICONV_UCS_FROM_LE))
+ dp->d_csp->cp_refcount--;
+ if (dp->convtype & (KICONV_UCS_TO_UTF8 | KICONV_UCS_TO_LE))
+ dp->d_csp->cp_refcount--;
+ kobj_delete((struct kobj*)data, M_ICONV);
+ return 0;
+}
+
+static int
+iconv_ucs_conv(void *d2p, const char **inbuf,
+ size_t *inbytesleft, char **outbuf, size_t *outbytesleft,
+ int convchar, int casetype)
+{
+ struct iconv_ucs *dp = (struct iconv_ucs*)d2p;
+ int ret = 0, i;
+ size_t in, on, ir, or, inlen, outlen, ucslen;
+ const char *src, *p;
+ char *dst;
+ u_char ucs[4], *q;
+ uint32_t code;
+
+ if (inbuf == NULL || *inbuf == NULL || outbuf == NULL || *outbuf == NULL)
+ return 0;
+ ir = in = *inbytesleft;
+ or = on = *outbytesleft;
+ src = *inbuf;
+ dst = *outbuf;
+
+ while (ir > 0 && or > 0) {
+
+ /*
+ * The first half of conversion.
+ * (convert any code into ENCODING_UNICODE)
+ */
+ code = 0;
+ p = src;
+ if (dp->convtype & KICONV_UCS_FROM_UTF8) {
+ /* convert UTF-8 to ENCODING_UNICODE */
+ inlen = 0;
+ code = utf8_to_ucs4(p, &inlen, ir);
+ if (code == 0) {
+ ret = -1;
+ break;
+ }
+
+ if (casetype == KICONV_FROM_LOWER && dp->ctype) {
+ code = towlower(code, dp->ctype);
+ } else if (casetype == KICONV_FROM_UPPER && dp->ctype) {
+ code = towupper(code, dp->ctype);
+ }
+
+ if ((code >= 0xd800 && code < 0xe000) || code >= 0x110000 ) {
+ /* reserved for utf-16 surrogate pair */
+ /* invalid unicode */
+ ret = -1;
+ break;
+ }
+
+ if (inlen == 4) {
+ if (dp->convtype & KICONV_UCS_UCS4) {
+ ucslen = 4;
+ code = encode_surrogate(code);
+ } else {
+ /* can't handle with ucs-2 */
+ ret = -1;
+ break;
+ }
+ } else {
+ ucslen = 2;
+ }
+
+ /* save UCS-4 into ucs[] */
+ for (q = ucs, i = ucslen - 1 ; i >= 0 ; i--)
+ *q++ = (code >> (i << 3)) & 0xff;
+
+ } else if (dp->convtype & KICONV_UCS_COMBINE && dp->f_ctp) {
+ /* convert local code to ENCODING_UNICODE */
+ ucslen = 4;
+ inlen = ir;
+ q = ucs;
+ ret = iconv_convchr_case(dp->f_ctp, &p, &inlen, (char **)&q,
+ &ucslen, casetype & (KICONV_FROM_LOWER | KICONV_FROM_UPPER));
+ if (ret)
+ break;
+ inlen = ir - inlen;
+ ucslen = 4 - ucslen;
+
+ } else {
+ /* src code is a proper subset of ENCODING_UNICODE */
+ q = ucs;
+ if (dp->convtype & KICONV_UCS_FROM_LE) {
+ *q = *(p + 1);
+ *(q + 1) = *p;
+ p += 2;
+ } else {
+ *q = *p++;
+ *(q + 1) = *p++;
+ }
+ if ((*q & 0xfc) == 0xd8) {
+ if (dp->convtype & KICONV_UCS_UCS4 &&
+ dp->convtype & KICONV_UCS_FROM_UTF16) {
+ inlen = ucslen = 4;
+ } else {
+ /* invalid unicode */
+ ret = -1;
+ break;
+ }
+ } else {
+ inlen = ucslen = 2;
+ }
+ if (ir < inlen) {
+ ret = -1;
+ break;
+ }
+ if (ucslen == 4) {
+ q += 2;
+ if (dp->convtype & KICONV_UCS_FROM_LE) {
+ *q = *(p + 1);
+ *(q + 1) = *p;
+ } else {
+ *q = *p++;
+ *(q + 1) = *p;
+ }
+ if ((*q & 0xfc) != 0xdc) {
+ /* invalid unicode */
+ ret = -1;
+ break;
+ }
+ }
+ }
+
+ /*
+ * The second half of conversion.
+ * (convert ENCODING_UNICODE into any code)
+ */
+ p = ucs;
+ if (dp->convtype & KICONV_UCS_TO_UTF8) {
+ q = (u_char *)dst;
+ if (ucslen == 4 && dp->convtype & KICONV_UCS_UCS4) {
+ /* decode surrogate pair */
+ code = decode_surrogate(p);
+ } else {
+ code = (ucs[0] << 8) | ucs[1];
+ }
+
+ if (casetype == KICONV_LOWER && dp->ctype) {
+ code = towlower(code, dp->ctype);
+ } else if (casetype == KICONV_UPPER && dp->ctype) {
+ code = towupper(code, dp->ctype);
+ }
+
+ outlen = 0;
+ if (ucs4_to_utf8(code, q, &outlen, or) == NULL) {
+ ret = -1;
+ break;
+ }
+
+ src += inlen;
+ ir -= inlen;
+ dst += outlen;
+ or -= outlen;
+
+ } else if (dp->convtype & KICONV_UCS_COMBINE && dp->t_ctp) {
+ ret = iconv_convchr_case(dp->t_ctp, &p, &ucslen, &dst,
+ &or, casetype & (KICONV_LOWER | KICONV_UPPER));
+ if (ret)
+ break;
+
+ src += inlen;
+ ir -= inlen;
+
+ } else {
+ /* dst code is a proper subset of ENCODING_UNICODE */
+ if (or < ucslen) {
+ ret = -1;
+ break;
+ }
+ src += inlen;
+ ir -= inlen;
+ or -= ucslen;
+ if (dp->convtype & KICONV_UCS_TO_LE) {
+ *dst++ = *(p + 1);
+ *dst++ = *p;
+ p += 2;
+ } else {
+ *dst++ = *p++;
+ *dst++ = *p++;
+ }
+ if (ucslen == 4) {
+ if ((dp->convtype & KICONV_UCS_UCS4) == 0 ||
+ (dp->convtype & KICONV_UCS_TO_UTF16) == 0) {
+ ret = -1;
+ break;
+ }
+ if (dp->convtype & KICONV_UCS_TO_LE) {
+ *dst++ = *(p + 1);
+ *dst++ = *p;
+ } else {
+ *dst++ = *p++;
+ *dst++ = *p;
+ }
+ }
+ }
+
+ if (convchar == 1)
+ break;
+ }
+
+ *inbuf += in - ir;
+ *outbuf += on - or;
+ *inbytesleft -= in - ir;
+ *outbytesleft -= on - or;
+ return (ret);
+}
+
+static int
+iconv_ucs_init(struct iconv_converter_class *dcp)
+{
+ int error;
+
+ error = iconv_add(ENCODING_UNICODE, ENCODING_UNICODE, ENCODING_UTF8);
+ if (error)
+ return (error);
+ error = iconv_add(ENCODING_UNICODE, ENCODING_UTF8, ENCODING_UNICODE);
+ if (error)
+ return (error);
+ return (0);
+}
+
+static int
+iconv_ucs_done(struct iconv_converter_class *dcp)
+{
+ return (0);
+}
+
+static const char *
+iconv_ucs_name(struct iconv_converter_class *dcp)
+{
+ return (ENCODING_UNICODE);
+}
+
+static kobj_method_t iconv_ucs_methods[] = {
+ KOBJMETHOD(iconv_converter_open, iconv_ucs_open),
+ KOBJMETHOD(iconv_converter_close, iconv_ucs_close),
+ KOBJMETHOD(iconv_converter_conv, iconv_ucs_conv),
+ KOBJMETHOD(iconv_converter_init, iconv_ucs_init),
+ KOBJMETHOD(iconv_converter_done, iconv_ucs_done),
+ KOBJMETHOD(iconv_converter_name, iconv_ucs_name),
+ {0, 0}
+};
+
+KICONV_CONVERTER(ucs, sizeof(struct iconv_ucs));
+
+static uint32_t
+utf8_to_ucs4(const char *src, size_t *utf8width, size_t srclen)
+{
+ size_t i, w = 0;
+ uint32_t ucs4 = 0;
+
+ /*
+ * get leading 1 byte from utf-8
+ */
+ if ((*src & 0x80) == 0) {
+ /*
+ * leading 1 bit is "0"
+ * utf-8: 0xxxxxxx
+ * ucs-4: 00000000 00000000 00000000 0xxxxxxx
+ */
+ w = 1;
+ /* get trailing 7 bits */
+ ucs4 = *src & 0x7f;
+ } else if ((*src & 0xe0) == 0xc0) {
+ /*
+ * leading 3 bits are "110"
+ * utf-8: 110xxxxx 10yyyyyy
+ * ucs-4: 00000000 00000000 00000xxx xxyyyyyy
+ */
+ w = 2;
+ /* get trailing 5 bits */
+ ucs4 = *src & 0x1f;
+ } else if ((*src & 0xf0) == 0xe0) {
+ /*
+ * leading 4 bits are "1110"
+ * utf-8: 1110xxxx 10yyyyyy 10zzzzzz
+ * ucs-4: 00000000 00000000 xxxxyyyy yyzzzzzz
+ */
+ w = 3;
+ /* get trailing 4 bits */
+ ucs4 = *src & 0x0f;
+ } else if ((*src & 0xf8) == 0xf0) {
+ /*
+ * leading 5 bits are "11110"
+ * utf-8: 11110www 10xxxxxx 10yyyyyy 10zzzzzz
+ * ucs-4: 00000000 000wwwxx xxxxyyyy yyzzzzzz
+ */
+ w = 4;
+ /* get trailing 3 bits */
+ ucs4 = *src & 0x07;
+ } else {
+ /* out of utf-16 range or having illegal bits */
+ return (0);
+ }
+ if (w == 0)
+ return (0);
+
+ if (srclen < w)
+ return (0);
+
+ /*
+ * get left parts from utf-8
+ */
+ for (i = 1 ; i < w ; i++) {
+ if ((*(src + i) & 0xc0) != 0x80) {
+ /* invalid: leading 2 bits are not "10" */
+ return (0);
+ }
+ /* concatenate trailing 6 bits into ucs4 */
+ ucs4 <<= 6;
+ ucs4 |= *(src + i) & 0x3f;
+ }
+
+ *utf8width = w;
+ return (ucs4);
+}
+
+static u_char *
+ucs4_to_utf8(uint32_t ucs4, char *dst, size_t *utf8width, size_t dstlen)
+{
+ u_char lead, *p;
+ size_t i, w;
+
+ /*
+ * determine utf-8 width and leading bits
+ */
+ if (ucs4 < 0x80) {
+ w = 1;
+ lead = 0; /* "0" */
+ } else if (ucs4 < 0x800) {
+ w = 2;
+ lead = 0xc0; /* "11" */
+ } else if (ucs4 < 0x10000) {
+ w = 3;
+ lead = 0xe0; /* "111" */
+ } else if (ucs4 < 0x200000) {
+ w = 4;
+ lead = 0xf0; /* "1111" */
+ } else {
+ return (NULL);
+ }
+
+ if (dstlen < w)
+ return (NULL);
+
+ /*
+ * construct utf-8
+ */
+ p = dst;
+ for (i = w - 1 ; i >= 1 ; i--) {
+ /* get trailing 6 bits and put it with leading bit as "1" */
+ *(p + i) = (ucs4 & 0x3f) | 0x80;
+ ucs4 >>= 6;
+ }
+ *p = ucs4 | lead;
+
+ *utf8width = w;
+
+ return (p);
+}
+
+static uint32_t
+encode_surrogate(register uint32_t code)
+{
+ return ((((code - 0x10000) << 6) & 0x3ff0000) |
+ ((code - 0x10000) & 0x3ff) | 0xd800dc00);
+}
+
+static uint32_t
+decode_surrogate(register const u_char *ucs)
+{
+ return ((((ucs[0] & 0x3) << 18) | (ucs[1] << 10) |
+ ((ucs[2] & 0x3) << 8) | ucs[3]) + 0x10000);
+}
+
OpenPOWER on IntegriCloud