Convert the Big5, EUC, MSKanji and UTF-8 encoding methods to implement

mbrtowc() and wcrtomb() directly. GB18030, GBK and UTF2 are left unconverted; GB18030 will be done eventually, but GBK and UTF2 may just be removed, as they are subsets of GB18030 and UTF-8 respectively.
author: tjr <tjr@FreeBSD.org> 2003-11-02 10:09:33 +0000
committer: tjr <tjr@FreeBSD.org> 2003-11-02 10:09:33 +0000
commit: 1c3a3f7e2621ff295c7e7b931e81655cef49d115 (patch)
tree: 824f8d75e428ea4465ffd6a2314d3084178e9224 /lib/libc/locale/utf8.c
parent: 69c81d4892abdd3dea4d40d0a43d101eeb956109 (diff)
download: FreeBSD-src-1c3a3f7e2621ff295c7e7b931e81655cef49d115.zip
FreeBSD-src-1c3a3f7e2621ff295c7e7b931e81655cef49d115.tar.gz
1 files changed, 69 insertions, 71 deletions
diff --git a/lib/libc/locale/utf8.c b/lib/libc/locale/utf8.c
index c22d3d6..10f937b 100644
--- a/lib/libc/locale/utf8.c
+++ b/lib/libc/locale/utf8.c
@@ -1,5 +1,5 @@
 /*-
- * Copyright (c) 2002 Tim J. Robbins
+ * Copyright (c) 2002, 2003 Tim J. Robbins
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -27,37 +27,46 @@
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
-#include <rune.h>
+#include <errno.h>
+#include <runetype.h>
 #include <stddef.h>
 #include <stdio.h>
 #include <stdlib.h>
+#include <wchar.h>
 
-rune_t	_UTF8_sgetrune(const char *, size_t, char const **);
-int	_UTF8_sputrune(rune_t, char *, size_t, char **);
+extern size_t (*__mbrtowc)(wchar_t * __restrict, const char * __restrict,
+    size_t, mbstate_t * __restrict);
+extern size_t (*__wcrtomb)(char * __restrict, wchar_t, mbstate_t * __restrict);
+
+size_t  _UTF8_mbrtowc(wchar_t * __restrict, const char * __restrict, size_t,
+	    mbstate_t * __restrict);
+size_t  _UTF8_wcrtomb(char * __restrict, wchar_t, mbstate_t * __restrict);
 
 int
 _UTF8_init(_RuneLocale *rl)
 {
 
-	rl->sgetrune = _UTF8_sgetrune;
-	rl->sputrune = _UTF8_sputrune;
+	__mbrtowc = _UTF8_mbrtowc;
+	__wcrtomb = _UTF8_wcrtomb;
 	_CurrentRuneLocale = rl;
 	__mb_cur_max = 6;
 
 	return (0);
 }
 
-rune_t
-_UTF8_sgetrune(const char *string, size_t n, const char **result)
+size_t
+_UTF8_mbrtowc(wchar_t * __restrict pwc, const char * __restrict s, size_t n,
+    mbstate_t * __restrict ps __unused)
 {
-	int ch, len, mask;
-	rune_t lbound, wch;
+	int ch, i, len, mask;
+	wchar_t lbound, wch;
 
-	if (n < 1) {
-		if (result != NULL)
-			*result = string;
-		return (_INVALID_RUNE);
-	}
+	if (s == NULL)
+		/* Reset to initial shift state (no-op) */
+		return (0);
+	if (n == 0)
+		/* Incomplete multibyte sequence */
+		return ((size_t)-2);
 
 	/*
 	 * Determine the number of octets that make up this character from
@@ -70,7 +79,7 @@ _UTF8_sgetrune(const char *string, size_t n, const char **result)
 	 * character. This enforces a 1-to-1 mapping between character
 	 * codes and their multibyte representations.
 	 */
-	ch = (unsigned char)*string;
+	ch = (unsigned char)*s;
 	if ((ch & 0x80) == 0) {
 		mask = 0x7f;
 		len = 1;
@@ -99,106 +108,95 @@ _UTF8_sgetrune(const char *string, size_t n, const char **result)
 		/*
 		 * Malformed input; input is not UTF-8.
 		 */
-		if (result != NULL)
-			*result = string + 1;
-		return (_INVALID_RUNE);
+		errno = EILSEQ;
+		return ((size_t)-1);
 	}
 
-	if (n < len) {
-		/*
-		 * Truncated or partial input.
-		 */
-		if (result != NULL)
-			*result = string;
-		return (_INVALID_RUNE);
-	}
+	if (n < (size_t)len)
+		/* Incomplete multibyte sequence */
+		return ((size_t)-2);
 
 	/*
 	 * Decode the octet sequence representing the character in chunks
 	 * of 6 bits, most significant first.
 	 */
-	wch = (unsigned char)*string++ & mask;
-	while (--len != 0) {
-		if ((*string & 0xc0) != 0x80) {
+	wch = (unsigned char)*s++ & mask;
+	i = len;
+	while (--i != 0) {
+		if ((*s & 0xc0) != 0x80) {
 			/*
 			 * Malformed input; bad characters in the middle
 			 * of a character.
 			 */
-			wch = _INVALID_RUNE;
-			if (result != NULL)
-				*result = string + 1;
-			return (_INVALID_RUNE);
+			errno = EILSEQ;
+			return ((size_t)-1);
 		}
 		wch <<= 6;
-		wch |= *string++ & 0x3f;
+		wch |= *s++ & 0x3f;
 	}
-	if (wch != _INVALID_RUNE && wch < lbound)
+	if (wch < lbound) {
 		/*
 		 * Malformed input; redundant encoding.
 		 */
-		wch = _INVALID_RUNE;
-	if (result != NULL)
-		*result = string;
-	return (wch);
+		errno = EILSEQ;
+		return ((size_t)-1);
+	}
+	if (pwc != NULL)
+		*pwc = wch;
+	return (wch == L'\0' ? 0 : i);
 }
 
-int
-_UTF8_sputrune(rune_t c, char *string, size_t n, char **result)
+size_t
+_UTF8_wcrtomb(char * __restrict s, wchar_t wc,
+    mbstate_t * __restrict ps __unused)
 {
 	unsigned char lead;
 	int i, len;
 
+	if (s == NULL)
+		/* Reset to initial shift state (no-op) */
+		return (1);
+
 	/*
 	 * Determine the number of octets needed to represent this character.
 	 * We always output the shortest sequence possible. Also specify the
 	 * first few bits of the first octet, which contains the information
 	 * about the sequence length.
 	 */
-	if ((c & ~0x7f) == 0) {
+	if ((wc & ~0x7f) == 0) {
 		lead = 0;
 		len = 1;
-	} else if ((c & ~0x7ff) == 0) {
+	} else if ((wc & ~0x7ff) == 0) {
 		lead = 0xc0;
 		len = 2;
-	} else if ((c & ~0xffff) == 0) {
+	} else if ((wc & ~0xffff) == 0) {
 		lead = 0xe0;
 		len = 3;
-	} else if ((c & ~0x1fffff) == 0) {
+	} else if ((wc & ~0x1fffff) == 0) {
 		lead = 0xf0;
 		len = 4;
-	} else if ((c & ~0x3ffffff) == 0) {
+	} else if ((wc & ~0x3ffffff) == 0) {
 		lead = 0xf8;
 		len = 5;
-	} else if ((c & ~0x7fffffff) == 0) {
+	} else if ((wc & ~0x7fffffff) == 0) {
 		lead = 0xfc;
 		len = 6;
 	} else {
-		/*
-		 * Wide character code is out of range.
-		 */
-		if (result != NULL)
-			*result = NULL;
-		return (0);
+		errno = EILSEQ;
+		return ((size_t)-1);
 	}
 
-	if (n < len) {
-		if (result != NULL)
-			*result = NULL;
-	} else {
-		/*
-		 * Output the octets representing the character in chunks
-		 * of 6 bits, least significant last. The first octet is
-		 * a special case because it contains the sequence length
-		 * information.
-		 */
-		for (i = len - 1; i > 0; i--) {
-			string[i] = (c & 0x3f) | 0x80;
-			c >>= 6;
-		}
-		*string = (c & 0xff) | lead;
-		if (result != NULL)
-			*result = string + len;
+	/*
+	 * Output the octets representing the character in chunks
+	 * of 6 bits, least significant last. The first octet is
+	 * a special case because it contains the sequence length
+	 * information.
+	 */
+	for (i = len - 1; i > 0; i--) {
+		s[i] = (wc & 0x3f) | 0x80;
+		wc >>= 6;
 	}
+	*s = (wc & 0xff) | lead;
 
 	return (len);
 }
author	tjr <tjr@FreeBSD.org>	2003-11-02 10:09:33 +0000
committer	tjr <tjr@FreeBSD.org>	2003-11-02 10:09:33 +0000
commit	1c3a3f7e2621ff295c7e7b931e81655cef49d115 (patch)
tree	824f8d75e428ea4465ffd6a2314d3084178e9224 /lib/libc/locale/utf8.c
parent	69c81d4892abdd3dea4d40d0a43d101eeb956109 (diff)
download	FreeBSD-src-1c3a3f7e2621ff295c7e7b931e81655cef49d115.zip FreeBSD-src-1c3a3f7e2621ff295c7e7b931e81655cef49d115.tar.gz