Use conversion state objects to store the accumulated wide character,

low bound, and the number of bytes remaining instead of storing the raw byte sequence and deriving them every time mbrtowc() is called. This is much faster -- about twice as fast in some crude benchmarks.
author: tjr <tjr@FreeBSD.org> 2004-05-17 12:32:40 +0000
committer: tjr <tjr@FreeBSD.org> 2004-05-17 12:32:40 +0000
commit: aee8349a0cd7f865941348a59879239207c8a21e (patch)
tree: 314729e4e801b26971a9b1a60a738b91641469f0 /lib/libc/locale/utf8.c
parent: b40b6a2d84571484579ca9193ffb9b9502c28389 (diff)
download: FreeBSD-src-aee8349a0cd7f865941348a59879239207c8a21e.zip
FreeBSD-src-aee8349a0cd7f865941348a59879239207c8a21e.tar.gz
1 files changed, 67 insertions, 63 deletions
diff --git a/lib/libc/locale/utf8.c b/lib/libc/locale/utf8.c
index d754a19..c0fbcf4 100644
--- a/lib/libc/locale/utf8.c
+++ b/lib/libc/locale/utf8.c
@@ -40,8 +40,9 @@ int	_UTF8_mbsinit(const mbstate_t *);
 size_t	_UTF8_wcrtomb(char * __restrict, wchar_t, mbstate_t * __restrict);
 
 typedef struct {
-	int	count;
-	u_char	bytes[6];
+	wchar_t	ch;
+	int	want;
+	wchar_t	lbound;
 } _UTF8State;
 
 int
@@ -61,7 +62,7 @@ int
 _UTF8_mbsinit(const mbstate_t *ps)
 {
 
-	return (ps == NULL || ((const _UTF8State *)ps)->count == 0);
+	return (ps == NULL || ((const _UTF8State *)ps)->want == 0);
 }
 
 size_t
@@ -69,13 +70,12 @@ _UTF8_mbrtowc(wchar_t * __restrict pwc, const char * __restrict s, size_t n,
     mbstate_t * __restrict ps)
 {
 	_UTF8State *us;
-	int ch, i, len, mask, ocount;
+	int ch, i, mask, want;
 	wchar_t lbound, wch;
-	size_t ncopy;
 
 	us = (_UTF8State *)ps;
 
-	if (us->count < 0 || us->count > sizeof(us->bytes)) {
+	if (us->want < 0 || us->want > 6) {
 		errno = EINVAL;
 		return ((size_t)-1);
 	}
@@ -86,72 +86,69 @@ _UTF8_mbrtowc(wchar_t * __restrict pwc, const char * __restrict s, size_t n,
 		pwc = NULL;
 	}
 
-	ncopy = MIN(MIN(n, MB_CUR_MAX), sizeof(us->bytes) - us->count);
-	memcpy(us->bytes + us->count, s, ncopy);
-	ocount = us->count;
-	us->count += ncopy;
-	s = (char *)us->bytes;
-	n = us->count;
-
 	if (n == 0)
 		/* Incomplete multibyte sequence */
 		return ((size_t)-2);
 
-	/*
-	 * Determine the number of octets that make up this character from
-	 * the first octet, and a mask that extracts the interesting bits of
-	 * the first octet.
-	 *
-	 * We also specify a lower bound for the character code to detect
-	 * redundant, non-"shortest form" encodings. For example, the
-	 * sequence C0 80 is _not_ a legal representation of the null
-	 * character. This enforces a 1-to-1 mapping between character
-	 * codes and their multibyte representations.
-	 */
-	ch = (unsigned char)*s;
-	if ((ch & 0x80) == 0) {
-		mask = 0x7f;
-		len = 1;
-		lbound = 0;
-	} else if ((ch & 0xe0) == 0xc0) {
-		mask = 0x1f;
-		len = 2;
-		lbound = 0x80;
-	} else if ((ch & 0xf0) == 0xe0) {
-		mask = 0x0f;
-		len = 3;
-		lbound = 0x800;
-	} else if ((ch & 0xf8) == 0xf0) {
-		mask = 0x07;
-		len = 4;
-		lbound = 0x10000;
-	} else if ((ch & 0xfc) == 0xf8) {
-		mask = 0x03;
-		len = 5;
-		lbound = 0x200000;
-	} else if ((ch & 0xfc) == 0xfc) {
-		mask = 0x01;
-		len = 6;
-		lbound = 0x4000000;
-	} else {
+	if (us->want == 0) {
 		/*
-		 * Malformed input; input is not UTF-8.
+		 * Determine the number of octets that make up this character
+		 * from the first octet, and a mask that extracts the
+		 * interesting bits of the first octet. We already know
+		 * the character is at least two bytes long.
+		 *
+		 * We also specify a lower bound for the character code to
+		 * detect redundant, non-"shortest form" encodings. For
+		 * example, the sequence C0 80 is _not_ a legal representation
+		 * of the null character. This enforces a 1-to-1 mapping
+		 * between character codes and their multibyte representations.
 		 */
-		errno = EILSEQ;
-		return ((size_t)-1);
+		ch = (unsigned char)*s;
+		if ((ch & 0x80) == 0) {
+			mask = 0x7f;
+			want = 1;
+			lbound = 0;
+		} else if ((ch & 0xe0) == 0xc0) {
+			mask = 0x1f;
+			want = 2;
+			lbound = 0x80;
+		} else if ((ch & 0xf0) == 0xe0) {
+			mask = 0x0f;
+			want = 3;
+			lbound = 0x800;
+		} else if ((ch & 0xf8) == 0xf0) {
+			mask = 0x07;
+			want = 4;
+			lbound = 0x10000;
+		} else if ((ch & 0xfc) == 0xf8) {
+			mask = 0x03;
+			want = 5;
+			lbound = 0x200000;
+		} else if ((ch & 0xfc) == 0xfc) {
+			mask = 0x01;
+			want = 6;
+			lbound = 0x4000000;
+		} else {
+			/*
+			 * Malformed input; input is not UTF-8.
+			 */
+			errno = EILSEQ;
+			return ((size_t)-1);
+		}
+	} else {
+		want = us->want;
+		lbound = us->lbound;
 	}
 
-	if (n < (size_t)len)
-		/* Incomplete multibyte sequence */
-		return ((size_t)-2);
-
 	/*
 	 * Decode the octet sequence representing the character in chunks
 	 * of 6 bits, most significant first.
 	 */
-	wch = (unsigned char)*s++ & mask;
-	i = len;
-	while (--i != 0) {
+	if (us->want == 0)
+		wch = (unsigned char)*s++ & mask;
+	else
+		wch = us->ch;
+	for (i = (us->want == 0) ? 1 : 0; i < MIN(want, n); i++) {
 		if ((*s & 0xc0) != 0x80) {
 			/*
 			 * Malformed input; bad characters in the middle
@@ -163,6 +160,13 @@ _UTF8_mbrtowc(wchar_t * __restrict pwc, const char * __restrict s, size_t n,
 		wch <<= 6;
 		wch |= *s++ & 0x3f;
 	}
+	if (i < want) {
+		/* Incomplete multibyte sequence. */
+		us->want = want - i;
+		us->lbound = lbound;
+		us->ch = wch;
+		return ((size_t)-2);
+	}
 	if (wch < lbound) {
 		/*
 		 * Malformed input; redundant encoding.
@@ -172,8 +176,8 @@ _UTF8_mbrtowc(wchar_t * __restrict pwc, const char * __restrict s, size_t n,
 	}
 	if (pwc != NULL)
 		*pwc = wch;
-	us->count = 0;
-	return (wch == L'\0' ? 0 : len - ocount);
+	us->want = 0;
+	return (wch == L'\0' ? 0 : want);
 }
 
 size_t
@@ -185,7 +189,7 @@ _UTF8_wcrtomb(char * __restrict s, wchar_t wc, mbstate_t * __restrict ps)
 
 	us = (_UTF8State *)ps;
 
-	if (us->count < 0 || us->count > sizeof(us->bytes)) {
+	if (us->want != 0) {
 		errno = EINVAL;
 		return ((size_t)-1);
 	}
author	tjr <tjr@FreeBSD.org>	2004-05-17 12:32:40 +0000
committer	tjr <tjr@FreeBSD.org>	2004-05-17 12:32:40 +0000
commit	aee8349a0cd7f865941348a59879239207c8a21e (patch)
tree	314729e4e801b26971a9b1a60a738b91641469f0 /lib/libc/locale/utf8.c
parent	b40b6a2d84571484579ca9193ffb9b9502c28389 (diff)
download	FreeBSD-src-aee8349a0cd7f865941348a59879239207c8a21e.zip FreeBSD-src-aee8349a0cd7f865941348a59879239207c8a21e.tar.gz