From 05bd8275f3829e7d1f55e8faabd64095eab3c291 Mon Sep 17 00:00:00 2001 From: ed Date: Tue, 21 May 2013 19:59:37 +0000 Subject: Add . The header, part of C11, adds a small number of utility functions for 16/32-bit "universal" characters, which may or may not be UTF-16/32. As our wchar_t is already ISO 10646, simply add light-weight wrappers around wcrtomb() and mbrtowc(). While there, also add (non-yet-standard) _l functions, similar to the ones we already have for the other locale-dependent functions. Reviewed by: theraven --- include/Makefile | 2 +- include/stdatomic.h | 2 - include/uchar.h | 60 +++++++++ include/xlocale/Makefile | 2 +- include/xlocale/_uchar.h | 46 +++++++ lib/libc/locale/Makefile.inc | 8 +- lib/libc/locale/Symbol.map | 8 ++ lib/libc/locale/c16rtomb.c | 81 ++++++++++++ lib/libc/locale/c32rtomb.c | 59 +++++++++ lib/libc/locale/mbrtoc16.c | 89 ++++++++++++++ lib/libc/locale/mbrtoc32.c | 53 ++++++++ lib/libc/locale/mbrtowc.3 | 96 ++++++++++----- lib/libc/locale/wcrtomb.3 | 60 +++++---- lib/libc/locale/xlocale_private.h | 8 ++ sys/sys/_types.h | 6 + tools/regression/lib/libc/locale/Makefile | 4 +- tools/regression/lib/libc/locale/test-c16rtomb.c | 115 +++++++++++++++++ tools/regression/lib/libc/locale/test-mbrtoc16.c | 150 +++++++++++++++++++++++ 18 files changed, 792 insertions(+), 57 deletions(-) create mode 100644 include/uchar.h create mode 100644 include/xlocale/_uchar.h create mode 100644 lib/libc/locale/c16rtomb.c create mode 100644 lib/libc/locale/c32rtomb.c create mode 100644 lib/libc/locale/mbrtoc16.c create mode 100644 lib/libc/locale/mbrtoc32.c create mode 100644 tools/regression/lib/libc/locale/test-c16rtomb.c create mode 100644 tools/regression/lib/libc/locale/test-mbrtoc16.c diff --git a/include/Makefile b/include/Makefile index 1cd70aa..a6aaa64 100644 --- a/include/Makefile +++ b/include/Makefile @@ -23,7 +23,7 @@ INCS= a.out.h ar.h assert.h bitstring.h complex.h cpio.h _ctype.h ctype.h \ stdnoreturn.h stdio.h stdlib.h string.h stringlist.h \ strings.h sysexits.h tar.h termios.h tgmath.h \ time.h timeconv.h timers.h ttyent.h \ - ulimit.h unistd.h utime.h utmpx.h uuid.h varargs.h \ + uchar.h ulimit.h unistd.h utime.h utmpx.h uuid.h varargs.h \ wchar.h wctype.h wordexp.h xlocale.h .PATH: ${.CURDIR}/../contrib/libc-vis diff --git a/include/stdatomic.h b/include/stdatomic.h index 44f73b5..4d03bb9 100644 --- a/include/stdatomic.h +++ b/include/stdatomic.h @@ -145,10 +145,8 @@ typedef _Atomic(long) atomic_long; typedef _Atomic(unsigned long) atomic_ulong; typedef _Atomic(long long) atomic_llong; typedef _Atomic(unsigned long long) atomic_ullong; -#if 0 typedef _Atomic(__char16_t) atomic_char16_t; typedef _Atomic(__char32_t) atomic_char32_t; -#endif typedef _Atomic(__wchar_t) atomic_wchar_t; typedef _Atomic(__int_least8_t) atomic_int_least8_t; typedef _Atomic(__uint_least8_t) atomic_uint_least8_t; diff --git a/include/uchar.h b/include/uchar.h new file mode 100644 index 0000000..34c919f --- /dev/null +++ b/include/uchar.h @@ -0,0 +1,60 @@ +/*- + * Copyright (c) 2013 Ed Schouten + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _UCHAR_H_ +#define _UCHAR_H_ + +#include +#include + +#ifndef _MBSTATE_T_DECLARED +typedef __mbstate_t mbstate_t; +#define _MBSTATE_T_DECLARED +#endif + +#ifndef _SIZE_T_DECLARED +typedef __size_t size_t; +#define _SIZE_T_DECLARED +#endif + +typedef __char16_t char16_t; +typedef __char32_t char32_t; + +__BEGIN_DECLS +size_t c16rtomb(char * __restrict, char16_t, mbstate_t * __restrict); +size_t c32rtomb(char * __restrict, char32_t, mbstate_t * __restrict); +size_t mbrtoc16(char16_t * __restrict, const char * __restrict, size_t, + mbstate_t * __restrict); +size_t mbrtoc32(char32_t * __restrict, const char * __restrict, size_t, + mbstate_t * __restrict); +#if __BSD_VISIBLE || defined(_XLOCALE_H_) +#include +#endif +__END_DECLS + +#endif /* !_UCHAR_H_ */ diff --git a/include/xlocale/Makefile b/include/xlocale/Makefile index e45ddca..a35db50 100644 --- a/include/xlocale/Makefile +++ b/include/xlocale/Makefile @@ -2,7 +2,7 @@ NO_OBJ= INCS= _ctype.h _inttypes.h _langinfo.h _locale.h _monetary.h _stdio.h\ - _stdlib.h _string.h _time.h _wchar.h + _stdlib.h _string.h _time.h _uchar.h _wchar.h INCSDIR=${INCLUDEDIR}/xlocale .include diff --git a/include/xlocale/_uchar.h b/include/xlocale/_uchar.h new file mode 100644 index 0000000..4256c87 --- /dev/null +++ b/include/xlocale/_uchar.h @@ -0,0 +1,46 @@ +/*- + * Copyright (c) 2013 Ed Schouten + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _LOCALE_T_DEFINED +#define _LOCALE_T_DEFINED +typedef struct _xlocale *locale_t; +#endif + +#ifndef _XLOCALE_UCHAR_H_ +#define _XLOCALE_UCHAR_H_ + +size_t c16rtomb_l(char * __restrict, char16_t, mbstate_t * __restrict, + locale_t); +size_t c32rtomb_l(char * __restrict, char32_t, mbstate_t * __restrict, + locale_t); +size_t mbrtoc16_l(char16_t * __restrict, const char * __restrict, size_t, + mbstate_t * __restrict, locale_t); +size_t mbrtoc32_l(char32_t * __restrict, const char * __restrict, size_t, + mbstate_t * __restrict, locale_t); + +#endif /* _XLOCALE_UCHAR_H_ */ diff --git a/lib/libc/locale/Makefile.inc b/lib/libc/locale/Makefile.inc index f2161be..ffef228 100644 --- a/lib/libc/locale/Makefile.inc +++ b/lib/libc/locale/Makefile.inc @@ -4,11 +4,11 @@ # locale sources .PATH: ${.CURDIR}/${LIBC_ARCH}/locale ${.CURDIR}/locale -SRCS+= ascii.c big5.c btowc.c collate.c collcmp.c euc.c fix_grouping.c \ - gb18030.c gb2312.c gbk.c ctype.c isctype.c iswctype.c \ +SRCS+= ascii.c big5.c btowc.c c16rtomb.c c32rtomb.c collate.c collcmp.c euc.c \ + fix_grouping.c gb18030.c gb2312.c gbk.c ctype.c isctype.c iswctype.c \ ldpart.c lmessages.c lmonetary.c lnumeric.c localeconv.c mblen.c \ mbrlen.c \ - mbrtowc.c mbsinit.c mbsnrtowcs.c \ + mbrtoc16.c mbrtoc32.c mbrtowc.c mbsinit.c mbsnrtowcs.c \ mbsrtowcs.c mbtowc.c mbstowcs.c \ mskanji.c nextwctype.c nl_langinfo.c nomacros.c none.c rpmatch.c \ rune.c \ @@ -72,7 +72,9 @@ MLINKS+=iswalnum_l.3 iswalpha_l.3 iswalnum_l.3 iswcntrl_l.3 \ iswalnum_l.3 iswspecial_l.3 iswalnum_l.3 nextwctype_l.3 \ iswalnum_l.3 towctrans_l.3 iswalnum_l.3 wctrans_l.3 MLINKS+=isxdigit.3 ishexnumber.3 +MLINKS+=mbrtowc.3 mbrtoc16.3 mbrtowc.3 mbrtoc32.3 MLINKS+=mbsrtowcs.3 mbsnrtowcs.3 +MLINKS+=wcrtomb.3 c16rtomb.3 wcrtomb.3 c32rtomb.3 MLINKS+=wcsrtombs.3 wcsnrtombs.3 MLINKS+=wcstod.3 wcstof.3 wcstod.3 wcstold.3 MLINKS+=wcstol.3 wcstoul.3 wcstol.3 wcstoll.3 wcstol.3 wcstoull.3 \ diff --git a/lib/libc/locale/Symbol.map b/lib/libc/locale/Symbol.map index f6367da..b2f2a35 100644 --- a/lib/libc/locale/Symbol.map +++ b/lib/libc/locale/Symbol.map @@ -199,6 +199,14 @@ FBSD_1.3 { __istype_l; __runes_for_locale; _ThreadRuneLocale; + c16rtomb; + c16rtomb_l; + c32rtomb; + c32rtomb_l; + mbrtoc16; + mbrtoc16_l; + mbrtoc32; + mbrtoc32_l; }; FBSDprivate_1.0 { diff --git a/lib/libc/locale/c16rtomb.c b/lib/libc/locale/c16rtomb.c new file mode 100644 index 0000000..8bad982 --- /dev/null +++ b/lib/libc/locale/c16rtomb.c @@ -0,0 +1,81 @@ +/*- + * Copyright (c) 2013 Ed Schouten + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include "xlocale_private.h" + +typedef struct { + char16_t lead_surrogate; + mbstate_t c32_mbstate; +} _Char16State; + +size_t +c16rtomb_l(char * __restrict s, char16_t c16, mbstate_t * __restrict ps, + locale_t locale) +{ + _Char16State *cs; + char32_t c32; + + FIX_LOCALE(locale); + if (ps == NULL) + ps = &locale->c16rtomb; + cs = (_Char16State *)ps; + + /* If s is a null pointer, the value of parameter c16 is ignored. */ + if (s == NULL) { + c32 = 0; + } else if (cs->lead_surrogate >= 0xd800 && + cs->lead_surrogate <= 0xdbff) { + /* We should see a trail surrogate now. */ + if (c16 < 0xdc00 || c16 > 0xdfff) { + errno = EILSEQ; + return ((size_t)-1); + } + c32 = 0x10000 + ((cs->lead_surrogate & 0x3ff) << 10 | + (c16 & 0x3ff)); + } else if (c16 >= 0xd800 && c16 <= 0xdbff) { + /* Store lead surrogate for next invocation. */ + cs->lead_surrogate = c16; + return (0); + } else { + /* Regular character. */ + c32 = c16; + } + cs->lead_surrogate = 0; + + return (c32rtomb_l(s, c32, &cs->c32_mbstate, locale)); +} + +size_t +c16rtomb(char * __restrict s, char16_t c16, mbstate_t * __restrict ps) +{ + + return (c16rtomb_l(s, c16, ps, __get_locale())); +} diff --git a/lib/libc/locale/c32rtomb.c b/lib/libc/locale/c32rtomb.c new file mode 100644 index 0000000..c5f008f --- /dev/null +++ b/lib/libc/locale/c32rtomb.c @@ -0,0 +1,59 @@ +/*- + * Copyright (c) 2013 Ed Schouten + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include "xlocale_private.h" + +size_t +c32rtomb_l(char * __restrict s, char32_t c32, mbstate_t * __restrict ps, + locale_t locale) +{ + + /* Unicode Standard 5.0, D90: ill-formed characters. */ + if ((c32 >= 0xd800 && c32 <= 0xdfff) || c32 > 0x10ffff) { + errno = EILSEQ; + return ((size_t)-1); + } + + FIX_LOCALE(locale); + if (ps == NULL) + ps = &locale->c32rtomb; + + /* Assume wchar_t uses UTF-32. */ + return (wcrtomb_l(s, c32, ps, locale)); +} + +size_t +c32rtomb(char * __restrict s, char32_t c32, mbstate_t * __restrict ps) +{ + + return (c32rtomb_l(s, c32, ps, __get_locale())); +} diff --git a/lib/libc/locale/mbrtoc16.c b/lib/libc/locale/mbrtoc16.c new file mode 100644 index 0000000..a47514c --- /dev/null +++ b/lib/libc/locale/mbrtoc16.c @@ -0,0 +1,89 @@ +/*- + * Copyright (c) 2013 Ed Schouten + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include "xlocale_private.h" + +typedef struct { + char16_t trail_surrogate; + mbstate_t c32_mbstate; +} _Char16State; + +size_t +mbrtoc16_l(char16_t * __restrict pc16, const char * __restrict s, size_t n, + mbstate_t * __restrict ps, locale_t locale) +{ + _Char16State *cs; + char32_t c32; + ssize_t len; + + FIX_LOCALE(locale); + if (ps == NULL) + ps = &locale->mbrtoc16; + cs = (_Char16State *)ps; + + /* + * Call straight into mbrtoc32_l() if we don't need to return a + * character value. According to the spec, if s is a null + * pointer, the value of parameter pc16 is also ignored. + */ + if (pc16 == NULL || s == NULL) { + cs->trail_surrogate = 0; + return (mbrtoc32_l(NULL, s, n, &cs->c32_mbstate, locale)); + } + + /* Return the trail surrogate from the previous invocation. */ + if (cs->trail_surrogate >= 0xdc00 && cs->trail_surrogate <= 0xdfff) { + *pc16 = cs->trail_surrogate; + cs->trail_surrogate = 0; + return ((size_t)-3); + } + + len = mbrtoc32_l(&c32, s, n, &cs->c32_mbstate, locale); + if (len >= 0) { + if (c32 < 0x10000) { + /* Fits in one UTF-16 character. */ + *pc16 = c32; + } else { + /* Split up in a surrogate pair. */ + c32 -= 0x10000; + *pc16 = 0xd800 | (c32 >> 10); + cs->trail_surrogate = 0xdc00 | (c32 & 0x3ff); + } + } + return (len); +} + +size_t +mbrtoc16(char16_t * __restrict pc16, const char * __restrict s, size_t n, + mbstate_t * __restrict ps) +{ + + return (mbrtoc16_l(pc16, s, n, ps, __get_locale())); +} diff --git a/lib/libc/locale/mbrtoc32.c b/lib/libc/locale/mbrtoc32.c new file mode 100644 index 0000000..ee9e47d --- /dev/null +++ b/lib/libc/locale/mbrtoc32.c @@ -0,0 +1,53 @@ +/*- + * Copyright (c) 2013 Ed Schouten + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include "xlocale_private.h" + +size_t +mbrtoc32_l(char32_t * __restrict pc32, const char * __restrict s, size_t n, + mbstate_t * __restrict ps, locale_t locale) +{ + + FIX_LOCALE(locale); + if (ps == NULL) + ps = &locale->mbrtoc32; + + /* Assume wchar_t uses UTF-32. */ + return (mbrtowc_l(pc32, s, n, ps, locale)); +} + +size_t +mbrtoc32(char32_t * __restrict pc32, const char * __restrict s, size_t n, + mbstate_t * __restrict ps) +{ + + return (mbrtoc32_l(pc32, s, n, ps, __get_locale())); +} diff --git a/lib/libc/locale/mbrtowc.3 b/lib/libc/locale/mbrtowc.3 index 10160d1..22b26cd 100644 --- a/lib/libc/locale/mbrtowc.3 +++ b/lib/libc/locale/mbrtowc.3 @@ -24,11 +24,13 @@ .\" .\" $FreeBSD$ .\" -.Dd April 8, 2004 +.Dd May 21, 2013 .Dt MBRTOWC 3 .Os .Sh NAME -.Nm mbrtowc +.Nm mbrtowc , +.Nm mbrtoc16 , +.Nm mbrtoc32 .Nd "convert a character to a wide-character code (restartable)" .Sh LIBRARY .Lb libc @@ -36,35 +38,51 @@ .In wchar.h .Ft size_t .Fo mbrtowc -.Fa "wchar_t * restrict pwc" "const char * restrict s" "size_t n" +.Fa "wchar_t * restrict pc" "const char * restrict s" "size_t n" +.Fa "mbstate_t * restrict ps" +.Fc +.In uchar.h +.Ft size_t +.Fo mbrtoc16 +.Fa "char16_t * restrict pc" "const char * restrict s" "size_t n" +.Fa "mbstate_t * restrict ps" +.Fc +.Ft size_t +.Fo mbrtoc32 +.Fa "char32_t * restrict pc" "const char * restrict s" "size_t n" .Fa "mbstate_t * restrict ps" .Fc .Sh DESCRIPTION The -.Fn mbrtowc -function inspects at most +.Fn mbrtowc , +.Fn mbrtoc16 +and +.Fn mbrtoc32 +functions inspect at most .Fa n bytes pointed to by .Fa s to determine the number of bytes needed to complete the next multibyte character. If a character can be completed, and -.Fa pwc +.Fa pc is not .Dv NULL , the wide character which is represented by .Fa s is stored in the -.Vt wchar_t +.Vt wchar_t , +.Vt char16_t +or +.Vt char32_t it points to. .Pp If .Fa s is .Dv NULL , -.Fn mbrtowc -behaves as if -.Fa pwc +these functions behave as if +.Fa pc was .Dv NULL , .Fa s @@ -81,15 +99,24 @@ argument, is used to keep track of the shift state. If it is .Dv NULL , -.Fn mbrtowc -uses an internal, static +these functions use an internal, static .Vt mbstate_t object, which is initialized to the initial conversion state at program startup. +.Pp +As a single +.Vt char16_t +is not large enough to represent certain multibyte characters, the +.Fn mbrtoc16 +function may need to be invoked multiple times to convert a single +multibyte character sequence. .Sh RETURN VALUES The -.Fn mbrtowc -functions returns: +.Fn mbrtowc , +.Fn mbrtoc16 +and +.Fn mbrtoc32 +functions return: .Bl -tag -width indent .It 0 The next @@ -100,10 +127,13 @@ represent the null wide character .It >0 The next .Fa n -or fewer bytes -represent a valid character, -.Fn mbrtowc -returns the number of bytes used to complete the multibyte character. +or fewer bytes represent a valid character, these functions +return the number of bytes used to complete the multibyte character. +.It Po Vt size_t Pc Ns \-1 +An encoding error has occurred. +The next +.Fa n +or fewer bytes do not contribute to a valid multibyte character. .It Po Vt size_t Pc Ns \-2 The next .Fa n @@ -111,16 +141,23 @@ contribute to, but do not complete, a valid multibyte character sequence, and all .Fa n bytes have been processed. -.It Po Vt size_t Pc Ns \-1 -An encoding error has occurred. -The next -.Fa n -or fewer bytes do not contribute to a valid multibyte character. +.El +.Pp +The +.Fn mbrtoc16 +function also returns: +.Bl -tag -width indent +.It Po Vt size_t Pc Ns \-3 +The next character resulting from a previous call has been stored. +No bytes from the input have been consumed. .El .Sh ERRORS The -.Fn mbrtowc -function will fail if: +.Fn mbrtowc , +.Fn mbrtoc16 +and +.Fn mbrtoc32 +functions will fail if: .Bl -tag -width Er .It Bq Er EILSEQ An invalid multibyte sequence was detected. @@ -134,6 +171,9 @@ The conversion state is invalid. .Xr wcrtomb 3 .Sh STANDARDS The -.Fn mbrtowc -function conforms to -.St -isoC-99 . +.Fn mbrtowc , +.Fn mbrtoc16 +and +.Fn mbrtoc32 +functions conform to +.St -isoC-2011 . diff --git a/lib/libc/locale/wcrtomb.3 b/lib/libc/locale/wcrtomb.3 index c89614e..bc74174 100644 --- a/lib/libc/locale/wcrtomb.3 +++ b/lib/libc/locale/wcrtomb.3 @@ -24,24 +24,34 @@ .\" .\" $FreeBSD$ .\" -.Dd April 8, 2004 +.Dd May 21, 2013 .Dt WCRTOMB 3 .Os .Sh NAME -.Nm wcrtomb +.Nm wcrtomb , +.Nm c16rtomb , +.Nm c32rtomb .Nd "convert a wide-character code to a character (restartable)" .Sh LIBRARY .Lb libc .Sh SYNOPSIS .In wchar.h .Ft size_t -.Fn wcrtomb "char * restrict s" "wchar_t wc" "mbstate_t * restrict ps" +.Fn wcrtomb "char * restrict s" "wchar_t c" "mbstate_t * restrict ps" +.In uchar.h +.Ft size_t +.Fn c16rtomb "char * restrict s" "char16_t c" "mbstate_t * restrict ps" +.Ft size_t +.Fn c32rtomb "char * restrict s" "char32_t c" "mbstate_t * restrict ps" .Sh DESCRIPTION The -.Fn wcrtomb -function stores a multibyte sequence representing the +.Fn wcrtomb , +.Fn c16rtomb +and +.Fn c32rtomb +functions store a multibyte sequence representing the wide character -.Fa wc , +.Fa c , including any necessary shift sequences, to the character array .Fa s , @@ -53,11 +63,10 @@ If .Fa s is .Dv NULL , -.Fn wcrtomb -behaves as if +these functions behave as if .Fa s pointed to an internal buffer and -.Fa wc +.Fa c was a null wide character (L'\e0'). .Pp The @@ -67,26 +76,32 @@ argument, is used to keep track of the shift state. If it is .Dv NULL , -.Fn wcrtomb -uses an internal, static +these functions use an internal, static .Vt mbstate_t object, which is initialized to the initial conversion state at program startup. +.Pp +As certain multibyte characters may only be represented by a series of +16-bit characters, the +.Fn c16rtomb +may need to invoked multiple times before a multibyte sequence is +returned. .Sh RETURN VALUES -The -.Fn wcrtomb -functions returns the length (in bytes) of the multibyte sequence +These functions return the length (in bytes) of the multibyte sequence needed to represent -.Fa wc , +.Fa c , or .Po Vt size_t Pc Ns \-1 if -.Fa wc +.Fa c is not a valid wide character code. .Sh ERRORS The -.Fn wcrtomb -function will fail if: +.Fn wcrtomb , +.Fn c16rtomb +and +.Fn c32rtomb +functions will fail if: .Bl -tag -width Er .It Bq Er EILSEQ An invalid wide character code was specified. @@ -100,6 +115,9 @@ The conversion state is invalid. .Xr wctomb 3 .Sh STANDARDS The -.Fn wcrtomb -function conforms to -.St -isoC-99 . +.Fn wcrtomb , +.Fn c16rtomb +and +.Fn c32rtomb +functions conform to +.St -isoC-2011 . diff --git a/lib/libc/locale/xlocale_private.h b/lib/libc/locale/xlocale_private.h index 813f719..502b548 100644 --- a/lib/libc/locale/xlocale_private.h +++ b/lib/libc/locale/xlocale_private.h @@ -109,6 +109,10 @@ struct _xlocale { __mbstate_t mblen; /** Persistent state used by mbrlen() calls. */ __mbstate_t mbrlen; + /** Persistent state used by mbrtoc16() calls. */ + __mbstate_t mbrtoc16; + /** Persistent state used by mbrtoc32() calls. */ + __mbstate_t mbrtoc32; /** Persistent state used by mbrtowc() calls. */ __mbstate_t mbrtowc; /** Persistent state used by mbsnrtowcs() calls. */ @@ -117,6 +121,10 @@ struct _xlocale { __mbstate_t mbsrtowcs; /** Persistent state used by mbtowc() calls. */ __mbstate_t mbtowc; + /** Persistent state used by c16rtomb() calls. */ + __mbstate_t c16rtomb; + /** Persistent state used by c32rtomb() calls. */ + __mbstate_t c32rtomb; /** Persistent state used by wcrtomb() calls. */ __mbstate_t wcrtomb; /** Persistent state used by wcsnrtombs() calls. */ diff --git a/sys/sys/_types.h b/sys/sys/_types.h index 27ecaf4..e47749f 100644 --- a/sys/sys/_types.h +++ b/sys/sys/_types.h @@ -89,6 +89,12 @@ typedef int __ct_rune_t; /* arg type for ctype funcs */ typedef __ct_rune_t __rune_t; /* rune_t (see above) */ typedef __ct_rune_t __wint_t; /* wint_t (see above) */ +/* Clang already provides these types as built-ins, but only in C++ mode. */ +#if !defined(__clang__) || !defined(__cplusplus) +typedef __uint_least16_t __char16_t; +typedef __uint_least32_t __char32_t; +#endif + typedef __uint32_t __dev_t; /* device number */ typedef __uint32_t __fixpt_t; /* fixed point number */ diff --git a/tools/regression/lib/libc/locale/Makefile b/tools/regression/lib/libc/locale/Makefile index 5fb9a94..ebbd537 100644 --- a/tools/regression/lib/libc/locale/Makefile +++ b/tools/regression/lib/libc/locale/Makefile @@ -14,7 +14,9 @@ TESTS= test-mbrtowc \ test-wcstombs \ test-mblen \ test-iswctype \ - test-towctrans + test-towctrans \ + test-c16rtomb \ + test-mbrtoc16 .PHONY: tests tests: ${TESTS} diff --git a/tools/regression/lib/libc/locale/test-c16rtomb.c b/tools/regression/lib/libc/locale/test-c16rtomb.c new file mode 100644 index 0000000..eb88946 --- /dev/null +++ b/tools/regression/lib/libc/locale/test-c16rtomb.c @@ -0,0 +1,115 @@ +/*- + * Copyright (c) 2002 Tim J. Robbins + * All rights reserved. + * + * Copyright (c) 2013 Ed Schouten + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +/* + * Test program for c16rtomb() as specified by ISO/IEC 9899:2011. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include + +int +main(int argc, char *argv[]) +{ + mbstate_t s; + char buf[MB_LEN_MAX + 1]; + + /* + * C/POSIX locale. + */ + + printf("1..1\n"); + + /* + * If the buffer argument is NULL, c16 is implicitly 0, + * c16rtomb() resets its internal state. + */ + assert(c16rtomb(NULL, L'\0', NULL) == 1); + assert(c16rtomb(NULL, 0xdc00, NULL) == 1); + + /* Null wide character. */ + memset(&s, 0, sizeof(s)); + memset(buf, 0xcc, sizeof(buf)); + assert(c16rtomb(buf, 0, &s) == 1); + assert((unsigned char)buf[0] == 0 && (unsigned char)buf[1] == 0xcc); + + /* Latin letter A, internal state. */ + assert(c16rtomb(NULL, L'\0', NULL) == 1); + assert(c16rtomb(NULL, L'A', NULL) == 1); + + /* Latin letter A. */ + memset(&s, 0, sizeof(s)); + memset(buf, 0xcc, sizeof(buf)); + assert(c16rtomb(buf, L'A', &s) == 1); + assert((unsigned char)buf[0] == 'A' && (unsigned char)buf[1] == 0xcc); + + /* Unicode character 'Pile of poo'. */ + memset(&s, 0, sizeof(s)); + memset(buf, 0xcc, sizeof(buf)); + assert(c16rtomb(buf, 0xd83d, &s) == 0); + assert(c16rtomb(buf, 0xdca9, &s) == (size_t)-1); + assert(errno == EILSEQ); + + /* + * UTF-8. + */ + + assert(strcmp(setlocale(LC_CTYPE, "en_US.UTF-8"), "en_US.UTF-8") == 0); + + /* Unicode character 'Pile of poo'. */ + memset(&s, 0, sizeof(s)); + memset(buf, 0xcc, sizeof(buf)); + assert(c16rtomb(buf, 0xd83d, &s) == 0); + assert(c16rtomb(buf, 0xdca9, &s) == 4); + assert((unsigned char)buf[0] == 0xf0 && (unsigned char)buf[1] == 0x9f && + (unsigned char)buf[2] == 0x92 && (unsigned char)buf[3] == 0xa9 && + (unsigned char)buf[4] == 0xcc); + + /* Invalid code; 'Pile of poo' without the trail surrogate. */ + memset(&s, 0, sizeof(s)); + memset(buf, 0xcc, sizeof(buf)); + assert(c16rtomb(buf, 0xd83d, &s) == 0); + assert(c16rtomb(buf, L'A', &s) == (size_t)-1); + assert(errno == EILSEQ); + + /* Invalid code; 'Pile of poo' without the lead surrogate. */ + memset(&s, 0, sizeof(s)); + memset(buf, 0xcc, sizeof(buf)); + assert(c16rtomb(buf, 0xdca9, &s) == (size_t)-1); + assert(errno == EILSEQ); + + printf("ok 1 - c16rtomb()\n"); +} diff --git a/tools/regression/lib/libc/locale/test-mbrtoc16.c b/tools/regression/lib/libc/locale/test-mbrtoc16.c new file mode 100644 index 0000000..88e8091 --- /dev/null +++ b/tools/regression/lib/libc/locale/test-mbrtoc16.c @@ -0,0 +1,150 @@ +/*- + * Copyright (c) 2002 Tim J. Robbins + * All rights reserved. + * + * Copyright (c) 2013 Ed Schouten + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +/* + * Test program for mbrtoc16() as specified by ISO/IEC 9899:2011. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include + +int +main(int argc, char *argv[]) +{ + mbstate_t s; + size_t len; + char16_t c16; + + /* + * C/POSIX locale. + */ + + printf("1..1\n"); + + /* Null wide character, internal state. */ + assert(mbrtoc16(&c16, "", 1, NULL) == 0); + assert(c16 == 0); + + /* Null wide character. */ + memset(&s, 0, sizeof(s)); + assert(mbrtoc16(&c16, "", 1, &s) == 0); + assert(c16 == 0); + + /* Latin letter A, internal state. */ + assert(mbrtoc16(NULL, 0, 0, NULL) == 0); + assert(mbrtoc16(&c16, "A", 1, NULL) == 1); + assert(c16 == L'A'); + + /* Latin letter A. */ + memset(&s, 0, sizeof(s)); + assert(mbrtoc16(&c16, "A", 1, &s) == 1); + assert(c16 == L'A'); + + /* Incomplete character sequence. */ + c16 = L'z'; + memset(&s, 0, sizeof(s)); + assert(mbrtoc16(&c16, "", 0, &s) == (size_t)-2); + assert(c16 == L'z'); + + /* Check that mbrtoc16() doesn't access the buffer when n == 0. */ + c16 = L'z'; + memset(&s, 0, sizeof(s)); + assert(mbrtoc16(&c16, "", 0, &s) == (size_t)-2); + assert(c16 == L'z'); + + /* + * UTF-8. + */ + + assert(strcmp(setlocale(LC_CTYPE, "en_US.UTF-8"), "en_US.UTF-8") == 0); + + /* Null wide character, internal state. */ + assert(mbrtoc16(NULL, 0, 0, NULL) == 0); + assert(mbrtoc16(&c16, "", 1, NULL) == 0); + assert(c16 == 0); + + /* Null wide character. */ + memset(&s, 0, sizeof(s)); + assert(mbrtoc16(&c16, "", 1, &s) == 0); + assert(c16 == 0); + + /* Latin letter A, internal state. */ + assert(mbrtoc16(NULL, 0, 0, NULL) == 0); + assert(mbrtoc16(&c16, "A", 1, NULL) == 1); + assert(c16 == L'A'); + + /* Latin letter A. */ + memset(&s, 0, sizeof(s)); + assert(mbrtoc16(&c16, "A", 1, &s) == 1); + assert(c16 == L'A'); + + /* Incomplete character sequence (zero length). */ + c16 = L'z'; + memset(&s, 0, sizeof(s)); + assert(mbrtoc16(&c16, "", 0, &s) == (size_t)-2); + assert(c16 == L'z'); + + /* Incomplete character sequence (truncated double-byte). */ + memset(&s, 0, sizeof(s)); + c16 = 0; + assert(mbrtoc16(&c16, "\xc3", 1, &s) == (size_t)-2); + + /* Same as above, but complete. */ + memset(&s, 0, sizeof(s)); + c16 = 0; + assert(mbrtoc16(&c16, "\xc3\x84", 2, &s) == 2); + assert(c16 == 0xc4); + + /* Test restarting behaviour. */ + memset(&s, 0, sizeof(s)); + c16 = 0; + assert(mbrtoc16(&c16, "\xc3", 1, &s) == (size_t)-2); + assert(c16 == 0); + assert(mbrtoc16(&c16, "\xb7", 1, &s) == 1); + assert(c16 == 0xf7); + + /* Surrogate pair. */ + memset(&s, 0, sizeof(s)); + c16 = 0; + assert(mbrtoc16(&c16, "\xf0\x9f\x92\xa9", 4, &s) == 4); + assert(c16 == 0xd83d); + assert(mbrtoc16(&c16, "", 0, &s) == (size_t)-3); + assert(c16 == 0xdca9); + + printf("ok 1 - mbrtoc16()\n"); + + return (0); +} -- cgit v1.1