diff options
Diffstat (limited to 'libarchive/test/test_archive_string_conversion.c')
-rw-r--r-- | libarchive/test/test_archive_string_conversion.c | 629 |
1 files changed, 629 insertions, 0 deletions
diff --git a/libarchive/test/test_archive_string_conversion.c b/libarchive/test/test_archive_string_conversion.c new file mode 100644 index 0000000..8b833ea --- /dev/null +++ b/libarchive/test/test_archive_string_conversion.c @@ -0,0 +1,629 @@ +/*- + * Copyright (c) 2011 Michihiro NAKAJIMA + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR(S) ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR(S) BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#include "test.h" +__FBSDID("$FreeBSD$"); + +#include <locale.h> + +#define __LIBARCHIVE_TEST +#include "archive_string.h" + +/* +Execute the following to rebuild the data for this program: + tail -n +36 test_archive_string_conversion.c | /bin/sh +# +# This requires http://unicode.org/Public/UNIDATA/NormalizationTest.txt +# +if="NormalizationTest.txt" +if [ ! -f ${if} ]; then + echo "Not found: \"${if}\"" + exit 0 +fi +of=test_archive_string_conversion.txt.Z +echo "\$FreeBSD\$" > ${of}.uu +awk -F ';' '$0 ~/^[0-9A-F]+/ {printf "%s;%s\n", $2, $3}' ${if} | compress | uuencode ${of} >> ${of}.uu +exit 1 +*/ + +static int +unicode_to_utf8(char *p, uint32_t uc) +{ + char *_p = p; + + /* Translate code point to UTF8 */ + if (uc <= 0x7f) { + *p++ = (char)uc; + } else if (uc <= 0x7ff) { + *p++ = 0xc0 | ((uc >> 6) & 0x1f); + *p++ = 0x80 | (uc & 0x3f); + } else if (uc <= 0xffff) { + *p++ = 0xe0 | ((uc >> 12) & 0x0f); + *p++ = 0x80 | ((uc >> 6) & 0x3f); + *p++ = 0x80 | (uc & 0x3f); + } else { + *p++ = 0xf0 | ((uc >> 18) & 0x07); + *p++ = 0x80 | ((uc >> 12) & 0x3f); + *p++ = 0x80 | ((uc >> 6) & 0x3f); + *p++ = 0x80 | (uc & 0x3f); + } + return ((int)(p - _p)); +} + +static void +archive_be16enc(void *pp, uint16_t u) +{ + unsigned char *p = (unsigned char *)pp; + + p[0] = (u >> 8) & 0xff; + p[1] = u & 0xff; +} + +static int +unicode_to_utf16be(char *p, uint32_t uc) +{ + char *utf16 = p; + + if (uc > 0xffff) { + /* We have a code point that won't fit into a + * wchar_t; convert it to a surrogate pair. */ + uc -= 0x10000; + archive_be16enc(utf16, ((uc >> 10) & 0x3ff) + 0xD800); + archive_be16enc(utf16+2, (uc & 0x3ff) + 0xDC00); + return (4); + } else { + archive_be16enc(utf16, uc); + return (2); + } +} + +static void +archive_le16enc(void *pp, uint16_t u) +{ + unsigned char *p = (unsigned char *)pp; + + p[0] = u & 0xff; + p[1] = (u >> 8) & 0xff; +} + +static size_t +unicode_to_utf16le(char *p, uint32_t uc) +{ + char *utf16 = p; + + if (uc > 0xffff) { + /* We have a code point that won't fit into a + * wchar_t; convert it to a surrogate pair. */ + uc -= 0x10000; + archive_le16enc(utf16, ((uc >> 10) & 0x3ff) + 0xD800); + archive_le16enc(utf16+2, (uc & 0x3ff) + 0xDC00); + return (4); + } else { + archive_le16enc(utf16, uc); + return (2); + } +} + +static int +wc_size(void) +{ + return (sizeof(wchar_t)); +} + +static int +unicode_to_wc(wchar_t *wp, uint32_t uc) +{ + if (wc_size() == 4) { + *wp = (wchar_t)uc; + return (1); + } + if (uc > 0xffff) { + /* We have a code point that won't fit into a + * wchar_t; convert it to a surrogate pair. */ + uc -= 0x10000; + *wp++ = (wchar_t)(((uc >> 10) & 0x3ff) + 0xD800); + *wp = (wchar_t)((uc & 0x3ff) + 0xDC00); + return (2); + } else { + *wp = (wchar_t)uc; + return (1); + } +} + +/* + * Note: U+2000 - U+2FFF, U+F900 - U+FAFF and U+2F800 - U+2FAFF are not + * converted to NFD on Mac OS. + * see also http://developer.apple.com/library/mac/#qa/qa2001/qa1173.html + */ +static int +scan_unicode_pattern(char *out, wchar_t *wout, char *u16be, char *u16le, + const char *pattern, int exclude_mac_nfd) +{ + unsigned uc = 0; + const char *p = pattern; + char *op = out; + wchar_t *owp = wout; + char *op16be = u16be; + char *op16le = u16le; + + for (;;) { + if (*p >= '0' && *p <= '9') + uc = (uc << 4) + (*p - '0'); + else if (*p >= 'A' && *p <= 'F') + uc = (uc << 4) + (*p - 'A' + 0x0a); + else { + if (exclude_mac_nfd) { + /* + * These are not converted to NFD on Mac OS. + */ + if ((uc >= 0x2000 && uc <= 0x2FFF) || + (uc >= 0xF900 && uc <= 0xFAFF) || + (uc >= 0x2F800 && uc <= 0x2FAFF)) + return (-1); + /* + * Those code points are not converted to + * NFD on Mac OS. I do not know the reason + * because it is undocumented. + * NFC NFD + * 1109A ==> 11099 110BA + * 1109C ==> 1109B 110BA + * 110AB ==> 110A5 110BA + */ + if (uc == 0x1109A || uc == 0x1109C || + uc == 0x110AB) + return (-1); + } + op16be += unicode_to_utf16be(op16be, uc); + op16le += unicode_to_utf16le(op16le, uc); + owp += unicode_to_wc(owp, uc); + op += unicode_to_utf8(op, uc); + if (!*p) { + *op16be++ = 0; + *op16be = 0; + *op16le++ = 0; + *op16le = 0; + *owp = L'\0'; + *op = '\0'; + break; + } + uc = 0; + } + p++; + } + return (0); +} + +static int +is_wc_unicode(void) +{ +#if defined(_WIN32) && !defined(__CYGWIN__) + return (1); +#else + return (0); +#endif +} + +/* + * A conversion test that we correctly normalize UTF-8 and UTF-16BE characters. + * On Mac OS, the characters to be Form D. + * On other platforms, the characters to be Form C. + */ +static void +test_archive_string_normalization(void) +{ + struct archive *a, *a2; + struct archive_entry *ae; + struct archive_string utf8; + struct archive_mstring mstr; + struct archive_string_conv *f_sconv8, *t_sconv8; + struct archive_string_conv *f_sconv16be, *f_sconv16le; + FILE *fp; + char buff[512]; + static const char reffile[] = "test_archive_string_conversion.txt.Z"; + ssize_t size; + int line = 0; + int locale_is_utf8, wc_is_unicode; + + locale_is_utf8 = (NULL != setlocale(LC_ALL, "en_US.UTF-8")); + wc_is_unicode = is_wc_unicode(); + /* If it doesn't exist, just warn and return. */ + if (!locale_is_utf8 && !wc_is_unicode) { + skipping("invalid encoding tests require a suitable locale;" + " en_US.UTF-8 not available on this system"); + return; + } + + archive_string_init(&utf8); + memset(&mstr, 0, sizeof(mstr)); + + /* + * Extract a test pattern file. + */ + extract_reference_file(reffile); + assert((a = archive_read_new()) != NULL); + assertEqualIntA(a, ARCHIVE_OK, archive_read_support_filter_all(a)); + assertEqualIntA(a, ARCHIVE_OK, archive_read_support_format_raw(a)); + assertEqualIntA(a, ARCHIVE_OK, + archive_read_open_filename(a, reffile, 512)); + + assertEqualIntA(a, ARCHIVE_OK, archive_read_next_header(a, &ae)); + assert((fp = fopen("testdata.txt", "w")) != NULL); + while ((size = archive_read_data(a, buff, 512)) > 0) + fwrite(buff, 1, size, fp); + fclose(fp); + + /* Open a test pattern file. */ + assert((fp = fopen("testdata.txt", "r")) != NULL); + + /* + * Create string conversion objects. + */ + assertA(NULL != (f_sconv8 = + archive_string_conversion_from_charset(a, "UTF-8", 0))); + assertA(NULL != (f_sconv16be = + archive_string_conversion_from_charset(a, "UTF-16BE", 0))); + assertA(NULL != (f_sconv16le = + archive_string_conversion_from_charset(a, "UTF-16LE", 0))); + assert((a2 = archive_write_new()) != NULL); + assertA(NULL != (t_sconv8 = + archive_string_conversion_to_charset(a2, "UTF-8", 0))); + if (f_sconv8 == NULL || f_sconv16be == NULL || f_sconv16le == NULL || + t_sconv8 == NULL || fp == NULL) { + /* We cannot continue this test. */ + if (fp != NULL) + fclose(fp); + assertEqualInt(ARCHIVE_OK, archive_read_free(a)); + return; + } + + /* + * Read test data. + * Test data format: + * <NFC Unicode pattern> ';' <NFD Unicode pattern> '\n' + * Unicode pattern format: + * [0-9A-F]{4,5}([ ][0-9A-F]{4,5}){0,} + */ + while (fgets(buff, sizeof(buff), fp) != NULL) { + char nfc[80], nfd[80]; + char utf8_nfc[80], utf8_nfd[80]; + char utf16be_nfc[80], utf16be_nfd[80]; + char utf16le_nfc[80], utf16le_nfd[80]; + wchar_t wc_nfc[40], wc_nfd[40]; + char *e, *p; + + line++; + if (buff[0] == '#') + continue; + p = strchr(buff, ';'); + if (p == NULL) + continue; + *p++ = '\0'; + /* Copy an NFC pattern */ + strncpy(nfc, buff, sizeof(nfc)-1); + nfc[sizeof(nfc)-1] = '\0'; + e = p; + p = strchr(p, '\n'); + if (p == NULL) + continue; + *p = '\0'; + /* Copy an NFD pattern */ + strncpy(nfd, e, sizeof(nfd)-1); + nfd[sizeof(nfd)-1] = '\0'; + + /* + * Convert an NFC pattern to UTF-8 bytes. + */ +#if defined(__APPLE__) + if (scan_unicode_pattern(utf8_nfc, wc_nfc, utf16be_nfc, utf16le_nfc, + nfc, 1) != 0) + continue; +#else + scan_unicode_pattern(utf8_nfc, wc_nfc, utf16be_nfc, utf16le_nfc, + nfc, 0); +#endif + + /* + * Convert an NFD pattern to UTF-8 bytes. + */ + scan_unicode_pattern(utf8_nfd, wc_nfd, utf16be_nfd, utf16le_nfd, + nfd, 0); + + if (locale_is_utf8) { +#if defined(__APPLE__) + /* + * Normalize an NFC string for import. + */ + assertEqualInt(0, archive_strcpy_in_locale( + &utf8, utf8_nfc, f_sconv8)); + failure("NFC(%s) should be converted to NFD(%s):%d", + nfc, nfd, line); + assertEqualUTF8String(utf8_nfd, utf8.s); + + /* + * Normalize an NFD string for import. + */ + assertEqualInt(0, archive_strcpy_in_locale( + &utf8, utf8_nfd, f_sconv8)); + failure("NFD(%s) should not be any changed:%d", + nfd, line); + assertEqualUTF8String(utf8_nfd, utf8.s); + + /* + * Copy an NFD string for export. + */ + assertEqualInt(0, archive_strcpy_in_locale( + &utf8, utf8_nfd, t_sconv8)); + failure("NFD(%s) should not be any changed:%d", + nfd, line); + assertEqualUTF8String(utf8_nfd, utf8.s); + + /* + * Normalize an NFC string in UTF-16BE for import. + */ + assertEqualInt(0, archive_strncpy_in_locale( + &utf8, utf16be_nfc, 100000, f_sconv16be)); + failure("NFC(%s) should be converted to NFD(%s):%d", + nfc, nfd, line); + assertEqualUTF8String(utf8_nfd, utf8.s); + + /* + * Normalize an NFC string in UTF-16LE for import. + */ + assertEqualInt(0, archive_strncpy_in_locale( + &utf8, utf16le_nfc, 100000, f_sconv16le)); + failure("NFC(%s) should be converted to NFD(%s):%d", + nfc, nfd, line); + assertEqualUTF8String(utf8_nfd, utf8.s); +#else + /* + * Normalize an NFD string for import. + */ + assertEqualInt(0, archive_strcpy_in_locale( + &utf8, utf8_nfd, f_sconv8)); + failure("NFD(%s) should be converted to NFC(%s):%d", + nfd, nfc, line); + assertEqualUTF8String(utf8_nfc, utf8.s); + + /* + * Normalize an NFC string for import. + */ + assertEqualInt(0, archive_strcpy_in_locale( + &utf8, utf8_nfc, f_sconv8)); + failure("NFC(%s) should not be any changed:%d", + nfc, line); + assertEqualUTF8String(utf8_nfc, utf8.s); + + /* + * Copy an NFC string for export. + */ + assertEqualInt(0, archive_strcpy_in_locale( + &utf8, utf8_nfc, t_sconv8)); + failure("NFC(%s) should not be any changed:%d", + nfc, line); + assertEqualUTF8String(utf8_nfc, utf8.s); + + /* + * Normalize an NFD string in UTF-16BE for import. + */ + assertEqualInt(0, archive_strncpy_in_locale( + &utf8, utf16be_nfd, 100000, f_sconv16be)); + failure("NFD(%s) should be converted to NFC(%s):%d", + nfd, nfc, line); + assertEqualUTF8String(utf8_nfc, utf8.s); + + /* + * Normalize an NFD string in UTF-16LE for import. + */ + assertEqualInt(0, archive_strncpy_in_locale( + &utf8, utf16le_nfd, 100000, f_sconv16le)); + failure("NFD(%s) should be converted to NFC(%s):%d", + nfd, nfc, line); + assertEqualUTF8String(utf8_nfc, utf8.s); +#endif + } + + /* + * Test for archive_mstring interface. + * In specific, Windows platform UTF-16BE is directly + * converted to/from wide-character to avoid the effect of + * current locale since windows platform cannot make + * locale UTF-8. + */ + if (locale_is_utf8 || wc_is_unicode) { + const wchar_t *wp; + const char *mp; + size_t mplen; + +#if defined(__APPLE__) + /* + * Normalize an NFD string in UTF-8 for import. + */ + assertEqualInt(0, archive_mstring_copy_mbs_len_l( + &mstr, utf8_nfc, 100000, f_sconv8)); + assertEqualInt(0, + archive_mstring_get_wcs(a, &mstr, &wp)); + failure("UTF-8 NFC(%s) should be converted " + "to WCS NFD(%s):%d", nfc, nfd, line); + assertEqualWString(wc_nfd, wp); + + /* + * Normalize an NFD string in UTF-16BE for import. + */ + assertEqualInt(0, archive_mstring_copy_mbs_len_l( + &mstr, utf16be_nfc, 100000, f_sconv16be)); + assertEqualInt(0, + archive_mstring_get_wcs(a, &mstr, &wp)); + failure("UTF-16BE NFC(%s) should be converted " + "to WCS NFD(%s):%d", nfc, nfd, line); + assertEqualWString(wc_nfd, wp); + + /* + * Normalize an NFD string in UTF-16LE for import. + */ + assertEqualInt(0, archive_mstring_copy_mbs_len_l( + &mstr, utf16le_nfc, 100000, f_sconv16le)); + assertEqualInt(0, + archive_mstring_get_wcs(a, &mstr, &wp)); + failure("UTF-16LE NFC(%s) should be converted " + "to WCS NFD(%s):%d", nfc, nfd, line); + assertEqualWString(wc_nfd, wp); + + /* + * Copy an NFD wide-string for export. + */ + assertEqualInt(0, archive_mstring_copy_wcs( + &mstr, wc_nfd)); + assertEqualInt(0, archive_mstring_get_mbs_l( + &mstr, &mp, &mplen, t_sconv8)); + failure("WCS NFD(%s) should be UTF-8 NFD:%d" + ,nfd, line); + assertEqualUTF8String(utf8_nfd, mp); +#else + /* + * Normalize an NFD string in UTF-8 for import. + */ + assertEqualInt(0, archive_mstring_copy_mbs_len_l( + &mstr, utf8_nfd, 100000, f_sconv8)); + assertEqualInt(0, + archive_mstring_get_wcs(a, &mstr, &wp)); + failure("UTF-8 NFD(%s) should be converted " + "to WCS NFC(%s):%d", nfd, nfc, line); + assertEqualWString(wc_nfc, wp); + + /* + * Normalize an NFD string in UTF-16BE for import. + */ + assertEqualInt(0, archive_mstring_copy_mbs_len_l( + &mstr, utf16be_nfd, 100000, f_sconv16be)); + assertEqualInt(0, + archive_mstring_get_wcs(a, &mstr, &wp)); + failure("UTF-8 NFD(%s) should be converted " + "to WCS NFC(%s):%d", nfd, nfc, line); + assertEqualWString(wc_nfc, wp); + + /* + * Normalize an NFD string in UTF-16LE for import. + */ + assertEqualInt(0, archive_mstring_copy_mbs_len_l( + &mstr, utf16le_nfd, 100000, f_sconv16le)); + assertEqualInt(0, + archive_mstring_get_wcs(a, &mstr, &wp)); + failure("UTF-8 NFD(%s) should be converted " + "to WCS NFC(%s):%d", nfd, nfc, line); + assertEqualWString(wc_nfc, wp); + + /* + * Copy an NFC wide-string for export. + */ + assertEqualInt(0, archive_mstring_copy_wcs( + &mstr, wc_nfc)); + assertEqualInt(0, archive_mstring_get_mbs_l( + &mstr, &mp, &mplen, t_sconv8)); + failure("WCS NFC(%s) should be UTF-8 NFC:%d" + ,nfc, line); + assertEqualUTF8String(utf8_nfc, mp); +#endif + } + } + + archive_string_free(&utf8); + archive_mstring_clean(&mstr); + fclose(fp); + assertEqualInt(ARCHIVE_OK, archive_read_free(a)); + assertEqualInt(ARCHIVE_OK, archive_write_free(a2)); +} + +static void +test_archive_string_canonicalization(void) +{ + struct archive *a; + struct archive_string_conv *sconv; + + setlocale(LC_ALL, "en_US.UTF-8"); + + assert((a = archive_read_new()) != NULL); + + assertA(NULL != (sconv = + archive_string_conversion_to_charset(a, "UTF-8", 1))); + failure("Charset name should be UTF-8"); + assertEqualString("UTF-8", + archive_string_conversion_charset_name(sconv)); + + assertA(NULL != (sconv = + archive_string_conversion_to_charset(a, "UTF8", 1))); + failure("Charset name should be UTF-8"); + assertEqualString("UTF-8", + archive_string_conversion_charset_name(sconv)); + + assertA(NULL != (sconv = + archive_string_conversion_to_charset(a, "utf8", 1))); + failure("Charset name should be UTF-8"); + assertEqualString("UTF-8", + archive_string_conversion_charset_name(sconv)); + + assertA(NULL != (sconv = + archive_string_conversion_to_charset(a, "UTF-16BE", 1))); + failure("Charset name should be UTF-16BE"); + assertEqualString("UTF-16BE", + archive_string_conversion_charset_name(sconv)); + + assertA(NULL != (sconv = + archive_string_conversion_to_charset(a, "UTF16BE", 1))); + failure("Charset name should be UTF-16BE"); + assertEqualString("UTF-16BE", + archive_string_conversion_charset_name(sconv)); + + assertA(NULL != (sconv = + archive_string_conversion_to_charset(a, "utf16be", 1))); + failure("Charset name should be UTF-16BE"); + assertEqualString("UTF-16BE", + archive_string_conversion_charset_name(sconv)); + + assertA(NULL != (sconv = + archive_string_conversion_to_charset(a, "UTF-16LE", 1))); + failure("Charset name should be UTF-16LE"); + assertEqualString("UTF-16LE", + archive_string_conversion_charset_name(sconv)); + + assertA(NULL != (sconv = + archive_string_conversion_to_charset(a, "UTF16LE", 1))); + failure("Charset name should be UTF-16LE"); + assertEqualString("UTF-16LE", + archive_string_conversion_charset_name(sconv)); + + assertA(NULL != (sconv = + archive_string_conversion_to_charset(a, "utf16le", 1))); + failure("Charset name should be UTF-16LE"); + assertEqualString("UTF-16LE", + archive_string_conversion_charset_name(sconv)); + + assertEqualInt(ARCHIVE_OK, archive_read_free(a)); + +} + +DEFINE_TEST(test_archive_string_conversion) +{ + test_archive_string_normalization(); + test_archive_string_canonicalization(); +} |