diff options
author | dim <dim@FreeBSD.org> | 2012-04-14 14:01:31 +0000 |
---|---|---|
committer | dim <dim@FreeBSD.org> | 2012-04-14 14:01:31 +0000 |
commit | 50b73317314e889cf39c7b1d6cbf419fa7502f22 (patch) | |
tree | be1815eb79b42ff482a8562b13c2dcbf0c5dcbee /lib/Basic/ConvertUTF.c | |
parent | dc04cb328508e61aad809d9b53b12f9799a00e7d (diff) | |
download | FreeBSD-src-50b73317314e889cf39c7b1d6cbf419fa7502f22.zip FreeBSD-src-50b73317314e889cf39c7b1d6cbf419fa7502f22.tar.gz |
Vendor import of clang trunk r154661:
http://llvm.org/svn/llvm-project/cfe/trunk@r154661
Diffstat (limited to 'lib/Basic/ConvertUTF.c')
-rw-r--r-- | lib/Basic/ConvertUTF.c | 143 |
1 files changed, 80 insertions, 63 deletions
diff --git a/lib/Basic/ConvertUTF.c b/lib/Basic/ConvertUTF.c index 124e386..e197003 100644 --- a/lib/Basic/ConvertUTF.c +++ b/lib/Basic/ConvertUTF.c @@ -339,67 +339,6 @@ ConversionResult ConvertUTF32toUTF8 ( return result; } -/* --------------------------------------------------------------------- */ - -ConversionResult ConvertUTF8toUTF32 ( - const UTF8** sourceStart, const UTF8* sourceEnd, - UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) { - ConversionResult result = conversionOK; - const UTF8* source = *sourceStart; - UTF32* target = *targetStart; - while (source < sourceEnd) { - UTF32 ch = 0; - unsigned short extraBytesToRead = trailingBytesForUTF8[*source]; - if (source + extraBytesToRead >= sourceEnd) { - result = sourceExhausted; break; - } - /* Do this check whether lenient or strict */ - if (!isLegalUTF8(source, extraBytesToRead+1)) { - result = sourceIllegal; - break; - } - /* - * The cases all fall through. See "Note A" below. - */ - switch (extraBytesToRead) { - case 5: ch += *source++; ch <<= 6; - case 4: ch += *source++; ch <<= 6; - case 3: ch += *source++; ch <<= 6; - case 2: ch += *source++; ch <<= 6; - case 1: ch += *source++; ch <<= 6; - case 0: ch += *source++; - } - ch -= offsetsFromUTF8[extraBytesToRead]; - - if (target >= targetEnd) { - source -= (extraBytesToRead+1); /* Back up the source pointer! */ - result = targetExhausted; break; - } - if (ch <= UNI_MAX_LEGAL_UTF32) { - /* - * UTF-16 surrogate values are illegal in UTF-32, and anything - * over Plane 17 (> 0x10FFFF) is illegal. - */ - if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) { - if (flags == strictConversion) { - source -= (extraBytesToRead+1); /* return to the illegal value itself */ - result = sourceIllegal; - break; - } else { - *target++ = UNI_REPLACEMENT_CHAR; - } - } else { - *target++ = ch; - } - } else { /* i.e., ch > UNI_MAX_LEGAL_UTF32 */ - result = sourceIllegal; - *target++ = UNI_REPLACEMENT_CHAR; - } - } - *sourceStart = source; - *targetStart = target; - return result; -} #endif /* --------------------------------------------------------------------- */ @@ -448,7 +387,7 @@ static Boolean isLegalUTF8(const UTF8 *source, int length) { */ Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd) { int length = trailingBytesForUTF8[*source]+1; - if (source+length > sourceEnd) { + if (length > sourceEnd - source) { return false; } return isLegalUTF8(source, length); @@ -456,6 +395,22 @@ Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd) { /* --------------------------------------------------------------------- */ +/* + * Exported function to return whether a UTF-8 string is legal or not. + * This is not used here; it's just exported. + */ +Boolean isLegalUTF8String(const UTF8 *source, const UTF8 *sourceEnd) { + while (source != sourceEnd) { + int length = trailingBytesForUTF8[*source] + 1; + if (length > sourceEnd - source || !isLegalUTF8(source, length)) + return false; + source += length; + } + return true; +} + +/* --------------------------------------------------------------------- */ + ConversionResult ConvertUTF8toUTF16 ( const UTF8** sourceStart, const UTF8* sourceEnd, UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) { @@ -465,7 +420,7 @@ ConversionResult ConvertUTF8toUTF16 ( while (source < sourceEnd) { UTF32 ch = 0; unsigned short extraBytesToRead = trailingBytesForUTF8[*source]; - if (source + extraBytesToRead >= sourceEnd) { + if (extraBytesToRead >= sourceEnd - source) { result = sourceExhausted; break; } /* Do this check whether lenient or strict */ @@ -527,6 +482,68 @@ ConversionResult ConvertUTF8toUTF16 ( return result; } +/* --------------------------------------------------------------------- */ + +ConversionResult ConvertUTF8toUTF32 ( + const UTF8** sourceStart, const UTF8* sourceEnd, + UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) { + ConversionResult result = conversionOK; + const UTF8* source = *sourceStart; + UTF32* target = *targetStart; + while (source < sourceEnd) { + UTF32 ch = 0; + unsigned short extraBytesToRead = trailingBytesForUTF8[*source]; + if (extraBytesToRead >= sourceEnd - source) { + result = sourceExhausted; break; + } + /* Do this check whether lenient or strict */ + if (!isLegalUTF8(source, extraBytesToRead+1)) { + result = sourceIllegal; + break; + } + /* + * The cases all fall through. See "Note A" below. + */ + switch (extraBytesToRead) { + case 5: ch += *source++; ch <<= 6; + case 4: ch += *source++; ch <<= 6; + case 3: ch += *source++; ch <<= 6; + case 2: ch += *source++; ch <<= 6; + case 1: ch += *source++; ch <<= 6; + case 0: ch += *source++; + } + ch -= offsetsFromUTF8[extraBytesToRead]; + + if (target >= targetEnd) { + source -= (extraBytesToRead+1); /* Back up the source pointer! */ + result = targetExhausted; break; + } + if (ch <= UNI_MAX_LEGAL_UTF32) { + /* + * UTF-16 surrogate values are illegal in UTF-32, and anything + * over Plane 17 (> 0x10FFFF) is illegal. + */ + if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) { + if (flags == strictConversion) { + source -= (extraBytesToRead+1); /* return to the illegal value itself */ + result = sourceIllegal; + break; + } else { + *target++ = UNI_REPLACEMENT_CHAR; + } + } else { + *target++ = ch; + } + } else { /* i.e., ch > UNI_MAX_LEGAL_UTF32 */ + result = sourceIllegal; + *target++ = UNI_REPLACEMENT_CHAR; + } + } + *sourceStart = source; + *targetStart = target; + return result; +} + /* --------------------------------------------------------------------- Note A. |