summaryrefslogtreecommitdiffstats
path: root/contrib/llvm/tools/clang/lib/Basic/ConvertUTF.c
diff options
context:
space:
mode:
Diffstat (limited to 'contrib/llvm/tools/clang/lib/Basic/ConvertUTF.c')
-rw-r--r--contrib/llvm/tools/clang/lib/Basic/ConvertUTF.c143
1 files changed, 80 insertions, 63 deletions
diff --git a/contrib/llvm/tools/clang/lib/Basic/ConvertUTF.c b/contrib/llvm/tools/clang/lib/Basic/ConvertUTF.c
index 124e386..e197003 100644
--- a/contrib/llvm/tools/clang/lib/Basic/ConvertUTF.c
+++ b/contrib/llvm/tools/clang/lib/Basic/ConvertUTF.c
@@ -339,67 +339,6 @@ ConversionResult ConvertUTF32toUTF8 (
return result;
}
-/* --------------------------------------------------------------------- */
-
-ConversionResult ConvertUTF8toUTF32 (
- const UTF8** sourceStart, const UTF8* sourceEnd,
- UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) {
- ConversionResult result = conversionOK;
- const UTF8* source = *sourceStart;
- UTF32* target = *targetStart;
- while (source < sourceEnd) {
- UTF32 ch = 0;
- unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
- if (source + extraBytesToRead >= sourceEnd) {
- result = sourceExhausted; break;
- }
- /* Do this check whether lenient or strict */
- if (!isLegalUTF8(source, extraBytesToRead+1)) {
- result = sourceIllegal;
- break;
- }
- /*
- * The cases all fall through. See "Note A" below.
- */
- switch (extraBytesToRead) {
- case 5: ch += *source++; ch <<= 6;
- case 4: ch += *source++; ch <<= 6;
- case 3: ch += *source++; ch <<= 6;
- case 2: ch += *source++; ch <<= 6;
- case 1: ch += *source++; ch <<= 6;
- case 0: ch += *source++;
- }
- ch -= offsetsFromUTF8[extraBytesToRead];
-
- if (target >= targetEnd) {
- source -= (extraBytesToRead+1); /* Back up the source pointer! */
- result = targetExhausted; break;
- }
- if (ch <= UNI_MAX_LEGAL_UTF32) {
- /*
- * UTF-16 surrogate values are illegal in UTF-32, and anything
- * over Plane 17 (> 0x10FFFF) is illegal.
- */
- if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
- if (flags == strictConversion) {
- source -= (extraBytesToRead+1); /* return to the illegal value itself */
- result = sourceIllegal;
- break;
- } else {
- *target++ = UNI_REPLACEMENT_CHAR;
- }
- } else {
- *target++ = ch;
- }
- } else { /* i.e., ch > UNI_MAX_LEGAL_UTF32 */
- result = sourceIllegal;
- *target++ = UNI_REPLACEMENT_CHAR;
- }
- }
- *sourceStart = source;
- *targetStart = target;
- return result;
-}
#endif
/* --------------------------------------------------------------------- */
@@ -448,7 +387,7 @@ static Boolean isLegalUTF8(const UTF8 *source, int length) {
*/
Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd) {
int length = trailingBytesForUTF8[*source]+1;
- if (source+length > sourceEnd) {
+ if (length > sourceEnd - source) {
return false;
}
return isLegalUTF8(source, length);
@@ -456,6 +395,22 @@ Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd) {
/* --------------------------------------------------------------------- */
+/*
+ * Exported function to return whether a UTF-8 string is legal or not.
+ * This is not used here; it's just exported.
+ */
+Boolean isLegalUTF8String(const UTF8 *source, const UTF8 *sourceEnd) {
+ while (source != sourceEnd) {
+ int length = trailingBytesForUTF8[*source] + 1;
+ if (length > sourceEnd - source || !isLegalUTF8(source, length))
+ return false;
+ source += length;
+ }
+ return true;
+}
+
+/* --------------------------------------------------------------------- */
+
ConversionResult ConvertUTF8toUTF16 (
const UTF8** sourceStart, const UTF8* sourceEnd,
UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
@@ -465,7 +420,7 @@ ConversionResult ConvertUTF8toUTF16 (
while (source < sourceEnd) {
UTF32 ch = 0;
unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
- if (source + extraBytesToRead >= sourceEnd) {
+ if (extraBytesToRead >= sourceEnd - source) {
result = sourceExhausted; break;
}
/* Do this check whether lenient or strict */
@@ -527,6 +482,68 @@ ConversionResult ConvertUTF8toUTF16 (
return result;
}
+/* --------------------------------------------------------------------- */
+
+ConversionResult ConvertUTF8toUTF32 (
+ const UTF8** sourceStart, const UTF8* sourceEnd,
+ UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) {
+ ConversionResult result = conversionOK;
+ const UTF8* source = *sourceStart;
+ UTF32* target = *targetStart;
+ while (source < sourceEnd) {
+ UTF32 ch = 0;
+ unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
+ if (extraBytesToRead >= sourceEnd - source) {
+ result = sourceExhausted; break;
+ }
+ /* Do this check whether lenient or strict */
+ if (!isLegalUTF8(source, extraBytesToRead+1)) {
+ result = sourceIllegal;
+ break;
+ }
+ /*
+ * The cases all fall through. See "Note A" below.
+ */
+ switch (extraBytesToRead) {
+ case 5: ch += *source++; ch <<= 6;
+ case 4: ch += *source++; ch <<= 6;
+ case 3: ch += *source++; ch <<= 6;
+ case 2: ch += *source++; ch <<= 6;
+ case 1: ch += *source++; ch <<= 6;
+ case 0: ch += *source++;
+ }
+ ch -= offsetsFromUTF8[extraBytesToRead];
+
+ if (target >= targetEnd) {
+ source -= (extraBytesToRead+1); /* Back up the source pointer! */
+ result = targetExhausted; break;
+ }
+ if (ch <= UNI_MAX_LEGAL_UTF32) {
+ /*
+ * UTF-16 surrogate values are illegal in UTF-32, and anything
+ * over Plane 17 (> 0x10FFFF) is illegal.
+ */
+ if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
+ if (flags == strictConversion) {
+ source -= (extraBytesToRead+1); /* return to the illegal value itself */
+ result = sourceIllegal;
+ break;
+ } else {
+ *target++ = UNI_REPLACEMENT_CHAR;
+ }
+ } else {
+ *target++ = ch;
+ }
+ } else { /* i.e., ch > UNI_MAX_LEGAL_UTF32 */
+ result = sourceIllegal;
+ *target++ = UNI_REPLACEMENT_CHAR;
+ }
+ }
+ *sourceStart = source;
+ *targetStart = target;
+ return result;
+}
+
/* ---------------------------------------------------------------------
Note A.
OpenPOWER on IntegriCloud