diff options
Diffstat (limited to 'contrib/llvm/tools/clang/lib/Lex/LiteralSupport.cpp')
-rw-r--r-- | contrib/llvm/tools/clang/lib/Lex/LiteralSupport.cpp | 394 |
1 files changed, 246 insertions, 148 deletions
diff --git a/contrib/llvm/tools/clang/lib/Lex/LiteralSupport.cpp b/contrib/llvm/tools/clang/lib/Lex/LiteralSupport.cpp index 2c96c4d..70183fd 100644 --- a/contrib/llvm/tools/clang/lib/Lex/LiteralSupport.cpp +++ b/contrib/llvm/tools/clang/lib/Lex/LiteralSupport.cpp @@ -16,8 +16,8 @@ #include "clang/Lex/Preprocessor.h" #include "clang/Lex/LexDiagnostic.h" #include "clang/Basic/TargetInfo.h" -#include "llvm/ADT/StringRef.h" #include "llvm/ADT/StringExtras.h" +#include "llvm/Support/ErrorHandling.h" using namespace clang; /// HexDigitValue - Return the value of the specified hex digit, or -1 if it's @@ -29,12 +29,31 @@ static int HexDigitValue(char C) { return -1; } +static unsigned getCharWidth(tok::TokenKind kind, const TargetInfo &Target) { + switch (kind) { + default: llvm_unreachable("Unknown token type!"); + case tok::char_constant: + case tok::string_literal: + case tok::utf8_string_literal: + return Target.getCharWidth(); + case tok::wide_char_constant: + case tok::wide_string_literal: + return Target.getWCharWidth(); + case tok::utf16_char_constant: + case tok::utf16_string_literal: + return Target.getChar16Width(); + case tok::utf32_char_constant: + case tok::utf32_string_literal: + return Target.getChar32Width(); + } +} + /// ProcessCharEscape - Parse a standard C escape sequence, which can occur in /// either a character or a string literal. static unsigned ProcessCharEscape(const char *&ThisTokBuf, const char *ThisTokEnd, bool &HadError, - FullSourceLoc Loc, bool IsWide, - Diagnostic *Diags, const TargetInfo &Target) { + FullSourceLoc Loc, unsigned CharWidth, + DiagnosticsEngine *Diags) { // Skip the '\' char. ++ThisTokBuf; @@ -99,9 +118,6 @@ static unsigned ProcessCharEscape(const char *&ThisTokBuf, } // See if any bits will be truncated when evaluated as a character. - unsigned CharWidth = - IsWide ? Target.getWCharWidth() : Target.getCharWidth(); - if (CharWidth != 32 && (ResultChar >> CharWidth) != 0) { Overflow = true; ResultChar &= ~0U >> (32-CharWidth); @@ -129,9 +145,6 @@ static unsigned ProcessCharEscape(const char *&ThisTokBuf, ThisTokBuf[0] >= '0' && ThisTokBuf[0] <= '7'); // Check for overflow. Reject '\777', but not L'\777'. - unsigned CharWidth = - IsWide ? Target.getWCharWidth() : Target.getCharWidth(); - if (CharWidth != 32 && (ResultChar >> CharWidth) != 0) { if (Diags) Diags->Report(Loc, diag::warn_octal_escape_too_large); @@ -167,7 +180,7 @@ static unsigned ProcessCharEscape(const char *&ThisTokBuf, /// return the UTF32. static bool ProcessUCNEscape(const char *&ThisTokBuf, const char *ThisTokEnd, uint32_t &UcnVal, unsigned short &UcnLen, - FullSourceLoc Loc, Diagnostic *Diags, + FullSourceLoc Loc, DiagnosticsEngine *Diags, const LangOptions &Features) { if (!Features.CPlusPlus && !Features.C99 && Diags) Diags->Report(Loc, diag::warn_ucn_not_valid_in_c89); @@ -220,7 +233,8 @@ static bool ProcessUCNEscape(const char *&ThisTokBuf, const char *ThisTokEnd, /// we will likely rework our support for UCN's. static void EncodeUCNEscape(const char *&ThisTokBuf, const char *ThisTokEnd, char *&ResultBuf, bool &HadError, - FullSourceLoc Loc, bool wide, Diagnostic *Diags, + FullSourceLoc Loc, unsigned CharByteWidth, + DiagnosticsEngine *Diags, const LangOptions &Features) { typedef uint32_t UTF32; UTF32 UcnVal = 0; @@ -231,19 +245,22 @@ static void EncodeUCNEscape(const char *&ThisTokBuf, const char *ThisTokEnd, return; } - if (wide) { - (void)UcnLen; - assert((UcnLen== 4 || UcnLen== 8) && "only ucn length of 4 or 8 supported"); + assert((CharByteWidth == 1 || CharByteWidth == 2 || CharByteWidth) && + "only character widths of 1, 2, or 4 bytes supported"); - if (!Features.ShortWChar) { - // Note: our internal rep of wide char tokens is always little-endian. - *ResultBuf++ = (UcnVal & 0x000000FF); - *ResultBuf++ = (UcnVal & 0x0000FF00) >> 8; - *ResultBuf++ = (UcnVal & 0x00FF0000) >> 16; - *ResultBuf++ = (UcnVal & 0xFF000000) >> 24; - return; - } + (void)UcnLen; + assert((UcnLen== 4 || UcnLen== 8) && "only ucn length of 4 or 8 supported"); + if (CharByteWidth == 4) { + // Note: our internal rep of wide char tokens is always little-endian. + *ResultBuf++ = (UcnVal & 0x000000FF); + *ResultBuf++ = (UcnVal & 0x0000FF00) >> 8; + *ResultBuf++ = (UcnVal & 0x00FF0000) >> 16; + *ResultBuf++ = (UcnVal & 0xFF000000) >> 24; + return; + } + + if (CharByteWidth == 2) { // Convert to UTF16. if (UcnVal < (UTF32)0xFFFF) { *ResultBuf++ = (UcnVal & 0x000000FF); @@ -262,6 +279,9 @@ static void EncodeUCNEscape(const char *&ThisTokBuf, const char *ThisTokEnd, *ResultBuf++ = (surrogate2 & 0x0000FF00) >> 8; return; } + + assert(CharByteWidth == 1 && "UTF-8 encoding is only for 1 byte characters"); + // Now that we've parsed/checked the UCN, we convert from UTF32->UTF8. // The conversion below was inspired by: // http://www.unicode.org/Public/PROGRAMS/CVTUTF/ConvertUTF.c @@ -371,7 +391,7 @@ NumericLiteralParser(const char *begin, const char *end, // Done. } else if (isxdigit(*s) && !(*s == 'e' || *s == 'E')) { PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-begin), - diag::err_invalid_decimal_digit) << llvm::StringRef(s, 1); + diag::err_invalid_decimal_digit) << StringRef(s, 1); hadError = true; return; } else if (*s == '.') { @@ -434,7 +454,7 @@ NumericLiteralParser(const char *begin, const char *end, continue; // Success. case 'i': case 'I': - if (PP.getLangOptions().Microsoft) { + if (PP.getLangOptions().MicrosoftExt) { if (isFPConstant || isLong || isLongLong) break; // Allow i8, i16, i32, i64, and i128. @@ -498,7 +518,7 @@ NumericLiteralParser(const char *begin, const char *end, PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-begin), isFPConstant ? diag::err_invalid_suffix_float_constant : diag::err_invalid_suffix_integer_constant) - << llvm::StringRef(SuffixBegin, ThisTokEnd-SuffixBegin); + << StringRef(SuffixBegin, ThisTokEnd-SuffixBegin); hadError = true; return; } @@ -528,7 +548,7 @@ void NumericLiteralParser::ParseNumberStartingWithZero(SourceLocation TokLoc) { } // A binary exponent can appear with or with a '.'. If dotted, the // binary exponent is required. - if ((*s == 'p' || *s == 'P') && !PP.getLangOptions().CPlusPlus0x) { + if (*s == 'p' || *s == 'P') { const char *Exponent = s; s++; saw_exponent = true; @@ -542,12 +562,7 @@ void NumericLiteralParser::ParseNumberStartingWithZero(SourceLocation TokLoc) { } s = first_non_digit; - // In C++0x, we cannot support hexadecmial floating literals because - // they conflict with user-defined literals, so we warn in previous - // versions of C++ by default. - if (PP.getLangOptions().CPlusPlus) - PP.Diag(TokLoc, diag::ext_hexconstant_cplusplus); - else if (!PP.getLangOptions().HexFloats) + if (!PP.getLangOptions().HexFloats) PP.Diag(TokLoc, diag::ext_hexconstant_invalid); } else if (saw_period) { PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-ThisTokBegin), @@ -569,7 +584,7 @@ void NumericLiteralParser::ParseNumberStartingWithZero(SourceLocation TokLoc) { // Done. } else if (isxdigit(*s)) { PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-ThisTokBegin), - diag::err_invalid_binary_digit) << llvm::StringRef(s, 1); + diag::err_invalid_binary_digit) << StringRef(s, 1); hadError = true; } // Other suffixes will be diagnosed by the caller. @@ -599,7 +614,7 @@ void NumericLiteralParser::ParseNumberStartingWithZero(SourceLocation TokLoc) { // the code is using an incorrect base. if (isxdigit(*s) && *s != 'e' && *s != 'E') { PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-ThisTokBegin), - diag::err_invalid_octal_digit) << llvm::StringRef(s, 1); + diag::err_invalid_octal_digit) << StringRef(s, 1); hadError = true; return; } @@ -688,7 +703,6 @@ bool NumericLiteralParser::GetIntegerValue(llvm::APInt &Val) { llvm::APFloat::opStatus NumericLiteralParser::GetFloatValue(llvm::APFloat &Result) { using llvm::APFloat; - using llvm::StringRef; unsigned n = std::min(SuffixBegin - ThisTokBegin, ThisTokEnd - ThisTokBegin); return Result.convertFromString(StringRef(ThisTokBegin, n), @@ -696,14 +710,51 @@ NumericLiteralParser::GetFloatValue(llvm::APFloat &Result) { } +/// character-literal: [C++0x lex.ccon] +/// ' c-char-sequence ' +/// u' c-char-sequence ' +/// U' c-char-sequence ' +/// L' c-char-sequence ' +/// c-char-sequence: +/// c-char +/// c-char-sequence c-char +/// c-char: +/// any member of the source character set except the single-quote ', +/// backslash \, or new-line character +/// escape-sequence +/// universal-character-name +/// escape-sequence: [C++0x lex.ccon] +/// simple-escape-sequence +/// octal-escape-sequence +/// hexadecimal-escape-sequence +/// simple-escape-sequence: +/// one of \' \" \? \\ \a \b \f \n \r \t \v +/// octal-escape-sequence: +/// \ octal-digit +/// \ octal-digit octal-digit +/// \ octal-digit octal-digit octal-digit +/// hexadecimal-escape-sequence: +/// \x hexadecimal-digit +/// hexadecimal-escape-sequence hexadecimal-digit +/// universal-character-name: +/// \u hex-quad +/// \U hex-quad hex-quad +/// hex-quad: +/// hex-digit hex-digit hex-digit hex-digit +/// CharLiteralParser::CharLiteralParser(const char *begin, const char *end, - SourceLocation Loc, Preprocessor &PP) { + SourceLocation Loc, Preprocessor &PP, + tok::TokenKind kind) { // At this point we know that the character matches the regex "L?'.*'". HadError = false; - // Determine if this is a wide character. - IsWide = begin[0] == 'L'; - if (IsWide) ++begin; + Kind = kind; + + // Determine if this is a wide or UTF character. + if (Kind == tok::wide_char_constant || Kind == tok::utf16_char_constant || + Kind == tok::utf32_char_constant) { + ++begin; + } // Skip over the entry quote. assert(begin[0] == '\'' && "Invalid token lexed"); @@ -730,8 +781,9 @@ CharLiteralParser::CharLiteralParser(const char *begin, const char *end, // Is this a Universal Character Name escape? if (begin[0] != '\\') // If this is a normal character, consume it. - ResultChar = *begin++; + ResultChar = (unsigned char)*begin++; else { // Otherwise, this is an escape character. + unsigned CharWidth = getCharWidth(Kind, PP.getTargetInfo()); // Check for UCN. if (begin[1] == 'u' || begin[1] == 'U') { uint32_t utf32 = 0; @@ -742,19 +794,22 @@ CharLiteralParser::CharLiteralParser(const char *begin, const char *end, HadError = 1; } ResultChar = utf32; + if (CharWidth != 32 && (ResultChar >> CharWidth) != 0) { + PP.Diag(Loc, diag::warn_ucn_escape_too_large); + ResultChar &= ~0U >> (32-CharWidth); + } } else { // Otherwise, this is a non-UCN escape character. Process it. ResultChar = ProcessCharEscape(begin, end, HadError, FullSourceLoc(Loc,PP.getSourceManager()), - IsWide, - &PP.getDiagnostics(), PP.getTargetInfo()); + CharWidth, &PP.getDiagnostics()); } } // If this is a multi-character constant (e.g. 'abc'), handle it. These are // implementation defined (C99 6.4.4.4p10). if (NumCharsSoFar) { - if (IsWide) { + if (!isAscii()) { // Emulate GCC's (unintentional?) behavior: L'ab' -> L'b'. LitVal = 0; } else { @@ -776,8 +831,8 @@ CharLiteralParser::CharLiteralParser(const char *begin, const char *end, if (NumCharsSoFar > 1) { // Warn about discarding the top bits for multi-char wide-character // constants (L'abcd'). - if (IsWide) - PP.Diag(Loc, diag::warn_extraneous_wide_char_constant); + if (!isAscii()) + PP.Diag(Loc, diag::warn_extraneous_char_constant); else if (NumCharsSoFar != 4) PP.Diag(Loc, diag::ext_multichar_character_literal); else @@ -789,47 +844,62 @@ CharLiteralParser::CharLiteralParser(const char *begin, const char *end, // Transfer the value from APInt to uint64_t Value = LitVal.getZExtValue(); - if (IsWide && PP.getLangOptions().ShortWChar && Value > 0xFFFF) - PP.Diag(Loc, diag::warn_ucn_escape_too_large); - // If this is a single narrow character, sign extend it (e.g. '\xFF' is "-1") // if 'char' is signed for this target (C99 6.4.4.4p10). Note that multiple // character constants are not sign extended in the this implementation: // '\xFF\xFF' = 65536 and '\x0\xFF' = 255, which matches GCC. - if (!IsWide && NumCharsSoFar == 1 && (Value & 128) && + if (isAscii() && NumCharsSoFar == 1 && (Value & 128) && PP.getLangOptions().CharIsSigned) Value = (signed char)Value; } -/// string-literal: [C99 6.4.5] -/// " [s-char-sequence] " -/// L" [s-char-sequence] " +/// string-literal: [C++0x lex.string] +/// encoding-prefix " [s-char-sequence] " +/// encoding-prefix R raw-string +/// encoding-prefix: +/// u8 +/// u +/// U +/// L /// s-char-sequence: /// s-char /// s-char-sequence s-char /// s-char: -/// any source character except the double quote ", -/// backslash \, or newline character -/// escape-character -/// universal-character-name -/// escape-character: [C99 6.4.4.4] -/// \ escape-code +/// any member of the source character set except the double-quote ", +/// backslash \, or new-line character +/// escape-sequence /// universal-character-name -/// escape-code: -/// character-escape-code -/// octal-escape-code -/// hex-escape-code -/// character-escape-code: one of -/// n t b r f v a -/// \ ' " ? -/// octal-escape-code: -/// octal-digit -/// octal-digit octal-digit -/// octal-digit octal-digit octal-digit -/// hex-escape-code: -/// x hex-digit -/// hex-escape-code hex-digit +/// raw-string: +/// " d-char-sequence ( r-char-sequence ) d-char-sequence " +/// r-char-sequence: +/// r-char +/// r-char-sequence r-char +/// r-char: +/// any member of the source character set, except a right parenthesis ) +/// followed by the initial d-char-sequence (which may be empty) +/// followed by a double quote ". +/// d-char-sequence: +/// d-char +/// d-char-sequence d-char +/// d-char: +/// any member of the basic source character set except: +/// space, the left parenthesis (, the right parenthesis ), +/// the backslash \, and the control characters representing horizontal +/// tab, vertical tab, form feed, and newline. +/// escape-sequence: [C++0x lex.ccon] +/// simple-escape-sequence +/// octal-escape-sequence +/// hexadecimal-escape-sequence +/// simple-escape-sequence: +/// one of \' \" \? \\ \a \b \f \n \r \t \v +/// octal-escape-sequence: +/// \ octal-digit +/// \ octal-digit octal-digit +/// \ octal-digit octal-digit octal-digit +/// hexadecimal-escape-sequence: +/// \x hexadecimal-digit +/// hexadecimal-escape-sequence hexadecimal-digit /// universal-character-name: /// \u hex-quad /// \U hex-quad hex-quad @@ -841,8 +911,8 @@ StringLiteralParser(const Token *StringToks, unsigned NumStringToks, Preprocessor &PP, bool Complain) : SM(PP.getSourceManager()), Features(PP.getLangOptions()), Target(PP.getTargetInfo()), Diags(Complain ? &PP.getDiagnostics() : 0), - MaxTokenLength(0), SizeBound(0), wchar_tByteWidth(0), - ResultPtr(ResultBuf.data()), hadError(false), AnyWide(false), Pascal(false) { + MaxTokenLength(0), SizeBound(0), CharByteWidth(0), Kind(tok::unknown), + ResultPtr(ResultBuf.data()), hadError(false), Pascal(false) { init(StringToks, NumStringToks); } @@ -862,7 +932,7 @@ void StringLiteralParser::init(const Token *StringToks, unsigned NumStringToks){ MaxTokenLength = StringToks[0].getLength(); assert(StringToks[0].getLength() >= 2 && "literal token is invalid!"); SizeBound = StringToks[0].getLength()-2; // -2 for "". - AnyWide = StringToks[0].is(tok::wide_string_literal); + Kind = StringToks[0].getKind(); hadError = false; @@ -883,8 +953,18 @@ void StringLiteralParser::init(const Token *StringToks, unsigned NumStringToks){ if (StringToks[i].getLength() > MaxTokenLength) MaxTokenLength = StringToks[i].getLength(); - // Remember if we see any wide strings. - AnyWide |= StringToks[i].is(tok::wide_string_literal); + // Remember if we see any wide or utf-8/16/32 strings. + // Also check for illegal concatenations. + if (StringToks[i].isNot(Kind) && StringToks[i].isNot(tok::string_literal)) { + if (isAscii()) { + Kind = StringToks[i].getKind(); + } else { + if (Diags) + Diags->Report(FullSourceLoc(StringToks[i].getLocation(), SM), + diag::err_unsupported_string_concat); + hadError = true; + } + } } // Include space for the null terminator. @@ -892,19 +972,14 @@ void StringLiteralParser::init(const Token *StringToks, unsigned NumStringToks){ // TODO: K&R warning: "traditional C rejects string constant concatenation" - // Get the width in bytes of wchar_t. If no wchar_t strings are used, do not - // query the target. As such, wchar_tByteWidth is only valid if AnyWide=true. - wchar_tByteWidth = ~0U; - if (AnyWide) { - wchar_tByteWidth = Target.getWCharWidth(); - assert((wchar_tByteWidth & 7) == 0 && "Assumes wchar_t is byte multiple!"); - wchar_tByteWidth /= 8; - } + // Get the width in bytes of char/wchar_t/char16_t/char32_t + CharByteWidth = getCharWidth(Kind, Target); + assert((CharByteWidth & 7) == 0 && "Assumes character size is byte multiple"); + CharByteWidth /= 8; // The output buffer size needs to be large enough to hold wide characters. // This is a worst-case assumption which basically corresponds to L"" "long". - if (AnyWide) - SizeBound *= wchar_tByteWidth; + SizeBound *= CharByteWidth; // Size the temporary buffer to hold the result string data. ResultBuf.resize(SizeBound); @@ -929,78 +1004,82 @@ void StringLiteralParser::init(const Token *StringToks, unsigned NumStringToks){ Lexer::getSpelling(StringToks[i], ThisTokBuf, SM, Features, &StringInvalid); if (StringInvalid) { - hadError = 1; + hadError = true; continue; } const char *ThisTokEnd = ThisTokBuf+ThisTokLen-1; // Skip end quote. - bool wide = false; // TODO: Input character set mapping support. - // Skip L marker for wide strings. - if (ThisTokBuf[0] == 'L') { - wide = true; + // Skip marker for wide or unicode strings. + if (ThisTokBuf[0] == 'L' || ThisTokBuf[0] == 'u' || ThisTokBuf[0] == 'U') { ++ThisTokBuf; + // Skip 8 of u8 marker for utf8 strings. + if (ThisTokBuf[0] == '8') + ++ThisTokBuf; } - assert(ThisTokBuf[0] == '"' && "Expected quote, lexer broken?"); - ++ThisTokBuf; - - // Check if this is a pascal string - if (Features.PascalStrings && ThisTokBuf + 1 != ThisTokEnd && - ThisTokBuf[0] == '\\' && ThisTokBuf[1] == 'p') { + // Check for raw string + if (ThisTokBuf[0] == 'R') { + ThisTokBuf += 2; // skip R" - // If the \p sequence is found in the first token, we have a pascal string - // Otherwise, if we already have a pascal string, ignore the first \p - if (i == 0) { + const char *Prefix = ThisTokBuf; + while (ThisTokBuf[0] != '(') ++ThisTokBuf; - Pascal = true; - } else if (Pascal) - ThisTokBuf += 2; - } + ++ThisTokBuf; // skip '(' + + // remove same number of characters from the end + if (ThisTokEnd >= ThisTokBuf + (ThisTokBuf - Prefix)) + ThisTokEnd -= (ThisTokBuf - Prefix); + + // Copy the string over + CopyStringFragment(StringRef(ThisTokBuf, ThisTokEnd - ThisTokBuf)); + } else { + assert(ThisTokBuf[0] == '"' && "Expected quote, lexer broken?"); + ++ThisTokBuf; // skip " + + // Check if this is a pascal string + if (Features.PascalStrings && ThisTokBuf + 1 != ThisTokEnd && + ThisTokBuf[0] == '\\' && ThisTokBuf[1] == 'p') { - while (ThisTokBuf != ThisTokEnd) { - // Is this a span of non-escape characters? - if (ThisTokBuf[0] != '\\') { - const char *InStart = ThisTokBuf; - do { + // If the \p sequence is found in the first token, we have a pascal string + // Otherwise, if we already have a pascal string, ignore the first \p + if (i == 0) { ++ThisTokBuf; - } while (ThisTokBuf != ThisTokEnd && ThisTokBuf[0] != '\\'); - - // Copy the character span over. - unsigned Len = ThisTokBuf-InStart; - if (!AnyWide) { - memcpy(ResultPtr, InStart, Len); - ResultPtr += Len; - } else { - // Note: our internal rep of wide char tokens is always little-endian. - for (; Len; --Len, ++InStart) { - *ResultPtr++ = InStart[0]; - // Add zeros at the end. - for (unsigned i = 1, e = wchar_tByteWidth; i != e; ++i) - *ResultPtr++ = 0; - } - } - continue; - } - // Is this a Universal Character Name escape? - if (ThisTokBuf[1] == 'u' || ThisTokBuf[1] == 'U') { - EncodeUCNEscape(ThisTokBuf, ThisTokEnd, ResultPtr, - hadError, FullSourceLoc(StringToks[i].getLocation(),SM), - wide, Diags, Features); - continue; + Pascal = true; + } else if (Pascal) + ThisTokBuf += 2; } - // Otherwise, this is a non-UCN escape character. Process it. - unsigned ResultChar = - ProcessCharEscape(ThisTokBuf, ThisTokEnd, hadError, - FullSourceLoc(StringToks[i].getLocation(), SM), - AnyWide, Diags, Target); - // Note: our internal rep of wide char tokens is always little-endian. - *ResultPtr++ = ResultChar & 0xFF; + while (ThisTokBuf != ThisTokEnd) { + // Is this a span of non-escape characters? + if (ThisTokBuf[0] != '\\') { + const char *InStart = ThisTokBuf; + do { + ++ThisTokBuf; + } while (ThisTokBuf != ThisTokEnd && ThisTokBuf[0] != '\\'); + + // Copy the character span over. + CopyStringFragment(StringRef(InStart, ThisTokBuf - InStart)); + continue; + } + // Is this a Universal Character Name escape? + if (ThisTokBuf[1] == 'u' || ThisTokBuf[1] == 'U') { + EncodeUCNEscape(ThisTokBuf, ThisTokEnd, ResultPtr, + hadError, FullSourceLoc(StringToks[i].getLocation(),SM), + CharByteWidth, Diags, Features); + continue; + } + // Otherwise, this is a non-UCN escape character. Process it. + unsigned ResultChar = + ProcessCharEscape(ThisTokBuf, ThisTokEnd, hadError, + FullSourceLoc(StringToks[i].getLocation(), SM), + CharByteWidth*8, Diags); + + // Note: our internal rep of wide char tokens is always little-endian. + *ResultPtr++ = ResultChar & 0xFF; - if (AnyWide) { - for (unsigned i = 1, e = wchar_tByteWidth; i != e; ++i) + for (unsigned i = 1, e = CharByteWidth; i != e; ++i) *ResultPtr++ = ResultChar >> i*8; } } @@ -1008,8 +1087,7 @@ void StringLiteralParser::init(const Token *StringToks, unsigned NumStringToks){ if (Pascal) { ResultBuf[0] = ResultPtr-&ResultBuf[0]-1; - if (AnyWide) - ResultBuf[0] /= wchar_tByteWidth; + ResultBuf[0] /= CharByteWidth; // Verify that pascal strings aren't too large. if (GetStringLength() > 256) { @@ -1018,7 +1096,7 @@ void StringLiteralParser::init(const Token *StringToks, unsigned NumStringToks){ diag::err_pascal_string_too_long) << SourceRange(StringToks[0].getLocation(), StringToks[NumStringToks-1].getLocation()); - hadError = 1; + hadError = true; return; } } else if (Diags) { @@ -1036,6 +1114,25 @@ void StringLiteralParser::init(const Token *StringToks, unsigned NumStringToks){ } +/// copyStringFragment - This function copies from Start to End into ResultPtr. +/// Performs widening for multi-byte characters. +void StringLiteralParser::CopyStringFragment(StringRef Fragment) { + // Copy the character span over. + if (CharByteWidth == 1) { + memcpy(ResultPtr, Fragment.data(), Fragment.size()); + ResultPtr += Fragment.size(); + } else { + // Note: our internal rep of wide char tokens is always little-endian. + for (StringRef::iterator I=Fragment.begin(), E=Fragment.end(); I!=E; ++I) { + *ResultPtr++ = *I; + // Add zeros at the end. + for (unsigned i = 1, e = CharByteWidth; i != e; ++i) + *ResultPtr++ = 0; + } + } +} + + /// getOffsetOfStringByte - This function returns the offset of the /// specified byte of the string data represented by Token. This handles /// advancing over escape sequences in the string. @@ -1052,7 +1149,8 @@ unsigned StringLiteralParser::getOffsetOfStringByte(const Token &Tok, if (StringInvalid) return 0; - assert(SpellingPtr[0] != 'L' && "Doesn't handle wide strings yet"); + assert(SpellingPtr[0] != 'L' && SpellingPtr[0] != 'u' && + SpellingPtr[0] != 'U' && "Doesn't handle wide or utf strings yet"); const char *SpellingStart = SpellingPtr; @@ -1077,7 +1175,7 @@ unsigned StringLiteralParser::getOffsetOfStringByte(const Token &Tok, bool HadError = false; ProcessCharEscape(SpellingPtr, SpellingEnd, HadError, FullSourceLoc(Tok.getLocation(), SM), - false, Diags, Target); + CharByteWidth*8, Diags); assert(!HadError && "This method isn't valid on erroneous strings"); --ByteNo; } |