summaryrefslogtreecommitdiffstats
path: root/lib/Lex/LiteralSupport.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'lib/Lex/LiteralSupport.cpp')
-rw-r--r--lib/Lex/LiteralSupport.cpp464
1 files changed, 340 insertions, 124 deletions
diff --git a/lib/Lex/LiteralSupport.cpp b/lib/Lex/LiteralSupport.cpp
index 70183fd..c1d228b 100644
--- a/lib/Lex/LiteralSupport.cpp
+++ b/lib/Lex/LiteralSupport.cpp
@@ -16,6 +16,7 @@
#include "clang/Lex/Preprocessor.h"
#include "clang/Lex/LexDiagnostic.h"
#include "clang/Basic/TargetInfo.h"
+#include "clang/Basic/ConvertUTF.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/Support/ErrorHandling.h"
using namespace clang;
@@ -178,15 +179,16 @@ static unsigned ProcessCharEscape(const char *&ThisTokBuf,
/// ProcessUCNEscape - Read the Universal Character Name, check constraints and
/// return the UTF32.
-static bool ProcessUCNEscape(const char *&ThisTokBuf, const char *ThisTokEnd,
+static bool ProcessUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf,
+ const char *ThisTokEnd,
uint32_t &UcnVal, unsigned short &UcnLen,
FullSourceLoc Loc, DiagnosticsEngine *Diags,
- const LangOptions &Features) {
+ const LangOptions &Features,
+ bool in_char_string_literal = false) {
if (!Features.CPlusPlus && !Features.C99 && Diags)
Diags->Report(Loc, diag::warn_ucn_not_valid_in_c89);
- // Save the beginning of the string (for error diagnostics).
- const char *ThisTokBegin = ThisTokBuf;
+ const char *UcnBegin = ThisTokBuf;
// Skip the '\u' char's.
ThisTokBuf += 2;
@@ -208,22 +210,43 @@ static bool ProcessUCNEscape(const char *&ThisTokBuf, const char *ThisTokEnd,
if (UcnLenSave) {
if (Diags) {
SourceLocation L =
- Lexer::AdvanceToTokenCharacter(Loc, ThisTokBuf-ThisTokBegin,
+ Lexer::AdvanceToTokenCharacter(Loc, UcnBegin - ThisTokBegin,
Loc.getManager(), Features);
- Diags->Report(FullSourceLoc(L, Loc.getManager()),
- diag::err_ucn_escape_incomplete);
+ Diags->Report(L, diag::err_ucn_escape_incomplete);
}
return false;
}
- // Check UCN constraints (C99 6.4.3p2).
- if ((UcnVal < 0xa0 &&
- (UcnVal != 0x24 && UcnVal != 0x40 && UcnVal != 0x60 )) // $, @, `
- || (UcnVal >= 0xD800 && UcnVal <= 0xDFFF)
- || (UcnVal > 0x10FFFF)) /* the maximum legal UTF32 value */ {
+
+ // Check UCN constraints (C99 6.4.3p2) [C++11 lex.charset p2]
+ if ((0xD800 <= UcnVal && UcnVal <= 0xDFFF) || // surrogate codepoints
+ UcnVal > 0x10FFFF) { // maximum legal UTF32 value
if (Diags)
Diags->Report(Loc, diag::err_ucn_escape_invalid);
return false;
}
+
+ // C++11 allows UCNs that refer to control characters and basic source
+ // characters inside character and string literals
+ if (UcnVal < 0xa0 &&
+ (UcnVal != 0x24 && UcnVal != 0x40 && UcnVal != 0x60)) { // $, @, `
+ bool IsError = (!Features.CPlusPlus0x || !in_char_string_literal);
+ if (Diags) {
+ SourceLocation UcnBeginLoc =
+ Lexer::AdvanceToTokenCharacter(Loc, UcnBegin - ThisTokBegin,
+ Loc.getManager(), Features);
+ char BasicSCSChar = UcnVal;
+ if (UcnVal >= 0x20 && UcnVal < 0x7f)
+ Diags->Report(UcnBeginLoc, IsError ? diag::err_ucn_escape_basic_scs :
+ diag::warn_cxx98_compat_literal_ucn_escape_basic_scs)
+ << StringRef(&BasicSCSChar, 1);
+ else
+ Diags->Report(UcnBeginLoc, IsError ? diag::err_ucn_control_character :
+ diag::warn_cxx98_compat_literal_ucn_control_character);
+ }
+ if (IsError)
+ return false;
+ }
+
return true;
}
@@ -231,7 +254,8 @@ static bool ProcessUCNEscape(const char *&ThisTokBuf, const char *ThisTokEnd,
/// convert the UTF32 to UTF8 or UTF16. This is a subroutine of
/// StringLiteralParser. When we decide to implement UCN's for identifiers,
/// we will likely rework our support for UCN's.
-static void EncodeUCNEscape(const char *&ThisTokBuf, const char *ThisTokEnd,
+static void EncodeUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf,
+ const char *ThisTokEnd,
char *&ResultBuf, bool &HadError,
FullSourceLoc Loc, unsigned CharByteWidth,
DiagnosticsEngine *Diags,
@@ -239,8 +263,8 @@ static void EncodeUCNEscape(const char *&ThisTokBuf, const char *ThisTokEnd,
typedef uint32_t UTF32;
UTF32 UcnVal = 0;
unsigned short UcnLen = 0;
- if (!ProcessUCNEscape(ThisTokBuf, ThisTokEnd, UcnVal, UcnLen, Loc, Diags,
- Features)) {
+ if (!ProcessUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, UcnVal, UcnLen,
+ Loc, Diags, Features, true)) {
HadError = 1;
return;
}
@@ -252,31 +276,30 @@ static void EncodeUCNEscape(const char *&ThisTokBuf, const char *ThisTokEnd,
assert((UcnLen== 4 || UcnLen== 8) && "only ucn length of 4 or 8 supported");
if (CharByteWidth == 4) {
- // Note: our internal rep of wide char tokens is always little-endian.
- *ResultBuf++ = (UcnVal & 0x000000FF);
- *ResultBuf++ = (UcnVal & 0x0000FF00) >> 8;
- *ResultBuf++ = (UcnVal & 0x00FF0000) >> 16;
- *ResultBuf++ = (UcnVal & 0xFF000000) >> 24;
+ // FIXME: Make the type of the result buffer correct instead of
+ // using reinterpret_cast.
+ UTF32 *ResultPtr = reinterpret_cast<UTF32*>(ResultBuf);
+ *ResultPtr = UcnVal;
+ ResultBuf += 4;
return;
}
if (CharByteWidth == 2) {
- // Convert to UTF16.
+ // FIXME: Make the type of the result buffer correct instead of
+ // using reinterpret_cast.
+ UTF16 *ResultPtr = reinterpret_cast<UTF16*>(ResultBuf);
+
if (UcnVal < (UTF32)0xFFFF) {
- *ResultBuf++ = (UcnVal & 0x000000FF);
- *ResultBuf++ = (UcnVal & 0x0000FF00) >> 8;
+ *ResultPtr = UcnVal;
+ ResultBuf += 2;
return;
}
- if (Diags) Diags->Report(Loc, diag::warn_ucn_escape_too_large);
- typedef uint16_t UTF16;
+ // Convert to UTF16.
UcnVal -= 0x10000;
- UTF16 surrogate1 = 0xD800 + (UcnVal >> 10);
- UTF16 surrogate2 = 0xDC00 + (UcnVal & 0x3FF);
- *ResultBuf++ = (surrogate1 & 0x000000FF);
- *ResultBuf++ = (surrogate1 & 0x0000FF00) >> 8;
- *ResultBuf++ = (surrogate2 & 0x000000FF);
- *ResultBuf++ = (surrogate2 & 0x0000FF00) >> 8;
+ *ResultPtr = 0xD800 + (UcnVal >> 10);
+ *(ResultPtr+1) = 0xDC00 + (UcnVal & 0x3FF);
+ ResultBuf += 4;
return;
}
@@ -323,6 +346,10 @@ static void EncodeUCNEscape(const char *&ThisTokBuf, const char *ThisTokEnd,
/// decimal-constant integer-suffix
/// octal-constant integer-suffix
/// hexadecimal-constant integer-suffix
+/// user-defined-integer-literal: [C++11 lex.ext]
+/// decimal-literal ud-suffix
+/// octal-literal ud-suffix
+/// hexadecimal-literal ud-suffix
/// decimal-constant:
/// nonzero-digit
/// decimal-constant digit
@@ -372,6 +399,7 @@ NumericLiteralParser(const char *begin, const char *end,
s = DigitsBegin = begin;
saw_exponent = false;
saw_period = false;
+ saw_ud_suffix = false;
isLong = false;
isUnsigned = false;
isLongLong = false;
@@ -454,7 +482,7 @@ NumericLiteralParser(const char *begin, const char *end,
continue; // Success.
case 'i':
case 'I':
- if (PP.getLangOptions().MicrosoftExt) {
+ if (PP.getLangOpts().MicrosoftExt) {
if (isFPConstant || isLong || isLongLong) break;
// Allow i8, i16, i32, i64, and i128.
@@ -509,13 +537,20 @@ NumericLiteralParser(const char *begin, const char *end,
isImaginary = true;
continue; // Success.
}
- // If we reached here, there was an error.
+ // If we reached here, there was an error or a ud-suffix.
break;
}
- // Report an error if there are any.
if (s != ThisTokEnd) {
- PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-begin),
+ if (PP.getLangOpts().CPlusPlus0x && s == SuffixBegin && *s == '_') {
+ // We have a ud-suffix! By C++11 [lex.ext]p10, ud-suffixes not starting
+ // with an '_' are ill-formed.
+ saw_ud_suffix = true;
+ return;
+ }
+
+ // Report an error if there are any.
+ PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, SuffixBegin-begin),
isFPConstant ? diag::err_invalid_suffix_float_constant :
diag::err_invalid_suffix_integer_constant)
<< StringRef(SuffixBegin, ThisTokEnd-SuffixBegin);
@@ -539,13 +574,24 @@ void NumericLiteralParser::ParseNumberStartingWithZero(SourceLocation TokLoc) {
radix = 16;
DigitsBegin = s;
s = SkipHexDigits(s);
+ bool noSignificand = (s == DigitsBegin);
if (s == ThisTokEnd) {
// Done.
} else if (*s == '.') {
s++;
saw_period = true;
+ const char *floatDigitsBegin = s;
s = SkipHexDigits(s);
+ noSignificand &= (floatDigitsBegin == s);
+ }
+
+ if (noSignificand) {
+ PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-ThisTokBegin), \
+ diag::err_hexconstant_requires_digits);
+ hadError = true;
+ return;
}
+
// A binary exponent can appear with or with a '.'. If dotted, the
// binary exponent is required.
if (*s == 'p' || *s == 'P') {
@@ -562,7 +608,7 @@ void NumericLiteralParser::ParseNumberStartingWithZero(SourceLocation TokLoc) {
}
s = first_non_digit;
- if (!PP.getLangOptions().HexFloats)
+ if (!PP.getLangOpts().HexFloats)
PP.Diag(TokLoc, diag::ext_hexconstant_invalid);
} else if (saw_period) {
PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-ThisTokBegin),
@@ -710,7 +756,11 @@ NumericLiteralParser::GetFloatValue(llvm::APFloat &Result) {
}
-/// character-literal: [C++0x lex.ccon]
+/// user-defined-character-literal: [C++11 lex.ext]
+/// character-literal ud-suffix
+/// ud-suffix:
+/// identifier
+/// character-literal: [C++11 lex.ccon]
/// ' c-char-sequence '
/// u' c-char-sequence '
/// U' c-char-sequence '
@@ -723,7 +773,7 @@ NumericLiteralParser::GetFloatValue(llvm::APFloat &Result) {
/// backslash \, or new-line character
/// escape-sequence
/// universal-character-name
-/// escape-sequence: [C++0x lex.ccon]
+/// escape-sequence:
/// simple-escape-sequence
/// octal-escape-sequence
/// hexadecimal-escape-sequence
@@ -736,7 +786,7 @@ NumericLiteralParser::GetFloatValue(llvm::APFloat &Result) {
/// hexadecimal-escape-sequence:
/// \x hexadecimal-digit
/// hexadecimal-escape-sequence hexadecimal-digit
-/// universal-character-name:
+/// universal-character-name: [C++11 lex.charset]
/// \u hex-quad
/// \U hex-quad hex-quad
/// hex-quad:
@@ -745,14 +795,15 @@ NumericLiteralParser::GetFloatValue(llvm::APFloat &Result) {
CharLiteralParser::CharLiteralParser(const char *begin, const char *end,
SourceLocation Loc, Preprocessor &PP,
tok::TokenKind kind) {
- // At this point we know that the character matches the regex "L?'.*'".
+ // At this point we know that the character matches the regex "(L|u|U)?'.*'".
HadError = false;
Kind = kind;
- // Determine if this is a wide or UTF character.
- if (Kind == tok::wide_char_constant || Kind == tok::utf16_char_constant ||
- Kind == tok::utf32_char_constant) {
+ const char *TokBegin = begin;
+
+ // Skip over wide character determinant.
+ if (Kind != tok::char_constant) {
++begin;
}
@@ -760,6 +811,20 @@ CharLiteralParser::CharLiteralParser(const char *begin, const char *end,
assert(begin[0] == '\'' && "Invalid token lexed");
++begin;
+ // Remove an optional ud-suffix.
+ if (end[-1] != '\'') {
+ const char *UDSuffixEnd = end;
+ do {
+ --end;
+ } while (end[-1] != '\'');
+ UDSuffixBuf.assign(end, UDSuffixEnd);
+ UDSuffixOffset = end - TokBegin;
+ }
+
+ // Trim the ending quote.
+ assert(end != begin && "Invalid token lexed");
+ --end;
+
// FIXME: The "Value" is an uint64_t so we can handle char literals of
// up to 64-bits.
// FIXME: This extensively assumes that 'char' is 8-bits.
@@ -771,76 +836,129 @@ CharLiteralParser::CharLiteralParser(const char *begin, const char *end,
assert(PP.getTargetInfo().getWCharWidth() <= 64 &&
"Assumes sizeof(wchar) on target is <= 64");
- // This is what we will use for overflow detection
- llvm::APInt LitVal(PP.getTargetInfo().getIntWidth(), 0);
+ SmallVector<uint32_t,4> codepoint_buffer;
+ codepoint_buffer.resize(end-begin);
+ uint32_t *buffer_begin = &codepoint_buffer.front();
+ uint32_t *buffer_end = buffer_begin + codepoint_buffer.size();
+
+ // Unicode escapes representing characters that cannot be correctly
+ // represented in a single code unit are disallowed in character literals
+ // by this implementation.
+ uint32_t largest_character_for_kind;
+ if (tok::wide_char_constant == Kind) {
+ largest_character_for_kind = 0xFFFFFFFFu >> (32-PP.getTargetInfo().getWCharWidth());
+ } else if (tok::utf16_char_constant == Kind) {
+ largest_character_for_kind = 0xFFFF;
+ } else if (tok::utf32_char_constant == Kind) {
+ largest_character_for_kind = 0x10FFFF;
+ } else {
+ largest_character_for_kind = 0x7Fu;
+ }
- unsigned NumCharsSoFar = 0;
- bool Warned = false;
- while (begin[0] != '\'') {
- uint64_t ResultChar;
-
- // Is this a Universal Character Name escape?
- if (begin[0] != '\\') // If this is a normal character, consume it.
- ResultChar = (unsigned char)*begin++;
- else { // Otherwise, this is an escape character.
- unsigned CharWidth = getCharWidth(Kind, PP.getTargetInfo());
- // Check for UCN.
- if (begin[1] == 'u' || begin[1] == 'U') {
- uint32_t utf32 = 0;
- unsigned short UcnLen = 0;
- if (!ProcessUCNEscape(begin, end, utf32, UcnLen,
- FullSourceLoc(Loc, PP.getSourceManager()),
- &PP.getDiagnostics(), PP.getLangOptions())) {
- HadError = 1;
+ while (begin!=end) {
+ // Is this a span of non-escape characters?
+ if (begin[0] != '\\') {
+ char const *start = begin;
+ do {
+ ++begin;
+ } while (begin != end && *begin != '\\');
+
+ char const *tmp_in_start = start;
+ uint32_t *tmp_out_start = buffer_begin;
+ ConversionResult res =
+ ConvertUTF8toUTF32(reinterpret_cast<UTF8 const **>(&start),
+ reinterpret_cast<UTF8 const *>(begin),
+ &buffer_begin,buffer_end,strictConversion);
+ if (res!=conversionOK) {
+ // If we see bad encoding for unprefixed character literals, warn and
+ // simply copy the byte values, for compatibility with gcc and
+ // older versions of clang.
+ bool NoErrorOnBadEncoding = isAscii();
+ unsigned Msg = diag::err_bad_character_encoding;
+ if (NoErrorOnBadEncoding)
+ Msg = diag::warn_bad_character_encoding;
+ PP.Diag(Loc, Msg);
+ if (NoErrorOnBadEncoding) {
+ start = tmp_in_start;
+ buffer_begin = tmp_out_start;
+ for ( ; start != begin; ++start, ++buffer_begin)
+ *buffer_begin = static_cast<uint8_t>(*start);
+ } else {
+ HadError = true;
}
- ResultChar = utf32;
- if (CharWidth != 32 && (ResultChar >> CharWidth) != 0) {
- PP.Diag(Loc, diag::warn_ucn_escape_too_large);
- ResultChar &= ~0U >> (32-CharWidth);
- }
- } else {
- // Otherwise, this is a non-UCN escape character. Process it.
- ResultChar = ProcessCharEscape(begin, end, HadError,
- FullSourceLoc(Loc,PP.getSourceManager()),
- CharWidth, &PP.getDiagnostics());
- }
- }
-
- // If this is a multi-character constant (e.g. 'abc'), handle it. These are
- // implementation defined (C99 6.4.4.4p10).
- if (NumCharsSoFar) {
- if (!isAscii()) {
- // Emulate GCC's (unintentional?) behavior: L'ab' -> L'b'.
- LitVal = 0;
} else {
- // Narrow character literals act as though their value is concatenated
- // in this implementation, but warn on overflow.
- if (LitVal.countLeadingZeros() < 8 && !Warned) {
- PP.Diag(Loc, diag::warn_char_constant_too_large);
- Warned = true;
+ for (; tmp_out_start <buffer_begin; ++tmp_out_start) {
+ if (*tmp_out_start > largest_character_for_kind) {
+ HadError = true;
+ PP.Diag(Loc, diag::err_character_too_large);
+ }
}
- LitVal <<= 8;
}
+
+ continue;
}
+ // Is this a Universal Character Name excape?
+ if (begin[1] == 'u' || begin[1] == 'U') {
+ unsigned short UcnLen = 0;
+ if (!ProcessUCNEscape(TokBegin, begin, end, *buffer_begin, UcnLen,
+ FullSourceLoc(Loc, PP.getSourceManager()),
+ &PP.getDiagnostics(), PP.getLangOpts(),
+ true))
+ {
+ HadError = true;
+ } else if (*buffer_begin > largest_character_for_kind) {
+ HadError = true;
+ PP.Diag(Loc,diag::err_character_too_large);
+ }
- LitVal = LitVal + ResultChar;
- ++NumCharsSoFar;
+ ++buffer_begin;
+ continue;
+ }
+ unsigned CharWidth = getCharWidth(Kind, PP.getTargetInfo());
+ uint64_t result =
+ ProcessCharEscape(begin, end, HadError,
+ FullSourceLoc(Loc,PP.getSourceManager()),
+ CharWidth, &PP.getDiagnostics());
+ *buffer_begin++ = result;
}
- // If this is the second character being processed, do special handling.
+ unsigned NumCharsSoFar = buffer_begin-&codepoint_buffer.front();
+
if (NumCharsSoFar > 1) {
- // Warn about discarding the top bits for multi-char wide-character
- // constants (L'abcd').
- if (!isAscii())
+ if (isWide())
PP.Diag(Loc, diag::warn_extraneous_char_constant);
- else if (NumCharsSoFar != 4)
+ else if (isAscii() && NumCharsSoFar == 4)
+ PP.Diag(Loc, diag::ext_four_char_character_literal);
+ else if (isAscii())
PP.Diag(Loc, diag::ext_multichar_character_literal);
else
- PP.Diag(Loc, diag::ext_four_char_character_literal);
+ PP.Diag(Loc, diag::err_multichar_utf_character_literal);
IsMultiChar = true;
} else
IsMultiChar = false;
+ llvm::APInt LitVal(PP.getTargetInfo().getIntWidth(), 0);
+
+ // Narrow character literals act as though their value is concatenated
+ // in this implementation, but warn on overflow.
+ bool multi_char_too_long = false;
+ if (isAscii() && isMultiChar()) {
+ LitVal = 0;
+ for (size_t i=0;i<NumCharsSoFar;++i) {
+ // check for enough leading zeros to shift into
+ multi_char_too_long |= (LitVal.countLeadingZeros() < 8);
+ LitVal <<= 8;
+ LitVal = LitVal + (codepoint_buffer[i] & 0xFF);
+ }
+ } else if (NumCharsSoFar > 0) {
+ // otherwise just take the last character
+ LitVal = buffer_begin[-1];
+ }
+
+ if (!HadError && multi_char_too_long) {
+ PP.Diag(Loc,diag::warn_char_constant_too_large);
+ }
+
// Transfer the value from APInt to uint64_t
Value = LitVal.getZExtValue();
@@ -849,7 +967,7 @@ CharLiteralParser::CharLiteralParser(const char *begin, const char *end,
// character constants are not sign extended in the this implementation:
// '\xFF\xFF' = 65536 and '\x0\xFF' = 255, which matches GCC.
if (isAscii() && NumCharsSoFar == 1 && (Value & 128) &&
- PP.getLangOptions().CharIsSigned)
+ PP.getLangOpts().CharIsSigned)
Value = (signed char)Value;
}
@@ -909,7 +1027,7 @@ CharLiteralParser::CharLiteralParser(const char *begin, const char *end,
StringLiteralParser::
StringLiteralParser(const Token *StringToks, unsigned NumStringToks,
Preprocessor &PP, bool Complain)
- : SM(PP.getSourceManager()), Features(PP.getLangOptions()),
+ : SM(PP.getSourceManager()), Features(PP.getLangOpts()),
Target(PP.getTargetInfo()), Diags(Complain ? &PP.getDiagnostics() : 0),
MaxTokenLength(0), SizeBound(0), CharByteWidth(0), Kind(tok::unknown),
ResultPtr(ResultBuf.data()), hadError(false), Pascal(false) {
@@ -985,7 +1103,7 @@ void StringLiteralParser::init(const Token *StringToks, unsigned NumStringToks){
ResultBuf.resize(SizeBound);
// Likewise, but for each string piece.
- llvm::SmallString<512> TokenBuf;
+ SmallString<512> TokenBuf;
TokenBuf.resize(MaxTokenLength);
// Loop over all the strings, getting their spelling, and expanding them to
@@ -994,6 +1112,8 @@ void StringLiteralParser::init(const Token *StringToks, unsigned NumStringToks){
Pascal = false;
+ SourceLocation UDSuffixTokLoc;
+
for (unsigned i = 0, e = NumStringToks; i != e; ++i) {
const char *ThisTokBuf = &TokenBuf[0];
// Get the spelling of the token, which eliminates trigraphs, etc. We know
@@ -1008,7 +1128,42 @@ void StringLiteralParser::init(const Token *StringToks, unsigned NumStringToks){
continue;
}
- const char *ThisTokEnd = ThisTokBuf+ThisTokLen-1; // Skip end quote.
+ const char *ThisTokBegin = ThisTokBuf;
+ const char *ThisTokEnd = ThisTokBuf+ThisTokLen;
+
+ // Remove an optional ud-suffix.
+ if (ThisTokEnd[-1] != '"') {
+ const char *UDSuffixEnd = ThisTokEnd;
+ do {
+ --ThisTokEnd;
+ } while (ThisTokEnd[-1] != '"');
+
+ StringRef UDSuffix(ThisTokEnd, UDSuffixEnd - ThisTokEnd);
+
+ if (UDSuffixBuf.empty()) {
+ UDSuffixBuf.assign(UDSuffix);
+ UDSuffixToken = i;
+ UDSuffixOffset = ThisTokEnd - ThisTokBuf;
+ UDSuffixTokLoc = StringToks[i].getLocation();
+ } else if (!UDSuffixBuf.equals(UDSuffix)) {
+ // C++11 [lex.ext]p8: At the end of phase 6, if a string literal is the
+ // result of a concatenation involving at least one user-defined-string-
+ // literal, all the participating user-defined-string-literals shall
+ // have the same ud-suffix.
+ if (Diags) {
+ SourceLocation TokLoc = StringToks[i].getLocation();
+ Diags->Report(TokLoc, diag::err_string_concat_mixed_suffix)
+ << UDSuffixBuf << UDSuffix
+ << SourceRange(UDSuffixTokLoc, UDSuffixTokLoc)
+ << SourceRange(TokLoc, TokLoc);
+ }
+ hadError = true;
+ }
+ }
+
+ // Strip the end quote.
+ --ThisTokEnd;
+
// TODO: Input character set mapping support.
// Skip marker for wide or unicode strings.
@@ -1028,12 +1183,14 @@ void StringLiteralParser::init(const Token *StringToks, unsigned NumStringToks){
++ThisTokBuf;
++ThisTokBuf; // skip '('
- // remove same number of characters from the end
- if (ThisTokEnd >= ThisTokBuf + (ThisTokBuf - Prefix))
- ThisTokEnd -= (ThisTokBuf - Prefix);
+ // Remove same number of characters from the end
+ ThisTokEnd -= ThisTokBuf - Prefix;
+ assert(ThisTokEnd >= ThisTokBuf && "malformed raw string literal");
// Copy the string over
- CopyStringFragment(StringRef(ThisTokBuf, ThisTokEnd - ThisTokBuf));
+ if (CopyStringFragment(StringRef(ThisTokBuf, ThisTokEnd - ThisTokBuf)))
+ if (DiagnoseBadString(StringToks[i]))
+ hadError = true;
} else {
assert(ThisTokBuf[0] == '"' && "Expected quote, lexer broken?");
++ThisTokBuf; // skip "
@@ -1060,13 +1217,16 @@ void StringLiteralParser::init(const Token *StringToks, unsigned NumStringToks){
} while (ThisTokBuf != ThisTokEnd && ThisTokBuf[0] != '\\');
// Copy the character span over.
- CopyStringFragment(StringRef(InStart, ThisTokBuf - InStart));
+ if (CopyStringFragment(StringRef(InStart, ThisTokBuf - InStart)))
+ if (DiagnoseBadString(StringToks[i]))
+ hadError = true;
continue;
}
// Is this a Universal Character Name escape?
if (ThisTokBuf[1] == 'u' || ThisTokBuf[1] == 'U') {
- EncodeUCNEscape(ThisTokBuf, ThisTokEnd, ResultPtr,
- hadError, FullSourceLoc(StringToks[i].getLocation(),SM),
+ EncodeUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd,
+ ResultPtr, hadError,
+ FullSourceLoc(StringToks[i].getLocation(), SM),
CharByteWidth, Diags, Features);
continue;
}
@@ -1076,18 +1236,41 @@ void StringLiteralParser::init(const Token *StringToks, unsigned NumStringToks){
FullSourceLoc(StringToks[i].getLocation(), SM),
CharByteWidth*8, Diags);
- // Note: our internal rep of wide char tokens is always little-endian.
- *ResultPtr++ = ResultChar & 0xFF;
-
- for (unsigned i = 1, e = CharByteWidth; i != e; ++i)
- *ResultPtr++ = ResultChar >> i*8;
+ if (CharByteWidth == 4) {
+ // FIXME: Make the type of the result buffer correct instead of
+ // using reinterpret_cast.
+ UTF32 *ResultWidePtr = reinterpret_cast<UTF32*>(ResultPtr);
+ *ResultWidePtr = ResultChar;
+ ResultPtr += 4;
+ } else if (CharByteWidth == 2) {
+ // FIXME: Make the type of the result buffer correct instead of
+ // using reinterpret_cast.
+ UTF16 *ResultWidePtr = reinterpret_cast<UTF16*>(ResultPtr);
+ *ResultWidePtr = ResultChar & 0xFFFF;
+ ResultPtr += 2;
+ } else {
+ assert(CharByteWidth == 1 && "Unexpected char width");
+ *ResultPtr++ = ResultChar & 0xFF;
+ }
}
}
}
if (Pascal) {
- ResultBuf[0] = ResultPtr-&ResultBuf[0]-1;
- ResultBuf[0] /= CharByteWidth;
+ if (CharByteWidth == 4) {
+ // FIXME: Make the type of the result buffer correct instead of
+ // using reinterpret_cast.
+ UTF32 *ResultWidePtr = reinterpret_cast<UTF32*>(ResultBuf.data());
+ ResultWidePtr[0] = GetNumStringChars() - 1;
+ } else if (CharByteWidth == 2) {
+ // FIXME: Make the type of the result buffer correct instead of
+ // using reinterpret_cast.
+ UTF16 *ResultWidePtr = reinterpret_cast<UTF16*>(ResultBuf.data());
+ ResultWidePtr[0] = GetNumStringChars() - 1;
+ } else {
+ assert(CharByteWidth == 1 && "Unexpected char width");
+ ResultBuf[0] = GetNumStringChars() - 1;
+ }
// Verify that pascal strings aren't too large.
if (GetStringLength() > 256) {
@@ -1116,22 +1299,55 @@ void StringLiteralParser::init(const Token *StringToks, unsigned NumStringToks){
/// copyStringFragment - This function copies from Start to End into ResultPtr.
/// Performs widening for multi-byte characters.
-void StringLiteralParser::CopyStringFragment(StringRef Fragment) {
+bool StringLiteralParser::CopyStringFragment(StringRef Fragment) {
+ assert(CharByteWidth==1 || CharByteWidth==2 || CharByteWidth==4);
+ ConversionResult result = conversionOK;
// Copy the character span over.
if (CharByteWidth == 1) {
+ if (!isLegalUTF8String(reinterpret_cast<const UTF8*>(Fragment.begin()),
+ reinterpret_cast<const UTF8*>(Fragment.end())))
+ result = sourceIllegal;
memcpy(ResultPtr, Fragment.data(), Fragment.size());
ResultPtr += Fragment.size();
- } else {
- // Note: our internal rep of wide char tokens is always little-endian.
- for (StringRef::iterator I=Fragment.begin(), E=Fragment.end(); I!=E; ++I) {
- *ResultPtr++ = *I;
- // Add zeros at the end.
- for (unsigned i = 1, e = CharByteWidth; i != e; ++i)
- *ResultPtr++ = 0;
- }
+ } else if (CharByteWidth == 2) {
+ UTF8 const *sourceStart = (UTF8 const *)Fragment.data();
+ // FIXME: Make the type of the result buffer correct instead of
+ // using reinterpret_cast.
+ UTF16 *targetStart = reinterpret_cast<UTF16*>(ResultPtr);
+ ConversionFlags flags = strictConversion;
+ result = ConvertUTF8toUTF16(
+ &sourceStart,sourceStart + Fragment.size(),
+ &targetStart,targetStart + 2*Fragment.size(),flags);
+ if (result==conversionOK)
+ ResultPtr = reinterpret_cast<char*>(targetStart);
+ } else if (CharByteWidth == 4) {
+ UTF8 const *sourceStart = (UTF8 const *)Fragment.data();
+ // FIXME: Make the type of the result buffer correct instead of
+ // using reinterpret_cast.
+ UTF32 *targetStart = reinterpret_cast<UTF32*>(ResultPtr);
+ ConversionFlags flags = strictConversion;
+ result = ConvertUTF8toUTF32(
+ &sourceStart,sourceStart + Fragment.size(),
+ &targetStart,targetStart + 4*Fragment.size(),flags);
+ if (result==conversionOK)
+ ResultPtr = reinterpret_cast<char*>(targetStart);
}
+ assert((result != targetExhausted)
+ && "ConvertUTF8toUTFXX exhausted target buffer");
+ return result != conversionOK;
}
+bool StringLiteralParser::DiagnoseBadString(const Token &Tok) {
+ // If we see bad encoding for unprefixed string literals, warn and
+ // simply copy the byte values, for compatibility with gcc and older
+ // versions of clang.
+ bool NoErrorOnBadEncoding = isAscii();
+ unsigned Msg = NoErrorOnBadEncoding ? diag::warn_bad_string_encoding :
+ diag::err_bad_string_encoding;
+ if (Diags)
+ Diags->Report(FullSourceLoc(Tok.getLocation(), SM), Msg);
+ return !NoErrorOnBadEncoding;
+}
/// getOffsetOfStringByte - This function returns the offset of the
/// specified byte of the string data represented by Token. This handles
@@ -1139,7 +1355,7 @@ void StringLiteralParser::CopyStringFragment(StringRef Fragment) {
unsigned StringLiteralParser::getOffsetOfStringByte(const Token &Tok,
unsigned ByteNo) const {
// Get the spelling of the token.
- llvm::SmallString<32> SpellingBuffer;
+ SmallString<32> SpellingBuffer;
SpellingBuffer.resize(Tok.getLength());
bool StringInvalid = false;
OpenPOWER on IntegriCloud