summaryrefslogtreecommitdiffstats
path: root/lib/Lex/LiteralSupport.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'lib/Lex/LiteralSupport.cpp')
-rw-r--r--lib/Lex/LiteralSupport.cpp275
1 files changed, 174 insertions, 101 deletions
diff --git a/lib/Lex/LiteralSupport.cpp b/lib/Lex/LiteralSupport.cpp
index fb543d0..16d7b36 100644
--- a/lib/Lex/LiteralSupport.cpp
+++ b/lib/Lex/LiteralSupport.cpp
@@ -33,8 +33,8 @@ static int HexDigitValue(char C) {
/// either a character or a string literal.
static unsigned ProcessCharEscape(const char *&ThisTokBuf,
const char *ThisTokEnd, bool &HadError,
- SourceLocation Loc, bool IsWide,
- Preprocessor &PP, bool Complain) {
+ FullSourceLoc Loc, bool IsWide,
+ Diagnostic *Diags, const TargetInfo &Target) {
// Skip the '\' char.
++ThisTokBuf;
@@ -54,13 +54,13 @@ static unsigned ProcessCharEscape(const char *&ThisTokBuf,
ResultChar = 8;
break;
case 'e':
- if (Complain)
- PP.Diag(Loc, diag::ext_nonstandard_escape) << "e";
+ if (Diags)
+ Diags->Report(Loc, diag::ext_nonstandard_escape) << "e";
ResultChar = 27;
break;
case 'E':
- if (Complain)
- PP.Diag(Loc, diag::ext_nonstandard_escape) << "E";
+ if (Diags)
+ Diags->Report(Loc, diag::ext_nonstandard_escape) << "E";
ResultChar = 27;
break;
case 'f':
@@ -81,8 +81,8 @@ static unsigned ProcessCharEscape(const char *&ThisTokBuf,
case 'x': { // Hex escape.
ResultChar = 0;
if (ThisTokBuf == ThisTokEnd || !isxdigit(*ThisTokBuf)) {
- if (Complain)
- PP.Diag(Loc, diag::err_hex_escape_no_digits);
+ if (Diags)
+ Diags->Report(Loc, diag::err_hex_escape_no_digits);
HadError = 1;
break;
}
@@ -99,9 +99,8 @@ static unsigned ProcessCharEscape(const char *&ThisTokBuf,
}
// See if any bits will be truncated when evaluated as a character.
- unsigned CharWidth = IsWide
- ? PP.getTargetInfo().getWCharWidth()
- : PP.getTargetInfo().getCharWidth();
+ unsigned CharWidth =
+ IsWide ? Target.getWCharWidth() : Target.getCharWidth();
if (CharWidth != 32 && (ResultChar >> CharWidth) != 0) {
Overflow = true;
@@ -109,8 +108,8 @@ static unsigned ProcessCharEscape(const char *&ThisTokBuf,
}
// Check for overflow.
- if (Overflow && Complain) // Too many digits to fit in
- PP.Diag(Loc, diag::warn_hex_escape_too_large);
+ if (Overflow && Diags) // Too many digits to fit in
+ Diags->Report(Loc, diag::warn_hex_escape_too_large);
break;
}
case '0': case '1': case '2': case '3':
@@ -130,13 +129,12 @@ static unsigned ProcessCharEscape(const char *&ThisTokBuf,
ThisTokBuf[0] >= '0' && ThisTokBuf[0] <= '7');
// Check for overflow. Reject '\777', but not L'\777'.
- unsigned CharWidth = IsWide
- ? PP.getTargetInfo().getWCharWidth()
- : PP.getTargetInfo().getCharWidth();
+ unsigned CharWidth =
+ IsWide ? Target.getWCharWidth() : Target.getCharWidth();
if (CharWidth != 32 && (ResultChar >> CharWidth) != 0) {
- if (Complain)
- PP.Diag(Loc, diag::warn_octal_escape_too_large);
+ if (Diags)
+ Diags->Report(Loc, diag::warn_octal_escape_too_large);
ResultChar &= ~0U >> (32-CharWidth);
}
break;
@@ -145,18 +143,20 @@ static unsigned ProcessCharEscape(const char *&ThisTokBuf,
// Otherwise, these are not valid escapes.
case '(': case '{': case '[': case '%':
// GCC accepts these as extensions. We warn about them as such though.
- if (Complain)
- PP.Diag(Loc, diag::ext_nonstandard_escape)
+ if (Diags)
+ Diags->Report(Loc, diag::ext_nonstandard_escape)
<< std::string()+(char)ResultChar;
break;
default:
- if (!Complain)
+ if (Diags == 0)
break;
- if (isgraph(ThisTokBuf[0]))
- PP.Diag(Loc, diag::ext_unknown_escape) << std::string()+(char)ResultChar;
+ if (isgraph(ResultChar))
+ Diags->Report(Loc, diag::ext_unknown_escape)
+ << std::string()+(char)ResultChar;
else
- PP.Diag(Loc, diag::ext_unknown_escape) << "x"+llvm::utohexstr(ResultChar);
+ Diags->Report(Loc, diag::ext_unknown_escape)
+ << "x"+llvm::utohexstr(ResultChar);
break;
}
@@ -164,16 +164,13 @@ static unsigned ProcessCharEscape(const char *&ThisTokBuf,
}
/// ProcessUCNEscape - Read the Universal Character Name, check constraints and
-/// convert the UTF32 to UTF8. This is a subroutine of StringLiteralParser.
-/// When we decide to implement UCN's for character constants and identifiers,
-/// we will likely rework our support for UCN's.
-static void ProcessUCNEscape(const char *&ThisTokBuf, const char *ThisTokEnd,
- char *&ResultBuf, bool &HadError,
- SourceLocation Loc, Preprocessor &PP,
- bool wide,
- bool Complain) {
- // FIXME: Add a warning - UCN's are only valid in C++ & C99.
- // FIXME: Handle wide strings.
+/// return the UTF32.
+static bool ProcessUCNEscape(const char *&ThisTokBuf, const char *ThisTokEnd,
+ uint32_t &UcnVal, unsigned short &UcnLen,
+ FullSourceLoc Loc, Diagnostic *Diags,
+ const LangOptions &Features) {
+ if (!Features.CPlusPlus && !Features.C99 && Diags)
+ Diags->Report(Loc, diag::warn_ucn_not_valid_in_c89);
// Save the beginning of the string (for error diagnostics).
const char *ThisTokBegin = ThisTokBuf;
@@ -182,49 +179,87 @@ static void ProcessUCNEscape(const char *&ThisTokBuf, const char *ThisTokEnd,
ThisTokBuf += 2;
if (ThisTokBuf == ThisTokEnd || !isxdigit(*ThisTokBuf)) {
- if (Complain)
- PP.Diag(Loc, diag::err_ucn_escape_no_digits);
- HadError = 1;
- return;
+ if (Diags)
+ Diags->Report(Loc, diag::err_ucn_escape_no_digits);
+ return false;
}
- typedef uint32_t UTF32;
-
- UTF32 UcnVal = 0;
- unsigned short UcnLen = (ThisTokBuf[-1] == 'u' ? 4 : 8);
+ UcnLen = (ThisTokBuf[-1] == 'u' ? 4 : 8);
unsigned short UcnLenSave = UcnLen;
- for (; ThisTokBuf != ThisTokEnd && UcnLen; ++ThisTokBuf, UcnLen--) {
+ for (; ThisTokBuf != ThisTokEnd && UcnLenSave; ++ThisTokBuf, UcnLenSave--) {
int CharVal = HexDigitValue(ThisTokBuf[0]);
if (CharVal == -1) break;
UcnVal <<= 4;
UcnVal |= CharVal;
}
// If we didn't consume the proper number of digits, there is a problem.
- if (UcnLen) {
- if (Complain)
- PP.Diag(PP.AdvanceToTokenCharacter(Loc, ThisTokBuf-ThisTokBegin),
- diag::err_ucn_escape_incomplete);
- HadError = 1;
- return;
+ if (UcnLenSave) {
+ if (Diags) {
+ SourceLocation L =
+ Lexer::AdvanceToTokenCharacter(Loc, ThisTokBuf-ThisTokBegin,
+ Loc.getManager(), Features);
+ Diags->Report(FullSourceLoc(L, Loc.getManager()),
+ diag::err_ucn_escape_incomplete);
+ }
+ return false;
}
// Check UCN constraints (C99 6.4.3p2).
if ((UcnVal < 0xa0 &&
(UcnVal != 0x24 && UcnVal != 0x40 && UcnVal != 0x60 )) // $, @, `
|| (UcnVal >= 0xD800 && UcnVal <= 0xDFFF)
|| (UcnVal > 0x10FFFF)) /* the maximum legal UTF32 value */ {
- if (Complain)
- PP.Diag(Loc, diag::err_ucn_escape_invalid);
+ if (Diags)
+ Diags->Report(Loc, diag::err_ucn_escape_invalid);
+ return false;
+ }
+ return true;
+}
+
+/// EncodeUCNEscape - Read the Universal Character Name, check constraints and
+/// convert the UTF32 to UTF8 or UTF16. This is a subroutine of
+/// StringLiteralParser. When we decide to implement UCN's for identifiers,
+/// we will likely rework our support for UCN's.
+static void EncodeUCNEscape(const char *&ThisTokBuf, const char *ThisTokEnd,
+ char *&ResultBuf, bool &HadError,
+ FullSourceLoc Loc, bool wide, Diagnostic *Diags,
+ const LangOptions &Features) {
+ typedef uint32_t UTF32;
+ UTF32 UcnVal = 0;
+ unsigned short UcnLen = 0;
+ if (!ProcessUCNEscape(ThisTokBuf, ThisTokEnd, UcnVal, UcnLen, Loc, Diags,
+ Features)) {
HadError = 1;
return;
}
+
if (wide) {
- (void)UcnLenSave;
- assert(UcnLenSave == 4 &&
- "ProcessUCNEscape - only ucn length of 4 supported");
- // little endian assumed.
- *ResultBuf++ = (UcnVal & 0x000000FF);
- *ResultBuf++ = (UcnVal & 0x0000FF00) >> 8;
- *ResultBuf++ = (UcnVal & 0x00FF0000) >> 16;
- *ResultBuf++ = (UcnVal & 0xFF000000) >> 24;
+ (void)UcnLen;
+ assert((UcnLen== 4 || UcnLen== 8) && "only ucn length of 4 or 8 supported");
+
+ if (!Features.ShortWChar) {
+ // Note: our internal rep of wide char tokens is always little-endian.
+ *ResultBuf++ = (UcnVal & 0x000000FF);
+ *ResultBuf++ = (UcnVal & 0x0000FF00) >> 8;
+ *ResultBuf++ = (UcnVal & 0x00FF0000) >> 16;
+ *ResultBuf++ = (UcnVal & 0xFF000000) >> 24;
+ return;
+ }
+
+ // Convert to UTF16.
+ if (UcnVal < (UTF32)0xFFFF) {
+ *ResultBuf++ = (UcnVal & 0x000000FF);
+ *ResultBuf++ = (UcnVal & 0x0000FF00) >> 8;
+ return;
+ }
+ if (Diags) Diags->Report(Loc, diag::warn_ucn_escape_too_large);
+
+ typedef uint16_t UTF16;
+ UcnVal -= 0x10000;
+ UTF16 surrogate1 = 0xD800 + (UcnVal >> 10);
+ UTF16 surrogate2 = 0xDC00 + (UcnVal & 0x3FF);
+ *ResultBuf++ = (surrogate1 & 0x000000FF);
+ *ResultBuf++ = (surrogate1 & 0x0000FF00) >> 8;
+ *ResultBuf++ = (surrogate2 & 0x000000FF);
+ *ResultBuf++ = (surrogate2 & 0x0000FF00) >> 8;
return;
}
// Now that we've parsed/checked the UCN, we convert from UTF32->UTF8.
@@ -398,6 +433,7 @@ NumericLiteralParser(const char *begin, const char *end,
}
continue; // Success.
case 'i':
+ case 'I':
if (PP.getLangOptions().Microsoft) {
if (isFPConstant || isLong || isLongLong) break;
@@ -410,22 +446,33 @@ NumericLiteralParser(const char *begin, const char *end,
break;
case '1':
if (s + 2 == ThisTokEnd) break;
- if (s[2] == '6') s += 3; // i16 suffix
+ if (s[2] == '6') {
+ s += 3; // i16 suffix
+ isMicrosoftInteger = true;
+ }
else if (s[2] == '2') {
if (s + 3 == ThisTokEnd) break;
- if (s[3] == '8') s += 4; // i128 suffix
+ if (s[3] == '8') {
+ s += 4; // i128 suffix
+ isMicrosoftInteger = true;
+ }
}
- isMicrosoftInteger = true;
break;
case '3':
if (s + 2 == ThisTokEnd) break;
- if (s[2] == '2') s += 3; // i32 suffix
- isMicrosoftInteger = true;
+ if (s[2] == '2') {
+ s += 3; // i32 suffix
+ isLong = true;
+ isMicrosoftInteger = true;
+ }
break;
case '6':
if (s + 2 == ThisTokEnd) break;
- if (s[2] == '4') s += 3; // i64 suffix
- isMicrosoftInteger = true;
+ if (s[2] == '4') {
+ s += 3; // i64 suffix
+ isLongLong = true;
+ isMicrosoftInteger = true;
+ }
break;
default:
break;
@@ -434,7 +481,6 @@ NumericLiteralParser(const char *begin, const char *end,
}
}
// fall through.
- case 'I':
case 'j':
case 'J':
if (isImaginary) break; // Cannot be repeated.
@@ -681,11 +727,29 @@ CharLiteralParser::CharLiteralParser(const char *begin, const char *end,
bool Warned = false;
while (begin[0] != '\'') {
uint64_t ResultChar;
+
+ // Is this a Universal Character Name escape?
if (begin[0] != '\\') // If this is a normal character, consume it.
ResultChar = *begin++;
- else // Otherwise, this is an escape character.
- ResultChar = ProcessCharEscape(begin, end, HadError, Loc, IsWide, PP,
- /*Complain=*/true);
+ else { // Otherwise, this is an escape character.
+ // Check for UCN.
+ if (begin[1] == 'u' || begin[1] == 'U') {
+ uint32_t utf32 = 0;
+ unsigned short UcnLen = 0;
+ if (!ProcessUCNEscape(begin, end, utf32, UcnLen,
+ FullSourceLoc(Loc, PP.getSourceManager()),
+ &PP.getDiagnostics(), PP.getLangOptions())) {
+ HadError = 1;
+ }
+ ResultChar = utf32;
+ } else {
+ // Otherwise, this is a non-UCN escape character. Process it.
+ ResultChar = ProcessCharEscape(begin, end, HadError,
+ FullSourceLoc(Loc,PP.getSourceManager()),
+ IsWide,
+ &PP.getDiagnostics(), PP.getTargetInfo());
+ }
+ }
// If this is a multi-character constant (e.g. 'abc'), handle it. These are
// implementation defined (C99 6.4.4.4p10).
@@ -725,6 +789,9 @@ CharLiteralParser::CharLiteralParser(const char *begin, const char *end,
// Transfer the value from APInt to uint64_t
Value = LitVal.getZExtValue();
+ if (IsWide && PP.getLangOptions().ShortWChar && Value > 0xFFFF)
+ PP.Diag(Loc, diag::warn_ucn_escape_too_large);
+
// If this is a single narrow character, sign extend it (e.g. '\xFF' is "-1")
// if 'char' is signed for this target (C99 6.4.4.4p10). Note that multiple
// character constants are not sign extended in the this implementation:
@@ -771,7 +838,13 @@ CharLiteralParser::CharLiteralParser(const char *begin, const char *end,
///
StringLiteralParser::
StringLiteralParser(const Token *StringToks, unsigned NumStringToks,
- Preprocessor &pp, bool Complain) : PP(pp) {
+ Preprocessor &PP, bool Complain)
+ : SM(PP.getSourceManager()), Features(PP.getLangOptions()),
+ Target(PP.getTargetInfo()), Diags(Complain ? &PP.getDiagnostics() : 0) {
+ init(StringToks, NumStringToks);
+}
+
+void StringLiteralParser::init(const Token *StringToks, unsigned NumStringToks){
// Scan all of the string portions, remember the max individual token length,
// computing a bound on the concatenated string length, and see whether any
// piece is a wide-string. If any of the string portions is a wide-string
@@ -806,7 +879,7 @@ StringLiteralParser(const Token *StringToks, unsigned NumStringToks,
// query the target. As such, wchar_tByteWidth is only valid if AnyWide=true.
wchar_tByteWidth = ~0U;
if (AnyWide) {
- wchar_tByteWidth = PP.getTargetInfo().getWCharWidth();
+ wchar_tByteWidth = Target.getWCharWidth();
assert((wchar_tByteWidth & 7) == 0 && "Assumes wchar_t is byte multiple!");
wchar_tByteWidth /= 8;
}
@@ -835,8 +908,9 @@ StringLiteralParser(const Token *StringToks, unsigned NumStringToks,
// that ThisTokBuf points to a buffer that is big enough for the whole token
// and 'spelled' tokens can only shrink.
bool StringInvalid = false;
- unsigned ThisTokLen = PP.getSpelling(StringToks[i], ThisTokBuf,
- &StringInvalid);
+ unsigned ThisTokLen =
+ Lexer::getSpelling(StringToks[i], ThisTokBuf, SM, Features,
+ &StringInvalid);
if (StringInvalid) {
hadError = 1;
continue;
@@ -856,7 +930,7 @@ StringLiteralParser(const Token *StringToks, unsigned NumStringToks,
++ThisTokBuf;
// Check if this is a pascal string
- if (pp.getLangOptions().PascalStrings && ThisTokBuf + 1 != ThisTokEnd &&
+ if (Features.PascalStrings && ThisTokBuf + 1 != ThisTokEnd &&
ThisTokBuf[0] == '\\' && ThisTokBuf[1] == 'p') {
// If the \p sequence is found in the first token, we have a pascal string
@@ -894,15 +968,16 @@ StringLiteralParser(const Token *StringToks, unsigned NumStringToks,
}
// Is this a Universal Character Name escape?
if (ThisTokBuf[1] == 'u' || ThisTokBuf[1] == 'U') {
- ProcessUCNEscape(ThisTokBuf, ThisTokEnd, ResultPtr,
- hadError, StringToks[i].getLocation(), PP, wide,
- Complain);
+ EncodeUCNEscape(ThisTokBuf, ThisTokEnd, ResultPtr,
+ hadError, FullSourceLoc(StringToks[i].getLocation(),SM),
+ wide, Diags, Features);
continue;
}
// Otherwise, this is a non-UCN escape character. Process it.
- unsigned ResultChar = ProcessCharEscape(ThisTokBuf, ThisTokEnd, hadError,
- StringToks[i].getLocation(),
- AnyWide, PP, Complain);
+ unsigned ResultChar =
+ ProcessCharEscape(ThisTokBuf, ThisTokEnd, hadError,
+ FullSourceLoc(StringToks[i].getLocation(), SM),
+ AnyWide, Diags, Target);
// Note: our internal rep of wide char tokens is always little-endian.
*ResultPtr++ = ResultChar & 0xFF;
@@ -920,25 +995,24 @@ StringLiteralParser(const Token *StringToks, unsigned NumStringToks,
ResultBuf[0] /= wchar_tByteWidth;
// Verify that pascal strings aren't too large.
- if (GetStringLength() > 256 && Complain) {
- PP.Diag(StringToks[0].getLocation(), diag::err_pascal_string_too_long)
- << SourceRange(StringToks[0].getLocation(),
- StringToks[NumStringToks-1].getLocation());
+ if (GetStringLength() > 256) {
+ if (Diags)
+ Diags->Report(FullSourceLoc(StringToks[0].getLocation(), SM),
+ diag::err_pascal_string_too_long)
+ << SourceRange(StringToks[0].getLocation(),
+ StringToks[NumStringToks-1].getLocation());
hadError = 1;
return;
}
- } else if (Complain) {
+ } else if (Diags) {
// Complain if this string literal has too many characters.
- unsigned MaxChars = PP.getLangOptions().CPlusPlus? 65536
- : PP.getLangOptions().C99 ? 4095
- : 509;
+ unsigned MaxChars = Features.CPlusPlus? 65536 : Features.C99 ? 4095 : 509;
if (GetNumStringChars() > MaxChars)
- PP.Diag(StringToks[0].getLocation(), diag::ext_string_too_long)
+ Diags->Report(FullSourceLoc(StringToks[0].getLocation(), SM),
+ diag::ext_string_too_long)
<< GetNumStringChars() << MaxChars
- << (PP.getLangOptions().CPlusPlus? 2
- : PP.getLangOptions().C99 ? 1
- : 0)
+ << (Features.CPlusPlus ? 2 : Features.C99 ? 1 : 0)
<< SourceRange(StringToks[0].getLocation(),
StringToks[NumStringToks-1].getLocation());
}
@@ -949,19 +1023,17 @@ StringLiteralParser(const Token *StringToks, unsigned NumStringToks,
/// specified byte of the string data represented by Token. This handles
/// advancing over escape sequences in the string.
unsigned StringLiteralParser::getOffsetOfStringByte(const Token &Tok,
- unsigned ByteNo,
- Preprocessor &PP,
- bool Complain) {
+ unsigned ByteNo) const {
// Get the spelling of the token.
- llvm::SmallString<16> SpellingBuffer;
+ llvm::SmallString<32> SpellingBuffer;
SpellingBuffer.resize(Tok.getLength());
bool StringInvalid = false;
const char *SpellingPtr = &SpellingBuffer[0];
- unsigned TokLen = PP.getSpelling(Tok, SpellingPtr, &StringInvalid);
- if (StringInvalid) {
+ unsigned TokLen = Lexer::getSpelling(Tok, SpellingPtr, SM, Features,
+ &StringInvalid);
+ if (StringInvalid)
return 0;
- }
assert(SpellingPtr[0] != 'L' && "Doesn't handle wide strings yet");
@@ -987,7 +1059,8 @@ unsigned StringLiteralParser::getOffsetOfStringByte(const Token &Tok,
// Otherwise, this is an escape character. Advance over it.
bool HadError = false;
ProcessCharEscape(SpellingPtr, SpellingEnd, HadError,
- Tok.getLocation(), false, PP, Complain);
+ FullSourceLoc(Tok.getLocation(), SM),
+ false, Diags, Target);
assert(!HadError && "This method isn't valid on erroneous strings");
--ByteNo;
}
OpenPOWER on IntegriCloud