summaryrefslogtreecommitdiffstats
path: root/contrib/llvm/tools/clang/lib/Lex/Lexer.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'contrib/llvm/tools/clang/lib/Lex/Lexer.cpp')
-rw-r--r--contrib/llvm/tools/clang/lib/Lex/Lexer.cpp230
1 files changed, 210 insertions, 20 deletions
diff --git a/contrib/llvm/tools/clang/lib/Lex/Lexer.cpp b/contrib/llvm/tools/clang/lib/Lex/Lexer.cpp
index 917829b..b17198b 100644
--- a/contrib/llvm/tools/clang/lib/Lex/Lexer.cpp
+++ b/contrib/llvm/tools/clang/lib/Lex/Lexer.cpp
@@ -212,6 +212,109 @@ void Lexer::Stringify(llvm::SmallVectorImpl<char> &Str) {
}
}
+//===----------------------------------------------------------------------===//
+// Token Spelling
+//===----------------------------------------------------------------------===//
+
+/// getSpelling() - Return the 'spelling' of this token. The spelling of a
+/// token are the characters used to represent the token in the source file
+/// after trigraph expansion and escaped-newline folding. In particular, this
+/// wants to get the true, uncanonicalized, spelling of things like digraphs
+/// UCNs, etc.
+std::string Lexer::getSpelling(const Token &Tok, const SourceManager &SourceMgr,
+ const LangOptions &Features, bool *Invalid) {
+ assert((int)Tok.getLength() >= 0 && "Token character range is bogus!");
+
+ // If this token contains nothing interesting, return it directly.
+ bool CharDataInvalid = false;
+ const char* TokStart = SourceMgr.getCharacterData(Tok.getLocation(),
+ &CharDataInvalid);
+ if (Invalid)
+ *Invalid = CharDataInvalid;
+ if (CharDataInvalid)
+ return std::string();
+
+ if (!Tok.needsCleaning())
+ return std::string(TokStart, TokStart+Tok.getLength());
+
+ std::string Result;
+ Result.reserve(Tok.getLength());
+
+ // Otherwise, hard case, relex the characters into the string.
+ for (const char *Ptr = TokStart, *End = TokStart+Tok.getLength();
+ Ptr != End; ) {
+ unsigned CharSize;
+ Result.push_back(Lexer::getCharAndSizeNoWarn(Ptr, CharSize, Features));
+ Ptr += CharSize;
+ }
+ assert(Result.size() != unsigned(Tok.getLength()) &&
+ "NeedsCleaning flag set on something that didn't need cleaning!");
+ return Result;
+}
+
+/// getSpelling - This method is used to get the spelling of a token into a
+/// preallocated buffer, instead of as an std::string. The caller is required
+/// to allocate enough space for the token, which is guaranteed to be at least
+/// Tok.getLength() bytes long. The actual length of the token is returned.
+///
+/// Note that this method may do two possible things: it may either fill in
+/// the buffer specified with characters, or it may *change the input pointer*
+/// to point to a constant buffer with the data already in it (avoiding a
+/// copy). The caller is not allowed to modify the returned buffer pointer
+/// if an internal buffer is returned.
+unsigned Lexer::getSpelling(const Token &Tok, const char *&Buffer,
+ const SourceManager &SourceMgr,
+ const LangOptions &Features, bool *Invalid) {
+ assert((int)Tok.getLength() >= 0 && "Token character range is bogus!");
+
+ const char *TokStart = 0;
+ // NOTE: this has to be checked *before* testing for an IdentifierInfo.
+ if (Tok.is(tok::raw_identifier))
+ TokStart = Tok.getRawIdentifierData();
+ else if (const IdentifierInfo *II = Tok.getIdentifierInfo()) {
+ // Just return the string from the identifier table, which is very quick.
+ Buffer = II->getNameStart();
+ return II->getLength();
+ }
+
+ // NOTE: this can be checked even after testing for an IdentifierInfo.
+ if (Tok.isLiteral())
+ TokStart = Tok.getLiteralData();
+
+ if (TokStart == 0) {
+ // Compute the start of the token in the input lexer buffer.
+ bool CharDataInvalid = false;
+ TokStart = SourceMgr.getCharacterData(Tok.getLocation(), &CharDataInvalid);
+ if (Invalid)
+ *Invalid = CharDataInvalid;
+ if (CharDataInvalid) {
+ Buffer = "";
+ return 0;
+ }
+ }
+
+ // If this token contains nothing interesting, return it directly.
+ if (!Tok.needsCleaning()) {
+ Buffer = TokStart;
+ return Tok.getLength();
+ }
+
+ // Otherwise, hard case, relex the characters into the string.
+ char *OutBuf = const_cast<char*>(Buffer);
+ for (const char *Ptr = TokStart, *End = TokStart+Tok.getLength();
+ Ptr != End; ) {
+ unsigned CharSize;
+ *OutBuf++ = Lexer::getCharAndSizeNoWarn(Ptr, CharSize, Features);
+ Ptr += CharSize;
+ }
+ assert(unsigned(OutBuf-Buffer) != Tok.getLength() &&
+ "NeedsCleaning flag set on something that didn't need cleaning!");
+
+ return OutBuf-Buffer;
+}
+
+
+
static bool isWhitespace(unsigned char c);
/// MeasureTokenLength - Relex the token at the specified location and return
@@ -242,7 +345,8 @@ unsigned Lexer::MeasureTokenLength(SourceLocation Loc,
return 0;
// Create a lexer starting at the beginning of this token.
- Lexer TheLexer(Loc, LangOpts, Buffer.begin(), StrData, Buffer.end());
+ Lexer TheLexer(SM.getLocForStartOfFile(LocInfo.first), LangOpts,
+ Buffer.begin(), StrData, Buffer.end());
TheLexer.SetCommentRetentionState(true);
Token TheTok;
TheLexer.LexFromRawLexer(TheTok);
@@ -253,6 +357,9 @@ SourceLocation Lexer::GetBeginningOfToken(SourceLocation Loc,
const SourceManager &SM,
const LangOptions &LangOpts) {
std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
+ if (LocInfo.first.isInvalid())
+ return Loc;
+
bool Invalid = false;
llvm::StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid);
if (Invalid)
@@ -261,6 +368,9 @@ SourceLocation Lexer::GetBeginningOfToken(SourceLocation Loc,
// Back up from the current location until we hit the beginning of a line
// (or the buffer). We'll relex from that point.
const char *BufStart = Buffer.data();
+ if (LocInfo.second >= Buffer.size())
+ return Loc;
+
const char *StrData = BufStart+LocInfo.second;
if (StrData[0] == '\n' || StrData[0] == '\r')
return Loc;
@@ -371,10 +481,9 @@ Lexer::ComputePreamble(const llvm::MemoryBuffer *Buffer, unsigned MaxLines) {
// we don't have an identifier table available. Instead, just look at
// the raw identifier to recognize and categorize preprocessor directives.
TheLexer.LexFromRawLexer(TheTok);
- if (TheTok.getKind() == tok::identifier && !TheTok.needsCleaning()) {
- const char *IdStart = Buffer->getBufferStart()
- + TheTok.getLocation().getRawEncoding() - 1;
- llvm::StringRef Keyword(IdStart, TheTok.getLength());
+ if (TheTok.getKind() == tok::raw_identifier && !TheTok.needsCleaning()) {
+ llvm::StringRef Keyword(TheTok.getRawIdentifierData(),
+ TheTok.getLength());
PreambleDirectiveKind PDK
= llvm::StringSwitch<PreambleDirectiveKind>(Keyword)
.Case("include", PDK_Skipped)
@@ -443,6 +552,83 @@ Lexer::ComputePreamble(const llvm::MemoryBuffer *Buffer, unsigned MaxLines) {
: TheTok.isAtStartOfLine());
}
+
+/// AdvanceToTokenCharacter - Given a location that specifies the start of a
+/// token, return a new location that specifies a character within the token.
+SourceLocation Lexer::AdvanceToTokenCharacter(SourceLocation TokStart,
+ unsigned CharNo,
+ const SourceManager &SM,
+ const LangOptions &Features) {
+ // Figure out how many physical characters away the specified instantiation
+ // character is. This needs to take into consideration newlines and
+ // trigraphs.
+ bool Invalid = false;
+ const char *TokPtr = SM.getCharacterData(TokStart, &Invalid);
+
+ // If they request the first char of the token, we're trivially done.
+ if (Invalid || (CharNo == 0 && Lexer::isObviouslySimpleCharacter(*TokPtr)))
+ return TokStart;
+
+ unsigned PhysOffset = 0;
+
+ // The usual case is that tokens don't contain anything interesting. Skip
+ // over the uninteresting characters. If a token only consists of simple
+ // chars, this method is extremely fast.
+ while (Lexer::isObviouslySimpleCharacter(*TokPtr)) {
+ if (CharNo == 0)
+ return TokStart.getFileLocWithOffset(PhysOffset);
+ ++TokPtr, --CharNo, ++PhysOffset;
+ }
+
+ // If we have a character that may be a trigraph or escaped newline, use a
+ // lexer to parse it correctly.
+ for (; CharNo; --CharNo) {
+ unsigned Size;
+ Lexer::getCharAndSizeNoWarn(TokPtr, Size, Features);
+ TokPtr += Size;
+ PhysOffset += Size;
+ }
+
+ // Final detail: if we end up on an escaped newline, we want to return the
+ // location of the actual byte of the token. For example foo\<newline>bar
+ // advanced by 3 should return the location of b, not of \\. One compounding
+ // detail of this is that the escape may be made by a trigraph.
+ if (!Lexer::isObviouslySimpleCharacter(*TokPtr))
+ PhysOffset += Lexer::SkipEscapedNewLines(TokPtr)-TokPtr;
+
+ return TokStart.getFileLocWithOffset(PhysOffset);
+}
+
+/// \brief Computes the source location just past the end of the
+/// token at this source location.
+///
+/// This routine can be used to produce a source location that
+/// points just past the end of the token referenced by \p Loc, and
+/// is generally used when a diagnostic needs to point just after a
+/// token where it expected something different that it received. If
+/// the returned source location would not be meaningful (e.g., if
+/// it points into a macro), this routine returns an invalid
+/// source location.
+///
+/// \param Offset an offset from the end of the token, where the source
+/// location should refer to. The default offset (0) produces a source
+/// location pointing just past the end of the token; an offset of 1 produces
+/// a source location pointing to the last character in the token, etc.
+SourceLocation Lexer::getLocForEndOfToken(SourceLocation Loc, unsigned Offset,
+ const SourceManager &SM,
+ const LangOptions &Features) {
+ if (Loc.isInvalid() || !Loc.isFileID())
+ return SourceLocation();
+
+ unsigned Len = Lexer::MeasureTokenLength(Loc, SM, Features);
+ if (Len > Offset)
+ Len = Len - Offset;
+ else
+ return Loc;
+
+ return AdvanceToTokenCharacter(Loc, Len, SM, Features);
+}
+
//===----------------------------------------------------------------------===//
// Character information.
//===----------------------------------------------------------------------===//
@@ -584,10 +770,8 @@ static inline bool isNumberBody(unsigned char c) {
/// lexer buffer was all instantiated at a single point, perform the mapping.
/// This is currently only used for _Pragma implementation, so it is the slow
/// path of the hot getSourceLocation method. Do not allow it to be inlined.
-static DISABLE_INLINE SourceLocation GetMappedTokenLoc(Preprocessor &PP,
- SourceLocation FileLoc,
- unsigned CharNo,
- unsigned TokLen);
+static LLVM_ATTRIBUTE_NOINLINE SourceLocation GetMappedTokenLoc(
+ Preprocessor &PP, SourceLocation FileLoc, unsigned CharNo, unsigned TokLen);
static SourceLocation GetMappedTokenLoc(Preprocessor &PP,
SourceLocation FileLoc,
unsigned CharNo, unsigned TokLen) {
@@ -869,19 +1053,17 @@ void Lexer::LexIdentifier(Token &Result, const char *CurPtr) {
if (C != '\\' && C != '?' && (C != '$' || !Features.DollarIdents)) {
FinishIdentifier:
const char *IdStart = BufferPtr;
- FormTokenWithChars(Result, CurPtr, tok::identifier);
+ FormTokenWithChars(Result, CurPtr, tok::raw_identifier);
+ Result.setRawIdentifierData(IdStart);
// If we are in raw mode, return this identifier raw. There is no need to
// look up identifier information or attempt to macro expand it.
- if (LexingRawMode) return;
-
- // Fill in Result.IdentifierInfo, looking up the identifier in the
- // identifier table.
- IdentifierInfo *II = PP->LookUpIdentifierInfo(Result, IdStart);
+ if (LexingRawMode)
+ return;
- // Change the kind of this identifier to the appropriate token kind, e.g.
- // turning "for" into a keyword.
- Result.setKind(II->getTokenID());
+ // Fill in Result.IdentifierInfo and update the token kind,
+ // looking up the identifier in the identifier table.
+ IdentifierInfo *II = PP->LookUpIdentifierInfo(Result);
// Finally, now that we know we have an identifier, pass this off to the
// preprocessor, which may macro expand it or something.
@@ -980,7 +1162,7 @@ void Lexer::LexStringLiteral(Token &Result, const char *CurPtr, bool Wide) {
if (C == 0 && PP && PP->isCodeCompletionFile(FileLoc))
PP->CodeCompleteNaturalLanguage();
else if (!isLexingRawMode() && !Features.AsmPreprocessor)
- Diag(BufferPtr, diag::err_unterminated_string);
+ Diag(BufferPtr, diag::warn_unterminated_string);
FormTokenWithChars(Result, CurPtr-1, tok::unknown);
return;
}
@@ -1059,7 +1241,7 @@ void Lexer::LexCharConstant(Token &Result, const char *CurPtr) {
if (C == 0 && PP && PP->isCodeCompletionFile(FileLoc))
PP->CodeCompleteNaturalLanguage();
else if (!isLexingRawMode() && !Features.AsmPreprocessor)
- Diag(BufferPtr, diag::err_unterminated_char);
+ Diag(BufferPtr, diag::warn_unterminated_char);
FormTokenWithChars(Result, CurPtr-1, tok::unknown);
return;
} else if (C == 0) {
@@ -2141,6 +2323,10 @@ LexNextToken:
// If this is actually a '<<<<<<<' version control conflict marker,
// recognize it as such and recover nicely.
goto LexNextToken;
+ } else if (Features.CUDA && After == '<') {
+ Kind = tok::lesslessless;
+ CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
+ SizeTmp2, Result);
} else {
CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
Kind = tok::lessless;
@@ -2172,6 +2358,10 @@ LexNextToken:
} else if (After == '>' && HandleEndOfConflictMarker(CurPtr-1)) {
// If this is '>>>>>>>' and we're in a conflict marker, ignore it.
goto LexNextToken;
+ } else if (Features.CUDA && After == '>') {
+ Kind = tok::greatergreatergreater;
+ CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
+ SizeTmp2, Result);
} else {
CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
Kind = tok::greatergreater;
OpenPOWER on IntegriCloud