diff options
Diffstat (limited to 'contrib/llvm/tools/clang/lib/Lex/Lexer.cpp')
-rw-r--r-- | contrib/llvm/tools/clang/lib/Lex/Lexer.cpp | 230 |
1 files changed, 210 insertions, 20 deletions
diff --git a/contrib/llvm/tools/clang/lib/Lex/Lexer.cpp b/contrib/llvm/tools/clang/lib/Lex/Lexer.cpp index 917829b..b17198b 100644 --- a/contrib/llvm/tools/clang/lib/Lex/Lexer.cpp +++ b/contrib/llvm/tools/clang/lib/Lex/Lexer.cpp @@ -212,6 +212,109 @@ void Lexer::Stringify(llvm::SmallVectorImpl<char> &Str) { } } +//===----------------------------------------------------------------------===// +// Token Spelling +//===----------------------------------------------------------------------===// + +/// getSpelling() - Return the 'spelling' of this token. The spelling of a +/// token are the characters used to represent the token in the source file +/// after trigraph expansion and escaped-newline folding. In particular, this +/// wants to get the true, uncanonicalized, spelling of things like digraphs +/// UCNs, etc. +std::string Lexer::getSpelling(const Token &Tok, const SourceManager &SourceMgr, + const LangOptions &Features, bool *Invalid) { + assert((int)Tok.getLength() >= 0 && "Token character range is bogus!"); + + // If this token contains nothing interesting, return it directly. + bool CharDataInvalid = false; + const char* TokStart = SourceMgr.getCharacterData(Tok.getLocation(), + &CharDataInvalid); + if (Invalid) + *Invalid = CharDataInvalid; + if (CharDataInvalid) + return std::string(); + + if (!Tok.needsCleaning()) + return std::string(TokStart, TokStart+Tok.getLength()); + + std::string Result; + Result.reserve(Tok.getLength()); + + // Otherwise, hard case, relex the characters into the string. + for (const char *Ptr = TokStart, *End = TokStart+Tok.getLength(); + Ptr != End; ) { + unsigned CharSize; + Result.push_back(Lexer::getCharAndSizeNoWarn(Ptr, CharSize, Features)); + Ptr += CharSize; + } + assert(Result.size() != unsigned(Tok.getLength()) && + "NeedsCleaning flag set on something that didn't need cleaning!"); + return Result; +} + +/// getSpelling - This method is used to get the spelling of a token into a +/// preallocated buffer, instead of as an std::string. The caller is required +/// to allocate enough space for the token, which is guaranteed to be at least +/// Tok.getLength() bytes long. The actual length of the token is returned. +/// +/// Note that this method may do two possible things: it may either fill in +/// the buffer specified with characters, or it may *change the input pointer* +/// to point to a constant buffer with the data already in it (avoiding a +/// copy). The caller is not allowed to modify the returned buffer pointer +/// if an internal buffer is returned. +unsigned Lexer::getSpelling(const Token &Tok, const char *&Buffer, + const SourceManager &SourceMgr, + const LangOptions &Features, bool *Invalid) { + assert((int)Tok.getLength() >= 0 && "Token character range is bogus!"); + + const char *TokStart = 0; + // NOTE: this has to be checked *before* testing for an IdentifierInfo. + if (Tok.is(tok::raw_identifier)) + TokStart = Tok.getRawIdentifierData(); + else if (const IdentifierInfo *II = Tok.getIdentifierInfo()) { + // Just return the string from the identifier table, which is very quick. + Buffer = II->getNameStart(); + return II->getLength(); + } + + // NOTE: this can be checked even after testing for an IdentifierInfo. + if (Tok.isLiteral()) + TokStart = Tok.getLiteralData(); + + if (TokStart == 0) { + // Compute the start of the token in the input lexer buffer. + bool CharDataInvalid = false; + TokStart = SourceMgr.getCharacterData(Tok.getLocation(), &CharDataInvalid); + if (Invalid) + *Invalid = CharDataInvalid; + if (CharDataInvalid) { + Buffer = ""; + return 0; + } + } + + // If this token contains nothing interesting, return it directly. + if (!Tok.needsCleaning()) { + Buffer = TokStart; + return Tok.getLength(); + } + + // Otherwise, hard case, relex the characters into the string. + char *OutBuf = const_cast<char*>(Buffer); + for (const char *Ptr = TokStart, *End = TokStart+Tok.getLength(); + Ptr != End; ) { + unsigned CharSize; + *OutBuf++ = Lexer::getCharAndSizeNoWarn(Ptr, CharSize, Features); + Ptr += CharSize; + } + assert(unsigned(OutBuf-Buffer) != Tok.getLength() && + "NeedsCleaning flag set on something that didn't need cleaning!"); + + return OutBuf-Buffer; +} + + + static bool isWhitespace(unsigned char c); /// MeasureTokenLength - Relex the token at the specified location and return @@ -242,7 +345,8 @@ unsigned Lexer::MeasureTokenLength(SourceLocation Loc, return 0; // Create a lexer starting at the beginning of this token. - Lexer TheLexer(Loc, LangOpts, Buffer.begin(), StrData, Buffer.end()); + Lexer TheLexer(SM.getLocForStartOfFile(LocInfo.first), LangOpts, + Buffer.begin(), StrData, Buffer.end()); TheLexer.SetCommentRetentionState(true); Token TheTok; TheLexer.LexFromRawLexer(TheTok); @@ -253,6 +357,9 @@ SourceLocation Lexer::GetBeginningOfToken(SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts) { std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc); + if (LocInfo.first.isInvalid()) + return Loc; + bool Invalid = false; llvm::StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid); if (Invalid) @@ -261,6 +368,9 @@ SourceLocation Lexer::GetBeginningOfToken(SourceLocation Loc, // Back up from the current location until we hit the beginning of a line // (or the buffer). We'll relex from that point. const char *BufStart = Buffer.data(); + if (LocInfo.second >= Buffer.size()) + return Loc; + const char *StrData = BufStart+LocInfo.second; if (StrData[0] == '\n' || StrData[0] == '\r') return Loc; @@ -371,10 +481,9 @@ Lexer::ComputePreamble(const llvm::MemoryBuffer *Buffer, unsigned MaxLines) { // we don't have an identifier table available. Instead, just look at // the raw identifier to recognize and categorize preprocessor directives. TheLexer.LexFromRawLexer(TheTok); - if (TheTok.getKind() == tok::identifier && !TheTok.needsCleaning()) { - const char *IdStart = Buffer->getBufferStart() - + TheTok.getLocation().getRawEncoding() - 1; - llvm::StringRef Keyword(IdStart, TheTok.getLength()); + if (TheTok.getKind() == tok::raw_identifier && !TheTok.needsCleaning()) { + llvm::StringRef Keyword(TheTok.getRawIdentifierData(), + TheTok.getLength()); PreambleDirectiveKind PDK = llvm::StringSwitch<PreambleDirectiveKind>(Keyword) .Case("include", PDK_Skipped) @@ -443,6 +552,83 @@ Lexer::ComputePreamble(const llvm::MemoryBuffer *Buffer, unsigned MaxLines) { : TheTok.isAtStartOfLine()); } + +/// AdvanceToTokenCharacter - Given a location that specifies the start of a +/// token, return a new location that specifies a character within the token. +SourceLocation Lexer::AdvanceToTokenCharacter(SourceLocation TokStart, + unsigned CharNo, + const SourceManager &SM, + const LangOptions &Features) { + // Figure out how many physical characters away the specified instantiation + // character is. This needs to take into consideration newlines and + // trigraphs. + bool Invalid = false; + const char *TokPtr = SM.getCharacterData(TokStart, &Invalid); + + // If they request the first char of the token, we're trivially done. + if (Invalid || (CharNo == 0 && Lexer::isObviouslySimpleCharacter(*TokPtr))) + return TokStart; + + unsigned PhysOffset = 0; + + // The usual case is that tokens don't contain anything interesting. Skip + // over the uninteresting characters. If a token only consists of simple + // chars, this method is extremely fast. + while (Lexer::isObviouslySimpleCharacter(*TokPtr)) { + if (CharNo == 0) + return TokStart.getFileLocWithOffset(PhysOffset); + ++TokPtr, --CharNo, ++PhysOffset; + } + + // If we have a character that may be a trigraph or escaped newline, use a + // lexer to parse it correctly. + for (; CharNo; --CharNo) { + unsigned Size; + Lexer::getCharAndSizeNoWarn(TokPtr, Size, Features); + TokPtr += Size; + PhysOffset += Size; + } + + // Final detail: if we end up on an escaped newline, we want to return the + // location of the actual byte of the token. For example foo\<newline>bar + // advanced by 3 should return the location of b, not of \\. One compounding + // detail of this is that the escape may be made by a trigraph. + if (!Lexer::isObviouslySimpleCharacter(*TokPtr)) + PhysOffset += Lexer::SkipEscapedNewLines(TokPtr)-TokPtr; + + return TokStart.getFileLocWithOffset(PhysOffset); +} + +/// \brief Computes the source location just past the end of the +/// token at this source location. +/// +/// This routine can be used to produce a source location that +/// points just past the end of the token referenced by \p Loc, and +/// is generally used when a diagnostic needs to point just after a +/// token where it expected something different that it received. If +/// the returned source location would not be meaningful (e.g., if +/// it points into a macro), this routine returns an invalid +/// source location. +/// +/// \param Offset an offset from the end of the token, where the source +/// location should refer to. The default offset (0) produces a source +/// location pointing just past the end of the token; an offset of 1 produces +/// a source location pointing to the last character in the token, etc. +SourceLocation Lexer::getLocForEndOfToken(SourceLocation Loc, unsigned Offset, + const SourceManager &SM, + const LangOptions &Features) { + if (Loc.isInvalid() || !Loc.isFileID()) + return SourceLocation(); + + unsigned Len = Lexer::MeasureTokenLength(Loc, SM, Features); + if (Len > Offset) + Len = Len - Offset; + else + return Loc; + + return AdvanceToTokenCharacter(Loc, Len, SM, Features); +} + //===----------------------------------------------------------------------===// // Character information. //===----------------------------------------------------------------------===// @@ -584,10 +770,8 @@ static inline bool isNumberBody(unsigned char c) { /// lexer buffer was all instantiated at a single point, perform the mapping. /// This is currently only used for _Pragma implementation, so it is the slow /// path of the hot getSourceLocation method. Do not allow it to be inlined. -static DISABLE_INLINE SourceLocation GetMappedTokenLoc(Preprocessor &PP, - SourceLocation FileLoc, - unsigned CharNo, - unsigned TokLen); +static LLVM_ATTRIBUTE_NOINLINE SourceLocation GetMappedTokenLoc( + Preprocessor &PP, SourceLocation FileLoc, unsigned CharNo, unsigned TokLen); static SourceLocation GetMappedTokenLoc(Preprocessor &PP, SourceLocation FileLoc, unsigned CharNo, unsigned TokLen) { @@ -869,19 +1053,17 @@ void Lexer::LexIdentifier(Token &Result, const char *CurPtr) { if (C != '\\' && C != '?' && (C != '$' || !Features.DollarIdents)) { FinishIdentifier: const char *IdStart = BufferPtr; - FormTokenWithChars(Result, CurPtr, tok::identifier); + FormTokenWithChars(Result, CurPtr, tok::raw_identifier); + Result.setRawIdentifierData(IdStart); // If we are in raw mode, return this identifier raw. There is no need to // look up identifier information or attempt to macro expand it. - if (LexingRawMode) return; - - // Fill in Result.IdentifierInfo, looking up the identifier in the - // identifier table. - IdentifierInfo *II = PP->LookUpIdentifierInfo(Result, IdStart); + if (LexingRawMode) + return; - // Change the kind of this identifier to the appropriate token kind, e.g. - // turning "for" into a keyword. - Result.setKind(II->getTokenID()); + // Fill in Result.IdentifierInfo and update the token kind, + // looking up the identifier in the identifier table. + IdentifierInfo *II = PP->LookUpIdentifierInfo(Result); // Finally, now that we know we have an identifier, pass this off to the // preprocessor, which may macro expand it or something. @@ -980,7 +1162,7 @@ void Lexer::LexStringLiteral(Token &Result, const char *CurPtr, bool Wide) { if (C == 0 && PP && PP->isCodeCompletionFile(FileLoc)) PP->CodeCompleteNaturalLanguage(); else if (!isLexingRawMode() && !Features.AsmPreprocessor) - Diag(BufferPtr, diag::err_unterminated_string); + Diag(BufferPtr, diag::warn_unterminated_string); FormTokenWithChars(Result, CurPtr-1, tok::unknown); return; } @@ -1059,7 +1241,7 @@ void Lexer::LexCharConstant(Token &Result, const char *CurPtr) { if (C == 0 && PP && PP->isCodeCompletionFile(FileLoc)) PP->CodeCompleteNaturalLanguage(); else if (!isLexingRawMode() && !Features.AsmPreprocessor) - Diag(BufferPtr, diag::err_unterminated_char); + Diag(BufferPtr, diag::warn_unterminated_char); FormTokenWithChars(Result, CurPtr-1, tok::unknown); return; } else if (C == 0) { @@ -2141,6 +2323,10 @@ LexNextToken: // If this is actually a '<<<<<<<' version control conflict marker, // recognize it as such and recover nicely. goto LexNextToken; + } else if (Features.CUDA && After == '<') { + Kind = tok::lesslessless; + CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), + SizeTmp2, Result); } else { CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); Kind = tok::lessless; @@ -2172,6 +2358,10 @@ LexNextToken: } else if (After == '>' && HandleEndOfConflictMarker(CurPtr-1)) { // If this is '>>>>>>>' and we're in a conflict marker, ignore it. goto LexNextToken; + } else if (Features.CUDA && After == '>') { + Kind = tok::greatergreatergreater; + CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), + SizeTmp2, Result); } else { CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); Kind = tok::greatergreater; |