diff options
author | rdivacky <rdivacky@FreeBSD.org> | 2009-10-14 18:03:49 +0000 |
---|---|---|
committer | rdivacky <rdivacky@FreeBSD.org> | 2009-10-14 18:03:49 +0000 |
commit | 9092c3e0fa01f3139b016d05d267a89e3b07747a (patch) | |
tree | 137ebebcae16fb0ce7ab4af456992bbd8d22fced /lib/Lex/Lexer.cpp | |
parent | 4981926bf654fe5a2c3893f24ca44106b217e71e (diff) | |
download | FreeBSD-src-9092c3e0fa01f3139b016d05d267a89e3b07747a.zip FreeBSD-src-9092c3e0fa01f3139b016d05d267a89e3b07747a.tar.gz |
Update clang to r84119.
Diffstat (limited to 'lib/Lex/Lexer.cpp')
-rw-r--r-- | lib/Lex/Lexer.cpp | 436 |
1 files changed, 266 insertions, 170 deletions
diff --git a/lib/Lex/Lexer.cpp b/lib/Lex/Lexer.cpp index 6f1043a..c8b9a5d 100644 --- a/lib/Lex/Lexer.cpp +++ b/lib/Lex/Lexer.cpp @@ -39,7 +39,7 @@ static void InitCharacterInfo(); // Token Class Implementation //===----------------------------------------------------------------------===// -/// isObjCAtKeyword - Return true if we have an ObjC keyword identifier. +/// isObjCAtKeyword - Return true if we have an ObjC keyword identifier. bool Token::isObjCAtKeyword(tok::ObjCKeywordKind objcKey) const { if (IdentifierInfo *II = getIdentifierInfo()) return II->getObjCKeywordID() == objcKey; @@ -57,35 +57,36 @@ tok::ObjCKeywordKind Token::getObjCKeywordID() const { // Lexer Class Implementation //===----------------------------------------------------------------------===// -void Lexer::InitLexer(const char *BufStart, const char *BufPtr, +void Lexer::InitLexer(const char *BufStart, const char *BufPtr, const char *BufEnd) { InitCharacterInfo(); - + BufferStart = BufStart; BufferPtr = BufPtr; BufferEnd = BufEnd; - + assert(BufEnd[0] == 0 && "We assume that the input buffer has a null character at the end" " to simplify lexing!"); - - Is_PragmaLexer = false; + Is_PragmaLexer = false; + IsEofCodeCompletion = false; + // Start of the file is a start of line. IsAtStartOfLine = true; - + // We are not after parsing a #. ParsingPreprocessorDirective = false; - + // We are not after parsing #include. ParsingFilename = false; - + // We are not in raw mode. Raw mode disables diagnostics and interpretation // of tokens (e.g. identifiers, thus disabling macro expansion). It is used // to quickly lex the tokens of the buffer, e.g. when handling a "#if 0" block // or otherwise skipping over tokens. LexingRawMode = false; - + // Default to not keeping comments. ExtendedTokenMode = 0; } @@ -98,14 +99,18 @@ Lexer::Lexer(FileID FID, Preprocessor &PP) : PreprocessorLexer(&PP, FID), FileLoc(PP.getSourceManager().getLocForStartOfFile(FID)), Features(PP.getLangOptions()) { - + const llvm::MemoryBuffer *InputFile = PP.getSourceManager().getBuffer(FID); InitLexer(InputFile->getBufferStart(), InputFile->getBufferStart(), InputFile->getBufferEnd()); - + // Default to keeping comments if the preprocessor wants them. SetCommentRetentionState(PP.getCommentRetentionState()); + + // If the input file is truncated, the EOF is a code-completion token. + if (PP.getSourceManager().isTruncatedFile(FID)) + IsEofCodeCompletion = true; } /// Lexer constructor - Create a new raw lexer object. This object is only @@ -116,7 +121,7 @@ Lexer::Lexer(SourceLocation fileloc, const LangOptions &features, : FileLoc(fileloc), Features(features) { InitLexer(BufStart, BufPtr, BufEnd); - + // We *are* in raw mode. LexingRawMode = true; } @@ -128,9 +133,9 @@ Lexer::Lexer(FileID FID, const SourceManager &SM, const LangOptions &features) : FileLoc(SM.getLocForStartOfFile(FID)), Features(features) { const llvm::MemoryBuffer *FromFile = SM.getBuffer(FID); - InitLexer(FromFile->getBufferStart(), FromFile->getBufferStart(), + InitLexer(FromFile->getBufferStart(), FromFile->getBufferStart(), FromFile->getBufferEnd()); - + // We *are* in raw mode. LexingRawMode = true; } @@ -150,7 +155,7 @@ Lexer::Lexer(FileID FID, const SourceManager &SM, const LangOptions &features) /// interface that could handle this stuff. This would pull GetMappedTokenLoc /// out of the critical path of the lexer! /// -Lexer *Lexer::Create_PragmaLexer(SourceLocation SpellingLoc, +Lexer *Lexer::Create_PragmaLexer(SourceLocation SpellingLoc, SourceLocation InstantiationLocStart, SourceLocation InstantiationLocEnd, unsigned TokLen, Preprocessor &PP) { @@ -159,12 +164,12 @@ Lexer *Lexer::Create_PragmaLexer(SourceLocation SpellingLoc, // Create the lexer as if we were going to lex the file normally. FileID SpellingFID = SM.getFileID(SpellingLoc); Lexer *L = new Lexer(SpellingFID, PP); - + // Now that the lexer is created, change the start/end locations so that we // just lex the subsection of the file that we want. This is lexing from a // scratch buffer. const char *StrData = SM.getCharacterData(SpellingLoc); - + L->BufferPtr = StrData; L->BufferEnd = StrData+TokLen; assert(L->BufferEnd[0] == 0 && "Buffer is not nul terminated!"); @@ -174,11 +179,11 @@ Lexer *Lexer::Create_PragmaLexer(SourceLocation SpellingLoc, L->FileLoc = SM.createInstantiationLoc(SM.getLocForStartOfFile(SpellingFID), InstantiationLocStart, InstantiationLocEnd, TokLen); - + // Ensure that the lexer thinks it is inside a directive, so that end \n will // return an EOM token. L->ParsingPreprocessorDirective = true; - + // This lexer really is for _Pragma. L->Is_PragmaLexer = true; return L; @@ -220,7 +225,7 @@ unsigned Lexer::MeasureTokenLength(SourceLocation Loc, const LangOptions &LangOpts) { // TODO: this could be special cased for common tokens like identifiers, ')', // etc to make this faster, if it mattered. Just look at StrData[0] to handle - // all obviously single-char tokens. This could use + // all obviously single-char tokens. This could use // Lexer::isObviouslySimpleCharacter for example to handle identifiers or // something. @@ -233,6 +238,7 @@ unsigned Lexer::MeasureTokenLength(SourceLocation Loc, // Create a lexer starting at the beginning of this token. Lexer TheLexer(Loc, LangOpts, Buffer.first, StrData, Buffer.second); + TheLexer.SetCommentRetentionState(true); Token TheTok; TheLexer.LexFromRawLexer(TheTok); return TheTok.getLength(); @@ -242,8 +248,6 @@ unsigned Lexer::MeasureTokenLength(SourceLocation Loc, // Character information. //===----------------------------------------------------------------------===// -static unsigned char CharInfo[256]; - enum { CHAR_HORZ_WS = 0x01, // ' ', '\t', '\f', '\v'. Note, no '\0' CHAR_VERT_WS = 0x02, // '\r', '\n' @@ -253,25 +257,98 @@ enum { CHAR_PERIOD = 0x20 // . }; +// Statically initialize CharInfo table based on ASCII character set +// Reference: FreeBSD 7.2 /usr/share/misc/ascii +static const unsigned char CharInfo[256] = +{ +// 0 NUL 1 SOH 2 STX 3 ETX +// 4 EOT 5 ENQ 6 ACK 7 BEL + 0 , 0 , 0 , 0 , + 0 , 0 , 0 , 0 , +// 8 BS 9 HT 10 NL 11 VT +//12 NP 13 CR 14 SO 15 SI + 0 , CHAR_HORZ_WS, CHAR_VERT_WS, CHAR_HORZ_WS, + CHAR_HORZ_WS, CHAR_VERT_WS, 0 , 0 , +//16 DLE 17 DC1 18 DC2 19 DC3 +//20 DC4 21 NAK 22 SYN 23 ETB + 0 , 0 , 0 , 0 , + 0 , 0 , 0 , 0 , +//24 CAN 25 EM 26 SUB 27 ESC +//28 FS 29 GS 30 RS 31 US + 0 , 0 , 0 , 0 , + 0 , 0 , 0 , 0 , +//32 SP 33 ! 34 " 35 # +//36 $ 37 % 38 & 39 ' + CHAR_HORZ_WS, 0 , 0 , 0 , + 0 , 0 , 0 , 0 , +//40 ( 41 ) 42 * 43 + +//44 , 45 - 46 . 47 / + 0 , 0 , 0 , 0 , + 0 , 0 , CHAR_PERIOD , 0 , +//48 0 49 1 50 2 51 3 +//52 4 53 5 54 6 55 7 + CHAR_NUMBER , CHAR_NUMBER , CHAR_NUMBER , CHAR_NUMBER , + CHAR_NUMBER , CHAR_NUMBER , CHAR_NUMBER , CHAR_NUMBER , +//56 8 57 9 58 : 59 ; +//60 < 61 = 62 > 63 ? + CHAR_NUMBER , CHAR_NUMBER , 0 , 0 , + 0 , 0 , 0 , 0 , +//64 @ 65 A 66 B 67 C +//68 D 69 E 70 F 71 G + 0 , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , + CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , +//72 H 73 I 74 J 75 K +//76 L 77 M 78 N 79 O + CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , + CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , +//80 P 81 Q 82 R 83 S +//84 T 85 U 86 V 87 W + CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , + CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , +//88 X 89 Y 90 Z 91 [ +//92 \ 93 ] 94 ^ 95 _ + CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 0 , + 0 , 0 , 0 , CHAR_UNDER , +//96 ` 97 a 98 b 99 c +//100 d 101 e 102 f 103 g + 0 , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , + CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , +//104 h 105 i 106 j 107 k +//108 l 109 m 110 n 111 o + CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , + CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , +//112 p 113 q 114 r 115 s +//116 t 117 u 118 v 119 w + CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , + CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , +//120 x 121 y 122 z 123 { +//124 | 125 } 126 ~ 127 DEL + CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 0 , + 0 , 0 , 0 , 0 +}; + static void InitCharacterInfo() { static bool isInited = false; if (isInited) return; - isInited = true; - - // Intiialize the CharInfo table. - // TODO: statically initialize this. - CharInfo[(int)' '] = CharInfo[(int)'\t'] = - CharInfo[(int)'\f'] = CharInfo[(int)'\v'] = CHAR_HORZ_WS; - CharInfo[(int)'\n'] = CharInfo[(int)'\r'] = CHAR_VERT_WS; - - CharInfo[(int)'_'] = CHAR_UNDER; - CharInfo[(int)'.'] = CHAR_PERIOD; - for (unsigned i = 'a'; i <= 'z'; ++i) - CharInfo[i] = CharInfo[i+'A'-'a'] = CHAR_LETTER; + // check the statically-initialized CharInfo table + assert(CHAR_HORZ_WS == CharInfo[(int)' ']); + assert(CHAR_HORZ_WS == CharInfo[(int)'\t']); + assert(CHAR_HORZ_WS == CharInfo[(int)'\f']); + assert(CHAR_HORZ_WS == CharInfo[(int)'\v']); + assert(CHAR_VERT_WS == CharInfo[(int)'\n']); + assert(CHAR_VERT_WS == CharInfo[(int)'\r']); + assert(CHAR_UNDER == CharInfo[(int)'_']); + assert(CHAR_PERIOD == CharInfo[(int)'.']); + for (unsigned i = 'a'; i <= 'z'; ++i) { + assert(CHAR_LETTER == CharInfo[i]); + assert(CHAR_LETTER == CharInfo[i+'A'-'a']); + } for (unsigned i = '0'; i <= '9'; ++i) - CharInfo[i] = CHAR_NUMBER; + assert(CHAR_NUMBER == CharInfo[i]); + isInited = true; } + /// isIdentifierBody - Return true if this is the body character of an /// identifier, which is [a-zA-Z0-9_]. static inline bool isIdentifierBody(unsigned char c) { @@ -294,7 +371,7 @@ static inline bool isWhitespace(unsigned char c) { /// isNumberBody - Return true if this is the body character of an /// preprocessing number, which is [a-zA-Z0-9_.]. static inline bool isNumberBody(unsigned char c) { - return (CharInfo[c] & (CHAR_LETTER|CHAR_NUMBER|CHAR_UNDER|CHAR_PERIOD)) ? + return (CharInfo[c] & (CHAR_LETTER|CHAR_NUMBER|CHAR_UNDER|CHAR_PERIOD)) ? true : false; } @@ -315,22 +392,22 @@ static SourceLocation GetMappedTokenLoc(Preprocessor &PP, SourceLocation FileLoc, unsigned CharNo, unsigned TokLen) { assert(FileLoc.isMacroID() && "Must be an instantiation"); - + // Otherwise, we're lexing "mapped tokens". This is used for things like // _Pragma handling. Combine the instantiation location of FileLoc with the // spelling location. SourceManager &SM = PP.getSourceManager(); - + // Create a new SLoc which is expanded from Instantiation(FileLoc) but whose // characters come from spelling(FileLoc)+Offset. SourceLocation SpellingLoc = SM.getSpellingLoc(FileLoc); SpellingLoc = SpellingLoc.getFileLocWithOffset(CharNo); - + // Figure out the expansion loc range, which is the range covered by the // original _Pragma(...) sequence. std::pair<SourceLocation,SourceLocation> II = SM.getImmediateInstantiationRange(FileLoc); - + return SM.createInstantiationLoc(SpellingLoc, II.first, II.second, TokLen); } @@ -346,7 +423,7 @@ SourceLocation Lexer::getSourceLocation(const char *Loc, unsigned CharNo = Loc-BufferStart; if (FileLoc.isFileID()) return FileLoc.getFileLocWithOffset(CharNo); - + // Otherwise, this is the _Pragma lexer case, which pretends that all of the // tokens are lexed from where the _Pragma was defined. assert(PP && "This doesn't work on raw lexers"); @@ -387,13 +464,13 @@ static char GetTrigraphCharForLetter(char Letter) { static char DecodeTrigraphChar(const char *CP, Lexer *L) { char Res = GetTrigraphCharForLetter(*CP); if (!Res || !L) return Res; - + if (!L->getFeatures().Trigraphs) { if (!L->isLexingRawMode()) L->Diag(CP-2, diag::trigraph_ignored); return 0; } - + if (!L->isLexingRawMode()) L->Diag(CP-2, diag::trigraph_converted) << std::string()+Res; return Res; @@ -401,12 +478,12 @@ static char DecodeTrigraphChar(const char *CP, Lexer *L) { /// getEscapedNewLineSize - Return the size of the specified escaped newline, /// or 0 if it is not an escaped newline. P[-1] is known to be a "\" or a -/// trigraph equivalent on entry to this function. +/// trigraph equivalent on entry to this function. unsigned Lexer::getEscapedNewLineSize(const char *Ptr) { unsigned Size = 0; while (isWhitespace(Ptr[Size])) { ++Size; - + if (Ptr[Size-1] != '\n' && Ptr[Size-1] != '\r') continue; @@ -414,10 +491,10 @@ unsigned Lexer::getEscapedNewLineSize(const char *Ptr) { if ((Ptr[Size] == '\r' || Ptr[Size] == '\n') && Ptr[Size-1] != Ptr[Size]) ++Size; - + return Size; - } - + } + // Not an escaped newline, must be a \t or something else. return 0; } @@ -438,7 +515,7 @@ const char *Lexer::SkipEscapedNewLines(const char *P) { } else { return P; } - + unsigned NewLineSize = Lexer::getEscapedNewLineSize(AfterEscape); if (NewLineSize == 0) return P; P = AfterEscape+NewLineSize; @@ -472,7 +549,7 @@ char Lexer::getCharAndSizeSlow(const char *Ptr, unsigned &Size, Slash: // Common case, backslash-char where the char is not whitespace. if (!isWhitespace(Ptr[0])) return '\\'; - + // See if we have optional whitespace characters between the slash and // newline. if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) { @@ -482,18 +559,18 @@ Slash: // Warn if there was whitespace between the backslash and newline. if (Ptr[0] != '\n' && Ptr[0] != '\r' && Tok && !isLexingRawMode()) Diag(Ptr, diag::backslash_newline_space); - + // Found backslash<whitespace><newline>. Parse the char after it. Size += EscapedNewLineSize; Ptr += EscapedNewLineSize; // Use slow version to accumulate a correct size field. return getCharAndSizeSlow(Ptr, Size, Tok); } - + // Otherwise, this is not an escaped newline, just return the slash. return '\\'; } - + // If this is a trigraph, process it. if (Ptr[0] == '?' && Ptr[1] == '?') { // If this is actually a legal trigraph (not something like "??x"), emit @@ -508,7 +585,7 @@ Slash: return C; } } - + // If this is neither, return a single character. ++Size; return *Ptr; @@ -530,21 +607,21 @@ char Lexer::getCharAndSizeSlowNoWarn(const char *Ptr, unsigned &Size, Slash: // Common case, backslash-char where the char is not whitespace. if (!isWhitespace(Ptr[0])) return '\\'; - + // See if we have optional whitespace characters followed by a newline. if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) { // Found backslash<whitespace><newline>. Parse the char after it. Size += EscapedNewLineSize; Ptr += EscapedNewLineSize; - + // Use slow version to accumulate a correct size field. return getCharAndSizeSlowNoWarn(Ptr, Size, Features); } - + // Otherwise, this is not an escaped newline, just return the slash. return '\\'; } - + // If this is a trigraph, process it. if (Features.Trigraphs && Ptr[0] == '?' && Ptr[1] == '?') { // If this is actually a legal trigraph (not something like "??x"), return @@ -556,7 +633,7 @@ Slash: return C; } } - + // If this is neither, return a single character. ++Size; return *Ptr; @@ -582,34 +659,34 @@ void Lexer::LexIdentifier(Token &Result, const char *CurPtr) { FinishIdentifier: const char *IdStart = BufferPtr; FormTokenWithChars(Result, CurPtr, tok::identifier); - + // If we are in raw mode, return this identifier raw. There is no need to // look up identifier information or attempt to macro expand it. if (LexingRawMode) return; - + // Fill in Result.IdentifierInfo, looking up the identifier in the // identifier table. IdentifierInfo *II = PP->LookUpIdentifierInfo(Result, IdStart); - + // Change the kind of this identifier to the appropriate token kind, e.g. // turning "for" into a keyword. Result.setKind(II->getTokenID()); - + // Finally, now that we know we have an identifier, pass this off to the // preprocessor, which may macro expand it or something. if (II->isHandleIdentifierCase()) PP->HandleIdentifier(Result); return; } - + // Otherwise, $,\,? in identifier found. Enter slower path. - + C = getCharAndSize(CurPtr, Size); while (1) { if (C == '$') { // If we hit a $ and they are not supported in identifiers, we are done. if (!Features.DollarIdents) goto FinishIdentifier; - + // Otherwise, emit a diagnostic and continue. if (!isLexingRawMode()) Diag(CurPtr, diag::ext_dollar_in_identifier); @@ -645,7 +722,7 @@ void Lexer::LexNumericConstant(Token &Result, const char *CurPtr) { PrevCh = C; C = getCharAndSize(CurPtr, Size); } - + // If we fell out, check for a sign, due to 1e+12. If we have one, continue. if ((C == '-' || C == '+') && (PrevCh == 'E' || PrevCh == 'e')) return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result)); @@ -653,7 +730,7 @@ void Lexer::LexNumericConstant(Token &Result, const char *CurPtr) { // If we have a hex FP constant, continue. if ((C == '-' || C == '+') && (PrevCh == 'P' || PrevCh == 'p')) return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result)); - + // Update the location of token as well as BufferPtr. const char *TokStart = BufferPtr; FormTokenWithChars(Result, CurPtr, tok::numeric_constant); @@ -664,7 +741,7 @@ void Lexer::LexNumericConstant(Token &Result, const char *CurPtr) { /// either " or L". void Lexer::LexStringLiteral(Token &Result, const char *CurPtr, bool Wide) { const char *NulCharacter = 0; // Does this string contain the \0 character? - + char C = getAndAdvanceChar(CurPtr, Result); while (C != '"') { // Skip escaped characters. @@ -682,7 +759,7 @@ void Lexer::LexStringLiteral(Token &Result, const char *CurPtr, bool Wide) { } C = getAndAdvanceChar(CurPtr, Result); } - + // If a nul character existed in the string, warn about it. if (NulCharacter && !isLexingRawMode()) Diag(NulCharacter, diag::null_in_string); @@ -716,11 +793,11 @@ void Lexer::LexAngledStringLiteral(Token &Result, const char *CurPtr) { } C = getAndAdvanceChar(CurPtr, Result); } - + // If a nul character existed in the string, warn about it. if (NulCharacter && !isLexingRawMode()) Diag(NulCharacter, diag::null_in_string); - + // Update the location of token as well as BufferPtr. const char *TokStart = BufferPtr; FormTokenWithChars(Result, CurPtr, tok::angle_string_literal); @@ -745,7 +822,7 @@ void Lexer::LexCharConstant(Token &Result, const char *CurPtr) { // FIXME: UCN's. C = getAndAdvanceChar(CurPtr, Result); } - + if (C && C != '\n' && C != '\r' && CurPtr[0] == '\'') { ++CurPtr; } else { @@ -767,7 +844,7 @@ void Lexer::LexCharConstant(Token &Result, const char *CurPtr) { C = getAndAdvanceChar(CurPtr, Result); } while (C != '\''); } - + if (NulCharacter && !isLexingRawMode()) Diag(NulCharacter, diag::null_in_char); @@ -789,17 +866,17 @@ bool Lexer::SkipWhitespace(Token &Result, const char *CurPtr) { // Skip horizontal whitespace very aggressively. while (isHorizontalWhitespace(Char)) Char = *++CurPtr; - + // Otherwise if we have something other than whitespace, we're done. if (Char != '\n' && Char != '\r') break; - + if (ParsingPreprocessorDirective) { // End of preprocessor directive line, let LexTokenInternal handle this. BufferPtr = CurPtr; return false; } - + // ok, but handle newline. // The returned token is at the start of the line. Result.setFlag(Token::StartOfLine); @@ -818,7 +895,7 @@ bool Lexer::SkipWhitespace(Token &Result, const char *CurPtr) { FormTokenWithChars(Result, CurPtr, tok::unknown); return true; } - + BufferPtr = CurPtr; return false; } @@ -832,12 +909,12 @@ bool Lexer::SkipBCPLComment(Token &Result, const char *CurPtr) { // extension warning. if (!Features.BCPLComment && !isLexingRawMode()) { Diag(BufferPtr, diag::ext_bcpl_comment); - + // Mark them enabled so we only emit one warning for this translation // unit. Features.BCPLComment = true; } - + // Scan over the body of the comment. The common case, when scanning, is that // the comment contains normal ascii characters with nothing interesting in // them. As such, optimize for this case with the inner loop. @@ -847,7 +924,7 @@ bool Lexer::SkipBCPLComment(Token &Result, const char *CurPtr) { // FIXME: Speedup BCPL comment lexing. Just scan for a \n or \r character. // If we find a \n character, scan backwards, checking to see if it's an // escaped newline, like we do for block comments. - + // Skip over characters in the fast loop. while (C != 0 && // Potentially EOF. C != '\\' && // Potentially escaped newline. @@ -858,7 +935,7 @@ bool Lexer::SkipBCPLComment(Token &Result, const char *CurPtr) { // If this is a newline, we're done. if (C == '\n' || C == '\r') break; // Found the newline? Break out! - + // Otherwise, this is a hard case. Fall back on getAndAdvanceChar to // properly decode the character. Read it in raw mode to avoid emitting // diagnostics about things like trigraphs. If we see an escaped newline, @@ -876,7 +953,7 @@ bool Lexer::SkipBCPLComment(Token &Result, const char *CurPtr) { --CurPtr; C = 'x'; // doesn't matter what this is. } - + // If we read multiple characters, and one of those characters was a \r or // \n, then we had an escaped newline within the comment. Emit diagnostic // unless the next line is also a // comment. @@ -892,21 +969,21 @@ bool Lexer::SkipBCPLComment(Token &Result, const char *CurPtr) { if (ForwardPtr[0] == '/' && ForwardPtr[1] == '/') break; } - + if (!isLexingRawMode()) Diag(OldPtr-1, diag::ext_multi_line_bcpl_comment); break; } } - + if (CurPtr == BufferEnd+1) { --CurPtr; break; } } while (C != '\n' && C != '\r'); // Found but did not consume the newline. if (PP) - PP->HandleComment(SourceRange(getSourceLocation(BufferPtr), + PP->HandleComment(SourceRange(getSourceLocation(BufferPtr), getSourceLocation(CurPtr))); - + // If we are returning comments as tokens, return this comment as a token. if (inKeepCommentMode()) return SaveBCPLComment(Result, CurPtr); @@ -917,14 +994,14 @@ bool Lexer::SkipBCPLComment(Token &Result, const char *CurPtr) { BufferPtr = CurPtr; return false; } - + // Otherwise, eat the \n character. We don't care if this is a \n\r or // \r\n sequence. This is an efficiency hack (because we know the \n can't // contribute to another token), it isn't needed for correctness. Note that // this is ok even in KeepWhitespaceMode, because we would have returned the /// comment above in that mode. ++CurPtr; - + // The next returned token is at the start of the line. Result.setFlag(Token::StartOfLine); // No leading whitespace seen so far. @@ -939,17 +1016,17 @@ bool Lexer::SaveBCPLComment(Token &Result, const char *CurPtr) { // If we're not in a preprocessor directive, just return the // comment // directly. FormTokenWithChars(Result, CurPtr, tok::comment); - + if (!ParsingPreprocessorDirective) return true; - + // If this BCPL-style comment is in a macro definition, transmogrify it into // a C-style block comment. std::string Spelling = PP->getSpelling(Result); assert(Spelling[0] == '/' && Spelling[1] == '/' && "Not bcpl comment?"); Spelling[1] = '*'; // Change prefix to "/*". Spelling += "*/"; // add suffix. - + Result.setKind(tok::comment); PP->CreateString(&Spelling[0], Spelling.size(), Result, Result.getLocation()); @@ -959,13 +1036,13 @@ bool Lexer::SaveBCPLComment(Token &Result, const char *CurPtr) { /// isBlockCommentEndOfEscapedNewLine - Return true if the specified newline /// character (either \n or \r) is part of an escaped newline sequence. Issue a /// diagnostic if so. We know that the newline is inside of a block comment. -static bool isEndOfBlockCommentWithEscapedNewLine(const char *CurPtr, +static bool isEndOfBlockCommentWithEscapedNewLine(const char *CurPtr, Lexer *L) { assert(CurPtr[0] == '\n' || CurPtr[0] == '\r'); - + // Back up off the newline. --CurPtr; - + // If this is a two-character newline sequence, skip the other character. if (CurPtr[0] == '\n' || CurPtr[0] == '\r') { // \n\n or \r\r -> not escaped newline. @@ -974,7 +1051,7 @@ static bool isEndOfBlockCommentWithEscapedNewLine(const char *CurPtr, // \n\r or \r\n -> skip the newline. --CurPtr; } - + // If we have horizontal whitespace, skip over it. We allow whitespace // between the slash and newline. bool HasSpace = false; @@ -982,7 +1059,7 @@ static bool isEndOfBlockCommentWithEscapedNewLine(const char *CurPtr, --CurPtr; HasSpace = true; } - + // If we have a slash, we know this is an escaped newline. if (*CurPtr == '\\') { if (CurPtr[-1] != '*') return false; @@ -991,7 +1068,7 @@ static bool isEndOfBlockCommentWithEscapedNewLine(const char *CurPtr, if (CurPtr[0] != '/' || CurPtr[-1] != '?' || CurPtr[-2] != '?' || CurPtr[-3] != '*') return false; - + // This is the trigraph ending the comment. Emit a stern warning! CurPtr -= 2; @@ -1005,15 +1082,15 @@ static bool isEndOfBlockCommentWithEscapedNewLine(const char *CurPtr, if (!L->isLexingRawMode()) L->Diag(CurPtr, diag::trigraph_ends_block_comment); } - + // Warn about having an escaped newline between the */ characters. if (!L->isLexingRawMode()) L->Diag(CurPtr, diag::escaped_newline_block_comment_end); - + // If there was space between the backslash and newline, warn about it. if (HasSpace && !L->isLexingRawMode()) L->Diag(CurPtr, diag::backslash_newline_space); - + return true; } @@ -1049,23 +1126,23 @@ bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr) { if (!isLexingRawMode()) Diag(BufferPtr, diag::err_unterminated_block_comment); --CurPtr; - + // KeepWhitespaceMode should return this broken comment as a token. Since // it isn't a well formed comment, just return it as an 'unknown' token. if (isKeepWhitespaceMode()) { FormTokenWithChars(Result, CurPtr, tok::unknown); return true; } - + BufferPtr = CurPtr; return false; } - + // Check to see if the first character after the '/*' is another /. If so, // then this slash does not end the block comment, it is part of it. if (C == '/') C = *CurPtr++; - + while (1) { // Skip over all non-interesting characters until we find end of buffer or a // (probably ending) '/' character. @@ -1073,7 +1150,7 @@ bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr) { // While not aligned to a 16-byte boundary. while (C != '/' && ((intptr_t)CurPtr & 0x0F) != 0) C = *CurPtr++; - + if (C == '/') goto FoundSlash; #ifdef __SSE2__ @@ -1084,13 +1161,13 @@ bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr) { CurPtr += 16; #elif __ALTIVEC__ __vector unsigned char Slashes = { - '/', '/', '/', '/', '/', '/', '/', '/', + '/', '/', '/', '/', '/', '/', '/', '/', '/', '/', '/', '/', '/', '/', '/', '/' }; while (CurPtr+16 <= BufferEnd && !vec_any_eq(*(vector unsigned char*)CurPtr, Slashes)) CurPtr += 16; -#else +#else // Scan for '/' quickly. Many block comments are very large. while (CurPtr[0] != '/' && CurPtr[1] != '/' && @@ -1100,20 +1177,20 @@ bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr) { CurPtr += 4; } #endif - + // It has to be one of the bytes scanned, increment to it and read one. C = *CurPtr++; } - + // Loop to scan the remainder. while (C != '/' && C != '\0') C = *CurPtr++; - + FoundSlash: if (C == '/') { if (CurPtr[-2] == '*') // We found the final */. We're done! break; - + if ((CurPtr[-2] == '\n' || CurPtr[-2] == '\r')) { if (isEndOfBlockCommentWithEscapedNewLine(CurPtr-2, this)) { // We found the final */, though it had an escaped newline between the @@ -1135,22 +1212,22 @@ bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr) { // after the /*, but this would involve lexing a lot of what really is the // comment, which surely would confuse the parser. --CurPtr; - + // KeepWhitespaceMode should return this broken comment as a token. Since // it isn't a well formed comment, just return it as an 'unknown' token. if (isKeepWhitespaceMode()) { FormTokenWithChars(Result, CurPtr, tok::unknown); return true; } - + BufferPtr = CurPtr; return false; } C = *CurPtr++; } - - if (PP) - PP->HandleComment(SourceRange(getSourceLocation(BufferPtr), + + if (PP) + PP->HandleComment(SourceRange(getSourceLocation(BufferPtr), getSourceLocation(CurPtr))); // If we are returning comments as tokens, return this comment as a token. @@ -1208,11 +1285,11 @@ std::string Lexer::ReadToEndOfLine() { // Okay, we found the end of the line. First, back up past the \0, \r, \n. assert(CurPtr[-1] == Char && "Trigraphs for newline?"); BufferPtr = CurPtr-1; - + // Next, lex the character, which should handle the EOM transition. Lex(Tmp); assert(Tmp.is(tok::eom) && "Unexpected token!"); - + // Finally, we're done, return the string we found. return Result; } @@ -1232,12 +1309,12 @@ bool Lexer::LexEndOfFile(Token &Result, const char *CurPtr) { ParsingPreprocessorDirective = false; // Update the location of token as well as BufferPtr. FormTokenWithChars(Result, CurPtr, tok::eom); - + // Restore comment saving mode, in case it was disabled for directive. SetCommentRetentionState(PP->getCommentRetentionState()); return true; // Have a token. - } - + } + // If we are in raw mode, return this event as an EOF token. Let the caller // that put us in raw mode handle the event. if (isLexingRawMode()) { @@ -1246,23 +1323,44 @@ bool Lexer::LexEndOfFile(Token &Result, const char *CurPtr) { FormTokenWithChars(Result, BufferEnd, tok::eof); return true; } - - // Otherwise, issue diagnostics for unterminated #if and missing newline. + // Otherwise, check if we are code-completing, then issue diagnostics for + // unterminated #if and missing newline. + + if (IsEofCodeCompletion) { + bool isIntendedFile = true; + if (PP && FileLoc.isFileID()) { + SourceManager &SM = PP->getSourceManager(); + isIntendedFile = SM.isTruncatedFile(SM.getFileID(FileLoc)); + } + + if (isIntendedFile) { + // We're at the end of the file, but we've been asked to consider the + // end of the file to be a code-completion token. Return the + // code-completion token. + Result.startToken(); + FormTokenWithChars(Result, CurPtr, tok::code_completion); + + // Only do the eof -> code_completion translation once. + IsEofCodeCompletion = false; + return true; + } + } + // If we are in a #if directive, emit an error. while (!ConditionalStack.empty()) { PP->Diag(ConditionalStack.back().IfLoc, diag::err_pp_unterminated_conditional); ConditionalStack.pop_back(); } - + // C99 5.1.1.2p2: If the file is non-empty and didn't end in a newline, issue // a pedwarn. if (CurPtr != BufferStart && (CurPtr[-1] != '\n' && CurPtr[-1] != '\r')) Diag(BufferEnd, diag::ext_no_newline_eof) << CodeModificationHint::CreateInsertion(getSourceLocation(BufferEnd), "\n"); - + BufferPtr = CurPtr; // Finally, let the preprocessor handle this. @@ -1275,27 +1373,27 @@ bool Lexer::LexEndOfFile(Token &Result, const char *CurPtr) { /// lexer. unsigned Lexer::isNextPPTokenLParen() { assert(!LexingRawMode && "How can we expand a macro from a skipping buffer?"); - + // Switch to 'skipping' mode. This will ensure that we can lex a token // without emitting diagnostics, disables macro expansion, and will cause EOF // to return an EOF token instead of popping the include stack. LexingRawMode = true; - + // Save state that can be changed while lexing so that we can restore it. const char *TmpBufferPtr = BufferPtr; bool inPPDirectiveMode = ParsingPreprocessorDirective; - + Token Tok; Tok.startToken(); LexTokenInternal(Tok); - + // Restore state that may have changed. BufferPtr = TmpBufferPtr; ParsingPreprocessorDirective = inPPDirectiveMode; - + // Restore the lexer back to non-skipping mode. LexingRawMode = false; - + if (Tok.is(tok::eof)) return 2; return Tok.is(tok::l_paren); @@ -1304,17 +1402,15 @@ unsigned Lexer::isNextPPTokenLParen() { /// LexTokenInternal - This implements a simple C family lexer. It is an /// extremely performance critical piece of code. This assumes that the buffer -/// has a null character at the end of the file. Return true if an error -/// occurred and compilation should terminate, false if normal. This returns a -/// preprocessing token, not a normal token, as such, it is an internal -/// interface. It assumes that the Flags of result have been cleared before -/// calling this. +/// has a null character at the end of the file. This returns a preprocessing +/// token, not a normal token, as such, it is an internal interface. It assumes +/// that the Flags of result have been cleared before calling this. void Lexer::LexTokenInternal(Token &Result) { LexNextToken: // New token, can't need cleaning yet. Result.clearFlag(Token::NeedsCleaning); Result.setIdentifierInfo(0); - + // CurPtr - Cache BufferPtr in an automatic variable. const char *CurPtr = BufferPtr; @@ -1323,7 +1419,7 @@ LexNextToken: ++CurPtr; while ((*CurPtr == ' ') || (*CurPtr == '\t')) ++CurPtr; - + // If we are keeping whitespace and other tokens, just return what we just // skipped. The next lexer invocation will return the token after the // whitespace. @@ -1331,17 +1427,17 @@ LexNextToken: FormTokenWithChars(Result, CurPtr, tok::unknown); return; } - + BufferPtr = CurPtr; Result.setFlag(Token::LeadingSpace); } - + unsigned SizeTmp, SizeTmp2; // Temporaries for use in cases below. - + // Read a character, advancing over it. char Char = getAndAdvanceChar(CurPtr, Result); tok::TokenKind Kind; - + switch (Char) { case 0: // Null. // Found end of file? @@ -1354,13 +1450,13 @@ LexNextToken: assert(PPCache && "Raw buffer::LexEndOfFile should return a token"); return PPCache->Lex(Result); } - + if (!isLexingRawMode()) Diag(CurPtr-1, diag::null_in_file); Result.setFlag(Token::LeadingSpace); if (SkipWhitespace(Result, CurPtr)) return; // KeepWhitespaceMode - + goto LexNextToken; // GCC isn't tail call eliminating. case '\n': case '\r': @@ -1369,13 +1465,13 @@ LexNextToken: if (ParsingPreprocessorDirective) { // Done parsing the "line". ParsingPreprocessorDirective = false; - + // Restore comment saving mode, in case it was disabled for directive. SetCommentRetentionState(PP->getCommentRetentionState()); - + // Since we consumed a newline, we are back at the start of a line. IsAtStartOfLine = true; - + Kind = tok::eom; break; } @@ -1383,7 +1479,7 @@ LexNextToken: Result.setFlag(Token::StartOfLine); // No leading whitespace seen so far. Result.clearFlag(Token::LeadingSpace); - + if (SkipWhitespace(Result, CurPtr)) return; // KeepWhitespaceMode goto LexNextToken; // GCC isn't tail call eliminating. @@ -1398,7 +1494,7 @@ LexNextToken: SkipIgnoredUnits: CurPtr = BufferPtr; - + // If the next token is obviously a // or /* */ comment, skip it efficiently // too (without going through the big switch stmt). if (CurPtr[0] == '/' && CurPtr[1] == '/' && !inKeepCommentMode() && @@ -1420,7 +1516,7 @@ LexNextToken: // Notify MIOpt that we read a non-whitespace/non-comment token. MIOpt.ReadToken(); return LexNumericConstant(Result, CurPtr); - + case 'L': // Identifier (Loony) or wide literal (L'x' or L"xyz"). // Notify MIOpt that we read a non-whitespace/non-comment token. MIOpt.ReadToken(); @@ -1435,7 +1531,7 @@ LexNextToken: if (Char == '\'') return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result)); // FALL THROUGH, treating L like the start of an identifier. - + // C99 6.4.2: Identifiers. case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G': case 'H': case 'I': case 'J': case 'K': /*'L'*/case 'M': case 'N': @@ -1458,10 +1554,10 @@ LexNextToken: MIOpt.ReadToken(); return LexIdentifier(Result, CurPtr); } - + Kind = tok::unknown; break; - + // C99 6.4.4: Character Constants. case '\'': // Notify MIOpt that we read a non-whitespace/non-comment token. @@ -1527,7 +1623,7 @@ LexNextToken: Kind = tok::amp; } break; - case '*': + case '*': if (getCharAndSize(CurPtr, SizeTmp) == '=') { Kind = tok::starequal; CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); @@ -1552,7 +1648,7 @@ LexNextToken: if (Char == '-') { // -- CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); Kind = tok::minusminus; - } else if (Char == '>' && Features.CPlusPlus && + } else if (Char == '>' && Features.CPlusPlus && getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '*') { // C++ ->* CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), SizeTmp2, Result); @@ -1593,20 +1689,20 @@ LexNextToken: getCharAndSize(CurPtr+SizeTmp, SizeTmp2) != '*') { if (SkipBCPLComment(Result, ConsumeChar(CurPtr, SizeTmp, Result))) return; // KeepCommentMode - + // It is common for the tokens immediately after a // comment to be // whitespace (indentation for the next line). Instead of going through // the big switch, handle it efficiently now. goto SkipIgnoredUnits; } } - + if (Char == '*') { // /**/ comment. if (SkipBlockComment(Result, ConsumeChar(CurPtr, SizeTmp, Result))) return; // KeepCommentMode goto LexNextToken; // GCC isn't tail call eliminating. } - + if (Char == '=') { CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); Kind = tok::slashequal; @@ -1642,7 +1738,7 @@ LexNextToken: if (Result.isAtStartOfLine() && !LexingRawMode && !Is_PragmaLexer) { FormTokenWithChars(Result, CurPtr, tok::hash); PP->HandleDirective(Result); - + // As an optimization, if the preprocessor didn't switch lexers, tail // recurse. if (PP->isCurrentLexer(this)) { @@ -1655,10 +1751,10 @@ LexNextToken: } goto LexNextToken; // GCC isn't tail call eliminating. } - + return PP->Lex(Result); } - + Kind = tok::hash; } } else { @@ -1695,7 +1791,7 @@ LexNextToken: if (Char == '=') { CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); Kind = tok::greaterequal; - } else if (Char == '>' && + } else if (Char == '>' && getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '=') { CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), SizeTmp2, Result); @@ -1736,7 +1832,7 @@ LexNextToken: } else if (Features.CPlusPlus && Char == ':') { Kind = tok::coloncolon; CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); - } else { + } else { Kind = tok::colon; } break; @@ -1748,7 +1844,7 @@ LexNextToken: if (Char == '=') { Kind = tok::equalequal; CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); - } else { + } else { Kind = tok::equal; } break; @@ -1773,7 +1869,7 @@ LexNextToken: if (Result.isAtStartOfLine() && !LexingRawMode && !Is_PragmaLexer) { FormTokenWithChars(Result, CurPtr, tok::hash); PP->HandleDirective(Result); - + // As an optimization, if the preprocessor didn't switch lexers, tail // recurse. if (PP->isCurrentLexer(this)) { @@ -1788,7 +1884,7 @@ LexNextToken: } return PP->Lex(Result); } - + Kind = tok::hash; } break; @@ -1800,7 +1896,7 @@ LexNextToken: else Kind = tok::unknown; break; - + case '\\': // FIXME: UCN's. // FALL THROUGH. @@ -1808,7 +1904,7 @@ LexNextToken: Kind = tok::unknown; break; } - + // Notify MIOpt that we read a non-whitespace/non-comment token. MIOpt.ReadToken(); |