1 files changed, 264 insertions, 28 deletions
diff --git a/lib/Lex/Lexer.cpp b/lib/Lex/Lexer.cpp
index 91b14f6..917829b 100644
--- a/lib/Lex/Lexer.cpp
+++ b/lib/Lex/Lexer.cpp
@@ -27,7 +27,9 @@
 #include "clang/Lex/Lexer.h"
 #include "clang/Lex/Preprocessor.h"
 #include "clang/Lex/LexDiagnostic.h"
+#include "clang/Lex/CodeCompletionHandler.h"
 #include "clang/Basic/SourceManager.h"
+#include "llvm/ADT/StringSwitch.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include <cctype>
@@ -247,6 +249,200 @@ unsigned Lexer::MeasureTokenLength(SourceLocation Loc,
   return TheTok.getLength();
 }
 
+SourceLocation Lexer::GetBeginningOfToken(SourceLocation Loc,
+                                          const SourceManager &SM,
+                                          const LangOptions &LangOpts) {
+  std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
+  bool Invalid = false;
+  llvm::StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid);
+  if (Invalid)
+    return Loc;
+
+  // Back up from the current location until we hit the beginning of a line
+  // (or the buffer). We'll relex from that point.
+  const char *BufStart = Buffer.data();
+  const char *StrData = BufStart+LocInfo.second;
+  if (StrData[0] == '\n' || StrData[0] == '\r')
+    return Loc;
+
+  const char *LexStart = StrData;
+  while (LexStart != BufStart) {
+    if (LexStart[0] == '\n' || LexStart[0] == '\r') {
+      ++LexStart;
+      break;
+    }
+
+    --LexStart;
+  }
+  
+  // Create a lexer starting at the beginning of this token.
+  SourceLocation LexerStartLoc = Loc.getFileLocWithOffset(-LocInfo.second);
+  Lexer TheLexer(LexerStartLoc, LangOpts, BufStart, LexStart, Buffer.end());
+  TheLexer.SetCommentRetentionState(true);
+  
+  // Lex tokens until we find the token that contains the source location.
+  Token TheTok;
+  do {
+    TheLexer.LexFromRawLexer(TheTok);
+    
+    if (TheLexer.getBufferLocation() > StrData) {
+      // Lexing this token has taken the lexer past the source location we're
+      // looking for. If the current token encompasses our source location,
+      // return the beginning of that token.
+      if (TheLexer.getBufferLocation() - TheTok.getLength() <= StrData)
+        return TheTok.getLocation();
+      
+      // We ended up skipping over the source location entirely, which means
+      // that it points into whitespace. We're done here.
+      break;
+    }
+  } while (TheTok.getKind() != tok::eof);
+  
+  // We've passed our source location; just return the original source location.
+  return Loc;
+}
+
+namespace {
+  enum PreambleDirectiveKind {
+    PDK_Skipped,
+    PDK_StartIf,
+    PDK_EndIf,
+    PDK_Unknown
+  };
+}
+
+std::pair<unsigned, bool>
+Lexer::ComputePreamble(const llvm::MemoryBuffer *Buffer, unsigned MaxLines) {
+  // Create a lexer starting at the beginning of the file. Note that we use a
+  // "fake" file source location at offset 1 so that the lexer will track our
+  // position within the file.
+  const unsigned StartOffset = 1;
+  SourceLocation StartLoc = SourceLocation::getFromRawEncoding(StartOffset);
+  LangOptions LangOpts;
+  Lexer TheLexer(StartLoc, LangOpts, Buffer->getBufferStart(), 
+                 Buffer->getBufferStart(), Buffer->getBufferEnd());
+  
+  bool InPreprocessorDirective = false;
+  Token TheTok;
+  Token IfStartTok;
+  unsigned IfCount = 0;
+  unsigned Line = 0;
+
+  do {
+    TheLexer.LexFromRawLexer(TheTok);
+
+    if (InPreprocessorDirective) {
+      // If we've hit the end of the file, we're done.
+      if (TheTok.getKind() == tok::eof) {
+        InPreprocessorDirective = false;
+        break;
+      }
+      
+      // If we haven't hit the end of the preprocessor directive, skip this
+      // token.
+      if (!TheTok.isAtStartOfLine())
+        continue;
+        
+      // We've passed the end of the preprocessor directive, and will look
+      // at this token again below.
+      InPreprocessorDirective = false;
+    }
+    
+    // Keep track of the # of lines in the preamble.
+    if (TheTok.isAtStartOfLine()) {
+      ++Line;
+
+      // If we were asked to limit the number of lines in the preamble,
+      // and we're about to exceed that limit, we're done.
+      if (MaxLines && Line >= MaxLines)
+        break;
+    }
+
+    // Comments are okay; skip over them.
+    if (TheTok.getKind() == tok::comment)
+      continue;
+    
+    if (TheTok.isAtStartOfLine() && TheTok.getKind() == tok::hash) {
+      // This is the start of a preprocessor directive. 
+      Token HashTok = TheTok;
+      InPreprocessorDirective = true;
+      
+      // Figure out which direective this is. Since we're lexing raw tokens,
+      // we don't have an identifier table available. Instead, just look at
+      // the raw identifier to recognize and categorize preprocessor directives.
+      TheLexer.LexFromRawLexer(TheTok);
+      if (TheTok.getKind() == tok::identifier && !TheTok.needsCleaning()) {
+        const char *IdStart = Buffer->getBufferStart() 
+                            + TheTok.getLocation().getRawEncoding() - 1;
+        llvm::StringRef Keyword(IdStart, TheTok.getLength());
+        PreambleDirectiveKind PDK
+          = llvm::StringSwitch<PreambleDirectiveKind>(Keyword)
+              .Case("include", PDK_Skipped)
+              .Case("__include_macros", PDK_Skipped)
+              .Case("define", PDK_Skipped)
+              .Case("undef", PDK_Skipped)
+              .Case("line", PDK_Skipped)
+              .Case("error", PDK_Skipped)
+              .Case("pragma", PDK_Skipped)
+              .Case("import", PDK_Skipped)
+              .Case("include_next", PDK_Skipped)
+              .Case("warning", PDK_Skipped)
+              .Case("ident", PDK_Skipped)
+              .Case("sccs", PDK_Skipped)
+              .Case("assert", PDK_Skipped)
+              .Case("unassert", PDK_Skipped)
+              .Case("if", PDK_StartIf)
+              .Case("ifdef", PDK_StartIf)
+              .Case("ifndef", PDK_StartIf)
+              .Case("elif", PDK_Skipped)
+              .Case("else", PDK_Skipped)
+              .Case("endif", PDK_EndIf)
+              .Default(PDK_Unknown);
+
+        switch (PDK) {
+        case PDK_Skipped:
+          continue;
+
+        case PDK_StartIf:
+          if (IfCount == 0)
+            IfStartTok = HashTok;
+            
+          ++IfCount;
+          continue;
+            
+        case PDK_EndIf:
+          // Mismatched #endif. The preamble ends here.
+          if (IfCount == 0)
+            break;
+
+          --IfCount;
+          continue;
+            
+        case PDK_Unknown:
+          // We don't know what this directive is; stop at the '#'.
+          break;
+        }
+      }
+      
+      // We only end up here if we didn't recognize the preprocessor
+      // directive or it was one that can't occur in the preamble at this
+      // point. Roll back the current token to the location of the '#'.
+      InPreprocessorDirective = false;
+      TheTok = HashTok;
+    }
+
+    // We hit a token that we don't recognize as being in the
+    // "preprocessing only" part of the file, so we're no longer in
+    // the preamble.
+    break;
+  } while (true);
+  
+  SourceLocation End = IfCount? IfStartTok.getLocation() : TheTok.getLocation();
+  return std::make_pair(End.getRawEncoding() - StartLoc.getRawEncoding(),
+                        IfCount? IfStartTok.isAtStartOfLine()
+                               : TheTok.isAtStartOfLine());
+}
+
 //===----------------------------------------------------------------------===//
 // Character information.
 //===----------------------------------------------------------------------===//
@@ -476,7 +672,7 @@ static char DecodeTrigraphChar(const char *CP, Lexer *L) {
   }
 
   if (!L->isLexingRawMode())
-    L->Diag(CP-2, diag::trigraph_converted) << std::string()+Res;
+    L->Diag(CP-2, diag::trigraph_converted) << llvm::StringRef(&Res, 1);
   return Res;
 }
 
@@ -647,6 +843,14 @@ Slash:
 // Helper methods for lexing.
 //===----------------------------------------------------------------------===//
 
+/// \brief Routine that indiscriminately skips bytes in the source file.
+void Lexer::SkipBytes(unsigned Bytes, bool StartOfLine) {
+  BufferPtr += Bytes;
+  if (BufferPtr > BufferEnd)
+    BufferPtr = BufferEnd;
+  IsAtStartOfLine = StartOfLine;
+}
+
 void Lexer::LexIdentifier(Token &Result, const char *CurPtr) {
   // Match [_A-Za-z0-9]*, we have already matched [_A-Za-z$]
   unsigned Size;
@@ -716,6 +920,16 @@ FinishIdentifier:
   }
 }
 
+/// isHexaLiteral - Return true if Start points to a hex constant.
+/// in microsoft mode (where this is supposed to be several different tokens).
+static bool isHexaLiteral(const char *Start, const LangOptions &Features) {
+  unsigned Size;
+  char C1 = Lexer::getCharAndSizeNoWarn(Start, Size, Features);
+  if (C1 != '0')
+    return false;
+  char C2 = Lexer::getCharAndSizeNoWarn(Start + Size, Size, Features);
+  return (C2 == 'x' || C2 == 'X');
+}
 
 /// LexNumericConstant - Lex the remainder of a integer or floating point
 /// constant. From[-1] is the first character lexed.  Return the end of the
@@ -731,12 +945,16 @@ void Lexer::LexNumericConstant(Token &Result, const char *CurPtr) {
   }
 
   // If we fell out, check for a sign, due to 1e+12.  If we have one, continue.
-  if ((C == '-' || C == '+') && (PrevCh == 'E' || PrevCh == 'e'))
-    return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result));
+  if ((C == '-' || C == '+') && (PrevCh == 'E' || PrevCh == 'e')) {
+    // If we are in Microsoft mode, don't continue if the constant is hex.
+    // For example, MSVC will accept the following as 3 tokens: 0x1234567e+1
+    if (!Features.Microsoft || !isHexaLiteral(BufferPtr, Features))
+      return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result));
+  }
 
   // If we have a hex FP constant, continue.
   if ((C == '-' || C == '+') && (PrevCh == 'P' || PrevCh == 'p') &&
-      (!PP || !PP->getLangOptions().CPlusPlus0x))
+      !Features.CPlusPlus0x)
     return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result));
 
   // Update the location of token as well as BufferPtr.
@@ -759,7 +977,9 @@ void Lexer::LexStringLiteral(Token &Result, const char *CurPtr, bool Wide) {
     
     if (C == '\n' || C == '\r' ||             // Newline.
         (C == 0 && CurPtr-1 == BufferEnd)) {  // End of file.
-      if (!isLexingRawMode() && !Features.AsmPreprocessor)
+      if (C == 0 && PP && PP->isCodeCompletionFile(FileLoc))
+        PP->CodeCompleteNaturalLanguage();
+      else if (!isLexingRawMode() && !Features.AsmPreprocessor)
         Diag(BufferPtr, diag::err_unterminated_string);
       FormTokenWithChars(Result, CurPtr-1, tok::unknown);
       return;
@@ -836,7 +1056,9 @@ void Lexer::LexCharConstant(Token &Result, const char *CurPtr) {
       C = getAndAdvanceChar(CurPtr, Result);
     } else if (C == '\n' || C == '\r' ||             // Newline.
                (C == 0 && CurPtr-1 == BufferEnd)) {  // End of file.
-      if (!isLexingRawMode() && !Features.AsmPreprocessor)
+      if (C == 0 && PP && PP->isCodeCompletionFile(FileLoc))
+        PP->CodeCompleteNaturalLanguage();
+      else if (!isLexingRawMode() && !Features.AsmPreprocessor)
         Diag(BufferPtr, diag::err_unterminated_char);
       FormTokenWithChars(Result, CurPtr-1, tok::unknown);
       return;
@@ -980,7 +1202,13 @@ bool Lexer::SkipBCPLComment(Token &Result, const char *CurPtr) {
         }
     }
 
-    if (CurPtr == BufferEnd+1) { --CurPtr; break; }
+    if (CurPtr == BufferEnd+1) { 
+      if (PP && PP->isCodeCompletionFile(FileLoc))
+        PP->CodeCompleteNaturalLanguage();
+
+      --CurPtr; 
+      break; 
+    }
   } while (C != '\n' && C != '\r');
 
   // Found but did not consume the newline.  Notify comment handlers about the
@@ -1219,7 +1447,9 @@ bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr) {
           Diag(CurPtr-1, diag::warn_nested_block_comment);
       }
     } else if (C == 0 && CurPtr == BufferEnd+1) {
-      if (!isLexingRawMode() && !PP->isCodeCompletionFile(FileLoc))
+      if (PP && PP->isCodeCompletionFile(FileLoc))
+        PP->CodeCompleteNaturalLanguage();
+      else if (!isLexingRawMode())
         Diag(BufferPtr, diag::err_unterminated_block_comment);
       // Note: the user probably forgot a */.  We could continue immediately
       // after the /*, but this would involve lexing a lot of what really is the
@@ -1305,6 +1535,11 @@ std::string Lexer::ReadToEndOfLine() {
 
       // Next, lex the character, which should handle the EOM transition.
       Lex(Tmp);
+      if (Tmp.is(tok::code_completion)) {
+        if (PP && PP->getCodeCompletionHandler())
+          PP->getCodeCompletionHandler()->CodeCompleteNaturalLanguage();
+        Lex(Tmp);
+      }
       assert(Tmp.is(tok::eom) && "Unexpected token!");
 
       // Finally, we're done, return the string we found.
@@ -1318,6 +1553,22 @@ std::string Lexer::ReadToEndOfLine() {
 /// This returns true if Result contains a token, false if PP.Lex should be
 /// called again.
 bool Lexer::LexEndOfFile(Token &Result, const char *CurPtr) {
+  // Check if we are performing code completion.
+  if (PP && PP->isCodeCompletionFile(FileLoc)) {
+    // We're at the end of the file, but we've been asked to consider the
+    // end of the file to be a code-completion token. Return the
+    // code-completion token.
+    Result.startToken();
+    FormTokenWithChars(Result, CurPtr, tok::code_completion);
+    
+    // Only do the eof -> code_completion translation once.
+    PP->SetCodeCompletionPoint(0, 0, 0);
+    
+    // Silence any diagnostics that occur once we hit the code-completion point.
+    PP->getDiagnostics().setSuppressAllDiagnostics(true);
+    return true;
+  }
+
   // If we hit the end of the file while parsing a preprocessor directive,
   // end the preprocessor directive first.  The next token returned will
   // then be the end of file.
@@ -1340,29 +1591,14 @@ bool Lexer::LexEndOfFile(Token &Result, const char *CurPtr) {
     FormTokenWithChars(Result, BufferEnd, tok::eof);
     return true;
   }
-
-  // Otherwise, check if we are code-completing, then issue diagnostics for 
-  // unterminated #if and missing newline.
-
-  if (PP && PP->isCodeCompletionFile(FileLoc)) {
-    // We're at the end of the file, but we've been asked to consider the
-    // end of the file to be a code-completion token. Return the
-    // code-completion token.
-    Result.startToken();
-    FormTokenWithChars(Result, CurPtr, tok::code_completion);
-    
-    // Only do the eof -> code_completion translation once.
-    PP->SetCodeCompletionPoint(0, 0, 0);
-    
-    // Silence any diagnostics that occur once we hit the code-completion point.
-    PP->getDiagnostics().setSuppressAllDiagnostics(true);
-    return true;
-  }
   
+  // Issue diagnostics for unterminated #if and missing newline.
+
   // If we are in a #if directive, emit an error.
   while (!ConditionalStack.empty()) {
-    PP->Diag(ConditionalStack.back().IfLoc,
-             diag::err_pp_unterminated_conditional);
+    if (!PP->isCodeCompletionFile(FileLoc))
+      PP->Diag(ConditionalStack.back().IfLoc,
+               diag::err_pp_unterminated_conditional);
     ConditionalStack.pop_back();
   }