From c72c57c9e9b69944e3e009cd5e209634839581d3 Mon Sep 17 00:00:00 2001
From: dim <dim@FreeBSD.org>
Date: Mon, 8 Apr 2013 18:45:10 +0000
Subject: Vendor import of clang trunk r178860:
 http://llvm.org/svn/llvm-project/cfe/trunk@178860

---
 lib/AST/CommentLexer.cpp | 127 ++++++++++++++++++++---------------------------
 1 file changed, 53 insertions(+), 74 deletions(-)

(limited to 'lib/AST/CommentLexer.cpp')

diff --git a/lib/AST/CommentLexer.cpp b/lib/AST/CommentLexer.cpp
index 31a09f7..1194520 100644
--- a/lib/AST/CommentLexer.cpp
+++ b/lib/AST/CommentLexer.cpp
@@ -1,7 +1,9 @@
 #include "clang/AST/CommentLexer.h"
 #include "clang/AST/CommentCommandTraits.h"
-#include "clang/Basic/ConvertUTF.h"
+#include "clang/Basic/CharInfo.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringSwitch.h"
+#include "llvm/Support/ConvertUTF.h"
 #include "llvm/Support/ErrorHandling.h"
 
 namespace clang {
@@ -13,34 +15,46 @@ void Token::dump(const Lexer &L, const SourceManager &SM) const {
   llvm::errs() << " " << Length << " \"" << L.getSpelling(*this, SM) << "\"\n";
 }
 
-namespace {
-bool isHTMLNamedCharacterReferenceCharacter(char C) {
-  return (C >= 'a' && C <= 'z') ||
-         (C >= 'A' && C <= 'Z');
+static inline bool isHTMLNamedCharacterReferenceCharacter(char C) {
+  return isLetter(C);
 }
 
-bool isHTMLDecimalCharacterReferenceCharacter(char C) {
-  return C >= '0' && C <= '9';
+static inline bool isHTMLDecimalCharacterReferenceCharacter(char C) {
+  return isDigit(C);
 }
 
-bool isHTMLHexCharacterReferenceCharacter(char C) {
-  return (C >= '0' && C <= '9') ||
-         (C >= 'a' && C <= 'f') ||
-         (C >= 'A' && C <= 'F');
+static inline bool isHTMLHexCharacterReferenceCharacter(char C) {
+  return isHexDigit(C);
 }
 
+static inline StringRef convertCodePointToUTF8(
+                                      llvm::BumpPtrAllocator &Allocator,
+                                      unsigned CodePoint) {
+  char *Resolved = Allocator.Allocate<char>(UNI_MAX_UTF8_BYTES_PER_CODE_POINT);
+  char *ResolvedPtr = Resolved;
+  if (llvm::ConvertCodePointToUTF8(CodePoint, ResolvedPtr))
+    return StringRef(Resolved, ResolvedPtr - Resolved);
+  else
+    return StringRef();
+}
+
+namespace {
+
 #include "clang/AST/CommentHTMLTags.inc"
+#include "clang/AST/CommentHTMLNamedCharacterReferences.inc"
 
 } // unnamed namespace
 
 StringRef Lexer::resolveHTMLNamedCharacterReference(StringRef Name) const {
+  // Fast path, first check a few most widely used named character references.
   return llvm::StringSwitch<StringRef>(Name)
       .Case("amp", "&")
       .Case("lt", "<")
       .Case("gt", ">")
       .Case("quot", "\"")
       .Case("apos", "\'")
-      .Default("");
+      // Slow path.
+      .Default(translateHTMLNamedCharacterReferenceToUTF8(Name));
 }
 
 StringRef Lexer::resolveHTMLDecimalCharacterReference(StringRef Name) const {
@@ -50,13 +64,7 @@ StringRef Lexer::resolveHTMLDecimalCharacterReference(StringRef Name) const {
     CodePoint *= 10;
     CodePoint += Name[i] - '0';
   }
-
-  char *Resolved = Allocator.Allocate<char>(UNI_MAX_UTF8_BYTES_PER_CODE_POINT);
-  char *ResolvedPtr = Resolved;
-  if (ConvertCodePointToUTF8(CodePoint, ResolvedPtr))
-    return StringRef(Resolved, ResolvedPtr - Resolved);
-  else
-    return StringRef();
+  return convertCodePointToUTF8(Allocator, CodePoint);
 }
 
 StringRef Lexer::resolveHTMLHexCharacterReference(StringRef Name) const {
@@ -65,20 +73,9 @@ StringRef Lexer::resolveHTMLHexCharacterReference(StringRef Name) const {
     CodePoint *= 16;
     const char C = Name[i];
     assert(isHTMLHexCharacterReferenceCharacter(C));
-    if (C >= '0' && C <= '9')
-      CodePoint += Name[i] - '0';
-    else if (C >= 'a' && C <= 'f')
-      CodePoint += Name[i] - 'a' + 10;
-    else
-      CodePoint += Name[i] - 'A' + 10;
+    CodePoint += llvm::hexDigitValue(C);
   }
-
-  char *Resolved = Allocator.Allocate<char>(UNI_MAX_UTF8_BYTES_PER_CODE_POINT);
-  char *ResolvedPtr = Resolved;
-  if (ConvertCodePointToUTF8(CodePoint, ResolvedPtr))
-    return StringRef(Resolved, ResolvedPtr - Resolved);
-  else
-    return StringRef();
+  return convertCodePointToUTF8(Allocator, CodePoint);
 }
 
 void Lexer::skipLineStartingDecorations() {
@@ -99,7 +96,7 @@ void Lexer::skipLineStartingDecorations() {
       return;
 
     char C = *NewBufferPtr;
-    while (C == ' ' || C == '\t' || C == '\f' || C == '\v') {
+    while (isHorizontalWhitespace(C)) {
       NewBufferPtr++;
       if (NewBufferPtr == CommentEnd)
         return;
@@ -119,8 +116,7 @@ namespace {
 /// Returns pointer to the first newline character in the string.
 const char *findNewline(const char *BufferPtr, const char *BufferEnd) {
   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
-    const char C = *BufferPtr;
-    if (C == '\n' || C == '\r')
+    if (isVerticalWhitespace(*BufferPtr))
       return BufferPtr;
   }
   return BufferEnd;
@@ -169,14 +165,11 @@ const char *skipHexCharacterReference(const char *BufferPtr,
 }
 
 bool isHTMLIdentifierStartingCharacter(char C) {
-  return (C >= 'a' && C <= 'z') ||
-         (C >= 'A' && C <= 'Z');
+  return isLetter(C);
 }
 
 bool isHTMLIdentifierCharacter(char C) {
-  return (C >= 'a' && C <= 'z') ||
-         (C >= 'A' && C <= 'Z') ||
-         (C >= '0' && C <= '9');
+  return isAlphanumeric(C);
 }
 
 const char *skipHTMLIdentifier(const char *BufferPtr, const char *BufferEnd) {
@@ -205,15 +198,6 @@ const char *skipHTMLQuotedString(const char *BufferPtr, const char *BufferEnd)
   return BufferEnd;
 }
 
-bool isHorizontalWhitespace(char C) {
-  return C == ' ' || C == '\t' || C == '\f' || C == '\v';
-}
-
-bool isWhitespace(char C) {
-  return C == ' ' || C == '\n' || C == '\r' ||
-         C == '\t' || C == '\f' || C == '\v';
-}
-
 const char *skipWhitespace(const char *BufferPtr, const char *BufferEnd) {
   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
     if (!isWhitespace(*BufferPtr))
@@ -227,14 +211,11 @@ bool isWhitespace(const char *BufferPtr, const char *BufferEnd) {
 }
 
 bool isCommandNameStartCharacter(char C) {
-  return (C >= 'a' && C <= 'z') ||
-         (C >= 'A' && C <= 'Z');
+  return isLetter(C);
 }
 
 bool isCommandNameCharacter(char C) {
-  return (C >= 'a' && C <= 'z') ||
-         (C >= 'A' && C <= 'Z') ||
-         (C >= '0' && C <= '9');
+  return isAlphanumeric(C);
 }
 
 const char *skipCommandName(const char *BufferPtr, const char *BufferEnd) {
@@ -250,12 +231,10 @@ const char *skipCommandName(const char *BufferPtr, const char *BufferEnd) {
 const char *findBCPLCommentEnd(const char *BufferPtr, const char *BufferEnd) {
   const char *CurPtr = BufferPtr;
   while (CurPtr != BufferEnd) {
-    char C = *CurPtr;
-    while (C != '\n' && C != '\r') {
+    while (!isVerticalWhitespace(*CurPtr)) {
       CurPtr++;
       if (CurPtr == BufferEnd)
         return BufferEnd;
-      C = *CurPtr;
     }
     // We found a newline, check if it is escaped.
     const char *EscapePtr = CurPtr - 1;
@@ -319,6 +298,11 @@ void Lexer::lexCommentText(Token &T) {
     switch(*TokenPtr) {
       case '\\':
       case '@': {
+        // Commands that start with a backslash and commands that start with
+        // 'at' have equivalent semantics.  But we keep information about the
+        // exact syntax in AST for comments.
+        tok::TokenKind CommandKind =
+            (*TokenPtr == '@') ? tok::at_command : tok::backslash_command;
         TokenPtr++;
         if (TokenPtr == CommentEnd) {
           formTextToken(T, TokenPtr);
@@ -379,7 +363,7 @@ void Lexer::lexCommentText(Token &T) {
           setupAndLexVerbatimLine(T, TokenPtr, Info);
           return;
         }
-        formTokenWithChars(T, TokenPtr, tok::command);
+        formTokenWithChars(T, TokenPtr, CommandKind);
         T.setCommandID(Info->getID());
         return;
       }
@@ -415,15 +399,12 @@ void Lexer::lexCommentText(Token &T) {
         return;
 
       default: {
-        while (true) {
-          TokenPtr++;
-          if (TokenPtr == CommentEnd)
-            break;
-          const char C = *TokenPtr;
-          if(C == '\n' || C == '\r' ||
-             C == '\\' || C == '@' || C == '&' || C == '<')
-            break;
-        }
+        size_t End = StringRef(TokenPtr, CommentEnd - TokenPtr).
+                         find_first_of("\n\r\\@&<");
+        if (End != StringRef::npos)
+          TokenPtr += End;
+        else
+          TokenPtr = CommentEnd;
         formTextToken(T, TokenPtr);
         return;
       }
@@ -446,13 +427,11 @@ void Lexer::setupAndLexVerbatimBlock(Token &T,
   // If there is a newline following the verbatim opening command, skip the
   // newline so that we don't create an tok::verbatim_block_line with empty
   // text content.
-  if (BufferPtr != CommentEnd) {
-    const char C = *BufferPtr;
-    if (C == '\n' || C == '\r') {
-      BufferPtr = skipNewline(BufferPtr, CommentEnd);
-      State = LS_VerbatimBlockBody;
-      return;
-    }
+  if (BufferPtr != CommentEnd &&
+      isVerticalWhitespace(*BufferPtr)) {
+    BufferPtr = skipNewline(BufferPtr, CommentEnd);
+    State = LS_VerbatimBlockBody;
+    return;
   }
 
   State = LS_VerbatimBlockFirstLine;
-- 
cgit v1.1