diff options
Diffstat (limited to 'lib/AST/CommentLexer.cpp')
-rw-r--r-- | lib/AST/CommentLexer.cpp | 815 |
1 files changed, 815 insertions, 0 deletions
diff --git a/lib/AST/CommentLexer.cpp b/lib/AST/CommentLexer.cpp new file mode 100644 index 0000000..b6516ec --- /dev/null +++ b/lib/AST/CommentLexer.cpp @@ -0,0 +1,815 @@ +#include "clang/AST/CommentLexer.h" +#include "clang/AST/CommentCommandTraits.h" +#include "clang/Basic/ConvertUTF.h" +#include "llvm/ADT/StringSwitch.h" +#include "llvm/Support/ErrorHandling.h" + +namespace clang { +namespace comments { + +void Token::dump(const Lexer &L, const SourceManager &SM) const { + llvm::errs() << "comments::Token Kind=" << Kind << " "; + Loc.dump(SM); + llvm::errs() << " " << Length << " \"" << L.getSpelling(*this, SM) << "\"\n"; +} + +namespace { +bool isHTMLNamedCharacterReferenceCharacter(char C) { + return (C >= 'a' && C <= 'z') || + (C >= 'A' && C <= 'Z'); +} + +bool isHTMLDecimalCharacterReferenceCharacter(char C) { + return C >= '0' && C <= '9'; +} + +bool isHTMLHexCharacterReferenceCharacter(char C) { + return (C >= '0' && C <= '9') || + (C >= 'a' && C <= 'f') || + (C >= 'A' && C <= 'F'); +} +} // unnamed namespace + +StringRef Lexer::resolveHTMLNamedCharacterReference(StringRef Name) const { + return llvm::StringSwitch<StringRef>(Name) + .Case("amp", "&") + .Case("lt", "<") + .Case("gt", ">") + .Case("quot", "\"") + .Case("apos", "\'") + .Default(""); +} + +StringRef Lexer::resolveHTMLDecimalCharacterReference(StringRef Name) const { + unsigned CodePoint = 0; + for (unsigned i = 0, e = Name.size(); i != e; ++i) { + assert(isHTMLDecimalCharacterReferenceCharacter(Name[i])); + CodePoint *= 10; + CodePoint += Name[i] - '0'; + } + + char *Resolved = Allocator.Allocate<char>(UNI_MAX_UTF8_BYTES_PER_CODE_POINT); + char *ResolvedPtr = Resolved; + if (ConvertCodePointToUTF8(CodePoint, ResolvedPtr)) + return StringRef(Resolved, ResolvedPtr - Resolved); + else + return StringRef(); +} + +StringRef Lexer::resolveHTMLHexCharacterReference(StringRef Name) const { + unsigned CodePoint = 0; + for (unsigned i = 0, e = Name.size(); i != e; ++i) { + CodePoint *= 16; + const char C = Name[i]; + assert(isHTMLHexCharacterReferenceCharacter(C)); + if (C >= '0' && C <= '9') + CodePoint += Name[i] - '0'; + else if (C >= 'a' && C <= 'f') + CodePoint += Name[i] - 'a' + 10; + else + CodePoint += Name[i] - 'A' + 10; + } + + char *Resolved = Allocator.Allocate<char>(UNI_MAX_UTF8_BYTES_PER_CODE_POINT); + char *ResolvedPtr = Resolved; + if (ConvertCodePointToUTF8(CodePoint, ResolvedPtr)) + return StringRef(Resolved, ResolvedPtr - Resolved); + else + return StringRef(); +} + +void Lexer::skipLineStartingDecorations() { + // This function should be called only for C comments + assert(CommentState == LCS_InsideCComment); + + if (BufferPtr == CommentEnd) + return; + + switch (*BufferPtr) { + case ' ': + case '\t': + case '\f': + case '\v': { + const char *NewBufferPtr = BufferPtr; + NewBufferPtr++; + if (NewBufferPtr == CommentEnd) + return; + + char C = *NewBufferPtr; + while (C == ' ' || C == '\t' || C == '\f' || C == '\v') { + NewBufferPtr++; + if (NewBufferPtr == CommentEnd) + return; + C = *NewBufferPtr; + } + if (C == '*') + BufferPtr = NewBufferPtr + 1; + break; + } + case '*': + BufferPtr++; + break; + } +} + +namespace { +/// Returns pointer to the first newline character in the string. +const char *findNewline(const char *BufferPtr, const char *BufferEnd) { + for ( ; BufferPtr != BufferEnd; ++BufferPtr) { + const char C = *BufferPtr; + if (C == '\n' || C == '\r') + return BufferPtr; + } + return BufferEnd; +} + +const char *skipNewline(const char *BufferPtr, const char *BufferEnd) { + if (BufferPtr == BufferEnd) + return BufferPtr; + + if (*BufferPtr == '\n') + BufferPtr++; + else { + assert(*BufferPtr == '\r'); + BufferPtr++; + if (BufferPtr != BufferEnd && *BufferPtr == '\n') + BufferPtr++; + } + return BufferPtr; +} + +const char *skipNamedCharacterReference(const char *BufferPtr, + const char *BufferEnd) { + for ( ; BufferPtr != BufferEnd; ++BufferPtr) { + if (!isHTMLNamedCharacterReferenceCharacter(*BufferPtr)) + return BufferPtr; + } + return BufferEnd; +} + +const char *skipDecimalCharacterReference(const char *BufferPtr, + const char *BufferEnd) { + for ( ; BufferPtr != BufferEnd; ++BufferPtr) { + if (!isHTMLDecimalCharacterReferenceCharacter(*BufferPtr)) + return BufferPtr; + } + return BufferEnd; +} + +const char *skipHexCharacterReference(const char *BufferPtr, + const char *BufferEnd) { + for ( ; BufferPtr != BufferEnd; ++BufferPtr) { + if (!isHTMLHexCharacterReferenceCharacter(*BufferPtr)) + return BufferPtr; + } + return BufferEnd; +} + +bool isHTMLIdentifierStartingCharacter(char C) { + return (C >= 'a' && C <= 'z') || + (C >= 'A' && C <= 'Z'); +} + +bool isHTMLIdentifierCharacter(char C) { + return (C >= 'a' && C <= 'z') || + (C >= 'A' && C <= 'Z') || + (C >= '0' && C <= '9'); +} + +const char *skipHTMLIdentifier(const char *BufferPtr, const char *BufferEnd) { + for ( ; BufferPtr != BufferEnd; ++BufferPtr) { + if (!isHTMLIdentifierCharacter(*BufferPtr)) + return BufferPtr; + } + return BufferEnd; +} + +/// Skip HTML string quoted in single or double quotes. Escaping quotes inside +/// string allowed. +/// +/// Returns pointer to closing quote. +const char *skipHTMLQuotedString(const char *BufferPtr, const char *BufferEnd) +{ + const char Quote = *BufferPtr; + assert(Quote == '\"' || Quote == '\''); + + BufferPtr++; + for ( ; BufferPtr != BufferEnd; ++BufferPtr) { + const char C = *BufferPtr; + if (C == Quote && BufferPtr[-1] != '\\') + return BufferPtr; + } + return BufferEnd; +} + +bool isHorizontalWhitespace(char C) { + return C == ' ' || C == '\t' || C == '\f' || C == '\v'; +} + +bool isWhitespace(char C) { + return C == ' ' || C == '\n' || C == '\r' || + C == '\t' || C == '\f' || C == '\v'; +} + +const char *skipWhitespace(const char *BufferPtr, const char *BufferEnd) { + for ( ; BufferPtr != BufferEnd; ++BufferPtr) { + if (!isWhitespace(*BufferPtr)) + return BufferPtr; + } + return BufferEnd; +} + +bool isWhitespace(const char *BufferPtr, const char *BufferEnd) { + return skipWhitespace(BufferPtr, BufferEnd) == BufferEnd; +} + +bool isCommandNameCharacter(char C) { + return (C >= 'a' && C <= 'z') || + (C >= 'A' && C <= 'Z') || + (C >= '0' && C <= '9'); +} + +const char *skipCommandName(const char *BufferPtr, const char *BufferEnd) { + for ( ; BufferPtr != BufferEnd; ++BufferPtr) { + if (!isCommandNameCharacter(*BufferPtr)) + return BufferPtr; + } + return BufferEnd; +} + +/// Return the one past end pointer for BCPL comments. +/// Handles newlines escaped with backslash or trigraph for backslahs. +const char *findBCPLCommentEnd(const char *BufferPtr, const char *BufferEnd) { + const char *CurPtr = BufferPtr; + while (CurPtr != BufferEnd) { + char C = *CurPtr; + while (C != '\n' && C != '\r') { + CurPtr++; + if (CurPtr == BufferEnd) + return BufferEnd; + C = *CurPtr; + } + // We found a newline, check if it is escaped. + const char *EscapePtr = CurPtr - 1; + while(isHorizontalWhitespace(*EscapePtr)) + EscapePtr--; + + if (*EscapePtr == '\\' || + (EscapePtr - 2 >= BufferPtr && EscapePtr[0] == '/' && + EscapePtr[-1] == '?' && EscapePtr[-2] == '?')) { + // We found an escaped newline. + CurPtr = skipNewline(CurPtr, BufferEnd); + } else + return CurPtr; // Not an escaped newline. + } + return BufferEnd; +} + +/// Return the one past end pointer for C comments. +/// Very dumb, does not handle escaped newlines or trigraphs. +const char *findCCommentEnd(const char *BufferPtr, const char *BufferEnd) { + for ( ; BufferPtr != BufferEnd; ++BufferPtr) { + if (*BufferPtr == '*') { + assert(BufferPtr + 1 != BufferEnd); + if (*(BufferPtr + 1) == '/') + return BufferPtr; + } + } + llvm_unreachable("buffer end hit before '*/' was seen"); +} +} // unnamed namespace + +void Lexer::lexCommentText(Token &T) { + assert(CommentState == LCS_InsideBCPLComment || + CommentState == LCS_InsideCComment); + + switch (State) { + case LS_Normal: + break; + case LS_VerbatimBlockFirstLine: + lexVerbatimBlockFirstLine(T); + return; + case LS_VerbatimBlockBody: + lexVerbatimBlockBody(T); + return; + case LS_VerbatimLineText: + lexVerbatimLineText(T); + return; + case LS_HTMLStartTag: + lexHTMLStartTag(T); + return; + case LS_HTMLEndTag: + lexHTMLEndTag(T); + return; + } + + assert(State == LS_Normal); + + const char *TokenPtr = BufferPtr; + assert(TokenPtr < CommentEnd); + while (TokenPtr != CommentEnd) { + switch(*TokenPtr) { + case '\\': + case '@': { + TokenPtr++; + if (TokenPtr == CommentEnd) { + formTextToken(T, TokenPtr); + return; + } + char C = *TokenPtr; + switch (C) { + default: + break; + + case '\\': case '@': case '&': case '$': + case '#': case '<': case '>': case '%': + case '\"': case '.': case ':': + // This is one of \\ \@ \& \$ etc escape sequences. + TokenPtr++; + if (C == ':' && TokenPtr != CommentEnd && *TokenPtr == ':') { + // This is the \:: escape sequence. + TokenPtr++; + } + StringRef UnescapedText(BufferPtr + 1, TokenPtr - (BufferPtr + 1)); + formTokenWithChars(T, TokenPtr, tok::text); + T.setText(UnescapedText); + return; + } + + // Don't make zero-length commands. + if (!isCommandNameCharacter(*TokenPtr)) { + formTextToken(T, TokenPtr); + return; + } + + TokenPtr = skipCommandName(TokenPtr, CommentEnd); + unsigned Length = TokenPtr - (BufferPtr + 1); + + // Hardcoded support for lexing LaTeX formula commands + // \f$ \f[ \f] \f{ \f} as a single command. + if (Length == 1 && TokenPtr[-1] == 'f' && TokenPtr != CommentEnd) { + C = *TokenPtr; + if (C == '$' || C == '[' || C == ']' || C == '{' || C == '}') { + TokenPtr++; + Length++; + } + } + + const StringRef CommandName(BufferPtr + 1, Length); + StringRef EndName; + + if (Traits.isVerbatimBlockCommand(CommandName, EndName)) { + setupAndLexVerbatimBlock(T, TokenPtr, *BufferPtr, EndName); + return; + } + if (Traits.isVerbatimLineCommand(CommandName)) { + setupAndLexVerbatimLine(T, TokenPtr); + return; + } + formTokenWithChars(T, TokenPtr, tok::command); + T.setCommandName(CommandName); + return; + } + + case '&': + lexHTMLCharacterReference(T); + return; + + case '<': { + TokenPtr++; + if (TokenPtr == CommentEnd) { + formTextToken(T, TokenPtr); + return; + } + const char C = *TokenPtr; + if (isHTMLIdentifierStartingCharacter(C)) + setupAndLexHTMLStartTag(T); + else if (C == '/') + setupAndLexHTMLEndTag(T); + else + formTextToken(T, TokenPtr); + + return; + } + + case '\n': + case '\r': + TokenPtr = skipNewline(TokenPtr, CommentEnd); + formTokenWithChars(T, TokenPtr, tok::newline); + + if (CommentState == LCS_InsideCComment) + skipLineStartingDecorations(); + return; + + default: { + while (true) { + TokenPtr++; + if (TokenPtr == CommentEnd) + break; + const char C = *TokenPtr; + if(C == '\n' || C == '\r' || + C == '\\' || C == '@' || C == '&' || C == '<') + break; + } + formTextToken(T, TokenPtr); + return; + } + } + } +} + +void Lexer::setupAndLexVerbatimBlock(Token &T, + const char *TextBegin, + char Marker, StringRef EndName) { + VerbatimBlockEndCommandName.clear(); + VerbatimBlockEndCommandName.append(Marker == '\\' ? "\\" : "@"); + VerbatimBlockEndCommandName.append(EndName); + + StringRef Name(BufferPtr + 1, TextBegin - (BufferPtr + 1)); + formTokenWithChars(T, TextBegin, tok::verbatim_block_begin); + T.setVerbatimBlockName(Name); + + // If there is a newline following the verbatim opening command, skip the + // newline so that we don't create an tok::verbatim_block_line with empty + // text content. + if (BufferPtr != CommentEnd) { + const char C = *BufferPtr; + if (C == '\n' || C == '\r') { + BufferPtr = skipNewline(BufferPtr, CommentEnd); + State = LS_VerbatimBlockBody; + return; + } + } + + State = LS_VerbatimBlockFirstLine; +} + +void Lexer::lexVerbatimBlockFirstLine(Token &T) { +again: + assert(BufferPtr < CommentEnd); + + // FIXME: It would be better to scan the text once, finding either the block + // end command or newline. + // + // Extract current line. + const char *Newline = findNewline(BufferPtr, CommentEnd); + StringRef Line(BufferPtr, Newline - BufferPtr); + + // Look for end command in current line. + size_t Pos = Line.find(VerbatimBlockEndCommandName); + const char *TextEnd; + const char *NextLine; + if (Pos == StringRef::npos) { + // Current line is completely verbatim. + TextEnd = Newline; + NextLine = skipNewline(Newline, CommentEnd); + } else if (Pos == 0) { + // Current line contains just an end command. + const char *End = BufferPtr + VerbatimBlockEndCommandName.size(); + StringRef Name(BufferPtr + 1, End - (BufferPtr + 1)); + formTokenWithChars(T, End, tok::verbatim_block_end); + T.setVerbatimBlockName(Name); + State = LS_Normal; + return; + } else { + // There is some text, followed by end command. Extract text first. + TextEnd = BufferPtr + Pos; + NextLine = TextEnd; + // If there is only whitespace before end command, skip whitespace. + if (isWhitespace(BufferPtr, TextEnd)) { + BufferPtr = TextEnd; + goto again; + } + } + + StringRef Text(BufferPtr, TextEnd - BufferPtr); + formTokenWithChars(T, NextLine, tok::verbatim_block_line); + T.setVerbatimBlockText(Text); + + State = LS_VerbatimBlockBody; +} + +void Lexer::lexVerbatimBlockBody(Token &T) { + assert(State == LS_VerbatimBlockBody); + + if (CommentState == LCS_InsideCComment) + skipLineStartingDecorations(); + + lexVerbatimBlockFirstLine(T); +} + +void Lexer::setupAndLexVerbatimLine(Token &T, const char *TextBegin) { + const StringRef Name(BufferPtr + 1, TextBegin - BufferPtr - 1); + formTokenWithChars(T, TextBegin, tok::verbatim_line_name); + T.setVerbatimLineName(Name); + + State = LS_VerbatimLineText; +} + +void Lexer::lexVerbatimLineText(Token &T) { + assert(State == LS_VerbatimLineText); + + // Extract current line. + const char *Newline = findNewline(BufferPtr, CommentEnd); + const StringRef Text(BufferPtr, Newline - BufferPtr); + formTokenWithChars(T, Newline, tok::verbatim_line_text); + T.setVerbatimLineText(Text); + + State = LS_Normal; +} + +void Lexer::lexHTMLCharacterReference(Token &T) { + const char *TokenPtr = BufferPtr; + assert(*TokenPtr == '&'); + TokenPtr++; + if (TokenPtr == CommentEnd) { + formTextToken(T, TokenPtr); + return; + } + const char *NamePtr; + bool isNamed = false; + bool isDecimal = false; + char C = *TokenPtr; + if (isHTMLNamedCharacterReferenceCharacter(C)) { + NamePtr = TokenPtr; + TokenPtr = skipNamedCharacterReference(TokenPtr, CommentEnd); + isNamed = true; + } else if (C == '#') { + TokenPtr++; + if (TokenPtr == CommentEnd) { + formTextToken(T, TokenPtr); + return; + } + C = *TokenPtr; + if (isHTMLDecimalCharacterReferenceCharacter(C)) { + NamePtr = TokenPtr; + TokenPtr = skipDecimalCharacterReference(TokenPtr, CommentEnd); + isDecimal = true; + } else if (C == 'x' || C == 'X') { + TokenPtr++; + NamePtr = TokenPtr; + TokenPtr = skipHexCharacterReference(TokenPtr, CommentEnd); + } else { + formTextToken(T, TokenPtr); + return; + } + } else { + formTextToken(T, TokenPtr); + return; + } + if (NamePtr == TokenPtr || TokenPtr == CommentEnd || + *TokenPtr != ';') { + formTextToken(T, TokenPtr); + return; + } + StringRef Name(NamePtr, TokenPtr - NamePtr); + TokenPtr++; // Skip semicolon. + StringRef Resolved; + if (isNamed) + Resolved = resolveHTMLNamedCharacterReference(Name); + else if (isDecimal) + Resolved = resolveHTMLDecimalCharacterReference(Name); + else + Resolved = resolveHTMLHexCharacterReference(Name); + + if (Resolved.empty()) { + formTextToken(T, TokenPtr); + return; + } + formTokenWithChars(T, TokenPtr, tok::text); + T.setText(Resolved); + return; +} + +void Lexer::setupAndLexHTMLStartTag(Token &T) { + assert(BufferPtr[0] == '<' && + isHTMLIdentifierStartingCharacter(BufferPtr[1])); + const char *TagNameEnd = skipHTMLIdentifier(BufferPtr + 2, CommentEnd); + + StringRef Name(BufferPtr + 1, TagNameEnd - (BufferPtr + 1)); + formTokenWithChars(T, TagNameEnd, tok::html_start_tag); + T.setHTMLTagStartName(Name); + + BufferPtr = skipWhitespace(BufferPtr, CommentEnd); + + const char C = *BufferPtr; + if (BufferPtr != CommentEnd && + (C == '>' || C == '/' || isHTMLIdentifierStartingCharacter(C))) + State = LS_HTMLStartTag; +} + +void Lexer::lexHTMLStartTag(Token &T) { + assert(State == LS_HTMLStartTag); + + const char *TokenPtr = BufferPtr; + char C = *TokenPtr; + if (isHTMLIdentifierCharacter(C)) { + TokenPtr = skipHTMLIdentifier(TokenPtr, CommentEnd); + StringRef Ident(BufferPtr, TokenPtr - BufferPtr); + formTokenWithChars(T, TokenPtr, tok::html_ident); + T.setHTMLIdent(Ident); + } else { + switch (C) { + case '=': + TokenPtr++; + formTokenWithChars(T, TokenPtr, tok::html_equals); + break; + case '\"': + case '\'': { + const char *OpenQuote = TokenPtr; + TokenPtr = skipHTMLQuotedString(TokenPtr, CommentEnd); + const char *ClosingQuote = TokenPtr; + if (TokenPtr != CommentEnd) // Skip closing quote. + TokenPtr++; + formTokenWithChars(T, TokenPtr, tok::html_quoted_string); + T.setHTMLQuotedString(StringRef(OpenQuote + 1, + ClosingQuote - (OpenQuote + 1))); + break; + } + case '>': + TokenPtr++; + formTokenWithChars(T, TokenPtr, tok::html_greater); + State = LS_Normal; + return; + case '/': + TokenPtr++; + if (TokenPtr != CommentEnd && *TokenPtr == '>') { + TokenPtr++; + formTokenWithChars(T, TokenPtr, tok::html_slash_greater); + } else + formTextToken(T, TokenPtr); + + State = LS_Normal; + return; + } + } + + // Now look ahead and return to normal state if we don't see any HTML tokens + // ahead. + BufferPtr = skipWhitespace(BufferPtr, CommentEnd); + if (BufferPtr == CommentEnd) { + State = LS_Normal; + return; + } + + C = *BufferPtr; + if (!isHTMLIdentifierStartingCharacter(C) && + C != '=' && C != '\"' && C != '\'' && C != '>') { + State = LS_Normal; + return; + } +} + +void Lexer::setupAndLexHTMLEndTag(Token &T) { + assert(BufferPtr[0] == '<' && BufferPtr[1] == '/'); + + const char *TagNameBegin = skipWhitespace(BufferPtr + 2, CommentEnd); + const char *TagNameEnd = skipHTMLIdentifier(TagNameBegin, CommentEnd); + + const char *End = skipWhitespace(TagNameEnd, CommentEnd); + + formTokenWithChars(T, End, tok::html_end_tag); + T.setHTMLTagEndName(StringRef(TagNameBegin, TagNameEnd - TagNameBegin)); + + if (BufferPtr != CommentEnd && *BufferPtr == '>') + State = LS_HTMLEndTag; +} + +void Lexer::lexHTMLEndTag(Token &T) { + assert(BufferPtr != CommentEnd && *BufferPtr == '>'); + + formTokenWithChars(T, BufferPtr + 1, tok::html_greater); + State = LS_Normal; +} + +Lexer::Lexer(llvm::BumpPtrAllocator &Allocator, const CommandTraits &Traits, + SourceLocation FileLoc, const CommentOptions &CommOpts, + const char *BufferStart, const char *BufferEnd): + Allocator(Allocator), Traits(Traits), + BufferStart(BufferStart), BufferEnd(BufferEnd), + FileLoc(FileLoc), CommOpts(CommOpts), BufferPtr(BufferStart), + CommentState(LCS_BeforeComment), State(LS_Normal) { +} + +void Lexer::lex(Token &T) { +again: + switch (CommentState) { + case LCS_BeforeComment: + if (BufferPtr == BufferEnd) { + formTokenWithChars(T, BufferPtr, tok::eof); + return; + } + + assert(*BufferPtr == '/'); + BufferPtr++; // Skip first slash. + switch(*BufferPtr) { + case '/': { // BCPL comment. + BufferPtr++; // Skip second slash. + + if (BufferPtr != BufferEnd) { + // Skip Doxygen magic marker, if it is present. + // It might be missing because of a typo //< or /*<, or because we + // merged this non-Doxygen comment into a bunch of Doxygen comments + // around it: /** ... */ /* ... */ /** ... */ + const char C = *BufferPtr; + if (C == '/' || C == '!') + BufferPtr++; + } + + // Skip less-than symbol that marks trailing comments. + // Skip it even if the comment is not a Doxygen one, because //< and /*< + // are frequent typos. + if (BufferPtr != BufferEnd && *BufferPtr == '<') + BufferPtr++; + + CommentState = LCS_InsideBCPLComment; + if (State != LS_VerbatimBlockBody && State != LS_VerbatimBlockFirstLine) + State = LS_Normal; + CommentEnd = findBCPLCommentEnd(BufferPtr, BufferEnd); + goto again; + } + case '*': { // C comment. + BufferPtr++; // Skip star. + + // Skip Doxygen magic marker. + const char C = *BufferPtr; + if ((C == '*' && *(BufferPtr + 1) != '/') || C == '!') + BufferPtr++; + + // Skip less-than symbol that marks trailing comments. + if (BufferPtr != BufferEnd && *BufferPtr == '<') + BufferPtr++; + + CommentState = LCS_InsideCComment; + State = LS_Normal; + CommentEnd = findCCommentEnd(BufferPtr, BufferEnd); + goto again; + } + default: + llvm_unreachable("second character of comment should be '/' or '*'"); + } + + case LCS_BetweenComments: { + // Consecutive comments are extracted only if there is only whitespace + // between them. So we can search for the start of the next comment. + const char *EndWhitespace = BufferPtr; + while(EndWhitespace != BufferEnd && *EndWhitespace != '/') + EndWhitespace++; + + // Turn any whitespace between comments (and there is only whitespace + // between them -- guaranteed by comment extraction) into a newline. We + // have two newlines between C comments in total (first one was synthesized + // after a comment). + formTokenWithChars(T, EndWhitespace, tok::newline); + + CommentState = LCS_BeforeComment; + break; + } + + case LCS_InsideBCPLComment: + case LCS_InsideCComment: + if (BufferPtr != CommentEnd) { + lexCommentText(T); + break; + } else { + // Skip C comment closing sequence. + if (CommentState == LCS_InsideCComment) { + assert(BufferPtr[0] == '*' && BufferPtr[1] == '/'); + BufferPtr += 2; + assert(BufferPtr <= BufferEnd); + + // Synthenize newline just after the C comment, regardless if there is + // actually a newline. + formTokenWithChars(T, BufferPtr, tok::newline); + + CommentState = LCS_BetweenComments; + break; + } else { + // Don't synthesized a newline after BCPL comment. + CommentState = LCS_BetweenComments; + goto again; + } + } + } +} + +StringRef Lexer::getSpelling(const Token &Tok, + const SourceManager &SourceMgr, + bool *Invalid) const { + SourceLocation Loc = Tok.getLocation(); + std::pair<FileID, unsigned> LocInfo = SourceMgr.getDecomposedLoc(Loc); + + bool InvalidTemp = false; + StringRef File = SourceMgr.getBufferData(LocInfo.first, &InvalidTemp); + if (InvalidTemp) { + *Invalid = true; + return StringRef(); + } + + const char *Begin = File.data() + LocInfo.second; + return StringRef(Begin, Tok.getLength()); +} + +} // end namespace comments +} // end namespace clang + |