diff options
Diffstat (limited to 'include/clang/AST/CommentLexer.h')
-rw-r--r-- | include/clang/AST/CommentLexer.h | 353 |
1 files changed, 353 insertions, 0 deletions
diff --git a/include/clang/AST/CommentLexer.h b/include/clang/AST/CommentLexer.h new file mode 100644 index 0000000..7a24b11 --- /dev/null +++ b/include/clang/AST/CommentLexer.h @@ -0,0 +1,353 @@ +//===--- CommentLexer.h - Lexer for structured comments ---------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines lexer for structured comments and supporting token class. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CLANG_AST_COMMENT_LEXER_H +#define LLVM_CLANG_AST_COMMENT_LEXER_H + +#include "clang/Basic/SourceManager.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/ADT/SmallString.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/Support/Allocator.h" +#include "llvm/Support/raw_ostream.h" + +namespace clang { +namespace comments { + +class Lexer; +class TextTokenRetokenizer; +class CommandTraits; + +namespace tok { +enum TokenKind { + eof, + newline, + text, + command, + verbatim_block_begin, + verbatim_block_line, + verbatim_block_end, + verbatim_line_name, + verbatim_line_text, + html_start_tag, // <tag + html_ident, // attr + html_equals, // = + html_quoted_string, // "blah\"blah" or 'blah\'blah' + html_greater, // > + html_slash_greater, // /> + html_end_tag // </tag +}; +} // end namespace tok + +class CommentOptions { +public: + bool Markdown; +}; + +/// \brief Comment token. +class Token { + friend class Lexer; + friend class TextTokenRetokenizer; + + /// The location of the token. + SourceLocation Loc; + + /// The actual kind of the token. + tok::TokenKind Kind; + + /// Length of the token spelling in comment. Can be 0 for synthenized + /// tokens. + unsigned Length; + + /// Contains text value associated with a token. + const char *TextPtr1; + unsigned TextLen1; + +public: + SourceLocation getLocation() const LLVM_READONLY { return Loc; } + void setLocation(SourceLocation SL) { Loc = SL; } + + SourceLocation getEndLocation() const LLVM_READONLY { + if (Length == 0 || Length == 1) + return Loc; + return Loc.getLocWithOffset(Length - 1); + } + + tok::TokenKind getKind() const LLVM_READONLY { return Kind; } + void setKind(tok::TokenKind K) { Kind = K; } + + bool is(tok::TokenKind K) const LLVM_READONLY { return Kind == K; } + bool isNot(tok::TokenKind K) const LLVM_READONLY { return Kind != K; } + + unsigned getLength() const LLVM_READONLY { return Length; } + void setLength(unsigned L) { Length = L; } + + StringRef getText() const LLVM_READONLY { + assert(is(tok::text)); + return StringRef(TextPtr1, TextLen1); + } + + void setText(StringRef Text) { + assert(is(tok::text)); + TextPtr1 = Text.data(); + TextLen1 = Text.size(); + } + + StringRef getCommandName() const LLVM_READONLY { + assert(is(tok::command)); + return StringRef(TextPtr1, TextLen1); + } + + void setCommandName(StringRef Name) { + assert(is(tok::command)); + TextPtr1 = Name.data(); + TextLen1 = Name.size(); + } + + StringRef getVerbatimBlockName() const LLVM_READONLY { + assert(is(tok::verbatim_block_begin) || is(tok::verbatim_block_end)); + return StringRef(TextPtr1, TextLen1); + } + + void setVerbatimBlockName(StringRef Name) { + assert(is(tok::verbatim_block_begin) || is(tok::verbatim_block_end)); + TextPtr1 = Name.data(); + TextLen1 = Name.size(); + } + + StringRef getVerbatimBlockText() const LLVM_READONLY { + assert(is(tok::verbatim_block_line)); + return StringRef(TextPtr1, TextLen1); + } + + void setVerbatimBlockText(StringRef Text) { + assert(is(tok::verbatim_block_line)); + TextPtr1 = Text.data(); + TextLen1 = Text.size(); + } + + /// Returns the name of verbatim line command. + StringRef getVerbatimLineName() const LLVM_READONLY { + assert(is(tok::verbatim_line_name)); + return StringRef(TextPtr1, TextLen1); + } + + void setVerbatimLineName(StringRef Name) { + assert(is(tok::verbatim_line_name)); + TextPtr1 = Name.data(); + TextLen1 = Name.size(); + } + + StringRef getVerbatimLineText() const LLVM_READONLY { + assert(is(tok::verbatim_line_text)); + return StringRef(TextPtr1, TextLen1); + } + + void setVerbatimLineText(StringRef Text) { + assert(is(tok::verbatim_line_text)); + TextPtr1 = Text.data(); + TextLen1 = Text.size(); + } + + StringRef getHTMLTagStartName() const LLVM_READONLY { + assert(is(tok::html_start_tag)); + return StringRef(TextPtr1, TextLen1); + } + + void setHTMLTagStartName(StringRef Name) { + assert(is(tok::html_start_tag)); + TextPtr1 = Name.data(); + TextLen1 = Name.size(); + } + + StringRef getHTMLIdent() const LLVM_READONLY { + assert(is(tok::html_ident)); + return StringRef(TextPtr1, TextLen1); + } + + void setHTMLIdent(StringRef Name) { + assert(is(tok::html_ident)); + TextPtr1 = Name.data(); + TextLen1 = Name.size(); + } + + StringRef getHTMLQuotedString() const LLVM_READONLY { + assert(is(tok::html_quoted_string)); + return StringRef(TextPtr1, TextLen1); + } + + void setHTMLQuotedString(StringRef Str) { + assert(is(tok::html_quoted_string)); + TextPtr1 = Str.data(); + TextLen1 = Str.size(); + } + + StringRef getHTMLTagEndName() const LLVM_READONLY { + assert(is(tok::html_end_tag)); + return StringRef(TextPtr1, TextLen1); + } + + void setHTMLTagEndName(StringRef Name) { + assert(is(tok::html_end_tag)); + TextPtr1 = Name.data(); + TextLen1 = Name.size(); + } + + void dump(const Lexer &L, const SourceManager &SM) const; +}; + +/// \brief Comment lexer. +class Lexer { +private: + Lexer(const Lexer&); // DO NOT IMPLEMENT + void operator=(const Lexer&); // DO NOT IMPLEMENT + + /// Allocator for strings that are semantic values of tokens and have to be + /// computed (for example, resolved decimal character references). + llvm::BumpPtrAllocator &Allocator; + + const CommandTraits &Traits; + + const char *const BufferStart; + const char *const BufferEnd; + SourceLocation FileLoc; + CommentOptions CommOpts; + + const char *BufferPtr; + + /// One past end pointer for the current comment. For BCPL comments points + /// to newline or BufferEnd, for C comments points to star in '*/'. + const char *CommentEnd; + + enum LexerCommentState { + LCS_BeforeComment, + LCS_InsideBCPLComment, + LCS_InsideCComment, + LCS_BetweenComments + }; + + /// Low-level lexer state, track if we are inside or outside of comment. + LexerCommentState CommentState; + + enum LexerState { + /// Lexing normal comment text + LS_Normal, + + /// Finished lexing verbatim block beginning command, will lex first body + /// line. + LS_VerbatimBlockFirstLine, + + /// Lexing verbatim block body line-by-line, skipping line-starting + /// decorations. + LS_VerbatimBlockBody, + + /// Finished lexing verbatim line beginning command, will lex text (one + /// line). + LS_VerbatimLineText, + + /// Finished lexing \verbatim <TAG \endverbatim part, lexing tag attributes. + LS_HTMLStartTag, + + /// Finished lexing \verbatim </TAG \endverbatim part, lexing '>'. + LS_HTMLEndTag + }; + + /// Current lexing mode. + LexerState State; + + /// If State is LS_VerbatimBlock, contains the name of verbatim end + /// command, including command marker. + SmallString<16> VerbatimBlockEndCommandName; + + /// Given a character reference name (e.g., "lt"), return the character that + /// it stands for (e.g., "<"). + StringRef resolveHTMLNamedCharacterReference(StringRef Name) const; + + /// Given a Unicode codepoint as base-10 integer, return the character. + StringRef resolveHTMLDecimalCharacterReference(StringRef Name) const; + + /// Given a Unicode codepoint as base-16 integer, return the character. + StringRef resolveHTMLHexCharacterReference(StringRef Name) const; + + void formTokenWithChars(Token &Result, const char *TokEnd, + tok::TokenKind Kind) { + const unsigned TokLen = TokEnd - BufferPtr; + Result.setLocation(getSourceLocation(BufferPtr)); + Result.setKind(Kind); + Result.setLength(TokLen); +#ifndef NDEBUG + Result.TextPtr1 = "<UNSET>"; + Result.TextLen1 = 7; +#endif + BufferPtr = TokEnd; + } + + void formTextToken(Token &Result, const char *TokEnd) { + StringRef Text(BufferPtr, TokEnd - BufferPtr); + formTokenWithChars(Result, TokEnd, tok::text); + Result.setText(Text); + } + + SourceLocation getSourceLocation(const char *Loc) const { + assert(Loc >= BufferStart && Loc <= BufferEnd && + "Location out of range for this buffer!"); + + const unsigned CharNo = Loc - BufferStart; + return FileLoc.getLocWithOffset(CharNo); + } + + /// Eat string matching regexp \code \s*\* \endcode. + void skipLineStartingDecorations(); + + /// Lex stuff inside comments. CommentEnd should be set correctly. + void lexCommentText(Token &T); + + void setupAndLexVerbatimBlock(Token &T, + const char *TextBegin, + char Marker, StringRef EndName); + + void lexVerbatimBlockFirstLine(Token &T); + + void lexVerbatimBlockBody(Token &T); + + void setupAndLexVerbatimLine(Token &T, const char *TextBegin); + + void lexVerbatimLineText(Token &T); + + void lexHTMLCharacterReference(Token &T); + + void setupAndLexHTMLStartTag(Token &T); + + void lexHTMLStartTag(Token &T); + + void setupAndLexHTMLEndTag(Token &T); + + void lexHTMLEndTag(Token &T); + +public: + Lexer(llvm::BumpPtrAllocator &Allocator, const CommandTraits &Traits, + SourceLocation FileLoc, const CommentOptions &CommOpts, + const char *BufferStart, const char *BufferEnd); + + void lex(Token &T); + + StringRef getSpelling(const Token &Tok, + const SourceManager &SourceMgr, + bool *Invalid = NULL) const; +}; + +} // end namespace comments +} // end namespace clang + +#endif + |