diff options
Diffstat (limited to 'lib/Format')
-rw-r--r-- | lib/Format/CMakeLists.txt | 26 | ||||
-rw-r--r-- | lib/Format/Format.cpp | 1763 | ||||
-rw-r--r-- | lib/Format/Makefile | 13 | ||||
-rw-r--r-- | lib/Format/TokenAnnotator.cpp | 1187 | ||||
-rw-r--r-- | lib/Format/TokenAnnotator.h | 262 | ||||
-rw-r--r-- | lib/Format/UnwrappedLineParser.cpp | 858 | ||||
-rw-r--r-- | lib/Format/UnwrappedLineParser.h | 201 |
7 files changed, 4310 insertions, 0 deletions
diff --git a/lib/Format/CMakeLists.txt b/lib/Format/CMakeLists.txt new file mode 100644 index 0000000..d8630ee --- /dev/null +++ b/lib/Format/CMakeLists.txt @@ -0,0 +1,26 @@ +set(LLVM_LINK_COMPONENTS support) + +add_clang_library(clangFormat + TokenAnnotator.cpp + UnwrappedLineParser.cpp + Format.cpp + ) + +add_dependencies(clangFormat + ClangAttrClasses + ClangAttrList + ClangDeclNodes + ClangDiagnosticCommon + ClangDiagnosticFrontend + ClangStmtNodes + ) + +target_link_libraries(clangFormat + clangBasic + clangFrontend + clangAST + clangASTMatchers + clangRewriteCore + clangRewriteFrontend + clangTooling + ) diff --git a/lib/Format/Format.cpp b/lib/Format/Format.cpp new file mode 100644 index 0000000..101b16f --- /dev/null +++ b/lib/Format/Format.cpp @@ -0,0 +1,1763 @@ +//===--- Format.cpp - Format C++ code -------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// \brief This file implements functions declared in Format.h. This will be +/// split into separate files as we go. +/// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "format-formatter" + +#include "TokenAnnotator.h" +#include "UnwrappedLineParser.h" +#include "clang/Basic/Diagnostic.h" +#include "clang/Basic/OperatorPrecedence.h" +#include "clang/Basic/SourceManager.h" +#include "clang/Format/Format.h" +#include "clang/Frontend/TextDiagnosticPrinter.h" +#include "clang/Lex/Lexer.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/Support/Allocator.h" +#include "llvm/Support/Debug.h" +#include <queue> +#include <string> + +namespace clang { +namespace format { + +FormatStyle getLLVMStyle() { + FormatStyle LLVMStyle; + LLVMStyle.ColumnLimit = 80; + LLVMStyle.MaxEmptyLinesToKeep = 1; + LLVMStyle.PointerBindsToType = false; + LLVMStyle.DerivePointerBinding = false; + LLVMStyle.AccessModifierOffset = -2; + LLVMStyle.Standard = FormatStyle::LS_Cpp03; + LLVMStyle.IndentCaseLabels = false; + LLVMStyle.SpacesBeforeTrailingComments = 1; + LLVMStyle.BinPackParameters = true; + LLVMStyle.AllowAllParametersOfDeclarationOnNextLine = true; + LLVMStyle.ConstructorInitializerAllOnOneLineOrOnePerLine = false; + LLVMStyle.AllowShortIfStatementsOnASingleLine = false; + LLVMStyle.ObjCSpaceBeforeProtocolList = true; + LLVMStyle.PenaltyExcessCharacter = 1000000; + LLVMStyle.PenaltyReturnTypeOnItsOwnLine = 5; + return LLVMStyle; +} + +FormatStyle getGoogleStyle() { + FormatStyle GoogleStyle; + GoogleStyle.ColumnLimit = 80; + GoogleStyle.MaxEmptyLinesToKeep = 1; + GoogleStyle.PointerBindsToType = true; + GoogleStyle.DerivePointerBinding = true; + GoogleStyle.AccessModifierOffset = -1; + GoogleStyle.Standard = FormatStyle::LS_Auto; + GoogleStyle.IndentCaseLabels = true; + GoogleStyle.SpacesBeforeTrailingComments = 2; + GoogleStyle.BinPackParameters = true; + GoogleStyle.AllowAllParametersOfDeclarationOnNextLine = true; + GoogleStyle.ConstructorInitializerAllOnOneLineOrOnePerLine = true; + GoogleStyle.AllowShortIfStatementsOnASingleLine = false; + GoogleStyle.ObjCSpaceBeforeProtocolList = false; + GoogleStyle.PenaltyExcessCharacter = 1000000; + GoogleStyle.PenaltyReturnTypeOnItsOwnLine = 100; + return GoogleStyle; +} + +FormatStyle getChromiumStyle() { + FormatStyle ChromiumStyle = getGoogleStyle(); + ChromiumStyle.AllowAllParametersOfDeclarationOnNextLine = false; + ChromiumStyle.BinPackParameters = false; + ChromiumStyle.Standard = FormatStyle::LS_Cpp03; + ChromiumStyle.DerivePointerBinding = false; + return ChromiumStyle; +} + +static bool isTrailingComment(const AnnotatedToken &Tok) { + return Tok.is(tok::comment) && + (Tok.Children.empty() || Tok.Children[0].MustBreakBefore); +} + +static bool isComparison(const AnnotatedToken &Tok) { + prec::Level Precedence = getPrecedence(Tok); + return Tok.Type == TT_BinaryOperator && + (Precedence == prec::Equality || Precedence == prec::Relational); +} + +// Returns the length of everything up to the first possible line break after +// the ), ], } or > matching \c Tok. +static unsigned getLengthToMatchingParen(const AnnotatedToken &Tok) { + if (Tok.MatchingParen == NULL) + return 0; + AnnotatedToken *End = Tok.MatchingParen; + while (!End->Children.empty() && !End->Children[0].CanBreakBefore) { + End = &End->Children[0]; + } + return End->TotalLength - Tok.TotalLength + 1; +} + +static size_t +calculateColumnLimit(const FormatStyle &Style, bool InPPDirective) { + // In preprocessor directives reserve two chars for trailing " \" + return Style.ColumnLimit - (InPPDirective ? 2 : 0); +} + +/// \brief Manages the whitespaces around tokens and their replacements. +/// +/// This includes special handling for certain constructs, e.g. the alignment of +/// trailing line comments. +class WhitespaceManager { +public: + WhitespaceManager(SourceManager &SourceMgr, const FormatStyle &Style) + : SourceMgr(SourceMgr), Style(Style) {} + + /// \brief Replaces the whitespace in front of \p Tok. Only call once for + /// each \c AnnotatedToken. + void replaceWhitespace(const AnnotatedToken &Tok, unsigned NewLines, + unsigned Spaces, unsigned WhitespaceStartColumn) { + // 2+ newlines mean an empty line separating logic scopes. + if (NewLines >= 2) + alignComments(); + + SourceLocation TokenLoc = Tok.FormatTok.Tok.getLocation(); + bool LineExceedsColumnLimit = Spaces + WhitespaceStartColumn + + Tok.FormatTok.TokenLength > Style.ColumnLimit; + + // Align line comments if they are trailing or if they continue other + // trailing comments. + if (isTrailingComment(Tok)) { + // Remove the comment's trailing whitespace. + if (Tok.FormatTok.Tok.getLength() != Tok.FormatTok.TokenLength) + Replaces.insert(tooling::Replacement( + SourceMgr, TokenLoc.getLocWithOffset(Tok.FormatTok.TokenLength), + Tok.FormatTok.Tok.getLength() - Tok.FormatTok.TokenLength, "")); + + // Align comment with other comments. + if ((Tok.Parent != NULL || !Comments.empty()) && + !LineExceedsColumnLimit) { + StoredComment Comment; + Comment.Tok = Tok.FormatTok; + Comment.Spaces = Spaces; + Comment.NewLines = NewLines; + Comment.MinColumn = + NewLines > 0 ? Spaces : WhitespaceStartColumn + Spaces; + Comment.MaxColumn = Style.ColumnLimit - Tok.FormatTok.TokenLength; + Comment.Untouchable = false; + Comments.push_back(Comment); + return; + } + } + + // If this line does not have a trailing comment, align the stored comments. + if (Tok.Children.empty() && !isTrailingComment(Tok)) + alignComments(); + + if (Tok.Type == TT_BlockComment) { + indentBlockComment(Tok, Spaces, WhitespaceStartColumn, NewLines, false); + } else if (Tok.Type == TT_LineComment && LineExceedsColumnLimit) { + StringRef Line(SourceMgr.getCharacterData(TokenLoc), + Tok.FormatTok.TokenLength); + int StartColumn = Spaces + (NewLines == 0 ? WhitespaceStartColumn : 0); + StringRef Prefix = getLineCommentPrefix(Line); + std::string NewPrefix = std::string(StartColumn, ' ') + Prefix.str(); + splitLineInComment(Tok.FormatTok, Line.substr(Prefix.size()), + StartColumn + Prefix.size(), NewPrefix, + /*InPPDirective=*/ false, + /*CommentHasMoreLines=*/ false); + } + + storeReplacement(Tok.FormatTok, getNewLineText(NewLines, Spaces)); + } + + /// \brief Like \c replaceWhitespace, but additionally adds right-aligned + /// backslashes to escape newlines inside a preprocessor directive. + /// + /// This function and \c replaceWhitespace have the same behavior if + /// \c Newlines == 0. + void replacePPWhitespace(const AnnotatedToken &Tok, unsigned NewLines, + unsigned Spaces, unsigned WhitespaceStartColumn) { + if (Tok.Type == TT_BlockComment) + indentBlockComment(Tok, Spaces, WhitespaceStartColumn, NewLines, true); + + storeReplacement(Tok.FormatTok, + getNewLineText(NewLines, Spaces, WhitespaceStartColumn)); + } + + /// \brief Inserts a line break into the middle of a token. + /// + /// Will break at \p Offset inside \p Tok, putting \p Prefix before the line + /// break and \p Postfix before the rest of the token starts in the next line. + /// + /// \p InPPDirective, \p Spaces, \p WhitespaceStartColumn and \p Style are + /// used to generate the correct line break. + void breakToken(const FormatToken &Tok, unsigned Offset, + unsigned ReplaceChars, StringRef Prefix, StringRef Postfix, + bool InPPDirective, unsigned Spaces, + unsigned WhitespaceStartColumn) { + std::string NewLineText; + if (!InPPDirective) + NewLineText = getNewLineText(1, Spaces); + else + NewLineText = getNewLineText(1, Spaces, WhitespaceStartColumn); + std::string ReplacementText = (Prefix + NewLineText + Postfix).str(); + SourceLocation Location = Tok.Tok.getLocation().getLocWithOffset(Offset); + Replaces.insert(tooling::Replacement(SourceMgr, Location, ReplaceChars, + ReplacementText)); + } + + /// \brief Returns all the \c Replacements created during formatting. + const tooling::Replacements &generateReplacements() { + alignComments(); + return Replaces; + } + + void addUntouchableComment(unsigned Column) { + StoredComment Comment; + Comment.MinColumn = Column; + Comment.MaxColumn = Column; + Comment.Untouchable = true; + Comments.push_back(Comment); + } + +private: + static StringRef getLineCommentPrefix(StringRef Comment) { + const char *KnownPrefixes[] = { "/// ", "///", "// ", "//" }; + for (size_t i = 0; i < llvm::array_lengthof(KnownPrefixes); ++i) + if (Comment.startswith(KnownPrefixes[i])) + return KnownPrefixes[i]; + return ""; + } + + /// \brief Finds a common prefix of lines of a block comment to properly + /// indent (and possibly decorate with '*'s) added lines. + /// + /// The first line is ignored (it's special and starts with /*). The number of + /// lines should be more than one. + static StringRef findCommentLinesPrefix(ArrayRef<StringRef> Lines, + const char *PrefixChars = " *") { + assert(Lines.size() > 1); + StringRef Prefix(Lines[1].data(), Lines[1].find_first_not_of(PrefixChars)); + for (size_t i = 2; i < Lines.size(); ++i) { + for (size_t j = 0; j < Prefix.size() && j < Lines[i].size(); ++j) { + if (Prefix[j] != Lines[i][j]) { + Prefix = Prefix.substr(0, j); + break; + } + } + } + return Prefix; + } + + /// \brief Splits one line in a line or block comment, if it doesn't fit to + /// provided column limit. Removes trailing whitespace in each line. + /// + /// \param Line points to the line contents without leading // or /*. + /// + /// \param StartColumn is the column where the first character of Line will be + /// located after formatting. + /// + /// \param LinePrefix is inserted after each line break. + /// + /// When \param InPPDirective is true, each line break will be preceded by a + /// backslash in the last column to make line breaks inside the comment + /// visually consistent with line breaks outside the comment. This only makes + /// sense for block comments. + /// + /// When \param CommentHasMoreLines is false, no line breaks/trailing + /// backslashes will be inserted after it. + void splitLineInComment(const FormatToken &Tok, StringRef Line, + size_t StartColumn, StringRef LinePrefix, + bool InPPDirective, bool CommentHasMoreLines, + const char *WhiteSpaceChars = " ") { + size_t ColumnLimit = calculateColumnLimit(Style, InPPDirective); + const char *TokenStart = SourceMgr.getCharacterData(Tok.Tok.getLocation()); + + StringRef TrimmedLine = Line.rtrim(); + int TrailingSpaceLength = Line.size() - TrimmedLine.size(); + + // Don't touch leading whitespace. + Line = TrimmedLine.ltrim(); + StartColumn += TrimmedLine.size() - Line.size(); + + while (Line.size() + StartColumn > ColumnLimit) { + // Try to break at the last whitespace before the column limit. + size_t SpacePos = + Line.find_last_of(WhiteSpaceChars, ColumnLimit - StartColumn + 1); + if (SpacePos == StringRef::npos) { + // Try to find any whitespace in the line. + SpacePos = Line.find_first_of(WhiteSpaceChars); + if (SpacePos == StringRef::npos) // No whitespace found, give up. + break; + } + + StringRef NextCut = Line.substr(0, SpacePos).rtrim(); + StringRef RemainingLine = Line.substr(SpacePos).ltrim(); + if (RemainingLine.empty()) + break; + + if (RemainingLine == "*/" && LinePrefix.endswith("* ")) + LinePrefix = LinePrefix.substr(0, LinePrefix.size() - 2); + + Line = RemainingLine; + + size_t ReplaceChars = Line.begin() - NextCut.end(); + breakToken(Tok, NextCut.end() - TokenStart, ReplaceChars, "", LinePrefix, + InPPDirective, 0, NextCut.size() + StartColumn); + StartColumn = LinePrefix.size(); + } + + if (TrailingSpaceLength > 0 || (InPPDirective && CommentHasMoreLines)) { + // Remove trailing whitespace/insert backslash. + 1 is for \n + breakToken(Tok, Line.end() - TokenStart, TrailingSpaceLength + 1, "", "", + InPPDirective, 0, Line.size() + StartColumn); + } + } + + /// \brief Changes indentation of all lines in a block comment by Indent, + /// removes trailing whitespace from each line, splits lines that end up + /// exceeding the column limit. + void indentBlockComment(const AnnotatedToken &Tok, int Indent, + int WhitespaceStartColumn, int NewLines, + bool InPPDirective) { + assert(Tok.Type == TT_BlockComment); + int StartColumn = Indent + (NewLines == 0 ? WhitespaceStartColumn : 0); + const SourceLocation TokenLoc = Tok.FormatTok.Tok.getLocation(); + const int CurrentIndent = SourceMgr.getSpellingColumnNumber(TokenLoc) - 1; + const int IndentDelta = Indent - CurrentIndent; + const StringRef Text(SourceMgr.getCharacterData(TokenLoc), + Tok.FormatTok.TokenLength); + assert(Text.startswith("/*") && Text.endswith("*/")); + + SmallVector<StringRef, 16> Lines; + Text.split(Lines, "\n"); + + if (IndentDelta > 0) { + std::string WhiteSpace(IndentDelta, ' '); + for (size_t i = 1; i < Lines.size(); ++i) { + Replaces.insert(tooling::Replacement( + SourceMgr, TokenLoc.getLocWithOffset(Lines[i].data() - Text.data()), + 0, WhiteSpace)); + } + } else if (IndentDelta < 0) { + std::string WhiteSpace(-IndentDelta, ' '); + // Check that the line is indented enough. + for (size_t i = 1; i < Lines.size(); ++i) { + if (!Lines[i].startswith(WhiteSpace)) + return; + } + for (size_t i = 1; i < Lines.size(); ++i) { + Replaces.insert(tooling::Replacement( + SourceMgr, TokenLoc.getLocWithOffset(Lines[i].data() - Text.data()), + -IndentDelta, "")); + } + } + + // Split long lines in comments. + size_t OldPrefixSize = 0; + std::string NewPrefix; + if (Lines.size() > 1) { + StringRef CurrentPrefix = findCommentLinesPrefix(Lines); + OldPrefixSize = CurrentPrefix.size(); + NewPrefix = (IndentDelta < 0) + ? CurrentPrefix.substr(-IndentDelta).str() + : std::string(IndentDelta, ' ') + CurrentPrefix.str(); + if (CurrentPrefix.endswith("*")) { + NewPrefix += " "; + ++OldPrefixSize; + } + } else if (Tok.Parent == 0) { + NewPrefix = std::string(StartColumn, ' ') + " * "; + } + + StartColumn += 2; + for (size_t i = 0; i < Lines.size(); ++i) { + StringRef Line = Lines[i].substr(i == 0 ? 2 : OldPrefixSize); + splitLineInComment(Tok.FormatTok, Line, StartColumn, NewPrefix, + InPPDirective, i != Lines.size() - 1); + StartColumn = NewPrefix.size(); + } + } + + std::string getNewLineText(unsigned NewLines, unsigned Spaces) { + return std::string(NewLines, '\n') + std::string(Spaces, ' '); + } + + std::string getNewLineText(unsigned NewLines, unsigned Spaces, + unsigned WhitespaceStartColumn) { + std::string NewLineText; + if (NewLines > 0) { + unsigned Offset = + std::min<int>(Style.ColumnLimit - 1, WhitespaceStartColumn); + for (unsigned i = 0; i < NewLines; ++i) { + NewLineText += std::string(Style.ColumnLimit - Offset - 1, ' '); + NewLineText += "\\\n"; + Offset = 0; + } + } + return NewLineText + std::string(Spaces, ' '); + } + + /// \brief Structure to store a comment for later layout and alignment. + struct StoredComment { + FormatToken Tok; + unsigned MinColumn; + unsigned MaxColumn; + unsigned NewLines; + unsigned Spaces; + bool Untouchable; + }; + SmallVector<StoredComment, 16> Comments; + typedef SmallVector<StoredComment, 16>::iterator comment_iterator; + + /// \brief Try to align all stashed comments. + void alignComments() { + unsigned MinColumn = 0; + unsigned MaxColumn = UINT_MAX; + comment_iterator Start = Comments.begin(); + for (comment_iterator I = Start, E = Comments.end(); I != E; ++I) { + if (I->MinColumn > MaxColumn || I->MaxColumn < MinColumn) { + alignComments(Start, I, MinColumn); + MinColumn = I->MinColumn; + MaxColumn = I->MaxColumn; + Start = I; + } else { + MinColumn = std::max(MinColumn, I->MinColumn); + MaxColumn = std::min(MaxColumn, I->MaxColumn); + } + } + alignComments(Start, Comments.end(), MinColumn); + Comments.clear(); + } + + /// \brief Put all the comments between \p I and \p E into \p Column. + void alignComments(comment_iterator I, comment_iterator E, unsigned Column) { + while (I != E) { + if (!I->Untouchable) { + unsigned Spaces = I->Spaces + Column - I->MinColumn; + storeReplacement(I->Tok, getNewLineText(I->NewLines, Spaces)); + } + ++I; + } + } + + /// \brief Stores \p Text as the replacement for the whitespace in front of + /// \p Tok. + void storeReplacement(const FormatToken &Tok, const std::string Text) { + // Don't create a replacement, if it does not change anything. + if (StringRef(SourceMgr.getCharacterData(Tok.WhiteSpaceStart), + Tok.WhiteSpaceLength) == Text) + return; + + Replaces.insert(tooling::Replacement(SourceMgr, Tok.WhiteSpaceStart, + Tok.WhiteSpaceLength, Text)); + } + + SourceManager &SourceMgr; + tooling::Replacements Replaces; + const FormatStyle &Style; +}; + +class UnwrappedLineFormatter { +public: + UnwrappedLineFormatter(const FormatStyle &Style, SourceManager &SourceMgr, + const AnnotatedLine &Line, unsigned FirstIndent, + const AnnotatedToken &RootToken, + WhitespaceManager &Whitespaces, bool StructuralError) + : Style(Style), SourceMgr(SourceMgr), Line(Line), + FirstIndent(FirstIndent), RootToken(RootToken), + Whitespaces(Whitespaces), Count(0) {} + + /// \brief Formats an \c UnwrappedLine. + /// + /// \returns The column after the last token in the last line of the + /// \c UnwrappedLine. + unsigned format(const AnnotatedLine *NextLine) { + // Initialize state dependent on indent. + LineState State; + State.Column = FirstIndent; + State.NextToken = &RootToken; + State.Stack.push_back( + ParenState(FirstIndent, FirstIndent, !Style.BinPackParameters, + /*HasMultiParameterLine=*/ false)); + State.LineContainsContinuedForLoopSection = false; + State.ParenLevel = 0; + State.StartOfStringLiteral = 0; + State.StartOfLineLevel = State.ParenLevel; + + DEBUG({ + DebugTokenState(*State.NextToken); + }); + + // The first token has already been indented and thus consumed. + moveStateToNextToken(State, /*DryRun=*/ false); + + // If everything fits on a single line, just put it there. + unsigned ColumnLimit = Style.ColumnLimit; + if (NextLine && NextLine->InPPDirective && + !NextLine->First.FormatTok.HasUnescapedNewline) + ColumnLimit = getColumnLimit(); + if (Line.Last->TotalLength <= ColumnLimit - FirstIndent) { + while (State.NextToken != NULL) { + addTokenToState(false, false, State); + } + return State.Column; + } + + // If the ObjC method declaration does not fit on a line, we should format + // it with one arg per line. + if (Line.Type == LT_ObjCMethodDecl) + State.Stack.back().BreakBeforeParameter = true; + + // Find best solution in solution space. + return analyzeSolutionSpace(State); + } + +private: + void DebugTokenState(const AnnotatedToken &AnnotatedTok) { + const Token &Tok = AnnotatedTok.FormatTok.Tok; + llvm::errs() << StringRef(SourceMgr.getCharacterData(Tok.getLocation()), + Tok.getLength()); + llvm::errs(); + } + + struct ParenState { + ParenState(unsigned Indent, unsigned LastSpace, bool AvoidBinPacking, + bool HasMultiParameterLine) + : Indent(Indent), LastSpace(LastSpace), FirstLessLess(0), + BreakBeforeClosingBrace(false), QuestionColumn(0), + AvoidBinPacking(AvoidBinPacking), BreakBeforeParameter(false), + HasMultiParameterLine(HasMultiParameterLine), ColonPos(0), + StartOfFunctionCall(0), NestedNameSpecifierContinuation(0), + CallContinuation(0), VariablePos(0) {} + + /// \brief The position to which a specific parenthesis level needs to be + /// indented. + unsigned Indent; + + /// \brief The position of the last space on each level. + /// + /// Used e.g. to break like: + /// functionCall(Parameter, otherCall( + /// OtherParameter)); + unsigned LastSpace; + + /// \brief The position the first "<<" operator encountered on each level. + /// + /// Used to align "<<" operators. 0 if no such operator has been encountered + /// on a level. + unsigned FirstLessLess; + + /// \brief Whether a newline needs to be inserted before the block's closing + /// brace. + /// + /// We only want to insert a newline before the closing brace if there also + /// was a newline after the beginning left brace. + bool BreakBeforeClosingBrace; + + /// \brief The column of a \c ? in a conditional expression; + unsigned QuestionColumn; + + /// \brief Avoid bin packing, i.e. multiple parameters/elements on multiple + /// lines, in this context. + bool AvoidBinPacking; + + /// \brief Break after the next comma (or all the commas in this context if + /// \c AvoidBinPacking is \c true). + bool BreakBeforeParameter; + + /// \brief This context already has a line with more than one parameter. + bool HasMultiParameterLine; + + /// \brief The position of the colon in an ObjC method declaration/call. + unsigned ColonPos; + + /// \brief The start of the most recent function in a builder-type call. + unsigned StartOfFunctionCall; + + /// \brief If a nested name specifier was broken over multiple lines, this + /// contains the start column of the second line. Otherwise 0. + unsigned NestedNameSpecifierContinuation; + + /// \brief If a call expression was broken over multiple lines, this + /// contains the start column of the second line. Otherwise 0. + unsigned CallContinuation; + + /// \brief The column of the first variable name in a variable declaration. + /// + /// Used to align further variables if necessary. + unsigned VariablePos; + + bool operator<(const ParenState &Other) const { + if (Indent != Other.Indent) + return Indent < Other.Indent; + if (LastSpace != Other.LastSpace) + return LastSpace < Other.LastSpace; + if (FirstLessLess != Other.FirstLessLess) + return FirstLessLess < Other.FirstLessLess; + if (BreakBeforeClosingBrace != Other.BreakBeforeClosingBrace) + return BreakBeforeClosingBrace; + if (QuestionColumn != Other.QuestionColumn) + return QuestionColumn < Other.QuestionColumn; + if (AvoidBinPacking != Other.AvoidBinPacking) + return AvoidBinPacking; + if (BreakBeforeParameter != Other.BreakBeforeParameter) + return BreakBeforeParameter; + if (HasMultiParameterLine != Other.HasMultiParameterLine) + return HasMultiParameterLine; + if (ColonPos != Other.ColonPos) + return ColonPos < Other.ColonPos; + if (StartOfFunctionCall != Other.StartOfFunctionCall) + return StartOfFunctionCall < Other.StartOfFunctionCall; + if (NestedNameSpecifierContinuation != + Other.NestedNameSpecifierContinuation) + return NestedNameSpecifierContinuation < + Other.NestedNameSpecifierContinuation; + if (CallContinuation != Other.CallContinuation) + return CallContinuation < Other.CallContinuation; + if (VariablePos != Other.VariablePos) + return VariablePos < Other.VariablePos; + return false; + } + }; + + /// \brief The current state when indenting a unwrapped line. + /// + /// As the indenting tries different combinations this is copied by value. + struct LineState { + /// \brief The number of used columns in the current line. + unsigned Column; + + /// \brief The token that needs to be next formatted. + const AnnotatedToken *NextToken; + + /// \brief \c true if this line contains a continued for-loop section. + bool LineContainsContinuedForLoopSection; + + /// \brief The level of nesting inside (), [], <> and {}. + unsigned ParenLevel; + + /// \brief The \c ParenLevel at the start of this line. + unsigned StartOfLineLevel; + + /// \brief The start column of the string literal, if we're in a string + /// literal sequence, 0 otherwise. + unsigned StartOfStringLiteral; + + /// \brief A stack keeping track of properties applying to parenthesis + /// levels. + std::vector<ParenState> Stack; + + /// \brief Comparison operator to be able to used \c LineState in \c map. + bool operator<(const LineState &Other) const { + if (NextToken != Other.NextToken) + return NextToken < Other.NextToken; + if (Column != Other.Column) + return Column < Other.Column; + if (LineContainsContinuedForLoopSection != + Other.LineContainsContinuedForLoopSection) + return LineContainsContinuedForLoopSection; + if (ParenLevel != Other.ParenLevel) + return ParenLevel < Other.ParenLevel; + if (StartOfLineLevel != Other.StartOfLineLevel) + return StartOfLineLevel < Other.StartOfLineLevel; + if (StartOfStringLiteral != Other.StartOfStringLiteral) + return StartOfStringLiteral < Other.StartOfStringLiteral; + return Stack < Other.Stack; + } + }; + + /// \brief Appends the next token to \p State and updates information + /// necessary for indentation. + /// + /// Puts the token on the current line if \p Newline is \c true and adds a + /// line break and necessary indentation otherwise. + /// + /// If \p DryRun is \c false, also creates and stores the required + /// \c Replacement. + unsigned addTokenToState(bool Newline, bool DryRun, LineState &State) { + const AnnotatedToken &Current = *State.NextToken; + const AnnotatedToken &Previous = *State.NextToken->Parent; + + if (State.Stack.size() == 0 || Current.Type == TT_ImplicitStringLiteral) { + State.Column += State.NextToken->FormatTok.WhiteSpaceLength + + State.NextToken->FormatTok.TokenLength; + if (State.NextToken->Children.empty()) + State.NextToken = NULL; + else + State.NextToken = &State.NextToken->Children[0]; + return 0; + } + + // If we are continuing an expression, we want to indent an extra 4 spaces. + unsigned ContinuationIndent = + std::max(State.Stack.back().LastSpace, State.Stack.back().Indent) + 4; + if (Newline) { + unsigned WhitespaceStartColumn = State.Column; + if (Current.is(tok::r_brace)) { + State.Column = Line.Level * 2; + } else if (Current.is(tok::string_literal) && + State.StartOfStringLiteral != 0) { + State.Column = State.StartOfStringLiteral; + State.Stack.back().BreakBeforeParameter = true; + } else if (Current.is(tok::lessless) && + State.Stack.back().FirstLessLess != 0) { + State.Column = State.Stack.back().FirstLessLess; + } else if (Previous.is(tok::coloncolon)) { + if (State.Stack.back().NestedNameSpecifierContinuation == 0) { + State.Column = ContinuationIndent; + State.Stack.back().NestedNameSpecifierContinuation = State.Column; + } else { + State.Column = State.Stack.back().NestedNameSpecifierContinuation; + } + } else if (Current.isOneOf(tok::period, tok::arrow)) { + if (State.Stack.back().CallContinuation == 0) { + State.Column = ContinuationIndent; + State.Stack.back().CallContinuation = State.Column; + } else { + State.Column = State.Stack.back().CallContinuation; + } + } else if (Current.Type == TT_ConditionalExpr) { + State.Column = State.Stack.back().QuestionColumn; + } else if (Previous.is(tok::comma) && + State.Stack.back().VariablePos != 0) { + State.Column = State.Stack.back().VariablePos; + } else if (Previous.ClosesTemplateDeclaration || + (Current.Type == TT_StartOfName && State.ParenLevel == 0)) { + State.Column = State.Stack.back().Indent; + } else if (Current.Type == TT_ObjCSelectorName) { + if (State.Stack.back().ColonPos > Current.FormatTok.TokenLength) { + State.Column = + State.Stack.back().ColonPos - Current.FormatTok.TokenLength; + } else { + State.Column = State.Stack.back().Indent; + State.Stack.back().ColonPos = + State.Column + Current.FormatTok.TokenLength; + } + } else if (Current.Type == TT_StartOfName || Current.is(tok::question) || + Previous.is(tok::equal) || isComparison(Previous) || + Previous.Type == TT_ObjCMethodExpr) { + State.Column = ContinuationIndent; + } else { + State.Column = State.Stack.back().Indent; + // Ensure that we fall back to indenting 4 spaces instead of just + // flushing continuations left. + if (State.Column == FirstIndent) + State.Column += 4; + } + + if (Current.is(tok::question)) + State.Stack.back().BreakBeforeParameter = true; + if (Previous.isOneOf(tok::comma, tok::semi) && + !State.Stack.back().AvoidBinPacking) + State.Stack.back().BreakBeforeParameter = false; + + if (!DryRun) { + unsigned NewLines = 1; + if (Current.Type == TT_LineComment) + NewLines = + std::max(NewLines, std::min(Current.FormatTok.NewlinesBefore, + Style.MaxEmptyLinesToKeep + 1)); + if (!Line.InPPDirective) + Whitespaces.replaceWhitespace(Current, NewLines, State.Column, + WhitespaceStartColumn); + else + Whitespaces.replacePPWhitespace(Current, NewLines, State.Column, + WhitespaceStartColumn); + } + + State.Stack.back().LastSpace = State.Column; + State.StartOfLineLevel = State.ParenLevel; + + // Any break on this level means that the parent level has been broken + // and we need to avoid bin packing there. + for (unsigned i = 0, e = State.Stack.size() - 1; i != e; ++i) { + State.Stack[i].BreakBeforeParameter = true; + } + if (Current.isOneOf(tok::period, tok::arrow)) + State.Stack.back().BreakBeforeParameter = true; + + // If we break after {, we should also break before the corresponding }. + if (Previous.is(tok::l_brace)) + State.Stack.back().BreakBeforeClosingBrace = true; + + if (State.Stack.back().AvoidBinPacking) { + // If we are breaking after '(', '{', '<', this is not bin packing + // unless AllowAllParametersOfDeclarationOnNextLine is false. + if ((Previous.isNot(tok::l_paren) && Previous.isNot(tok::l_brace)) || + (!Style.AllowAllParametersOfDeclarationOnNextLine && + Line.MustBeDeclaration)) + State.Stack.back().BreakBeforeParameter = true; + } + } else { + if (Current.is(tok::equal) && + (RootToken.is(tok::kw_for) || State.ParenLevel == 0) && + State.Stack.back().VariablePos == 0) { + State.Stack.back().VariablePos = State.Column; + // Move over * and & if they are bound to the variable name. + const AnnotatedToken *Tok = &Previous; + while (Tok && + State.Stack.back().VariablePos >= Tok->FormatTok.TokenLength) { + State.Stack.back().VariablePos -= Tok->FormatTok.TokenLength; + if (Tok->SpacesRequiredBefore != 0) + break; + Tok = Tok->Parent; + } + if (Previous.PartOfMultiVariableDeclStmt) + State.Stack.back().LastSpace = State.Stack.back().VariablePos; + } + + unsigned Spaces = State.NextToken->SpacesRequiredBefore; + + if (!DryRun) + Whitespaces.replaceWhitespace(Current, 0, Spaces, State.Column); + + if (Current.Type == TT_ObjCSelectorName && + State.Stack.back().ColonPos == 0) { + if (State.Stack.back().Indent + Current.LongestObjCSelectorName > + State.Column + Spaces + Current.FormatTok.TokenLength) + State.Stack.back().ColonPos = + State.Stack.back().Indent + Current.LongestObjCSelectorName; + else + State.Stack.back().ColonPos = + State.Column + Spaces + Current.FormatTok.TokenLength; + } + + if (Current.Type != TT_LineComment && + (Previous.isOneOf(tok::l_paren, tok::l_brace) || + State.NextToken->Parent->Type == TT_TemplateOpener)) + State.Stack.back().Indent = State.Column + Spaces; + if (Previous.is(tok::comma) && !isTrailingComment(Current)) + State.Stack.back().HasMultiParameterLine = true; + + State.Column += Spaces; + if (Current.is(tok::l_paren) && Previous.isOneOf(tok::kw_if, tok::kw_for)) + // Treat the condition inside an if as if it was a second function + // parameter, i.e. let nested calls have an indent of 4. + State.Stack.back().LastSpace = State.Column + 1; // 1 is length of "(". + else if (Previous.is(tok::comma)) + State.Stack.back().LastSpace = State.Column; + else if ((Previous.Type == TT_BinaryOperator || + Previous.Type == TT_ConditionalExpr || + Previous.Type == TT_CtorInitializerColon) && + getPrecedence(Previous) != prec::Assignment) + State.Stack.back().LastSpace = State.Column; + else if (Previous.Type == TT_InheritanceColon) + State.Stack.back().Indent = State.Column; + else if (Previous.ParameterCount > 1 && + (Previous.isOneOf(tok::l_paren, tok::l_square, tok::l_brace) || + Previous.Type == TT_TemplateOpener)) + // If this function has multiple parameters, indent nested calls from + // the start of the first parameter. + State.Stack.back().LastSpace = State.Column; + } + + return moveStateToNextToken(State, DryRun); + } + + /// \brief Mark the next token as consumed in \p State and modify its stacks + /// accordingly. + unsigned moveStateToNextToken(LineState &State, bool DryRun) { + const AnnotatedToken &Current = *State.NextToken; + assert(State.Stack.size()); + + if (Current.Type == TT_InheritanceColon) + State.Stack.back().AvoidBinPacking = true; + if (Current.is(tok::lessless) && State.Stack.back().FirstLessLess == 0) + State.Stack.back().FirstLessLess = State.Column; + if (Current.is(tok::question)) + State.Stack.back().QuestionColumn = State.Column; + if (Current.isOneOf(tok::period, tok::arrow) && + Line.Type == LT_BuilderTypeCall && State.ParenLevel == 0) + State.Stack.back().StartOfFunctionCall = + Current.LastInChainOfCalls ? 0 : State.Column; + if (Current.Type == TT_CtorInitializerColon) { + if (Style.ConstructorInitializerAllOnOneLineOrOnePerLine) + State.Stack.back().AvoidBinPacking = true; + State.Stack.back().BreakBeforeParameter = false; + } + + // In ObjC method declaration we align on the ":" of parameters, but we need + // to ensure that we indent parameters on subsequent lines by at least 4. + if (Current.Type == TT_ObjCMethodSpecifier) + State.Stack.back().Indent += 4; + + // Insert scopes created by fake parenthesis. + for (unsigned i = 0, e = Current.FakeLParens; i != e; ++i) { + ParenState NewParenState = State.Stack.back(); + NewParenState.Indent = std::max(State.Column, State.Stack.back().Indent); + NewParenState.BreakBeforeParameter = false; + State.Stack.push_back(NewParenState); + } + + // If we encounter an opening (, [, { or <, we add a level to our stacks to + // prepare for the following tokens. + if (Current.isOneOf(tok::l_paren, tok::l_square, tok::l_brace) || + State.NextToken->Type == TT_TemplateOpener) { + unsigned NewIndent; + bool AvoidBinPacking; + if (Current.is(tok::l_brace)) { + NewIndent = 2 + State.Stack.back().LastSpace; + AvoidBinPacking = false; + } else { + NewIndent = 4 + std::max(State.Stack.back().LastSpace, + State.Stack.back().StartOfFunctionCall); + AvoidBinPacking = + !Style.BinPackParameters || State.Stack.back().AvoidBinPacking; + } + State.Stack.push_back( + ParenState(NewIndent, State.Stack.back().LastSpace, AvoidBinPacking, + State.Stack.back().HasMultiParameterLine)); + ++State.ParenLevel; + } + + // If this '[' opens an ObjC call, determine whether all parameters fit into + // one line and put one per line if they don't. + if (Current.is(tok::l_square) && Current.Type == TT_ObjCMethodExpr && + Current.MatchingParen != NULL) { + if (getLengthToMatchingParen(Current) + State.Column > getColumnLimit()) + State.Stack.back().BreakBeforeParameter = true; + } + + // If we encounter a closing ), ], } or >, we can remove a level from our + // stacks. + if (Current.isOneOf(tok::r_paren, tok::r_square) || + (Current.is(tok::r_brace) && State.NextToken != &RootToken) || + State.NextToken->Type == TT_TemplateCloser) { + State.Stack.pop_back(); + --State.ParenLevel; + } + + // Remove scopes created by fake parenthesis. + for (unsigned i = 0, e = Current.FakeRParens; i != e; ++i) { + unsigned VariablePos = State.Stack.back().VariablePos; + State.Stack.pop_back(); + State.Stack.back().VariablePos = VariablePos; + } + + if (Current.is(tok::string_literal)) { + State.StartOfStringLiteral = State.Column; + } else if (Current.isNot(tok::comment)) { + State.StartOfStringLiteral = 0; + } + + State.Column += Current.FormatTok.TokenLength; + + if (State.NextToken->Children.empty()) + State.NextToken = NULL; + else + State.NextToken = &State.NextToken->Children[0]; + + return breakProtrudingToken(Current, State, DryRun); + } + + /// \brief If the current token sticks out over the end of the line, break + /// it if possible. + unsigned breakProtrudingToken(const AnnotatedToken &Current, LineState &State, + bool DryRun) { + if (Current.isNot(tok::string_literal)) + return 0; + // Only break up default narrow strings. + const char *LiteralData = Current.FormatTok.Tok.getLiteralData(); + if (!LiteralData || *LiteralData != '"') + return 0; + + unsigned Penalty = 0; + unsigned TailOffset = 0; + unsigned TailLength = Current.FormatTok.TokenLength; + unsigned StartColumn = State.Column - Current.FormatTok.TokenLength; + unsigned OffsetFromStart = 0; + while (StartColumn + TailLength > getColumnLimit()) { + StringRef Text = StringRef(LiteralData + TailOffset, TailLength); + if (StartColumn + OffsetFromStart + 1 > getColumnLimit()) + break; + StringRef::size_type SplitPoint = getSplitPoint( + Text, getColumnLimit() - StartColumn - OffsetFromStart - 1); + if (SplitPoint == StringRef::npos) + break; + assert(SplitPoint != 0); + // +2, because 'Text' starts after the opening quotes, and does not + // include the closing quote we need to insert. + unsigned WhitespaceStartColumn = + StartColumn + OffsetFromStart + SplitPoint + 2; + State.Stack.back().LastSpace = StartColumn; + if (!DryRun) { + Whitespaces.breakToken(Current.FormatTok, TailOffset + SplitPoint + 1, + 0, "\"", "\"", Line.InPPDirective, StartColumn, + WhitespaceStartColumn); + } + TailOffset += SplitPoint + 1; + TailLength -= SplitPoint + 1; + OffsetFromStart = 1; + Penalty += Style.PenaltyExcessCharacter; + for (unsigned i = 0, e = State.Stack.size(); i != e; ++i) + State.Stack[i].BreakBeforeParameter = true; + } + State.Column = StartColumn + TailLength; + return Penalty; + } + + StringRef::size_type + getSplitPoint(StringRef Text, StringRef::size_type Offset) { + StringRef::size_type SpaceOffset = Text.rfind(' ', Offset); + if (SpaceOffset != StringRef::npos && SpaceOffset != 0) + return SpaceOffset; + StringRef::size_type SlashOffset = Text.rfind('/', Offset); + if (SlashOffset != StringRef::npos && SlashOffset != 0) + return SlashOffset; + StringRef::size_type Split = getStartOfCharacter(Text, Offset); + if (Split != StringRef::npos && Split > 1) + // Do not split at 0. + return Split - 1; + return StringRef::npos; + } + + StringRef::size_type + getStartOfCharacter(StringRef Text, StringRef::size_type Offset) { + StringRef::size_type NextEscape = Text.find('\\'); + while (NextEscape != StringRef::npos && NextEscape < Offset) { + StringRef::size_type SequenceLength = + getEscapeSequenceLength(Text.substr(NextEscape)); + if (Offset < NextEscape + SequenceLength) + return NextEscape; + NextEscape = Text.find('\\', NextEscape + SequenceLength); + } + return Offset; + } + + unsigned getEscapeSequenceLength(StringRef Text) { + assert(Text[0] == '\\'); + if (Text.size() < 2) + return 1; + + switch (Text[1]) { + case 'u': + return 6; + case 'U': + return 10; + case 'x': + return getHexLength(Text); + default: + if (Text[1] >= '0' && Text[1] <= '7') + return getOctalLength(Text); + return 2; + } + } + + unsigned getHexLength(StringRef Text) { + unsigned I = 2; // Point after '\x'. + while (I < Text.size() && ((Text[I] >= '0' && Text[I] <= '9') || + (Text[I] >= 'a' && Text[I] <= 'f') || + (Text[I] >= 'A' && Text[I] <= 'F'))) { + ++I; + } + return I; + } + + unsigned getOctalLength(StringRef Text) { + unsigned I = 1; + while (I < Text.size() && I < 4 && (Text[I] >= '0' && Text[I] <= '7')) { + ++I; + } + return I; + } + + unsigned getColumnLimit() { + return calculateColumnLimit(Style, Line.InPPDirective); + } + + /// \brief An edge in the solution space from \c Previous->State to \c State, + /// inserting a newline dependent on the \c NewLine. + struct StateNode { + StateNode(const LineState &State, bool NewLine, StateNode *Previous) + : State(State), NewLine(NewLine), Previous(Previous) {} + LineState State; + bool NewLine; + StateNode *Previous; + }; + + /// \brief A pair of <penalty, count> that is used to prioritize the BFS on. + /// + /// In case of equal penalties, we want to prefer states that were inserted + /// first. During state generation we make sure that we insert states first + /// that break the line as late as possible. + typedef std::pair<unsigned, unsigned> OrderedPenalty; + + /// \brief An item in the prioritized BFS search queue. The \c StateNode's + /// \c State has the given \c OrderedPenalty. + typedef std::pair<OrderedPenalty, StateNode *> QueueItem; + + /// \brief The BFS queue type. + typedef std::priority_queue<QueueItem, std::vector<QueueItem>, + std::greater<QueueItem> > QueueType; + + /// \brief Analyze the entire solution space starting from \p InitialState. + /// + /// This implements a variant of Dijkstra's algorithm on the graph that spans + /// the solution space (\c LineStates are the nodes). The algorithm tries to + /// find the shortest path (the one with lowest penalty) from \p InitialState + /// to a state where all tokens are placed. + unsigned analyzeSolutionSpace(LineState &InitialState) { + std::set<LineState> Seen; + + // Insert start element into queue. + StateNode *Node = + new (Allocator.Allocate()) StateNode(InitialState, false, NULL); + Queue.push(QueueItem(OrderedPenalty(0, Count), Node)); + ++Count; + + // While not empty, take first element and follow edges. + while (!Queue.empty()) { + unsigned Penalty = Queue.top().first.first; + StateNode *Node = Queue.top().second; + if (Node->State.NextToken == NULL) { + DEBUG(llvm::errs() << "\n---\nPenalty for line: " << Penalty << "\n"); + break; + } + Queue.pop(); + + if (!Seen.insert(Node->State).second) + // State already examined with lower penalty. + continue; + + addNextStateToQueue(Penalty, Node, /*NewLine=*/ false); + addNextStateToQueue(Penalty, Node, /*NewLine=*/ true); + } + + if (Queue.empty()) + // We were unable to find a solution, do nothing. + // FIXME: Add diagnostic? + return 0; + + // Reconstruct the solution. + reconstructPath(InitialState, Queue.top().second); + DEBUG(llvm::errs() << "---\n"); + + // Return the column after the last token of the solution. + return Queue.top().second->State.Column; + } + + void reconstructPath(LineState &State, StateNode *Current) { + // FIXME: This recursive implementation limits the possible number + // of tokens per line if compiled into a binary with small stack space. + // To become more independent of stack frame limitations we would need + // to also change the TokenAnnotator. + if (Current->Previous == NULL) + return; + reconstructPath(State, Current->Previous); + DEBUG({ + if (Current->NewLine) { + llvm::errs() + << "Penalty for splitting before " + << Current->Previous->State.NextToken->FormatTok.Tok.getName() + << ": " << Current->Previous->State.NextToken->SplitPenalty << "\n"; + } + }); + addTokenToState(Current->NewLine, false, State); + } + + /// \brief Add the following state to the analysis queue \c Queue. + /// + /// Assume the current state is \p PreviousNode and has been reached with a + /// penalty of \p Penalty. Insert a line break if \p NewLine is \c true. + void addNextStateToQueue(unsigned Penalty, StateNode *PreviousNode, + bool NewLine) { + if (NewLine && !canBreak(PreviousNode->State)) + return; + if (!NewLine && mustBreak(PreviousNode->State)) + return; + if (NewLine) + Penalty += PreviousNode->State.NextToken->SplitPenalty; + + StateNode *Node = new (Allocator.Allocate()) + StateNode(PreviousNode->State, NewLine, PreviousNode); + Penalty += addTokenToState(NewLine, true, Node->State); + if (Node->State.Column > getColumnLimit()) { + unsigned ExcessCharacters = Node->State.Column - getColumnLimit(); + Penalty += Style.PenaltyExcessCharacter * ExcessCharacters; + } + + Queue.push(QueueItem(OrderedPenalty(Penalty, Count), Node)); + ++Count; + } + + /// \brief Returns \c true, if a line break after \p State is allowed. + bool canBreak(const LineState &State) { + if (!State.NextToken->CanBreakBefore && + !(State.NextToken->is(tok::r_brace) && + State.Stack.back().BreakBeforeClosingBrace)) + return false; + // Trying to insert a parameter on a new line if there are already more than + // one parameter on the current line is bin packing. + if (State.Stack.back().HasMultiParameterLine && + State.Stack.back().AvoidBinPacking) + return false; + return true; + } + + /// \brief Returns \c true, if a line break after \p State is mandatory. + bool mustBreak(const LineState &State) { + if (State.NextToken->MustBreakBefore) + return true; + if (State.NextToken->is(tok::r_brace) && + State.Stack.back().BreakBeforeClosingBrace) + return true; + if (State.NextToken->Parent->is(tok::semi) && + State.LineContainsContinuedForLoopSection) + return true; + if ((State.NextToken->Parent->isOneOf(tok::comma, tok::semi) || + State.NextToken->is(tok::question) || + State.NextToken->Type == TT_ConditionalExpr) && + State.Stack.back().BreakBeforeParameter && + !isTrailingComment(*State.NextToken) && + State.NextToken->isNot(tok::r_paren) && + State.NextToken->isNot(tok::r_brace)) + return true; + // FIXME: Comparing LongestObjCSelectorName to 0 is a hacky way of finding + // out whether it is the first parameter. Clean this up. + if (State.NextToken->Type == TT_ObjCSelectorName && + State.NextToken->LongestObjCSelectorName == 0 && + State.Stack.back().BreakBeforeParameter) + return true; + if ((State.NextToken->Type == TT_CtorInitializerColon || + (State.NextToken->Parent->ClosesTemplateDeclaration && + State.ParenLevel == 0))) + return true; + if (State.NextToken->Type == TT_InlineASMColon) + return true; + // This prevents breaks like: + // ... + // SomeParameter, OtherParameter).DoSomething( + // ... + // As they hide "DoSomething" and generally bad for readability. + if (State.NextToken->isOneOf(tok::period, tok::arrow) && + getRemainingLength(State) + State.Column > getColumnLimit() && + State.ParenLevel < State.StartOfLineLevel) + return true; + return false; + } + + // Returns the total number of columns required for the remaining tokens. + unsigned getRemainingLength(const LineState &State) { + if (State.NextToken && State.NextToken->Parent) + return Line.Last->TotalLength - State.NextToken->Parent->TotalLength; + return 0; + } + + FormatStyle Style; + SourceManager &SourceMgr; + const AnnotatedLine &Line; + const unsigned FirstIndent; + const AnnotatedToken &RootToken; + WhitespaceManager &Whitespaces; + + llvm::SpecificBumpPtrAllocator<StateNode> Allocator; + QueueType Queue; + // Increasing count of \c StateNode items we have created. This is used + // to create a deterministic order independent of the container. + unsigned Count; +}; + +class LexerBasedFormatTokenSource : public FormatTokenSource { +public: + LexerBasedFormatTokenSource(Lexer &Lex, SourceManager &SourceMgr) + : GreaterStashed(false), Lex(Lex), SourceMgr(SourceMgr), + IdentTable(Lex.getLangOpts()) { + Lex.SetKeepWhitespaceMode(true); + } + + virtual FormatToken getNextToken() { + if (GreaterStashed) { + FormatTok.NewlinesBefore = 0; + FormatTok.WhiteSpaceStart = + FormatTok.Tok.getLocation().getLocWithOffset(1); + FormatTok.WhiteSpaceLength = 0; + GreaterStashed = false; + return FormatTok; + } + + FormatTok = FormatToken(); + Lex.LexFromRawLexer(FormatTok.Tok); + StringRef Text = rawTokenText(FormatTok.Tok); + FormatTok.WhiteSpaceStart = FormatTok.Tok.getLocation(); + if (SourceMgr.getFileOffset(FormatTok.WhiteSpaceStart) == 0) + FormatTok.IsFirst = true; + + // Consume and record whitespace until we find a significant token. + while (FormatTok.Tok.is(tok::unknown)) { + unsigned Newlines = Text.count('\n'); + if (Newlines > 0) + FormatTok.LastNewlineOffset = + FormatTok.WhiteSpaceLength + Text.rfind('\n') + 1; + unsigned EscapedNewlines = Text.count("\\\n"); + FormatTok.NewlinesBefore += Newlines; + FormatTok.HasUnescapedNewline |= EscapedNewlines != Newlines; + FormatTok.WhiteSpaceLength += FormatTok.Tok.getLength(); + + if (FormatTok.Tok.is(tok::eof)) + return FormatTok; + Lex.LexFromRawLexer(FormatTok.Tok); + Text = rawTokenText(FormatTok.Tok); + } + + // Now FormatTok is the next non-whitespace token. + FormatTok.TokenLength = Text.size(); + + // In case the token starts with escaped newlines, we want to + // take them into account as whitespace - this pattern is quite frequent + // in macro definitions. + // FIXME: What do we want to do with other escaped spaces, and escaped + // spaces or newlines in the middle of tokens? + // FIXME: Add a more explicit test. + unsigned i = 0; + while (i + 1 < Text.size() && Text[i] == '\\' && Text[i + 1] == '\n') { + // FIXME: ++FormatTok.NewlinesBefore is missing... + FormatTok.WhiteSpaceLength += 2; + FormatTok.TokenLength -= 2; + i += 2; + } + + if (FormatTok.Tok.is(tok::raw_identifier)) { + IdentifierInfo &Info = IdentTable.get(Text); + FormatTok.Tok.setIdentifierInfo(&Info); + FormatTok.Tok.setKind(Info.getTokenID()); + } + + if (FormatTok.Tok.is(tok::greatergreater)) { + FormatTok.Tok.setKind(tok::greater); + FormatTok.TokenLength = 1; + GreaterStashed = true; + } + + // If we reformat comments, we remove trailing whitespace. Update the length + // accordingly. + if (FormatTok.Tok.is(tok::comment)) + FormatTok.TokenLength = Text.rtrim().size(); + + return FormatTok; + } + + IdentifierTable &getIdentTable() { return IdentTable; } + +private: + FormatToken FormatTok; + bool GreaterStashed; + Lexer &Lex; + SourceManager &SourceMgr; + IdentifierTable IdentTable; + + /// Returns the text of \c FormatTok. + StringRef rawTokenText(Token &Tok) { + return StringRef(SourceMgr.getCharacterData(Tok.getLocation()), + Tok.getLength()); + } +}; + +class Formatter : public UnwrappedLineConsumer { +public: + Formatter(DiagnosticsEngine &Diag, const FormatStyle &Style, Lexer &Lex, + SourceManager &SourceMgr, + const std::vector<CharSourceRange> &Ranges) + : Diag(Diag), Style(Style), Lex(Lex), SourceMgr(SourceMgr), + Whitespaces(SourceMgr, Style), Ranges(Ranges) {} + + virtual ~Formatter() {} + + tooling::Replacements format() { + LexerBasedFormatTokenSource Tokens(Lex, SourceMgr); + UnwrappedLineParser Parser(Diag, Style, Tokens, *this); + StructuralError = Parser.parse(); + unsigned PreviousEndOfLineColumn = 0; + TokenAnnotator Annotator(Style, SourceMgr, Lex, + Tokens.getIdentTable().get("in")); + for (unsigned i = 0, e = AnnotatedLines.size(); i != e; ++i) { + Annotator.annotate(AnnotatedLines[i]); + } + deriveLocalStyle(); + for (unsigned i = 0, e = AnnotatedLines.size(); i != e; ++i) { + Annotator.calculateFormattingInformation(AnnotatedLines[i]); + + // Adapt level to the next line if this is a comment. + // FIXME: Can/should this be done in the UnwrappedLineParser? + if (i + 1 != e && AnnotatedLines[i].First.is(tok::comment) && + AnnotatedLines[i].First.Children.empty() && + AnnotatedLines[i + 1].First.isNot(tok::r_brace)) + AnnotatedLines[i].Level = AnnotatedLines[i + 1].Level; + } + std::vector<int> IndentForLevel; + bool PreviousLineWasTouched = false; + const AnnotatedToken *PreviousLineLastToken = 0; + for (std::vector<AnnotatedLine>::iterator I = AnnotatedLines.begin(), + E = AnnotatedLines.end(); + I != E; ++I) { + const AnnotatedLine &TheLine = *I; + const FormatToken &FirstTok = TheLine.First.FormatTok; + int Offset = getIndentOffset(TheLine.First); + while (IndentForLevel.size() <= TheLine.Level) + IndentForLevel.push_back(-1); + IndentForLevel.resize(TheLine.Level + 1); + bool WasMoved = PreviousLineWasTouched && FirstTok.NewlinesBefore == 0; + if (TheLine.First.is(tok::eof)) { + if (PreviousLineWasTouched) { + unsigned NewLines = std::min(FirstTok.NewlinesBefore, 1u); + Whitespaces.replaceWhitespace(TheLine.First, NewLines, /*Indent*/ 0, + /*WhitespaceStartColumn*/ 0); + } + } else if (TheLine.Type != LT_Invalid && + (WasMoved || touchesLine(TheLine))) { + unsigned LevelIndent = getIndent(IndentForLevel, TheLine.Level); + unsigned Indent = LevelIndent; + if (static_cast<int>(Indent) + Offset >= 0) + Indent += Offset; + if (!FirstTok.WhiteSpaceStart.isValid() || StructuralError) { + Indent = LevelIndent = + SourceMgr.getSpellingColumnNumber(FirstTok.Tok.getLocation()) - 1; + } else { + formatFirstToken(TheLine.First, PreviousLineLastToken, Indent, + TheLine.InPPDirective, PreviousEndOfLineColumn); + } + tryFitMultipleLinesInOne(Indent, I, E); + UnwrappedLineFormatter Formatter(Style, SourceMgr, TheLine, Indent, + TheLine.First, Whitespaces, + StructuralError); + PreviousEndOfLineColumn = + Formatter.format(I + 1 != E ? &*(I + 1) : NULL); + IndentForLevel[TheLine.Level] = LevelIndent; + PreviousLineWasTouched = true; + } else { + if (FirstTok.NewlinesBefore > 0 || FirstTok.IsFirst) { + unsigned Indent = + SourceMgr.getSpellingColumnNumber(FirstTok.Tok.getLocation()) - 1; + unsigned LevelIndent = Indent; + if (static_cast<int>(LevelIndent) - Offset >= 0) + LevelIndent -= Offset; + if (TheLine.First.isNot(tok::comment)) + IndentForLevel[TheLine.Level] = LevelIndent; + + // Remove trailing whitespace of the previous line if it was touched. + if (PreviousLineWasTouched || touchesEmptyLineBefore(TheLine)) + formatFirstToken(TheLine.First, PreviousLineLastToken, Indent, + TheLine.InPPDirective, PreviousEndOfLineColumn); + } + // If we did not reformat this unwrapped line, the column at the end of + // the last token is unchanged - thus, we can calculate the end of the + // last token. + SourceLocation LastLoc = TheLine.Last->FormatTok.Tok.getLocation(); + PreviousEndOfLineColumn = + SourceMgr.getSpellingColumnNumber(LastLoc) + + Lex.MeasureTokenLength(LastLoc, SourceMgr, Lex.getLangOpts()) - 1; + PreviousLineWasTouched = false; + if (TheLine.Last->is(tok::comment)) + Whitespaces.addUntouchableComment(SourceMgr.getSpellingColumnNumber( + TheLine.Last->FormatTok.Tok.getLocation()) - 1); + } + PreviousLineLastToken = I->Last; + } + return Whitespaces.generateReplacements(); + } + +private: + void deriveLocalStyle() { + unsigned CountBoundToVariable = 0; + unsigned CountBoundToType = 0; + bool HasCpp03IncompatibleFormat = false; + for (unsigned i = 0, e = AnnotatedLines.size(); i != e; ++i) { + if (AnnotatedLines[i].First.Children.empty()) + continue; + AnnotatedToken *Tok = &AnnotatedLines[i].First.Children[0]; + while (!Tok->Children.empty()) { + if (Tok->Type == TT_PointerOrReference) { + bool SpacesBefore = Tok->FormatTok.WhiteSpaceLength > 0; + bool SpacesAfter = Tok->Children[0].FormatTok.WhiteSpaceLength > 0; + if (SpacesBefore && !SpacesAfter) + ++CountBoundToVariable; + else if (!SpacesBefore && SpacesAfter) + ++CountBoundToType; + } + + if (Tok->Type == TT_TemplateCloser && + Tok->Parent->Type == TT_TemplateCloser && + Tok->FormatTok.WhiteSpaceLength == 0) + HasCpp03IncompatibleFormat = true; + Tok = &Tok->Children[0]; + } + } + if (Style.DerivePointerBinding) { + if (CountBoundToType > CountBoundToVariable) + Style.PointerBindsToType = true; + else if (CountBoundToType < CountBoundToVariable) + Style.PointerBindsToType = false; + } + if (Style.Standard == FormatStyle::LS_Auto) { + Style.Standard = HasCpp03IncompatibleFormat ? FormatStyle::LS_Cpp11 + : FormatStyle::LS_Cpp03; + } + } + + /// \brief Get the indent of \p Level from \p IndentForLevel. + /// + /// \p IndentForLevel must contain the indent for the level \c l + /// at \p IndentForLevel[l], or a value < 0 if the indent for + /// that level is unknown. + unsigned getIndent(const std::vector<int> IndentForLevel, unsigned Level) { + if (IndentForLevel[Level] != -1) + return IndentForLevel[Level]; + if (Level == 0) + return 0; + return getIndent(IndentForLevel, Level - 1) + 2; + } + + /// \brief Get the offset of the line relatively to the level. + /// + /// For example, 'public:' labels in classes are offset by 1 or 2 + /// characters to the left from their level. + int getIndentOffset(const AnnotatedToken &RootToken) { + if (RootToken.isAccessSpecifier(false) || RootToken.isObjCAccessSpecifier()) + return Style.AccessModifierOffset; + return 0; + } + + /// \brief Tries to merge lines into one. + /// + /// This will change \c Line and \c AnnotatedLine to contain the merged line, + /// if possible; note that \c I will be incremented when lines are merged. + /// + /// Returns whether the resulting \c Line can fit in a single line. + void tryFitMultipleLinesInOne(unsigned Indent, + std::vector<AnnotatedLine>::iterator &I, + std::vector<AnnotatedLine>::iterator E) { + // We can never merge stuff if there are trailing line comments. + if (I->Last->Type == TT_LineComment) + return; + + unsigned Limit = Style.ColumnLimit - Indent; + // If we already exceed the column limit, we set 'Limit' to 0. The different + // tryMerge..() functions can then decide whether to still do merging. + Limit = I->Last->TotalLength > Limit ? 0 : Limit - I->Last->TotalLength; + + if (I + 1 == E || (I + 1)->Type == LT_Invalid) + return; + + if (I->Last->is(tok::l_brace)) { + tryMergeSimpleBlock(I, E, Limit); + } else if (I->First.is(tok::kw_if)) { + tryMergeSimpleIf(I, E, Limit); + } else if (I->InPPDirective && (I->First.FormatTok.HasUnescapedNewline || + I->First.FormatTok.IsFirst)) { + tryMergeSimplePPDirective(I, E, Limit); + } + return; + } + + void tryMergeSimplePPDirective(std::vector<AnnotatedLine>::iterator &I, + std::vector<AnnotatedLine>::iterator E, + unsigned Limit) { + if (Limit == 0) + return; + AnnotatedLine &Line = *I; + if (!(I + 1)->InPPDirective || (I + 1)->First.FormatTok.HasUnescapedNewline) + return; + if (I + 2 != E && (I + 2)->InPPDirective && + !(I + 2)->First.FormatTok.HasUnescapedNewline) + return; + if (1 + (I + 1)->Last->TotalLength > Limit) + return; + join(Line, *(++I)); + } + + void tryMergeSimpleIf(std::vector<AnnotatedLine>::iterator &I, + std::vector<AnnotatedLine>::iterator E, + unsigned Limit) { + if (Limit == 0) + return; + if (!Style.AllowShortIfStatementsOnASingleLine) + return; + if ((I + 1)->InPPDirective != I->InPPDirective || + ((I + 1)->InPPDirective && + (I + 1)->First.FormatTok.HasUnescapedNewline)) + return; + AnnotatedLine &Line = *I; + if (Line.Last->isNot(tok::r_paren)) + return; + if (1 + (I + 1)->Last->TotalLength > Limit) + return; + if ((I + 1)->First.is(tok::kw_if) || (I + 1)->First.Type == TT_LineComment) + return; + // Only inline simple if's (no nested if or else). + if (I + 2 != E && (I + 2)->First.is(tok::kw_else)) + return; + join(Line, *(++I)); + } + + void tryMergeSimpleBlock(std::vector<AnnotatedLine>::iterator &I, + std::vector<AnnotatedLine>::iterator E, + unsigned Limit) { + // First, check that the current line allows merging. This is the case if + // we're not in a control flow statement and the last token is an opening + // brace. + AnnotatedLine &Line = *I; + if (Line.First.isOneOf(tok::kw_if, tok::kw_while, tok::kw_do, tok::r_brace, + tok::kw_else, tok::kw_try, tok::kw_catch, + tok::kw_for, + // This gets rid of all ObjC @ keywords and methods. + tok::at, tok::minus, tok::plus)) + return; + + AnnotatedToken *Tok = &(I + 1)->First; + if (Tok->Children.empty() && Tok->is(tok::r_brace) && + !Tok->MustBreakBefore) { + // We merge empty blocks even if the line exceeds the column limit. + Tok->SpacesRequiredBefore = 0; + Tok->CanBreakBefore = true; + join(Line, *(I + 1)); + I += 1; + } else if (Limit != 0) { + // Check that we still have three lines and they fit into the limit. + if (I + 2 == E || (I + 2)->Type == LT_Invalid || + !nextTwoLinesFitInto(I, Limit)) + return; + + // Second, check that the next line does not contain any braces - if it + // does, readability declines when putting it into a single line. + if ((I + 1)->Last->Type == TT_LineComment || Tok->MustBreakBefore) + return; + do { + if (Tok->isOneOf(tok::l_brace, tok::r_brace)) + return; + Tok = Tok->Children.empty() ? NULL : &Tok->Children.back(); + } while (Tok != NULL); + + // Last, check that the third line contains a single closing brace. + Tok = &(I + 2)->First; + if (!Tok->Children.empty() || Tok->isNot(tok::r_brace) || + Tok->MustBreakBefore) + return; + + join(Line, *(I + 1)); + join(Line, *(I + 2)); + I += 2; + } + } + + bool nextTwoLinesFitInto(std::vector<AnnotatedLine>::iterator I, + unsigned Limit) { + return 1 + (I + 1)->Last->TotalLength + 1 + (I + 2)->Last->TotalLength <= + Limit; + } + + void join(AnnotatedLine &A, const AnnotatedLine &B) { + unsigned LengthA = A.Last->TotalLength + B.First.SpacesRequiredBefore; + A.Last->Children.push_back(B.First); + while (!A.Last->Children.empty()) { + A.Last->Children[0].Parent = A.Last; + A.Last->Children[0].TotalLength += LengthA; + A.Last = &A.Last->Children[0]; + } + } + + bool touchesRanges(const CharSourceRange &Range) { + for (unsigned i = 0, e = Ranges.size(); i != e; ++i) { + if (!SourceMgr.isBeforeInTranslationUnit(Range.getEnd(), + Ranges[i].getBegin()) && + !SourceMgr.isBeforeInTranslationUnit(Ranges[i].getEnd(), + Range.getBegin())) + return true; + } + return false; + } + + bool touchesLine(const AnnotatedLine &TheLine) { + const FormatToken *First = &TheLine.First.FormatTok; + const FormatToken *Last = &TheLine.Last->FormatTok; + CharSourceRange LineRange = CharSourceRange::getTokenRange( + First->WhiteSpaceStart.getLocWithOffset(First->LastNewlineOffset), + Last->Tok.getLocation()); + return touchesRanges(LineRange); + } + + bool touchesEmptyLineBefore(const AnnotatedLine &TheLine) { + const FormatToken *First = &TheLine.First.FormatTok; + CharSourceRange LineRange = CharSourceRange::getCharRange( + First->WhiteSpaceStart, + First->WhiteSpaceStart.getLocWithOffset(First->LastNewlineOffset)); + return touchesRanges(LineRange); + } + + virtual void consumeUnwrappedLine(const UnwrappedLine &TheLine) { + AnnotatedLines.push_back(AnnotatedLine(TheLine)); + } + + /// \brief Add a new line and the required indent before the first Token + /// of the \c UnwrappedLine if there was no structural parsing error. + /// Returns the indent level of the \c UnwrappedLine. + void formatFirstToken(const AnnotatedToken &RootToken, + const AnnotatedToken *PreviousToken, unsigned Indent, + bool InPPDirective, unsigned PreviousEndOfLineColumn) { + const FormatToken &Tok = RootToken.FormatTok; + + unsigned Newlines = + std::min(Tok.NewlinesBefore, Style.MaxEmptyLinesToKeep + 1); + if (Newlines == 0 && !Tok.IsFirst) + Newlines = 1; + + if (!InPPDirective || Tok.HasUnescapedNewline) { + // Insert extra new line before access specifiers. + if (PreviousToken && PreviousToken->isOneOf(tok::semi, tok::r_brace) && + RootToken.isAccessSpecifier() && Tok.NewlinesBefore == 1) + ++Newlines; + + Whitespaces.replaceWhitespace(RootToken, Newlines, Indent, 0); + } else { + Whitespaces.replacePPWhitespace(RootToken, Newlines, Indent, + PreviousEndOfLineColumn); + } + } + + DiagnosticsEngine &Diag; + FormatStyle Style; + Lexer &Lex; + SourceManager &SourceMgr; + WhitespaceManager Whitespaces; + std::vector<CharSourceRange> Ranges; + std::vector<AnnotatedLine> AnnotatedLines; + bool StructuralError; +}; + +tooling::Replacements +reformat(const FormatStyle &Style, Lexer &Lex, SourceManager &SourceMgr, + std::vector<CharSourceRange> Ranges, DiagnosticConsumer *DiagClient) { + IntrusiveRefCntPtr<DiagnosticOptions> DiagOpts = new DiagnosticOptions(); + OwningPtr<DiagnosticConsumer> DiagPrinter; + if (DiagClient == 0) { + DiagPrinter.reset(new TextDiagnosticPrinter(llvm::errs(), &*DiagOpts)); + DiagPrinter->BeginSourceFile(Lex.getLangOpts(), Lex.getPP()); + DiagClient = DiagPrinter.get(); + } + DiagnosticsEngine Diagnostics( + IntrusiveRefCntPtr<DiagnosticIDs>(new DiagnosticIDs()), &*DiagOpts, + DiagClient, false); + Diagnostics.setSourceManager(&SourceMgr); + Formatter formatter(Diagnostics, Style, Lex, SourceMgr, Ranges); + return formatter.format(); +} + +LangOptions getFormattingLangOpts() { + LangOptions LangOpts; + LangOpts.CPlusPlus = 1; + LangOpts.CPlusPlus11 = 1; + LangOpts.LineComment = 1; + LangOpts.Bool = 1; + LangOpts.ObjC1 = 1; + LangOpts.ObjC2 = 1; + return LangOpts; +} + +} // namespace format +} // namespace clang diff --git a/lib/Format/Makefile b/lib/Format/Makefile new file mode 100644 index 0000000..f4d2b98 --- /dev/null +++ b/lib/Format/Makefile @@ -0,0 +1,13 @@ +##===- clang/lib/Format/Makefile ---------------------------*- Makefile -*-===## +# +# The LLVM Compiler Infrastructure +# +# This file is distributed under the University of Illinois Open Source +# License. See LICENSE.TXT for details. +# +##===----------------------------------------------------------------------===## + +CLANG_LEVEL := ../.. +LIBRARYNAME := clangFormat + +include $(CLANG_LEVEL)/Makefile diff --git a/lib/Format/TokenAnnotator.cpp b/lib/Format/TokenAnnotator.cpp new file mode 100644 index 0000000..427157e --- /dev/null +++ b/lib/Format/TokenAnnotator.cpp @@ -0,0 +1,1187 @@ +//===--- TokenAnnotator.cpp - Format C++ code -----------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// \brief This file implements a token annotator, i.e. creates +/// \c AnnotatedTokens out of \c FormatTokens with required extra information. +/// +//===----------------------------------------------------------------------===// + +#include "TokenAnnotator.h" +#include "clang/Basic/SourceManager.h" +#include "clang/Lex/Lexer.h" + +namespace clang { +namespace format { + +static bool isUnaryOperator(const AnnotatedToken &Tok) { + switch (Tok.FormatTok.Tok.getKind()) { + case tok::plus: + case tok::plusplus: + case tok::minus: + case tok::minusminus: + case tok::exclaim: + case tok::tilde: + case tok::kw_sizeof: + case tok::kw_alignof: + return true; + default: + return false; + } +} + +static bool isBinaryOperator(const AnnotatedToken &Tok) { + // Comma is a binary operator, but does not behave as such wrt. formatting. + return getPrecedence(Tok) > prec::Comma; +} + +// Returns the previous token ignoring comments. +static AnnotatedToken *getPreviousToken(AnnotatedToken &Tok) { + AnnotatedToken *PrevToken = Tok.Parent; + while (PrevToken != NULL && PrevToken->is(tok::comment)) + PrevToken = PrevToken->Parent; + return PrevToken; +} +static const AnnotatedToken *getPreviousToken(const AnnotatedToken &Tok) { + return getPreviousToken(const_cast<AnnotatedToken &>(Tok)); +} + +static bool isTrailingComment(AnnotatedToken *Tok) { + return Tok != NULL && Tok->is(tok::comment) && + (Tok->Children.empty() || + Tok->Children[0].FormatTok.NewlinesBefore > 0); +} + +// Returns the next token ignoring comments. +static const AnnotatedToken *getNextToken(const AnnotatedToken &Tok) { + if (Tok.Children.empty()) + return NULL; + const AnnotatedToken *NextToken = &Tok.Children[0]; + while (NextToken->is(tok::comment)) { + if (NextToken->Children.empty()) + return NULL; + NextToken = &NextToken->Children[0]; + } + return NextToken; +} + +static bool closesScope(const AnnotatedToken &Tok) { + return Tok.isOneOf(tok::r_paren, tok::r_brace, tok::r_square) || + Tok.Type == TT_TemplateCloser; +} + +static bool opensScope(const AnnotatedToken &Tok) { + return Tok.isOneOf(tok::l_paren, tok::l_brace, tok::l_square) || + Tok.Type == TT_TemplateOpener; +} + +/// \brief A parser that gathers additional information about tokens. +/// +/// The \c TokenAnnotator tries to match parenthesis and square brakets and +/// store a parenthesis levels. It also tries to resolve matching "<" and ">" +/// into template parameter lists. +class AnnotatingParser { +public: + AnnotatingParser(SourceManager &SourceMgr, Lexer &Lex, AnnotatedLine &Line, + IdentifierInfo &Ident_in) + : SourceMgr(SourceMgr), Lex(Lex), Line(Line), CurrentToken(&Line.First), + KeywordVirtualFound(false), Ident_in(Ident_in) { + Contexts.push_back(Context(tok::unknown, 1, /*IsExpression=*/ false)); + } + +private: + bool parseAngle() { + if (CurrentToken == NULL) + return false; + ScopedContextCreator ContextCreator(*this, tok::less, 10); + AnnotatedToken *Left = CurrentToken->Parent; + Contexts.back().IsExpression = false; + while (CurrentToken != NULL) { + if (CurrentToken->is(tok::greater)) { + Left->MatchingParen = CurrentToken; + CurrentToken->MatchingParen = Left; + CurrentToken->Type = TT_TemplateCloser; + next(); + return true; + } + if (CurrentToken->isOneOf(tok::r_paren, tok::r_square, tok::r_brace, + tok::pipepipe, tok::ampamp, tok::question, + tok::colon)) + return false; + updateParameterCount(Left, CurrentToken); + if (!consumeToken()) + return false; + } + return false; + } + + bool parseParens(bool LookForDecls = false) { + if (CurrentToken == NULL) + return false; + ScopedContextCreator ContextCreator(*this, tok::l_paren, 1); + + // FIXME: This is a bit of a hack. Do better. + Contexts.back().ColonIsForRangeExpr = + Contexts.size() == 2 && Contexts[0].ColonIsForRangeExpr; + + bool StartsObjCMethodExpr = false; + AnnotatedToken *Left = CurrentToken->Parent; + if (CurrentToken->is(tok::caret)) { + // ^( starts a block. + Left->Type = TT_ObjCBlockLParen; + } else if (AnnotatedToken *MaybeSel = Left->Parent) { + // @selector( starts a selector. + if (MaybeSel->isObjCAtKeyword(tok::objc_selector) && MaybeSel->Parent && + MaybeSel->Parent->is(tok::at)) { + StartsObjCMethodExpr = true; + } + } + + if (StartsObjCMethodExpr) { + Contexts.back().ColonIsObjCMethodExpr = true; + Left->Type = TT_ObjCMethodExpr; + } + + while (CurrentToken != NULL) { + // LookForDecls is set when "if (" has been seen. Check for + // 'identifier' '*' 'identifier' followed by not '=' -- this + // '*' has to be a binary operator but determineStarAmpUsage() will + // categorize it as an unary operator, so set the right type here. + if (LookForDecls && !CurrentToken->Children.empty()) { + AnnotatedToken &Prev = *CurrentToken->Parent; + AnnotatedToken &Next = CurrentToken->Children[0]; + if (Prev.Parent->is(tok::identifier) && + Prev.isOneOf(tok::star, tok::amp, tok::ampamp) && + CurrentToken->is(tok::identifier) && Next.isNot(tok::equal)) { + Prev.Type = TT_BinaryOperator; + LookForDecls = false; + } + } + + if (CurrentToken->is(tok::r_paren)) { + Left->MatchingParen = CurrentToken; + CurrentToken->MatchingParen = Left; + + if (StartsObjCMethodExpr) { + CurrentToken->Type = TT_ObjCMethodExpr; + if (Contexts.back().FirstObjCSelectorName != NULL) { + Contexts.back().FirstObjCSelectorName->LongestObjCSelectorName = + Contexts.back().LongestObjCSelectorName; + } + } + + next(); + return true; + } + if (CurrentToken->isOneOf(tok::r_square, tok::r_brace)) + return false; + updateParameterCount(Left, CurrentToken); + if (!consumeToken()) + return false; + } + return false; + } + + bool parseSquare() { + if (!CurrentToken) + return false; + + // A '[' could be an index subscript (after an indentifier or after + // ')' or ']'), it could be the start of an Objective-C method + // expression, or it could the the start of an Objective-C array literal. + AnnotatedToken *Left = CurrentToken->Parent; + AnnotatedToken *Parent = getPreviousToken(*Left); + bool StartsObjCMethodExpr = + Contexts.back().CanBeExpression && + (!Parent || Parent->isOneOf(tok::colon, tok::l_square, tok::l_paren, + tok::kw_return, tok::kw_throw) || + isUnaryOperator(*Parent) || Parent->Type == TT_ObjCForIn || + Parent->Type == TT_CastRParen || + getBinOpPrecedence(Parent->FormatTok.Tok.getKind(), true, true) > + prec::Unknown); + ScopedContextCreator ContextCreator(*this, tok::l_square, 10); + Contexts.back().IsExpression = true; + bool StartsObjCArrayLiteral = Parent && Parent->is(tok::at); + + if (StartsObjCMethodExpr) { + Contexts.back().ColonIsObjCMethodExpr = true; + Left->Type = TT_ObjCMethodExpr; + } else if (StartsObjCArrayLiteral) { + Left->Type = TT_ObjCArrayLiteral; + } + + while (CurrentToken != NULL) { + if (CurrentToken->is(tok::r_square)) { + if (!CurrentToken->Children.empty() && + CurrentToken->Children[0].is(tok::l_paren)) { + // An ObjC method call is rarely followed by an open parenthesis. + // FIXME: Do we incorrectly label ":" with this? + StartsObjCMethodExpr = false; + Left->Type = TT_Unknown; + } + if (StartsObjCMethodExpr) { + CurrentToken->Type = TT_ObjCMethodExpr; + // determineStarAmpUsage() thinks that '*' '[' is allocating an + // array of pointers, but if '[' starts a selector then '*' is a + // binary operator. + if (Parent != NULL && Parent->Type == TT_PointerOrReference) + Parent->Type = TT_BinaryOperator; + } else if (StartsObjCArrayLiteral) { + CurrentToken->Type = TT_ObjCArrayLiteral; + } + Left->MatchingParen = CurrentToken; + CurrentToken->MatchingParen = Left; + if (Contexts.back().FirstObjCSelectorName != NULL) + Contexts.back().FirstObjCSelectorName->LongestObjCSelectorName = + Contexts.back().LongestObjCSelectorName; + next(); + return true; + } + if (CurrentToken->isOneOf(tok::r_paren, tok::r_brace)) + return false; + updateParameterCount(Left, CurrentToken); + if (!consumeToken()) + return false; + } + return false; + } + + bool parseBrace() { + // Lines are fine to end with '{'. + if (CurrentToken == NULL) + return true; + ScopedContextCreator ContextCreator(*this, tok::l_brace, 1); + AnnotatedToken *Left = CurrentToken->Parent; + while (CurrentToken != NULL) { + if (CurrentToken->is(tok::r_brace)) { + Left->MatchingParen = CurrentToken; + CurrentToken->MatchingParen = Left; + next(); + return true; + } + if (CurrentToken->isOneOf(tok::r_paren, tok::r_square)) + return false; + updateParameterCount(Left, CurrentToken); + if (!consumeToken()) + return false; + } + return true; + } + + void updateParameterCount(AnnotatedToken *Left, AnnotatedToken *Current) { + if (Current->is(tok::comma)) + ++Left->ParameterCount; + else if (Left->ParameterCount == 0 && Current->isNot(tok::comment)) + Left->ParameterCount = 1; + } + + bool parseConditional() { + while (CurrentToken != NULL) { + if (CurrentToken->is(tok::colon)) { + CurrentToken->Type = TT_ConditionalExpr; + next(); + return true; + } + if (!consumeToken()) + return false; + } + return false; + } + + bool parseTemplateDeclaration() { + if (CurrentToken != NULL && CurrentToken->is(tok::less)) { + CurrentToken->Type = TT_TemplateOpener; + next(); + if (!parseAngle()) + return false; + if (CurrentToken != NULL) + CurrentToken->Parent->ClosesTemplateDeclaration = true; + return true; + } + return false; + } + + bool consumeToken() { + AnnotatedToken *Tok = CurrentToken; + next(); + switch (Tok->FormatTok.Tok.getKind()) { + case tok::plus: + case tok::minus: + if (Tok->Parent == NULL && Line.MustBeDeclaration) + Tok->Type = TT_ObjCMethodSpecifier; + break; + case tok::colon: + if (Tok->Parent == NULL) + return false; + // Colons from ?: are handled in parseConditional(). + if (Tok->Parent->is(tok::r_paren) && Contexts.size() == 1) { + Tok->Type = TT_CtorInitializerColon; + } else if (Contexts.back().ColonIsObjCMethodExpr || + Line.First.Type == TT_ObjCMethodSpecifier) { + Tok->Type = TT_ObjCMethodExpr; + Tok->Parent->Type = TT_ObjCSelectorName; + if (Tok->Parent->FormatTok.TokenLength > + Contexts.back().LongestObjCSelectorName) + Contexts.back().LongestObjCSelectorName = + Tok->Parent->FormatTok.TokenLength; + if (Contexts.back().FirstObjCSelectorName == NULL) + Contexts.back().FirstObjCSelectorName = Tok->Parent; + } else if (Contexts.back().ColonIsForRangeExpr) { + Tok->Type = TT_RangeBasedForLoopColon; + } else if (Contexts.size() == 1) { + Tok->Type = TT_InheritanceColon; + } else if (Contexts.back().ContextKind == tok::l_paren) { + Tok->Type = TT_InlineASMColon; + } + break; + case tok::kw_if: + case tok::kw_while: + if (CurrentToken != NULL && CurrentToken->is(tok::l_paren)) { + next(); + if (!parseParens(/*LookForDecls=*/ true)) + return false; + } + break; + case tok::kw_for: + Contexts.back().ColonIsForRangeExpr = true; + next(); + if (!parseParens()) + return false; + break; + case tok::l_paren: + if (!parseParens()) + return false; + if (Line.MustBeDeclaration) + Line.MightBeFunctionDecl = true; + break; + case tok::l_square: + if (!parseSquare()) + return false; + break; + case tok::l_brace: + if (!parseBrace()) + return false; + break; + case tok::less: + if (parseAngle()) + Tok->Type = TT_TemplateOpener; + else { + Tok->Type = TT_BinaryOperator; + CurrentToken = Tok; + next(); + } + break; + case tok::r_paren: + case tok::r_square: + return false; + case tok::r_brace: + // Lines can start with '}'. + if (Tok->Parent != NULL) + return false; + break; + case tok::greater: + Tok->Type = TT_BinaryOperator; + break; + case tok::kw_operator: + while (CurrentToken && CurrentToken->isNot(tok::l_paren)) { + if (CurrentToken->isOneOf(tok::star, tok::amp)) + CurrentToken->Type = TT_PointerOrReference; + consumeToken(); + } + if (CurrentToken) + CurrentToken->Type = TT_OverloadedOperatorLParen; + break; + case tok::question: + parseConditional(); + break; + case tok::kw_template: + parseTemplateDeclaration(); + break; + case tok::identifier: + if (Line.First.is(tok::kw_for) && + Tok->FormatTok.Tok.getIdentifierInfo() == &Ident_in) + Tok->Type = TT_ObjCForIn; + break; + case tok::comma: + if (Contexts.back().FirstStartOfName) + Contexts.back().FirstStartOfName->PartOfMultiVariableDeclStmt = true; + break; + default: + break; + } + return true; + } + + void parseIncludeDirective() { + next(); + if (CurrentToken != NULL && CurrentToken->is(tok::less)) { + next(); + while (CurrentToken != NULL) { + if (CurrentToken->isNot(tok::comment) || + !CurrentToken->Children.empty()) + CurrentToken->Type = TT_ImplicitStringLiteral; + next(); + } + } else { + while (CurrentToken != NULL) { + if (CurrentToken->is(tok::string_literal)) + // Mark these string literals as "implicit" literals, too, so that + // they are not split or line-wrapped. + CurrentToken->Type = TT_ImplicitStringLiteral; + next(); + } + } + } + + void parseWarningOrError() { + next(); + // We still want to format the whitespace left of the first token of the + // warning or error. + next(); + while (CurrentToken != NULL) { + CurrentToken->Type = TT_ImplicitStringLiteral; + next(); + } + } + + void parsePreprocessorDirective() { + next(); + if (CurrentToken == NULL) + return; + // Hashes in the middle of a line can lead to any strange token + // sequence. + if (CurrentToken->FormatTok.Tok.getIdentifierInfo() == NULL) + return; + switch (CurrentToken->FormatTok.Tok.getIdentifierInfo()->getPPKeywordID()) { + case tok::pp_include: + case tok::pp_import: + parseIncludeDirective(); + break; + case tok::pp_error: + case tok::pp_warning: + parseWarningOrError(); + break; + default: + break; + } + while (CurrentToken != NULL) + next(); + } + +public: + LineType parseLine() { + int PeriodsAndArrows = 0; + AnnotatedToken *LastPeriodOrArrow = NULL; + bool CanBeBuilderTypeStmt = true; + if (CurrentToken->is(tok::hash)) { + parsePreprocessorDirective(); + return LT_PreprocessorDirective; + } + while (CurrentToken != NULL) { + if (CurrentToken->is(tok::kw_virtual)) + KeywordVirtualFound = true; + if (CurrentToken->isOneOf(tok::period, tok::arrow)) { + ++PeriodsAndArrows; + LastPeriodOrArrow = CurrentToken; + } + AnnotatedToken *TheToken = CurrentToken; + if (!consumeToken()) + return LT_Invalid; + if (getPrecedence(*TheToken) > prec::Assignment && + TheToken->Type == TT_BinaryOperator) + CanBeBuilderTypeStmt = false; + } + if (KeywordVirtualFound) + return LT_VirtualFunctionDecl; + + // Assume a builder-type call if there are 2 or more "." and "->". + if (PeriodsAndArrows >= 2 && CanBeBuilderTypeStmt) { + LastPeriodOrArrow->LastInChainOfCalls = true; + return LT_BuilderTypeCall; + } + + if (Line.First.Type == TT_ObjCMethodSpecifier) { + if (Contexts.back().FirstObjCSelectorName != NULL) + Contexts.back().FirstObjCSelectorName->LongestObjCSelectorName = + Contexts.back().LongestObjCSelectorName; + return LT_ObjCMethodDecl; + } + + return LT_Other; + } + +private: + void next() { + if (CurrentToken != NULL) { + determineTokenType(*CurrentToken); + CurrentToken->BindingStrength = Contexts.back().BindingStrength; + } + + if (CurrentToken != NULL && !CurrentToken->Children.empty()) + CurrentToken = &CurrentToken->Children[0]; + else + CurrentToken = NULL; + + // Reset token type in case we have already looked at it and then recovered + // from an error (e.g. failure to find the matching >). + if (CurrentToken != NULL) + CurrentToken->Type = TT_Unknown; + } + + /// \brief A struct to hold information valid in a specific context, e.g. + /// a pair of parenthesis. + struct Context { + Context(tok::TokenKind ContextKind, unsigned BindingStrength, + bool IsExpression) + : ContextKind(ContextKind), BindingStrength(BindingStrength), + LongestObjCSelectorName(0), ColonIsForRangeExpr(false), + ColonIsObjCMethodExpr(false), FirstObjCSelectorName(NULL), + FirstStartOfName(NULL), IsExpression(IsExpression), + CanBeExpression(true) {} + + tok::TokenKind ContextKind; + unsigned BindingStrength; + unsigned LongestObjCSelectorName; + bool ColonIsForRangeExpr; + bool ColonIsObjCMethodExpr; + AnnotatedToken *FirstObjCSelectorName; + AnnotatedToken *FirstStartOfName; + bool IsExpression; + bool CanBeExpression; + }; + + /// \brief Puts a new \c Context onto the stack \c Contexts for the lifetime + /// of each instance. + struct ScopedContextCreator { + AnnotatingParser &P; + + ScopedContextCreator(AnnotatingParser &P, tok::TokenKind ContextKind, + unsigned Increase) + : P(P) { + P.Contexts.push_back( + Context(ContextKind, P.Contexts.back().BindingStrength + Increase, + P.Contexts.back().IsExpression)); + } + + ~ScopedContextCreator() { P.Contexts.pop_back(); } + }; + + void determineTokenType(AnnotatedToken &Current) { + if (getPrecedence(Current) == prec::Assignment) { + Contexts.back().IsExpression = true; + for (AnnotatedToken *Previous = Current.Parent; + Previous && Previous->isNot(tok::comma); + Previous = Previous->Parent) { + if (Previous->is(tok::r_square)) + Previous = Previous->MatchingParen; + if (Previous->Type == TT_BinaryOperator && + Previous->isOneOf(tok::star, tok::amp)) { + Previous->Type = TT_PointerOrReference; + } + } + } else if (Current.isOneOf(tok::kw_return, tok::kw_throw) || + (Current.is(tok::l_paren) && !Line.MustBeDeclaration && + (!Current.Parent || Current.Parent->isNot(tok::kw_for)))) { + Contexts.back().IsExpression = true; + } else if (Current.isOneOf(tok::r_paren, tok::greater, tok::comma)) { + for (AnnotatedToken *Previous = Current.Parent; + Previous && Previous->isOneOf(tok::star, tok::amp); + Previous = Previous->Parent) + Previous->Type = TT_PointerOrReference; + } else if (Current.Parent && + Current.Parent->Type == TT_CtorInitializerColon) { + Contexts.back().IsExpression = true; + } else if (Current.is(tok::kw_new)) { + Contexts.back().CanBeExpression = false; + } + + if (Current.Type == TT_Unknown) { + if (Current.Parent && Current.is(tok::identifier) && + ((Current.Parent->is(tok::identifier) && + Current.Parent->FormatTok.Tok.getIdentifierInfo() + ->getPPKeywordID() == tok::pp_not_keyword) || + isSimpleTypeSpecifier(*Current.Parent) || + Current.Parent->Type == TT_PointerOrReference || + Current.Parent->Type == TT_TemplateCloser)) { + Contexts.back().FirstStartOfName = &Current; + Current.Type = TT_StartOfName; + } else if (Current.isOneOf(tok::star, tok::amp, tok::ampamp)) { + Current.Type = + determineStarAmpUsage(Current, Contexts.back().IsExpression); + } else if (Current.isOneOf(tok::minus, tok::plus, tok::caret)) { + Current.Type = determinePlusMinusCaretUsage(Current); + } else if (Current.isOneOf(tok::minusminus, tok::plusplus)) { + Current.Type = determineIncrementUsage(Current); + } else if (Current.is(tok::exclaim)) { + Current.Type = TT_UnaryOperator; + } else if (isBinaryOperator(Current)) { + Current.Type = TT_BinaryOperator; + } else if (Current.is(tok::comment)) { + std::string Data(Lexer::getSpelling(Current.FormatTok.Tok, SourceMgr, + Lex.getLangOpts())); + if (StringRef(Data).startswith("//")) + Current.Type = TT_LineComment; + else + Current.Type = TT_BlockComment; + } else if (Current.is(tok::r_paren)) { + bool ParensNotExpr = !Current.Parent || + Current.Parent->Type == TT_PointerOrReference || + Current.Parent->Type == TT_TemplateCloser; + bool ParensCouldEndDecl = + !Current.Children.empty() && + Current.Children[0].isOneOf(tok::equal, tok::semi, tok::l_brace); + bool IsSizeOfOrAlignOf = + Current.MatchingParen && Current.MatchingParen->Parent && + Current.MatchingParen->Parent->isOneOf(tok::kw_sizeof, + tok::kw_alignof); + if (ParensNotExpr && !ParensCouldEndDecl && !IsSizeOfOrAlignOf && + Contexts.back().IsExpression) + // FIXME: We need to get smarter and understand more cases of casts. + Current.Type = TT_CastRParen; + } else if (Current.is(tok::at) && Current.Children.size()) { + switch (Current.Children[0].FormatTok.Tok.getObjCKeywordID()) { + case tok::objc_interface: + case tok::objc_implementation: + case tok::objc_protocol: + Current.Type = TT_ObjCDecl; + break; + case tok::objc_property: + Current.Type = TT_ObjCProperty; + break; + default: + break; + } + } + } + } + + /// \brief Return the type of the given token assuming it is * or &. + TokenType + determineStarAmpUsage(const AnnotatedToken &Tok, bool IsExpression) { + const AnnotatedToken *PrevToken = getPreviousToken(Tok); + if (PrevToken == NULL) + return TT_UnaryOperator; + + const AnnotatedToken *NextToken = getNextToken(Tok); + if (NextToken == NULL) + return TT_Unknown; + + if (PrevToken->is(tok::l_paren) && !IsExpression) + return TT_PointerOrReference; + + if (PrevToken->isOneOf(tok::l_paren, tok::l_square, tok::l_brace, + tok::comma, tok::semi, tok::kw_return, tok::colon, + tok::equal) || + PrevToken->Type == TT_BinaryOperator || + PrevToken->Type == TT_UnaryOperator || PrevToken->Type == TT_CastRParen) + return TT_UnaryOperator; + + if (NextToken->is(tok::l_square)) + return TT_PointerOrReference; + + if (PrevToken->FormatTok.Tok.isLiteral() || + PrevToken->isOneOf(tok::r_paren, tok::r_square) || + NextToken->FormatTok.Tok.isLiteral() || isUnaryOperator(*NextToken)) + return TT_BinaryOperator; + + // It is very unlikely that we are going to find a pointer or reference type + // definition on the RHS of an assignment. + if (IsExpression) + return TT_BinaryOperator; + + return TT_PointerOrReference; + } + + TokenType determinePlusMinusCaretUsage(const AnnotatedToken &Tok) { + const AnnotatedToken *PrevToken = getPreviousToken(Tok); + if (PrevToken == NULL) + return TT_UnaryOperator; + + // Use heuristics to recognize unary operators. + if (PrevToken->isOneOf(tok::equal, tok::l_paren, tok::comma, tok::l_square, + tok::question, tok::colon, tok::kw_return, + tok::kw_case, tok::at, tok::l_brace)) + return TT_UnaryOperator; + + // There can't be two consecutive binary operators. + if (PrevToken->Type == TT_BinaryOperator) + return TT_UnaryOperator; + + // Fall back to marking the token as binary operator. + return TT_BinaryOperator; + } + + /// \brief Determine whether ++/-- are pre- or post-increments/-decrements. + TokenType determineIncrementUsage(const AnnotatedToken &Tok) { + const AnnotatedToken *PrevToken = getPreviousToken(Tok); + if (PrevToken == NULL) + return TT_UnaryOperator; + if (PrevToken->isOneOf(tok::r_paren, tok::r_square, tok::identifier)) + return TT_TrailingUnaryOperator; + + return TT_UnaryOperator; + } + + // FIXME: This is copy&pasted from Sema. Put it in a common place and remove + // duplication. + /// \brief Determine whether the token kind starts a simple-type-specifier. + bool isSimpleTypeSpecifier(const AnnotatedToken &Tok) const { + switch (Tok.FormatTok.Tok.getKind()) { + case tok::kw_short: + case tok::kw_long: + case tok::kw___int64: + case tok::kw___int128: + case tok::kw_signed: + case tok::kw_unsigned: + case tok::kw_void: + case tok::kw_char: + case tok::kw_int: + case tok::kw_half: + case tok::kw_float: + case tok::kw_double: + case tok::kw_wchar_t: + case tok::kw_bool: + case tok::kw___underlying_type: + return true; + case tok::annot_typename: + case tok::kw_char16_t: + case tok::kw_char32_t: + case tok::kw_typeof: + case tok::kw_decltype: + return Lex.getLangOpts().CPlusPlus; + default: + break; + } + return false; + } + + SmallVector<Context, 8> Contexts; + + SourceManager &SourceMgr; + Lexer &Lex; + AnnotatedLine &Line; + AnnotatedToken *CurrentToken; + bool KeywordVirtualFound; + IdentifierInfo &Ident_in; +}; + +/// \brief Parses binary expressions by inserting fake parenthesis based on +/// operator precedence. +class ExpressionParser { +public: + ExpressionParser(AnnotatedLine &Line) : Current(&Line.First) {} + + /// \brief Parse expressions with the given operatore precedence. + void parse(int Precedence = 0) { + if (Precedence > prec::PointerToMember || Current == NULL) + return; + + // Skip over "return" until we can properly parse it. + if (Current->is(tok::kw_return)) + next(); + + // Eagerly consume trailing comments. + while (isTrailingComment(Current)) { + next(); + } + + AnnotatedToken *Start = Current; + bool OperatorFound = false; + + while (Current) { + // Consume operators with higher precedence. + parse(prec::Level(Precedence + 1)); + + int CurrentPrecedence = 0; + if (Current) { + if (Current->Type == TT_ConditionalExpr) + CurrentPrecedence = 1 + (int) prec::Conditional; + else if (Current->is(tok::semi) || Current->Type == TT_InlineASMColon || + Current->Type == TT_CtorInitializerColon) + CurrentPrecedence = 1; + else if (Current->Type == TT_BinaryOperator || Current->is(tok::comma)) + CurrentPrecedence = 1 + (int) getPrecedence(*Current); + } + + // At the end of the line or when an operator with higher precedence is + // found, insert fake parenthesis and return. + if (Current == NULL || closesScope(*Current) || + (CurrentPrecedence != 0 && CurrentPrecedence < Precedence)) { + if (OperatorFound) { + ++Start->FakeLParens; + if (Current) + ++Current->Parent->FakeRParens; + } + return; + } + + // Consume scopes: (), [], <> and {} + if (opensScope(*Current)) { + AnnotatedToken *Left = Current; + while (Current && !closesScope(*Current)) { + next(); + parse(); + } + // Remove fake parens that just duplicate the real parens. + if (Current && Left->Children[0].FakeLParens > 0 && + Current->Parent->FakeRParens > 0) { + --Left->Children[0].FakeLParens; + --Current->Parent->FakeRParens; + } + next(); + } else { + // Operator found. + if (CurrentPrecedence == Precedence) + OperatorFound = true; + + next(); + } + } + } + +private: + void next() { + if (Current != NULL) + Current = Current->Children.empty() ? NULL : &Current->Children[0]; + } + + AnnotatedToken *Current; +}; + +void TokenAnnotator::annotate(AnnotatedLine &Line) { + AnnotatingParser Parser(SourceMgr, Lex, Line, Ident_in); + Line.Type = Parser.parseLine(); + if (Line.Type == LT_Invalid) + return; + + ExpressionParser ExprParser(Line); + ExprParser.parse(); + + if (Line.First.Type == TT_ObjCMethodSpecifier) + Line.Type = LT_ObjCMethodDecl; + else if (Line.First.Type == TT_ObjCDecl) + Line.Type = LT_ObjCDecl; + else if (Line.First.Type == TT_ObjCProperty) + Line.Type = LT_ObjCProperty; + + Line.First.SpacesRequiredBefore = 1; + Line.First.MustBreakBefore = Line.First.FormatTok.MustBreakBefore; + Line.First.CanBreakBefore = Line.First.MustBreakBefore; + + Line.First.TotalLength = Line.First.FormatTok.TokenLength; +} + +void TokenAnnotator::calculateFormattingInformation(AnnotatedLine &Line) { + if (Line.First.Children.empty()) + return; + AnnotatedToken *Current = &Line.First.Children[0]; + while (Current != NULL) { + if (Current->Type == TT_LineComment) + Current->SpacesRequiredBefore = Style.SpacesBeforeTrailingComments; + else + Current->SpacesRequiredBefore = + spaceRequiredBefore(Line, *Current) ? 1 : 0; + + if (Current->FormatTok.MustBreakBefore) { + Current->MustBreakBefore = true; + } else if (Current->Type == TT_LineComment) { + Current->MustBreakBefore = Current->FormatTok.NewlinesBefore > 0; + } else if (isTrailingComment(Current->Parent) || + (Current->is(tok::string_literal) && + Current->Parent->is(tok::string_literal))) { + Current->MustBreakBefore = true; + } else if (Current->is(tok::lessless) && !Current->Children.empty() && + Current->Parent->is(tok::string_literal) && + Current->Children[0].is(tok::string_literal)) { + Current->MustBreakBefore = true; + } else { + Current->MustBreakBefore = false; + } + Current->CanBreakBefore = + Current->MustBreakBefore || canBreakBefore(Line, *Current); + if (Current->MustBreakBefore) + Current->TotalLength = Current->Parent->TotalLength + Style.ColumnLimit; + else + Current->TotalLength = + Current->Parent->TotalLength + Current->FormatTok.TokenLength + + Current->SpacesRequiredBefore; + // FIXME: Only calculate this if CanBreakBefore is true once static + // initializers etc. are sorted out. + // FIXME: Move magic numbers to a better place. + Current->SplitPenalty = + 20 * Current->BindingStrength + splitPenalty(Line, *Current); + + Current = Current->Children.empty() ? NULL : &Current->Children[0]; + } +} + +unsigned TokenAnnotator::splitPenalty(const AnnotatedLine &Line, + const AnnotatedToken &Tok) { + const AnnotatedToken &Left = *Tok.Parent; + const AnnotatedToken &Right = Tok; + + if (Right.Type == TT_StartOfName) { + if (Line.First.is(tok::kw_for) && Right.PartOfMultiVariableDeclStmt) + return 3; + else if (Line.MightBeFunctionDecl && Right.BindingStrength == 1) + // FIXME: Clean up hack of using BindingStrength to find top-level names. + return Style.PenaltyReturnTypeOnItsOwnLine; + else + return 100; + } + if (Left.is(tok::equal) && Right.is(tok::l_brace)) + return 150; + if (Left.is(tok::coloncolon)) + return 500; + + if (Left.Type == TT_RangeBasedForLoopColon || + Left.Type == TT_InheritanceColon) + return 2; + + if (Right.isOneOf(tok::arrow, tok::period)) { + if (Line.Type == LT_BuilderTypeCall) + return prec::PointerToMember; + if (Left.isOneOf(tok::r_paren, tok::r_square) && Left.MatchingParen && + Left.MatchingParen->ParameterCount > 0) + return 20; // Should be smaller than breaking at a nested comma. + return 150; + } + + // In for-loops, prefer breaking at ',' and ';'. + if (Line.First.is(tok::kw_for) && Left.is(tok::equal)) + return 4; + + if (Left.is(tok::semi)) + return 0; + if (Left.is(tok::comma)) + return 1; + + // In Objective-C method expressions, prefer breaking before "param:" over + // breaking after it. + if (Right.Type == TT_ObjCSelectorName) + return 0; + if (Left.is(tok::colon) && Left.Type == TT_ObjCMethodExpr) + return 20; + + if (opensScope(Left)) + return Left.ParameterCount > 1 ? prec::Comma : 20; + + if (Right.is(tok::lessless)) { + if (Left.is(tok::string_literal)) { + StringRef Content = StringRef(Left.FormatTok.Tok.getLiteralData(), + Left.FormatTok.TokenLength); + Content = Content.drop_back(1).drop_front(1).trim(); + if (Content.size() > 1 && + (Content.back() == ':' || Content.back() == '=')) + return 100; + } + return prec::Shift; + } + if (Left.Type == TT_ConditionalExpr) + return prec::Conditional; + prec::Level Level = getPrecedence(Left); + + if (Level != prec::Unknown) + return Level; + + return 3; +} + +bool TokenAnnotator::spaceRequiredBetween(const AnnotatedLine &Line, + const AnnotatedToken &Left, + const AnnotatedToken &Right) { + if (Right.is(tok::hashhash)) + return Left.is(tok::hash); + if (Left.isOneOf(tok::hashhash, tok::hash)) + return Right.is(tok::hash); + if (Right.isOneOf(tok::r_paren, tok::semi, tok::comma)) + return false; + if (Right.is(tok::less) && + (Left.is(tok::kw_template) || + (Line.Type == LT_ObjCDecl && Style.ObjCSpaceBeforeProtocolList))) + return true; + if (Left.is(tok::arrow) || Right.is(tok::arrow)) + return false; + if (Left.isOneOf(tok::exclaim, tok::tilde)) + return false; + if (Left.is(tok::at) && + Right.isOneOf(tok::identifier, tok::string_literal, tok::char_constant, + tok::numeric_constant, tok::l_paren, tok::l_brace, + tok::kw_true, tok::kw_false)) + return false; + if (Left.is(tok::coloncolon)) + return false; + if (Right.is(tok::coloncolon)) + return !Left.isOneOf(tok::identifier, tok::greater, tok::l_paren); + if (Left.is(tok::less) || Right.isOneOf(tok::greater, tok::less)) + return false; + if (Right.Type == TT_PointerOrReference) + return Left.FormatTok.Tok.isLiteral() || + ((Left.Type != TT_PointerOrReference) && Left.isNot(tok::l_paren) && + !Style.PointerBindsToType); + if (Left.Type == TT_PointerOrReference) + return Right.FormatTok.Tok.isLiteral() || + ((Right.Type != TT_PointerOrReference) && + Right.isNot(tok::l_paren) && Style.PointerBindsToType && + Left.Parent && Left.Parent->isNot(tok::l_paren)); + if (Right.is(tok::star) && Left.is(tok::l_paren)) + return false; + if (Left.is(tok::l_square)) + return Left.Type == TT_ObjCArrayLiteral && Right.isNot(tok::r_square); + if (Right.is(tok::r_square)) + return Right.Type == TT_ObjCArrayLiteral; + if (Right.is(tok::l_square) && Right.Type != TT_ObjCMethodExpr) + return false; + if (Left.is(tok::period) || Right.is(tok::period)) + return false; + if (Left.is(tok::colon)) + return Left.Type != TT_ObjCMethodExpr; + if (Right.is(tok::colon)) + return Right.Type != TT_ObjCMethodExpr; + if (Left.is(tok::l_paren)) + return false; + if (Right.is(tok::l_paren)) { + return Line.Type == LT_ObjCDecl || + Left.isOneOf(tok::kw_if, tok::kw_for, tok::kw_while, tok::kw_switch, + tok::kw_return, tok::kw_catch, tok::kw_new, + tok::kw_delete); + } + if (Left.is(tok::at) && + Right.FormatTok.Tok.getObjCKeywordID() != tok::objc_not_keyword) + return false; + if (Left.is(tok::l_brace) && Right.is(tok::r_brace)) + return false; + return true; +} + +bool TokenAnnotator::spaceRequiredBefore(const AnnotatedLine &Line, + const AnnotatedToken &Tok) { + if (Tok.FormatTok.Tok.getIdentifierInfo() && + Tok.Parent->FormatTok.Tok.getIdentifierInfo()) + return true; // Never ever merge two identifiers. + if (Line.Type == LT_ObjCMethodDecl) { + if (Tok.Parent->Type == TT_ObjCMethodSpecifier) + return true; + if (Tok.Parent->is(tok::r_paren) && Tok.is(tok::identifier)) + // Don't space between ')' and <id> + return false; + } + if (Line.Type == LT_ObjCProperty && + (Tok.is(tok::equal) || Tok.Parent->is(tok::equal))) + return false; + + if (Tok.Parent->is(tok::comma)) + return true; + if (Tok.is(tok::comma)) + return false; + if (Tok.Type == TT_CtorInitializerColon || Tok.Type == TT_ObjCBlockLParen) + return true; + if (Tok.Parent->FormatTok.Tok.is(tok::kw_operator)) + return false; + if (Tok.Type == TT_OverloadedOperatorLParen) + return false; + if (Tok.is(tok::colon)) + return !Line.First.isOneOf(tok::kw_case, tok::kw_default) && + !Tok.Children.empty() && Tok.Type != TT_ObjCMethodExpr; + if (Tok.is(tok::l_paren) && !Tok.Children.empty() && + Tok.Children[0].Type == TT_PointerOrReference && + !Tok.Children[0].Children.empty() && + Tok.Children[0].Children[0].isNot(tok::r_paren) && + Tok.Parent->isNot(tok::l_paren) && + (Tok.Parent->Type != TT_PointerOrReference || Style.PointerBindsToType)) + return true; + if (Tok.Parent->Type == TT_UnaryOperator || Tok.Parent->Type == TT_CastRParen) + return false; + if (Tok.Type == TT_UnaryOperator) + return !Tok.Parent->isOneOf(tok::l_paren, tok::l_square, tok::at) && + (Tok.Parent->isNot(tok::colon) || + Tok.Parent->Type != TT_ObjCMethodExpr); + if (Tok.Parent->is(tok::greater) && Tok.is(tok::greater)) { + return Tok.Type == TT_TemplateCloser && + Tok.Parent->Type == TT_TemplateCloser && + Style.Standard != FormatStyle::LS_Cpp11; + } + if (Tok.isOneOf(tok::arrowstar, tok::periodstar) || + Tok.Parent->isOneOf(tok::arrowstar, tok::periodstar)) + return false; + if (Tok.Type == TT_BinaryOperator || Tok.Parent->Type == TT_BinaryOperator) + return true; + if (Tok.Parent->Type == TT_TemplateCloser && Tok.is(tok::l_paren)) + return false; + if (Tok.is(tok::less) && Line.First.is(tok::hash)) + return true; + if (Tok.Type == TT_TrailingUnaryOperator) + return false; + return spaceRequiredBetween(Line, *Tok.Parent, Tok); +} + +bool TokenAnnotator::canBreakBefore(const AnnotatedLine &Line, + const AnnotatedToken &Right) { + const AnnotatedToken &Left = *Right.Parent; + if (Right.Type == TT_StartOfName) + return true; + if (Right.is(tok::colon) && Right.Type == TT_ObjCMethodExpr) + return false; + if (Left.is(tok::colon) && Left.Type == TT_ObjCMethodExpr) + return true; + if (Right.Type == TT_ObjCSelectorName) + return true; + if (Left.ClosesTemplateDeclaration) + return true; + if (Right.Type == TT_ConditionalExpr || Right.is(tok::question)) + return true; + if (Right.Type == TT_RangeBasedForLoopColon || + Right.Type == TT_InheritanceColon) + return false; + if (Left.Type == TT_RangeBasedForLoopColon || + Left.Type == TT_InheritanceColon) + return true; + if (Right.Type == TT_RangeBasedForLoopColon) + return false; + if (Left.Type == TT_PointerOrReference || Left.Type == TT_TemplateCloser || + Left.Type == TT_UnaryOperator || Left.Type == TT_ConditionalExpr || + Left.isOneOf(tok::question, tok::kw_operator)) + return false; + if (Left.is(tok::equal) && Line.Type == LT_VirtualFunctionDecl) + return false; + if (Left.is(tok::l_paren) && Right.is(tok::l_paren) && Left.Parent && + Left.Parent->is(tok::kw___attribute)) + return false; + + if (Right.Type == TT_LineComment) + // We rely on MustBreakBefore being set correctly here as we should not + // change the "binding" behavior of a comment. + return false; + + // Allow breaking after a trailing 'const', e.g. after a method declaration, + // unless it is follow by ';', '{' or '='. + if (Left.is(tok::kw_const) && Left.Parent != NULL && + Left.Parent->is(tok::r_paren)) + return !Right.isOneOf(tok::l_brace, tok::semi, tok::equal); + + if (Right.is(tok::kw___attribute)) + return true; + + // We only break before r_brace if there was a corresponding break before + // the l_brace, which is tracked by BreakBeforeClosingBrace. + if (Right.isOneOf(tok::r_brace, tok::r_paren, tok::greater)) + return false; + if (Left.is(tok::identifier) && Right.is(tok::string_literal)) + return true; + return (isBinaryOperator(Left) && Left.isNot(tok::lessless)) || + Left.isOneOf(tok::comma, tok::coloncolon, tok::semi, tok::l_brace) || + Right.isOneOf(tok::lessless, tok::arrow, tok::period, tok::colon) || + (Left.is(tok::r_paren) && Left.Type != TT_CastRParen && + Right.isOneOf(tok::identifier, tok::kw___attribute)) || + (Left.is(tok::l_paren) && !Right.is(tok::r_paren)) || + (Left.is(tok::l_square) && !Right.is(tok::r_square)); +} + +} // namespace format +} // namespace clang diff --git a/lib/Format/TokenAnnotator.h b/lib/Format/TokenAnnotator.h new file mode 100644 index 0000000..c41ee33 --- /dev/null +++ b/lib/Format/TokenAnnotator.h @@ -0,0 +1,262 @@ +//===--- TokenAnnotator.h - Format C++ code ---------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// \brief This file implements a token annotator, i.e. creates +/// \c AnnotatedTokens out of \c FormatTokens with required extra information. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CLANG_FORMAT_TOKEN_ANNOTATOR_H +#define LLVM_CLANG_FORMAT_TOKEN_ANNOTATOR_H + +#include "UnwrappedLineParser.h" +#include "clang/Basic/OperatorPrecedence.h" +#include "clang/Format/Format.h" +#include <string> + +namespace clang { +class Lexer; +class SourceManager; + +namespace format { + +enum TokenType { + TT_BinaryOperator, + TT_BlockComment, + TT_CastRParen, + TT_ConditionalExpr, + TT_CtorInitializerColon, + TT_ImplicitStringLiteral, + TT_InlineASMColon, + TT_InheritanceColon, + TT_LineComment, + TT_ObjCArrayLiteral, + TT_ObjCBlockLParen, + TT_ObjCDecl, + TT_ObjCForIn, + TT_ObjCMethodExpr, + TT_ObjCMethodSpecifier, + TT_ObjCProperty, + TT_ObjCSelectorName, + TT_OverloadedOperatorLParen, + TT_PointerOrReference, + TT_PureVirtualSpecifier, + TT_RangeBasedForLoopColon, + TT_StartOfName, + TT_TemplateCloser, + TT_TemplateOpener, + TT_TrailingUnaryOperator, + TT_UnaryOperator, + TT_Unknown +}; + +enum LineType { + LT_Invalid, + LT_Other, + LT_BuilderTypeCall, + LT_PreprocessorDirective, + LT_VirtualFunctionDecl, + LT_ObjCDecl, // An @interface, @implementation, or @protocol line. + LT_ObjCMethodDecl, + LT_ObjCProperty // An @property line. +}; + +class AnnotatedToken { +public: + explicit AnnotatedToken(const FormatToken &FormatTok) + : FormatTok(FormatTok), Type(TT_Unknown), SpacesRequiredBefore(0), + CanBreakBefore(false), MustBreakBefore(false), + ClosesTemplateDeclaration(false), MatchingParen(NULL), + ParameterCount(0), BindingStrength(0), SplitPenalty(0), + LongestObjCSelectorName(0), Parent(NULL), FakeLParens(0), + FakeRParens(0), LastInChainOfCalls(false), + PartOfMultiVariableDeclStmt(false) {} + + bool is(tok::TokenKind Kind) const { return FormatTok.Tok.is(Kind); } + + bool isOneOf(tok::TokenKind K1, tok::TokenKind K2) const { + return is(K1) || is(K2); + } + + bool isOneOf(tok::TokenKind K1, tok::TokenKind K2, tok::TokenKind K3) const { + return is(K1) || is(K2) || is(K3); + } + + bool isOneOf( + tok::TokenKind K1, tok::TokenKind K2, tok::TokenKind K3, + tok::TokenKind K4, tok::TokenKind K5 = tok::NUM_TOKENS, + tok::TokenKind K6 = tok::NUM_TOKENS, tok::TokenKind K7 = tok::NUM_TOKENS, + tok::TokenKind K8 = tok::NUM_TOKENS, tok::TokenKind K9 = tok::NUM_TOKENS, + tok::TokenKind K10 = tok::NUM_TOKENS, + tok::TokenKind K11 = tok::NUM_TOKENS, + tok::TokenKind K12 = tok::NUM_TOKENS) const { + return is(K1) || is(K2) || is(K3) || is(K4) || is(K5) || is(K6) || is(K7) || + is(K8) || is(K9) || is(K10) || is(K11) || is(K12); + } + + bool isNot(tok::TokenKind Kind) const { return FormatTok.Tok.isNot(Kind); } + + bool isObjCAtKeyword(tok::ObjCKeywordKind Kind) const { + return FormatTok.Tok.isObjCAtKeyword(Kind); + } + + bool isAccessSpecifier(bool ColonRequired = true) const { + return isOneOf(tok::kw_public, tok::kw_protected, tok::kw_private) && + (!ColonRequired || + (!Children.empty() && Children[0].is(tok::colon))); + } + + bool isObjCAccessSpecifier() const { + return is(tok::at) && !Children.empty() && + (Children[0].isObjCAtKeyword(tok::objc_public) || + Children[0].isObjCAtKeyword(tok::objc_protected) || + Children[0].isObjCAtKeyword(tok::objc_package) || + Children[0].isObjCAtKeyword(tok::objc_private)); + } + + FormatToken FormatTok; + + TokenType Type; + + unsigned SpacesRequiredBefore; + bool CanBreakBefore; + bool MustBreakBefore; + + bool ClosesTemplateDeclaration; + + AnnotatedToken *MatchingParen; + + /// \brief Number of parameters, if this is "(", "[" or "<". + /// + /// This is initialized to 1 as we don't need to distinguish functions with + /// 0 parameters from functions with 1 parameter. Thus, we can simply count + /// the number of commas. + unsigned ParameterCount; + + /// \brief The total length of the line up to and including this token. + unsigned TotalLength; + + // FIXME: Come up with a 'cleaner' concept. + /// \brief The binding strength of a token. This is a combined value of + /// operator precedence, parenthesis nesting, etc. + unsigned BindingStrength; + + /// \brief Penalty for inserting a line break before this token. + unsigned SplitPenalty; + + /// \brief If this is the first ObjC selector name in an ObjC method + /// definition or call, this contains the length of the longest name. + unsigned LongestObjCSelectorName; + + std::vector<AnnotatedToken> Children; + AnnotatedToken *Parent; + + /// \brief Insert this many fake ( before this token for correct indentation. + unsigned FakeLParens; + /// \brief Insert this many fake ) after this token for correct indentation. + unsigned FakeRParens; + + /// \brief Is this the last "." or "->" in a builder-type call? + bool LastInChainOfCalls; + + /// \brief Is this token part of a \c DeclStmt defining multiple variables? + /// + /// Only set if \c Type == \c TT_StartOfName. + bool PartOfMultiVariableDeclStmt; + + const AnnotatedToken *getPreviousNoneComment() const { + AnnotatedToken *Tok = Parent; + while (Tok != NULL && Tok->is(tok::comment)) + Tok = Tok->Parent; + return Tok; + } +}; + +class AnnotatedLine { +public: + AnnotatedLine(const UnwrappedLine &Line) + : First(Line.Tokens.front()), Level(Line.Level), + InPPDirective(Line.InPPDirective), + MustBeDeclaration(Line.MustBeDeclaration), + MightBeFunctionDecl(false) { + assert(!Line.Tokens.empty()); + AnnotatedToken *Current = &First; + for (std::list<FormatToken>::const_iterator I = ++Line.Tokens.begin(), + E = Line.Tokens.end(); + I != E; ++I) { + Current->Children.push_back(AnnotatedToken(*I)); + Current->Children[0].Parent = Current; + Current = &Current->Children[0]; + } + Last = Current; + } + AnnotatedLine(const AnnotatedLine &Other) + : First(Other.First), Type(Other.Type), Level(Other.Level), + InPPDirective(Other.InPPDirective), + MustBeDeclaration(Other.MustBeDeclaration), + MightBeFunctionDecl(Other.MightBeFunctionDecl) { + Last = &First; + while (!Last->Children.empty()) { + Last->Children[0].Parent = Last; + Last = &Last->Children[0]; + } + } + + AnnotatedToken First; + AnnotatedToken *Last; + + LineType Type; + unsigned Level; + bool InPPDirective; + bool MustBeDeclaration; + bool MightBeFunctionDecl; +}; + +inline prec::Level getPrecedence(const AnnotatedToken &Tok) { + return getBinOpPrecedence(Tok.FormatTok.Tok.getKind(), true, true); +} + +/// \brief Determines extra information about the tokens comprising an +/// \c UnwrappedLine. +class TokenAnnotator { +public: + TokenAnnotator(const FormatStyle &Style, SourceManager &SourceMgr, Lexer &Lex, + IdentifierInfo &Ident_in) + : Style(Style), SourceMgr(SourceMgr), Lex(Lex), Ident_in(Ident_in) { + } + + void annotate(AnnotatedLine &Line); + void calculateFormattingInformation(AnnotatedLine &Line); + +private: + /// \brief Calculate the penalty for splitting before \c Tok. + unsigned splitPenalty(const AnnotatedLine &Line, const AnnotatedToken &Tok); + + bool spaceRequiredBetween(const AnnotatedLine &Line, + const AnnotatedToken &Left, + const AnnotatedToken &Right); + + bool spaceRequiredBefore(const AnnotatedLine &Line, + const AnnotatedToken &Tok); + + bool canBreakBefore(const AnnotatedLine &Line, const AnnotatedToken &Right); + + const FormatStyle &Style; + SourceManager &SourceMgr; + Lexer &Lex; + + // Contextual keywords: + IdentifierInfo &Ident_in; +}; + +} // end namespace format +} // end namespace clang + +#endif // LLVM_CLANG_FORMAT_TOKEN_ANNOTATOR_H diff --git a/lib/Format/UnwrappedLineParser.cpp b/lib/Format/UnwrappedLineParser.cpp new file mode 100644 index 0000000..89a391b --- /dev/null +++ b/lib/Format/UnwrappedLineParser.cpp @@ -0,0 +1,858 @@ +//===--- UnwrappedLineParser.cpp - Format C++ code ------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// \brief This file contains the implementation of the UnwrappedLineParser, +/// which turns a stream of tokens into UnwrappedLines. +/// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "format-parser" + +#include "UnwrappedLineParser.h" +#include "clang/Basic/Diagnostic.h" +#include "llvm/Support/Debug.h" + +namespace clang { +namespace format { + +class ScopedDeclarationState { +public: + ScopedDeclarationState(UnwrappedLine &Line, std::vector<bool> &Stack, + bool MustBeDeclaration) + : Line(Line), Stack(Stack) { + Line.MustBeDeclaration = MustBeDeclaration; + Stack.push_back(MustBeDeclaration); + } + ~ScopedDeclarationState() { + Stack.pop_back(); + if (!Stack.empty()) + Line.MustBeDeclaration = Stack.back(); + else + Line.MustBeDeclaration = true; + } +private: + UnwrappedLine &Line; + std::vector<bool> &Stack; +}; + +class ScopedMacroState : public FormatTokenSource { +public: + ScopedMacroState(UnwrappedLine &Line, FormatTokenSource *&TokenSource, + FormatToken &ResetToken) + : Line(Line), TokenSource(TokenSource), ResetToken(ResetToken), + PreviousLineLevel(Line.Level), PreviousTokenSource(TokenSource) { + TokenSource = this; + Line.Level = 0; + Line.InPPDirective = true; + } + + ~ScopedMacroState() { + TokenSource = PreviousTokenSource; + ResetToken = Token; + Line.InPPDirective = false; + Line.Level = PreviousLineLevel; + } + + virtual FormatToken getNextToken() { + // The \c UnwrappedLineParser guards against this by never calling + // \c getNextToken() after it has encountered the first eof token. + assert(!eof()); + Token = PreviousTokenSource->getNextToken(); + if (eof()) + return createEOF(); + return Token; + } + +private: + bool eof() { return Token.NewlinesBefore > 0 && Token.HasUnescapedNewline; } + + FormatToken createEOF() { + FormatToken FormatTok; + FormatTok.Tok.startToken(); + FormatTok.Tok.setKind(tok::eof); + return FormatTok; + } + + UnwrappedLine &Line; + FormatTokenSource *&TokenSource; + FormatToken &ResetToken; + unsigned PreviousLineLevel; + FormatTokenSource *PreviousTokenSource; + + FormatToken Token; +}; + +class ScopedLineState { +public: + ScopedLineState(UnwrappedLineParser &Parser, + bool SwitchToPreprocessorLines = false) + : Parser(Parser), SwitchToPreprocessorLines(SwitchToPreprocessorLines) { + if (SwitchToPreprocessorLines) + Parser.CurrentLines = &Parser.PreprocessorDirectives; + PreBlockLine = Parser.Line.take(); + Parser.Line.reset(new UnwrappedLine()); + Parser.Line->Level = PreBlockLine->Level; + Parser.Line->InPPDirective = PreBlockLine->InPPDirective; + } + + ~ScopedLineState() { + if (!Parser.Line->Tokens.empty()) { + Parser.addUnwrappedLine(); + } + assert(Parser.Line->Tokens.empty()); + Parser.Line.reset(PreBlockLine); + Parser.MustBreakBeforeNextToken = true; + if (SwitchToPreprocessorLines) + Parser.CurrentLines = &Parser.Lines; + } + +private: + UnwrappedLineParser &Parser; + const bool SwitchToPreprocessorLines; + + UnwrappedLine *PreBlockLine; +}; + +UnwrappedLineParser::UnwrappedLineParser( + clang::DiagnosticsEngine &Diag, const FormatStyle &Style, + FormatTokenSource &Tokens, UnwrappedLineConsumer &Callback) + : Line(new UnwrappedLine), MustBreakBeforeNextToken(false), + CurrentLines(&Lines), Diag(Diag), Style(Style), Tokens(&Tokens), + Callback(Callback) {} + +bool UnwrappedLineParser::parse() { + DEBUG(llvm::dbgs() << "----\n"); + readToken(); + bool Error = parseFile(); + for (std::vector<UnwrappedLine>::iterator I = Lines.begin(), E = Lines.end(); + I != E; ++I) { + Callback.consumeUnwrappedLine(*I); + } + + // Create line with eof token. + pushToken(FormatTok); + Callback.consumeUnwrappedLine(*Line); + + return Error; +} + +bool UnwrappedLineParser::parseFile() { + ScopedDeclarationState DeclarationState( + *Line, DeclarationScopeStack, + /*MustBeDeclaration=*/ !Line->InPPDirective); + bool Error = parseLevel(/*HasOpeningBrace=*/ false); + // Make sure to format the remaining tokens. + flushComments(true); + addUnwrappedLine(); + return Error; +} + +bool UnwrappedLineParser::parseLevel(bool HasOpeningBrace) { + bool Error = false; + do { + switch (FormatTok.Tok.getKind()) { + case tok::comment: + nextToken(); + addUnwrappedLine(); + break; + case tok::l_brace: + // FIXME: Add parameter whether this can happen - if this happens, we must + // be in a non-declaration context. + Error |= parseBlock(/*MustBeDeclaration=*/ false); + addUnwrappedLine(); + break; + case tok::r_brace: + if (HasOpeningBrace) { + return false; + } else { + Diag.Report(FormatTok.Tok.getLocation(), + Diag.getCustomDiagID(clang::DiagnosticsEngine::Error, + "unexpected '}'")); + Error = true; + nextToken(); + addUnwrappedLine(); + } + break; + default: + parseStructuralElement(); + break; + } + } while (!eof()); + return Error; +} + +bool UnwrappedLineParser::parseBlock(bool MustBeDeclaration, + unsigned AddLevels) { + assert(FormatTok.Tok.is(tok::l_brace) && "'{' expected"); + nextToken(); + + addUnwrappedLine(); + + ScopedDeclarationState DeclarationState(*Line, DeclarationScopeStack, + MustBeDeclaration); + Line->Level += AddLevels; + parseLevel(/*HasOpeningBrace=*/ true); + + if (!FormatTok.Tok.is(tok::r_brace)) { + Line->Level -= AddLevels; + return true; + } + + nextToken(); // Munch the closing brace. + Line->Level -= AddLevels; + return false; +} + +void UnwrappedLineParser::parsePPDirective() { + assert(FormatTok.Tok.is(tok::hash) && "'#' expected"); + ScopedMacroState MacroState(*Line, Tokens, FormatTok); + nextToken(); + + if (FormatTok.Tok.getIdentifierInfo() == NULL) { + parsePPUnknown(); + return; + } + + switch (FormatTok.Tok.getIdentifierInfo()->getPPKeywordID()) { + case tok::pp_define: + parsePPDefine(); + break; + default: + parsePPUnknown(); + break; + } +} + +void UnwrappedLineParser::parsePPDefine() { + nextToken(); + + if (FormatTok.Tok.getKind() != tok::identifier) { + parsePPUnknown(); + return; + } + nextToken(); + if (FormatTok.Tok.getKind() == tok::l_paren && + FormatTok.WhiteSpaceLength == 0) { + parseParens(); + } + addUnwrappedLine(); + Line->Level = 1; + + // Errors during a preprocessor directive can only affect the layout of the + // preprocessor directive, and thus we ignore them. An alternative approach + // would be to use the same approach we use on the file level (no + // re-indentation if there was a structural error) within the macro + // definition. + parseFile(); +} + +void UnwrappedLineParser::parsePPUnknown() { + do { + nextToken(); + } while (!eof()); + addUnwrappedLine(); +} + +void UnwrappedLineParser::parseStructuralElement() { + assert(!FormatTok.Tok.is(tok::l_brace)); + int TokenNumber = 0; + switch (FormatTok.Tok.getKind()) { + case tok::at: + nextToken(); + if (FormatTok.Tok.is(tok::l_brace)) { + parseBracedList(); + break; + } + switch (FormatTok.Tok.getObjCKeywordID()) { + case tok::objc_public: + case tok::objc_protected: + case tok::objc_package: + case tok::objc_private: + return parseAccessSpecifier(); + case tok::objc_interface: + case tok::objc_implementation: + return parseObjCInterfaceOrImplementation(); + case tok::objc_protocol: + return parseObjCProtocol(); + case tok::objc_end: + return; // Handled by the caller. + case tok::objc_optional: + case tok::objc_required: + nextToken(); + addUnwrappedLine(); + return; + default: + break; + } + break; + case tok::kw_namespace: + parseNamespace(); + return; + case tok::kw_inline: + nextToken(); + TokenNumber++; + if (FormatTok.Tok.is(tok::kw_namespace)) { + parseNamespace(); + return; + } + break; + case tok::kw_public: + case tok::kw_protected: + case tok::kw_private: + parseAccessSpecifier(); + return; + case tok::kw_if: + parseIfThenElse(); + return; + case tok::kw_for: + case tok::kw_while: + parseForOrWhileLoop(); + return; + case tok::kw_do: + parseDoWhile(); + return; + case tok::kw_switch: + parseSwitch(); + return; + case tok::kw_default: + nextToken(); + parseLabel(); + return; + case tok::kw_case: + parseCaseLabel(); + return; + case tok::kw_return: + parseReturn(); + return; + case tok::kw_extern: + nextToken(); + if (FormatTok.Tok.is(tok::string_literal)) { + nextToken(); + if (FormatTok.Tok.is(tok::l_brace)) { + parseBlock(/*MustBeDeclaration=*/ true, 0); + addUnwrappedLine(); + return; + } + } + // In all other cases, parse the declaration. + break; + default: + break; + } + do { + ++TokenNumber; + switch (FormatTok.Tok.getKind()) { + case tok::at: + nextToken(); + if (FormatTok.Tok.is(tok::l_brace)) + parseBracedList(); + break; + case tok::kw_enum: + parseEnum(); + break; + case tok::kw_struct: + case tok::kw_union: + case tok::kw_class: + parseRecord(); + // A record declaration or definition is always the start of a structural + // element. + break; + case tok::semi: + nextToken(); + addUnwrappedLine(); + return; + case tok::r_brace: + addUnwrappedLine(); + return; + case tok::l_paren: + parseParens(); + break; + case tok::l_brace: + // A block outside of parentheses must be the last part of a + // structural element. + // FIXME: Figure out cases where this is not true, and add projections for + // them (the one we know is missing are lambdas). + parseBlock(/*MustBeDeclaration=*/ false); + addUnwrappedLine(); + return; + case tok::identifier: + nextToken(); + if (TokenNumber == 1 && FormatTok.Tok.is(tok::colon)) { + parseLabel(); + return; + } + break; + case tok::equal: + nextToken(); + if (FormatTok.Tok.is(tok::l_brace)) { + parseBracedList(); + } + break; + default: + nextToken(); + break; + } + } while (!eof()); +} + +void UnwrappedLineParser::parseBracedList() { + nextToken(); + + do { + switch (FormatTok.Tok.getKind()) { + case tok::l_brace: + parseBracedList(); + break; + case tok::r_brace: + nextToken(); + return; + default: + nextToken(); + break; + } + } while (!eof()); +} + +void UnwrappedLineParser::parseReturn() { + nextToken(); + + do { + switch (FormatTok.Tok.getKind()) { + case tok::l_brace: + parseBracedList(); + break; + case tok::l_paren: + parseParens(); + break; + case tok::r_brace: + // Assume missing ';'. + addUnwrappedLine(); + return; + case tok::semi: + nextToken(); + addUnwrappedLine(); + return; + default: + nextToken(); + break; + } + } while (!eof()); +} + +void UnwrappedLineParser::parseParens() { + assert(FormatTok.Tok.is(tok::l_paren) && "'(' expected."); + nextToken(); + do { + switch (FormatTok.Tok.getKind()) { + case tok::l_paren: + parseParens(); + break; + case tok::r_paren: + nextToken(); + return; + case tok::l_brace: { + nextToken(); + ScopedLineState LineState(*this); + ScopedDeclarationState DeclarationState(*Line, DeclarationScopeStack, + /*MustBeDeclaration=*/ false); + Line->Level += 1; + parseLevel(/*HasOpeningBrace=*/ true); + Line->Level -= 1; + break; + } + case tok::at: + nextToken(); + if (FormatTok.Tok.is(tok::l_brace)) + parseBracedList(); + break; + default: + nextToken(); + break; + } + } while (!eof()); +} + +void UnwrappedLineParser::parseIfThenElse() { + assert(FormatTok.Tok.is(tok::kw_if) && "'if' expected"); + nextToken(); + if (FormatTok.Tok.is(tok::l_paren)) + parseParens(); + bool NeedsUnwrappedLine = false; + if (FormatTok.Tok.is(tok::l_brace)) { + parseBlock(/*MustBeDeclaration=*/ false); + NeedsUnwrappedLine = true; + } else { + addUnwrappedLine(); + ++Line->Level; + parseStructuralElement(); + --Line->Level; + } + if (FormatTok.Tok.is(tok::kw_else)) { + nextToken(); + if (FormatTok.Tok.is(tok::l_brace)) { + parseBlock(/*MustBeDeclaration=*/ false); + addUnwrappedLine(); + } else if (FormatTok.Tok.is(tok::kw_if)) { + parseIfThenElse(); + } else { + addUnwrappedLine(); + ++Line->Level; + parseStructuralElement(); + --Line->Level; + } + } else if (NeedsUnwrappedLine) { + addUnwrappedLine(); + } +} + +void UnwrappedLineParser::parseNamespace() { + assert(FormatTok.Tok.is(tok::kw_namespace) && "'namespace' expected"); + nextToken(); + if (FormatTok.Tok.is(tok::identifier)) + nextToken(); + if (FormatTok.Tok.is(tok::l_brace)) { + parseBlock(/*MustBeDeclaration=*/ true, 0); + // Munch the semicolon after a namespace. This is more common than one would + // think. Puttin the semicolon into its own line is very ugly. + if (FormatTok.Tok.is(tok::semi)) + nextToken(); + addUnwrappedLine(); + } + // FIXME: Add error handling. +} + +void UnwrappedLineParser::parseForOrWhileLoop() { + assert((FormatTok.Tok.is(tok::kw_for) || FormatTok.Tok.is(tok::kw_while)) && + "'for' or 'while' expected"); + nextToken(); + if (FormatTok.Tok.is(tok::l_paren)) + parseParens(); + if (FormatTok.Tok.is(tok::l_brace)) { + parseBlock(/*MustBeDeclaration=*/ false); + addUnwrappedLine(); + } else { + addUnwrappedLine(); + ++Line->Level; + parseStructuralElement(); + --Line->Level; + } +} + +void UnwrappedLineParser::parseDoWhile() { + assert(FormatTok.Tok.is(tok::kw_do) && "'do' expected"); + nextToken(); + if (FormatTok.Tok.is(tok::l_brace)) { + parseBlock(/*MustBeDeclaration=*/ false); + } else { + addUnwrappedLine(); + ++Line->Level; + parseStructuralElement(); + --Line->Level; + } + + // FIXME: Add error handling. + if (!FormatTok.Tok.is(tok::kw_while)) { + addUnwrappedLine(); + return; + } + + nextToken(); + parseStructuralElement(); +} + +void UnwrappedLineParser::parseLabel() { + if (FormatTok.Tok.isNot(tok::colon)) + return; + nextToken(); + unsigned OldLineLevel = Line->Level; + if (Line->Level > 1 || (!Line->InPPDirective && Line->Level > 0)) + --Line->Level; + if (CommentsBeforeNextToken.empty() && FormatTok.Tok.is(tok::l_brace)) { + parseBlock(/*MustBeDeclaration=*/ false); + if (FormatTok.Tok.is(tok::kw_break)) + parseStructuralElement(); // "break;" after "}" goes on the same line. + } + addUnwrappedLine(); + Line->Level = OldLineLevel; +} + +void UnwrappedLineParser::parseCaseLabel() { + assert(FormatTok.Tok.is(tok::kw_case) && "'case' expected"); + // FIXME: fix handling of complex expressions here. + do { + nextToken(); + } while (!eof() && !FormatTok.Tok.is(tok::colon)); + parseLabel(); +} + +void UnwrappedLineParser::parseSwitch() { + assert(FormatTok.Tok.is(tok::kw_switch) && "'switch' expected"); + nextToken(); + if (FormatTok.Tok.is(tok::l_paren)) + parseParens(); + if (FormatTok.Tok.is(tok::l_brace)) { + parseBlock(/*MustBeDeclaration=*/ false, Style.IndentCaseLabels ? 2 : 1); + addUnwrappedLine(); + } else { + addUnwrappedLine(); + Line->Level += (Style.IndentCaseLabels ? 2 : 1); + parseStructuralElement(); + Line->Level -= (Style.IndentCaseLabels ? 2 : 1); + } +} + +void UnwrappedLineParser::parseAccessSpecifier() { + nextToken(); + // Otherwise, we don't know what it is, and we'd better keep the next token. + if (FormatTok.Tok.is(tok::colon)) + nextToken(); + addUnwrappedLine(); +} + +void UnwrappedLineParser::parseEnum() { + nextToken(); + if (FormatTok.Tok.is(tok::identifier) || + FormatTok.Tok.is(tok::kw___attribute) || + FormatTok.Tok.is(tok::kw___declspec)) { + nextToken(); + // We can have macros or attributes in between 'enum' and the enum name. + if (FormatTok.Tok.is(tok::l_paren)) { + parseParens(); + } + if (FormatTok.Tok.is(tok::identifier)) + nextToken(); + } + if (FormatTok.Tok.is(tok::l_brace)) { + nextToken(); + addUnwrappedLine(); + ++Line->Level; + do { + switch (FormatTok.Tok.getKind()) { + case tok::l_paren: + parseParens(); + break; + case tok::r_brace: + addUnwrappedLine(); + nextToken(); + --Line->Level; + return; + case tok::comma: + nextToken(); + addUnwrappedLine(); + break; + default: + nextToken(); + break; + } + } while (!eof()); + } + // We fall through to parsing a structural element afterwards, so that in + // enum A {} n, m; + // "} n, m;" will end up in one unwrapped line. +} + +void UnwrappedLineParser::parseRecord() { + nextToken(); + if (FormatTok.Tok.is(tok::identifier) || + FormatTok.Tok.is(tok::kw___attribute) || + FormatTok.Tok.is(tok::kw___declspec)) { + nextToken(); + // We can have macros or attributes in between 'class' and the class name. + if (FormatTok.Tok.is(tok::l_paren)) { + parseParens(); + } + // The actual identifier can be a nested name specifier, and in macros + // it is often token-pasted. + while (FormatTok.Tok.is(tok::identifier) || + FormatTok.Tok.is(tok::coloncolon) || FormatTok.Tok.is(tok::hashhash)) + nextToken(); + + // Note that parsing away template declarations here leads to incorrectly + // accepting function declarations as record declarations. + // In general, we cannot solve this problem. Consider: + // class A<int> B() {} + // which can be a function definition or a class definition when B() is a + // macro. If we find enough real-world cases where this is a problem, we + // can parse for the 'template' keyword in the beginning of the statement, + // and thus rule out the record production in case there is no template + // (this would still leave us with an ambiguity between template function + // and class declarations). + if (FormatTok.Tok.is(tok::colon) || FormatTok.Tok.is(tok::less)) { + while (!eof() && FormatTok.Tok.isNot(tok::l_brace)) { + if (FormatTok.Tok.is(tok::semi)) + return; + nextToken(); + } + } + } + if (FormatTok.Tok.is(tok::l_brace)) + parseBlock(/*MustBeDeclaration=*/ true); + // We fall through to parsing a structural element afterwards, so + // class A {} n, m; + // will end up in one unwrapped line. +} + +void UnwrappedLineParser::parseObjCProtocolList() { + assert(FormatTok.Tok.is(tok::less) && "'<' expected."); + do + nextToken(); + while (!eof() && FormatTok.Tok.isNot(tok::greater)); + nextToken(); // Skip '>'. +} + +void UnwrappedLineParser::parseObjCUntilAtEnd() { + do { + if (FormatTok.Tok.isObjCAtKeyword(tok::objc_end)) { + nextToken(); + addUnwrappedLine(); + break; + } + parseStructuralElement(); + } while (!eof()); +} + +void UnwrappedLineParser::parseObjCInterfaceOrImplementation() { + nextToken(); + nextToken(); // interface name + + // @interface can be followed by either a base class, or a category. + if (FormatTok.Tok.is(tok::colon)) { + nextToken(); + nextToken(); // base class name + } else if (FormatTok.Tok.is(tok::l_paren)) + // Skip category, if present. + parseParens(); + + if (FormatTok.Tok.is(tok::less)) + parseObjCProtocolList(); + + // If instance variables are present, keep the '{' on the first line too. + if (FormatTok.Tok.is(tok::l_brace)) + parseBlock(/*MustBeDeclaration=*/ true); + + // With instance variables, this puts '}' on its own line. Without instance + // variables, this ends the @interface line. + addUnwrappedLine(); + + parseObjCUntilAtEnd(); +} + +void UnwrappedLineParser::parseObjCProtocol() { + nextToken(); + nextToken(); // protocol name + + if (FormatTok.Tok.is(tok::less)) + parseObjCProtocolList(); + + // Check for protocol declaration. + if (FormatTok.Tok.is(tok::semi)) { + nextToken(); + return addUnwrappedLine(); + } + + addUnwrappedLine(); + parseObjCUntilAtEnd(); +} + +void UnwrappedLineParser::addUnwrappedLine() { + if (Line->Tokens.empty()) + return; + DEBUG({ + llvm::dbgs() << "Line(" << Line->Level << ")" + << (Line->InPPDirective ? " MACRO" : "") << ": "; + for (std::list<FormatToken>::iterator I = Line->Tokens.begin(), + E = Line->Tokens.end(); + I != E; ++I) { + llvm::dbgs() << I->Tok.getName() << " "; + + } + llvm::dbgs() << "\n"; + }); + CurrentLines->push_back(*Line); + Line->Tokens.clear(); + if (CurrentLines == &Lines && !PreprocessorDirectives.empty()) { + for (std::vector<UnwrappedLine>::iterator + I = PreprocessorDirectives.begin(), + E = PreprocessorDirectives.end(); + I != E; ++I) { + CurrentLines->push_back(*I); + } + PreprocessorDirectives.clear(); + } +} + +bool UnwrappedLineParser::eof() const { return FormatTok.Tok.is(tok::eof); } + +void UnwrappedLineParser::flushComments(bool NewlineBeforeNext) { + bool JustComments = Line->Tokens.empty(); + for (SmallVectorImpl<FormatToken>::const_iterator + I = CommentsBeforeNextToken.begin(), + E = CommentsBeforeNextToken.end(); + I != E; ++I) { + if (I->NewlinesBefore && JustComments) { + addUnwrappedLine(); + } + pushToken(*I); + } + if (NewlineBeforeNext && JustComments) { + addUnwrappedLine(); + } + CommentsBeforeNextToken.clear(); +} + +void UnwrappedLineParser::nextToken() { + if (eof()) + return; + flushComments(FormatTok.NewlinesBefore > 0); + pushToken(FormatTok); + readToken(); +} + +void UnwrappedLineParser::readToken() { + bool CommentsInCurrentLine = true; + do { + FormatTok = Tokens->getNextToken(); + while (!Line->InPPDirective && FormatTok.Tok.is(tok::hash) && + ((FormatTok.NewlinesBefore > 0 && FormatTok.HasUnescapedNewline) || + FormatTok.IsFirst)) { + // If there is an unfinished unwrapped line, we flush the preprocessor + // directives only after that unwrapped line was finished later. + bool SwitchToPreprocessorLines = + !Line->Tokens.empty() && CurrentLines == &Lines; + ScopedLineState BlockState(*this, SwitchToPreprocessorLines); + // Comments stored before the preprocessor directive need to be output + // before the preprocessor directive, at the same level as the + // preprocessor directive, as we consider them to apply to the directive. + flushComments(FormatTok.NewlinesBefore > 0); + parsePPDirective(); + } + if (!FormatTok.Tok.is(tok::comment)) + return; + if (FormatTok.NewlinesBefore > 0 || FormatTok.IsFirst) { + CommentsInCurrentLine = false; + } + if (CommentsInCurrentLine) { + pushToken(FormatTok); + } else { + CommentsBeforeNextToken.push_back(FormatTok); + } + } while (!eof()); +} + +void UnwrappedLineParser::pushToken(const FormatToken &Tok) { + Line->Tokens.push_back(Tok); + if (MustBreakBeforeNextToken) { + Line->Tokens.back().MustBreakBefore = true; + MustBreakBeforeNextToken = false; + } +} + +} // end namespace format +} // end namespace clang diff --git a/lib/Format/UnwrappedLineParser.h b/lib/Format/UnwrappedLineParser.h new file mode 100644 index 0000000..f4fecc5 --- /dev/null +++ b/lib/Format/UnwrappedLineParser.h @@ -0,0 +1,201 @@ +//===--- UnwrappedLineParser.h - Format C++ code ----------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// \brief This file contains the declaration of the UnwrappedLineParser, +/// which turns a stream of tokens into UnwrappedLines. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CLANG_FORMAT_UNWRAPPED_LINE_PARSER_H +#define LLVM_CLANG_FORMAT_UNWRAPPED_LINE_PARSER_H + +#include "clang/Basic/IdentifierTable.h" +#include "clang/Basic/SourceManager.h" +#include "clang/Format/Format.h" +#include "clang/Lex/Lexer.h" +#include <list> + +namespace clang { + +class DiagnosticsEngine; + +namespace format { + +/// \brief A wrapper around a \c Token storing information about the +/// whitespace characters preceeding it. +struct FormatToken { + FormatToken() + : NewlinesBefore(0), HasUnescapedNewline(false), WhiteSpaceLength(0), + LastNewlineOffset(0), TokenLength(0), IsFirst(false), + MustBreakBefore(false) {} + + /// \brief The \c Token. + Token Tok; + + /// \brief The number of newlines immediately before the \c Token. + /// + /// This can be used to determine what the user wrote in the original code + /// and thereby e.g. leave an empty line between two function definitions. + unsigned NewlinesBefore; + + /// \brief Whether there is at least one unescaped newline before the \c + /// Token. + bool HasUnescapedNewline; + + /// \brief The location of the start of the whitespace immediately preceeding + /// the \c Token. + /// + /// Used together with \c WhiteSpaceLength to create a \c Replacement. + SourceLocation WhiteSpaceStart; + + /// \brief The length in characters of the whitespace immediately preceeding + /// the \c Token. + unsigned WhiteSpaceLength; + + /// \brief The offset just past the last '\n' in this token's leading + /// whitespace (relative to \c WhiteSpaceStart). 0 if there is no '\n'. + unsigned LastNewlineOffset; + + /// \brief The length of the non-whitespace parts of the token. This is + /// necessary because we need to handle escaped newlines that are stored + /// with the token. + unsigned TokenLength; + + /// \brief Indicates that this is the first token. + bool IsFirst; + + /// \brief Whether there must be a line break before this token. + /// + /// This happens for example when a preprocessor directive ended directly + /// before the token. + bool MustBreakBefore; +}; + +/// \brief An unwrapped line is a sequence of \c Token, that we would like to +/// put on a single line if there was no column limit. +/// +/// This is used as a main interface between the \c UnwrappedLineParser and the +/// \c UnwrappedLineFormatter. The key property is that changing the formatting +/// within an unwrapped line does not affect any other unwrapped lines. +struct UnwrappedLine { + UnwrappedLine() : Level(0), InPPDirective(false), MustBeDeclaration(false) { + } + + // FIXME: Don't use std::list here. + /// \brief The \c Tokens comprising this \c UnwrappedLine. + std::list<FormatToken> Tokens; + + /// \brief The indent level of the \c UnwrappedLine. + unsigned Level; + + /// \brief Whether this \c UnwrappedLine is part of a preprocessor directive. + bool InPPDirective; + + bool MustBeDeclaration; +}; + +class UnwrappedLineConsumer { +public: + virtual ~UnwrappedLineConsumer() { + } + virtual void consumeUnwrappedLine(const UnwrappedLine &Line) = 0; +}; + +class FormatTokenSource { +public: + virtual ~FormatTokenSource() { + } + virtual FormatToken getNextToken() = 0; +}; + +class UnwrappedLineParser { +public: + UnwrappedLineParser(clang::DiagnosticsEngine &Diag, const FormatStyle &Style, + FormatTokenSource &Tokens, + UnwrappedLineConsumer &Callback); + + /// Returns true in case of a structural error. + bool parse(); + +private: + bool parseFile(); + bool parseLevel(bool HasOpeningBrace); + bool parseBlock(bool MustBeDeclaration, unsigned AddLevels = 1); + void parsePPDirective(); + void parsePPDefine(); + void parsePPUnknown(); + void parseStructuralElement(); + void parseBracedList(); + void parseReturn(); + void parseParens(); + void parseIfThenElse(); + void parseForOrWhileLoop(); + void parseDoWhile(); + void parseLabel(); + void parseCaseLabel(); + void parseSwitch(); + void parseNamespace(); + void parseAccessSpecifier(); + void parseEnum(); + void parseRecord(); + void parseObjCProtocolList(); + void parseObjCUntilAtEnd(); + void parseObjCInterfaceOrImplementation(); + void parseObjCProtocol(); + void addUnwrappedLine(); + bool eof() const; + void nextToken(); + void readToken(); + void flushComments(bool NewlineBeforeNext); + void pushToken(const FormatToken &Tok); + + // FIXME: We are constantly running into bugs where Line.Level is incorrectly + // subtracted from beyond 0. Introduce a method to subtract from Line.Level + // and use that everywhere in the Parser. + OwningPtr<UnwrappedLine> Line; + + // Comments are sorted into unwrapped lines by whether they are in the same + // line as the previous token, or not. If not, they belong to the next token. + // Since the next token might already be in a new unwrapped line, we need to + // store the comments belonging to that token. + SmallVector<FormatToken, 1> CommentsBeforeNextToken; + FormatToken FormatTok; + bool MustBreakBeforeNextToken; + + // The parsed lines. Only added to through \c CurrentLines. + std::vector<UnwrappedLine> Lines; + + // Preprocessor directives are parsed out-of-order from other unwrapped lines. + // Thus, we need to keep a list of preprocessor directives to be reported + // after an unwarpped line that has been started was finished. + std::vector<UnwrappedLine> PreprocessorDirectives; + + // New unwrapped lines are added via CurrentLines. + // Usually points to \c &Lines. While parsing a preprocessor directive when + // there is an unfinished previous unwrapped line, will point to + // \c &PreprocessorDirectives. + std::vector<UnwrappedLine> *CurrentLines; + + // We store for each line whether it must be a declaration depending on + // whether we are in a compound statement or not. + std::vector<bool> DeclarationScopeStack; + + clang::DiagnosticsEngine &Diag; + const FormatStyle &Style; + FormatTokenSource *Tokens; + UnwrappedLineConsumer &Callback; + + friend class ScopedLineState; +}; + +} // end namespace format +} // end namespace clang + +#endif // LLVM_CLANG_FORMAT_UNWRAPPED_LINE_PARSER_H |