diff options
Diffstat (limited to 'contrib/llvm/lib/CodeGen/MachineBlockPlacement.cpp')
-rw-r--r-- | contrib/llvm/lib/CodeGen/MachineBlockPlacement.cpp | 901 |
1 files changed, 608 insertions, 293 deletions
diff --git a/contrib/llvm/lib/CodeGen/MachineBlockPlacement.cpp b/contrib/llvm/lib/CodeGen/MachineBlockPlacement.cpp index f5e3056..03dda8b 100644 --- a/contrib/llvm/lib/CodeGen/MachineBlockPlacement.cpp +++ b/contrib/llvm/lib/CodeGen/MachineBlockPlacement.cpp @@ -26,6 +26,8 @@ //===----------------------------------------------------------------------===// #include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/TargetPassConfig.h" +#include "BranchFolding.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" @@ -62,10 +64,12 @@ static cl::opt<unsigned> AlignAllBlock("align-all-blocks", "blocks in the function."), cl::init(0), cl::Hidden); -static cl::opt<unsigned> - AlignAllLoops("align-all-loops", - cl::desc("Force the alignment of all loops in the function."), - cl::init(0), cl::Hidden); +static cl::opt<unsigned> AlignAllNonFallThruBlocks( + "align-all-nofallthru-blocks", + cl::desc("Force the alignment of all " + "blocks that have no fall-through predecessors (i.e. don't add " + "nops that are executed)."), + cl::init(0), cl::Hidden); // FIXME: Find a good default for this flag and remove the flag. static cl::opt<unsigned> ExitBlockBias( @@ -97,10 +101,15 @@ static cl::opt<bool> cl::desc("Model the cost of loop rotation more " "precisely by using profile data."), cl::init(false), cl::Hidden); +static cl::opt<bool> + ForcePreciseRotationCost("force-precise-rotation-cost", + cl::desc("Force the use of precise cost " + "loop rotation strategy."), + cl::init(false), cl::Hidden); static cl::opt<unsigned> MisfetchCost( "misfetch-cost", - cl::desc("Cost that models the probablistic risk of an instruction " + cl::desc("Cost that models the probabilistic risk of an instruction " "misfetch due to a jump comparing to falling through, whose cost " "is zero."), cl::init(1), cl::Hidden); @@ -109,6 +118,15 @@ static cl::opt<unsigned> JumpInstCost("jump-inst-cost", cl::desc("Cost of jump instructions."), cl::init(1), cl::Hidden); +static cl::opt<bool> +BranchFoldPlacement("branch-fold-placement", + cl::desc("Perform branch folding during placement. " + "Reduces code size."), + cl::init(true), cl::Hidden); + +extern cl::opt<unsigned> StaticLikelyProb; +extern cl::opt<unsigned> ProfileLikelyProb; + namespace { class BlockChain; /// \brief Type for our function-wide basic block -> block chain mapping. @@ -149,7 +167,7 @@ public: /// function. It also registers itself as the chain that block participates /// in with the BlockToChain mapping. BlockChain(BlockToChainMapType &BlockToChain, MachineBasicBlock *BB) - : Blocks(1, BB), BlockToChain(BlockToChain), LoopPredecessors(0) { + : Blocks(1, BB), BlockToChain(BlockToChain), UnscheduledPredecessors(0) { assert(BB && "Cannot create a chain with a null basic block"); BlockToChain[BB] = this; } @@ -201,11 +219,16 @@ public: } #endif // NDEBUG - /// \brief Count of predecessors within the loop currently being processed. + /// \brief Count of predecessors of any block within the chain which have not + /// yet been scheduled. In general, we will delay scheduling this chain + /// until those predecessors are scheduled (or we find a sufficiently good + /// reason to override this heuristic.) Note that when forming loop chains, + /// blocks outside the loop are ignored and treated as if they were already + /// scheduled. /// - /// This count is updated at each loop we process to represent the number of - /// in-loop predecessors of this chain. - unsigned LoopPredecessors; + /// Note: This field is reinitialized multiple times - once for each loop, + /// and then once for the function as a whole. + unsigned UnscheduledPredecessors; }; } @@ -214,14 +237,21 @@ class MachineBlockPlacement : public MachineFunctionPass { /// \brief A typedef for a block filter set. typedef SmallPtrSet<MachineBasicBlock *, 16> BlockFilterSet; + /// \brief work lists of blocks that are ready to be laid out + SmallVector<MachineBasicBlock *, 16> BlockWorkList; + SmallVector<MachineBasicBlock *, 16> EHPadWorkList; + + /// \brief Machine Function + MachineFunction *F; + /// \brief A handle to the branch probability pass. const MachineBranchProbabilityInfo *MBPI; /// \brief A handle to the function-wide block frequency pass. - const MachineBlockFrequencyInfo *MBFI; + std::unique_ptr<BranchFolder::MBFIWrapper> MBFI; /// \brief A handle to the loop info. - const MachineLoopInfo *MLI; + MachineLoopInfo *MLI; /// \brief A handle to the target's instruction info. const TargetInstrInfo *TII; @@ -254,33 +284,56 @@ class MachineBlockPlacement : public MachineFunctionPass { DenseMap<MachineBasicBlock *, BlockChain *> BlockToChain; void markChainSuccessors(BlockChain &Chain, MachineBasicBlock *LoopHeaderBB, - SmallVectorImpl<MachineBasicBlock *> &BlockWorkList, const BlockFilterSet *BlockFilter = nullptr); + BranchProbability + collectViableSuccessors(MachineBasicBlock *BB, BlockChain &Chain, + const BlockFilterSet *BlockFilter, + SmallVector<MachineBasicBlock *, 4> &Successors); + bool shouldPredBlockBeOutlined(MachineBasicBlock *BB, MachineBasicBlock *Succ, + BlockChain &Chain, + const BlockFilterSet *BlockFilter, + BranchProbability SuccProb, + BranchProbability HotProb); + bool + hasBetterLayoutPredecessor(MachineBasicBlock *BB, MachineBasicBlock *Succ, + BlockChain &SuccChain, BranchProbability SuccProb, + BranchProbability RealSuccProb, BlockChain &Chain, + const BlockFilterSet *BlockFilter); MachineBasicBlock *selectBestSuccessor(MachineBasicBlock *BB, BlockChain &Chain, const BlockFilterSet *BlockFilter); MachineBasicBlock * selectBestCandidateBlock(BlockChain &Chain, - SmallVectorImpl<MachineBasicBlock *> &WorkList, - const BlockFilterSet *BlockFilter); + SmallVectorImpl<MachineBasicBlock *> &WorkList); MachineBasicBlock * - getFirstUnplacedBlock(MachineFunction &F, const BlockChain &PlacedChain, + getFirstUnplacedBlock(const BlockChain &PlacedChain, MachineFunction::iterator &PrevUnplacedBlockIt, const BlockFilterSet *BlockFilter); + + /// \brief Add a basic block to the work list if it is appropriate. + /// + /// If the optional parameter BlockFilter is provided, only MBB + /// present in the set will be added to the worklist. If nullptr + /// is provided, no filtering occurs. + void fillWorkLists(MachineBasicBlock *MBB, + SmallPtrSetImpl<BlockChain *> &UpdatedPreds, + const BlockFilterSet *BlockFilter); void buildChain(MachineBasicBlock *BB, BlockChain &Chain, - SmallVectorImpl<MachineBasicBlock *> &BlockWorkList, const BlockFilterSet *BlockFilter = nullptr); MachineBasicBlock *findBestLoopTop(MachineLoop &L, const BlockFilterSet &LoopBlockSet); - MachineBasicBlock *findBestLoopExit(MachineFunction &F, MachineLoop &L, + MachineBasicBlock *findBestLoopExit(MachineLoop &L, const BlockFilterSet &LoopBlockSet); - BlockFilterSet collectLoopBlockSet(MachineFunction &F, MachineLoop &L); - void buildLoopChains(MachineFunction &F, MachineLoop &L); + BlockFilterSet collectLoopBlockSet(MachineLoop &L); + void buildLoopChains(MachineLoop &L); void rotateLoop(BlockChain &LoopChain, MachineBasicBlock *ExitingBB, const BlockFilterSet &LoopBlockSet); void rotateLoopWithProfile(BlockChain &LoopChain, MachineLoop &L, const BlockFilterSet &LoopBlockSet); - void buildCFGChains(MachineFunction &F); + void collectMustExecuteBBs(); + void buildCFGChains(); + void optimizeBranches(); + void alignBlocks(); public: static char ID; // Pass identification, replacement for typeid @@ -295,6 +348,7 @@ public: AU.addRequired<MachineBlockFrequencyInfo>(); AU.addRequired<MachineDominatorTree>(); AU.addRequired<MachineLoopInfo>(); + AU.addRequired<TargetPassConfig>(); MachineFunctionPass::getAnalysisUsage(AU); } }; @@ -319,18 +373,7 @@ static std::string getBlockName(MachineBasicBlock *BB) { std::string Result; raw_string_ostream OS(Result); OS << "BB#" << BB->getNumber(); - OS << " (derived from LLVM BB '" << BB->getName() << "')"; - OS.flush(); - return Result; -} - -/// \brief Helper to print the number of a MBB. -/// -/// Only used by debug logging. -static std::string getBlockNum(MachineBasicBlock *BB) { - std::string Result; - raw_string_ostream OS(Result); - OS << "BB#" << BB->getNumber(); + OS << " ('" << BB->getName() << "')"; OS.flush(); return Result; } @@ -344,7 +387,6 @@ static std::string getBlockNum(MachineBasicBlock *BB) { /// chain which reach the zero-predecessor state to the worklist passed in. void MachineBlockPlacement::markChainSuccessors( BlockChain &Chain, MachineBasicBlock *LoopHeaderBB, - SmallVectorImpl<MachineBasicBlock *> &BlockWorkList, const BlockFilterSet *BlockFilter) { // Walk all the blocks in this chain, marking their successors as having // a predecessor placed. @@ -363,30 +405,26 @@ void MachineBlockPlacement::markChainSuccessors( // This is a cross-chain edge that is within the loop, so decrement the // loop predecessor count of the destination chain. - if (SuccChain.LoopPredecessors > 0 && --SuccChain.LoopPredecessors == 0) - BlockWorkList.push_back(*SuccChain.begin()); + if (SuccChain.UnscheduledPredecessors == 0 || + --SuccChain.UnscheduledPredecessors > 0) + continue; + + auto *MBB = *SuccChain.begin(); + if (MBB->isEHPad()) + EHPadWorkList.push_back(MBB); + else + BlockWorkList.push_back(MBB); } } } -/// \brief Select the best successor for a block. -/// -/// This looks across all successors of a particular block and attempts to -/// select the "best" one to be the layout successor. It only considers direct -/// successors which also pass the block filter. It will attempt to avoid -/// breaking CFG structure, but cave and break such structures in the case of -/// very hot successor edges. -/// -/// \returns The best successor block found, or null if none are viable. -MachineBasicBlock * -MachineBlockPlacement::selectBestSuccessor(MachineBasicBlock *BB, - BlockChain &Chain, - const BlockFilterSet *BlockFilter) { - const BranchProbability HotProb(4, 5); // 80% - - MachineBasicBlock *BestSucc = nullptr; - auto BestProb = BranchProbability::getZero(); - +/// This helper function collects the set of successors of block +/// \p BB that are allowed to be its layout successors, and return +/// the total branch probability of edges from \p BB to those +/// blocks. +BranchProbability MachineBlockPlacement::collectViableSuccessors( + MachineBasicBlock *BB, BlockChain &Chain, const BlockFilterSet *BlockFilter, + SmallVector<MachineBasicBlock *, 4> &Successors) { // Adjust edge probabilities by excluding edges pointing to blocks that is // either not in BlockFilter or is already in the current chain. Consider the // following CFG: @@ -400,20 +438,17 @@ MachineBlockPlacement::selectBestSuccessor(MachineBasicBlock *BB, // Assume A->C is very hot (>90%), and C->D has a 50% probability, then after // A->C is chosen as a fall-through, D won't be selected as a successor of C // due to CFG constraint (the probability of C->D is not greater than - // HotProb). If we exclude E that is not in BlockFilter when calculating the - // probability of C->D, D will be selected and we will get A C D B as the - // layout of this loop. + // HotProb to break top-order). If we exclude E that is not in BlockFilter + // when calculating the probability of C->D, D will be selected and we + // will get A C D B as the layout of this loop. auto AdjustedSumProb = BranchProbability::getOne(); - SmallVector<MachineBasicBlock *, 4> Successors; for (MachineBasicBlock *Succ : BB->successors()) { bool SkipSucc = false; - if (BlockFilter && !BlockFilter->count(Succ)) { + if (Succ->isEHPad() || (BlockFilter && !BlockFilter->count(Succ))) { SkipSucc = true; } else { BlockChain *SuccChain = BlockToChain[Succ]; if (SuccChain == &Chain) { - DEBUG(dbgs() << " " << getBlockName(Succ) - << " -> Already merged!\n"); SkipSucc = true; } else if (Succ != *SuccChain->begin()) { DEBUG(dbgs() << " " << getBlockName(Succ) << " -> Mid chain!\n"); @@ -426,78 +461,267 @@ MachineBlockPlacement::selectBestSuccessor(MachineBasicBlock *BB, Successors.push_back(Succ); } - DEBUG(dbgs() << "Attempting merge from: " << getBlockName(BB) << "\n"); - for (MachineBasicBlock *Succ : Successors) { - BranchProbability SuccProb; - uint32_t SuccProbN = MBPI->getEdgeProbability(BB, Succ).getNumerator(); - uint32_t SuccProbD = AdjustedSumProb.getNumerator(); - if (SuccProbN >= SuccProbD) - SuccProb = BranchProbability::getOne(); - else - SuccProb = BranchProbability(SuccProbN, SuccProbD); - - // If we outline optional branches, look whether Succ is unavoidable, i.e. - // dominates all terminators of the MachineFunction. If it does, other - // successors must be optional. Don't do this for cold branches. - if (OutlineOptionalBranches && SuccProb > HotProb.getCompl() && - UnavoidableBlocks.count(Succ) > 0) { - auto HasShortOptionalBranch = [&]() { - for (MachineBasicBlock *Pred : Succ->predecessors()) { - // Check whether there is an unplaced optional branch. - if (Pred == Succ || (BlockFilter && !BlockFilter->count(Pred)) || - BlockToChain[Pred] == &Chain) - continue; - // Check whether the optional branch has exactly one BB. - if (Pred->pred_size() > 1 || *Pred->pred_begin() != BB) - continue; - // Check whether the optional branch is small. - if (Pred->size() < OutlineOptionalThreshold) - return true; - } + return AdjustedSumProb; +} + +/// The helper function returns the branch probability that is adjusted +/// or normalized over the new total \p AdjustedSumProb. +static BranchProbability +getAdjustedProbability(BranchProbability OrigProb, + BranchProbability AdjustedSumProb) { + BranchProbability SuccProb; + uint32_t SuccProbN = OrigProb.getNumerator(); + uint32_t SuccProbD = AdjustedSumProb.getNumerator(); + if (SuccProbN >= SuccProbD) + SuccProb = BranchProbability::getOne(); + else + SuccProb = BranchProbability(SuccProbN, SuccProbD); + + return SuccProb; +} + +/// When the option OutlineOptionalBranches is on, this method +/// checks if the fallthrough candidate block \p Succ (of block +/// \p BB) also has other unscheduled predecessor blocks which +/// are also successors of \p BB (forming triangular shape CFG). +/// If none of such predecessors are small, it returns true. +/// The caller can choose to select \p Succ as the layout successors +/// so that \p Succ's predecessors (optional branches) can be +/// outlined. +/// FIXME: fold this with more general layout cost analysis. +bool MachineBlockPlacement::shouldPredBlockBeOutlined( + MachineBasicBlock *BB, MachineBasicBlock *Succ, BlockChain &Chain, + const BlockFilterSet *BlockFilter, BranchProbability SuccProb, + BranchProbability HotProb) { + if (!OutlineOptionalBranches) + return false; + // If we outline optional branches, look whether Succ is unavoidable, i.e. + // dominates all terminators of the MachineFunction. If it does, other + // successors must be optional. Don't do this for cold branches. + if (SuccProb > HotProb.getCompl() && UnavoidableBlocks.count(Succ) > 0) { + for (MachineBasicBlock *Pred : Succ->predecessors()) { + // Check whether there is an unplaced optional branch. + if (Pred == Succ || (BlockFilter && !BlockFilter->count(Pred)) || + BlockToChain[Pred] == &Chain) + continue; + // Check whether the optional branch has exactly one BB. + if (Pred->pred_size() > 1 || *Pred->pred_begin() != BB) + continue; + // Check whether the optional branch is small. + if (Pred->size() < OutlineOptionalThreshold) return false; - }; - if (!HasShortOptionalBranch()) - return Succ; } + return true; + } else + return false; +} - // Only consider successors which are either "hot", or wouldn't violate - // any CFG constraints. - BlockChain &SuccChain = *BlockToChain[Succ]; - if (SuccChain.LoopPredecessors != 0) { - if (SuccProb < HotProb) { - DEBUG(dbgs() << " " << getBlockName(Succ) << " -> " << SuccProb - << " (prob) (CFG conflict)\n"); - continue; - } +// When profile is not present, return the StaticLikelyProb. +// When profile is available, we need to handle the triangle-shape CFG. +static BranchProbability getLayoutSuccessorProbThreshold( + MachineBasicBlock *BB) { + if (!BB->getParent()->getFunction()->getEntryCount()) + return BranchProbability(StaticLikelyProb, 100); + if (BB->succ_size() == 2) { + const MachineBasicBlock *Succ1 = *BB->succ_begin(); + const MachineBasicBlock *Succ2 = *(BB->succ_begin() + 1); + if (Succ1->isSuccessor(Succ2) || Succ2->isSuccessor(Succ1)) { + /* See case 1 below for the cost analysis. For BB->Succ to + * be taken with smaller cost, the following needs to hold: + * Prob(BB->Succ) > 2* Prob(BB->Pred) + * So the threshold T + * T = 2 * (1-Prob(BB->Pred). Since T + Prob(BB->Pred) == 1, + * We have T + T/2 = 1, i.e. T = 2/3. Also adding user specified + * branch bias, we have + * T = (2/3)*(ProfileLikelyProb/50) + * = (2*ProfileLikelyProb)/150) + */ + return BranchProbability(2 * ProfileLikelyProb, 150); + } + } + return BranchProbability(ProfileLikelyProb, 100); +} - // Make sure that a hot successor doesn't have a globally more - // important predecessor. - auto RealSuccProb = MBPI->getEdgeProbability(BB, Succ); - BlockFrequency CandidateEdgeFreq = - MBFI->getBlockFreq(BB) * RealSuccProb * HotProb.getCompl(); - bool BadCFGConflict = false; - for (MachineBasicBlock *Pred : Succ->predecessors()) { - if (Pred == Succ || (BlockFilter && !BlockFilter->count(Pred)) || - BlockToChain[Pred] == &Chain) - continue; - BlockFrequency PredEdgeFreq = - MBFI->getBlockFreq(Pred) * MBPI->getEdgeProbability(Pred, Succ); - if (PredEdgeFreq >= CandidateEdgeFreq) { - BadCFGConflict = true; - break; - } - } - if (BadCFGConflict) { - DEBUG(dbgs() << " " << getBlockName(Succ) << " -> " << SuccProb - << " (prob) (non-cold CFG conflict)\n"); - continue; - } +/// Checks to see if the layout candidate block \p Succ has a better layout +/// predecessor than \c BB. If yes, returns true. +bool MachineBlockPlacement::hasBetterLayoutPredecessor( + MachineBasicBlock *BB, MachineBasicBlock *Succ, BlockChain &SuccChain, + BranchProbability SuccProb, BranchProbability RealSuccProb, + BlockChain &Chain, const BlockFilterSet *BlockFilter) { + + // There isn't a better layout when there are no unscheduled predecessors. + if (SuccChain.UnscheduledPredecessors == 0) + return false; + + // There are two basic scenarios here: + // ------------------------------------- + // Case 1: triangular shape CFG (if-then): + // BB + // | \ + // | \ + // | Pred + // | / + // Succ + // In this case, we are evaluating whether to select edge -> Succ, e.g. + // set Succ as the layout successor of BB. Picking Succ as BB's + // successor breaks the CFG constraints (FIXME: define these constraints). + // With this layout, Pred BB + // is forced to be outlined, so the overall cost will be cost of the + // branch taken from BB to Pred, plus the cost of back taken branch + // from Pred to Succ, as well as the additional cost associated + // with the needed unconditional jump instruction from Pred To Succ. + + // The cost of the topological order layout is the taken branch cost + // from BB to Succ, so to make BB->Succ a viable candidate, the following + // must hold: + // 2 * freq(BB->Pred) * taken_branch_cost + unconditional_jump_cost + // < freq(BB->Succ) * taken_branch_cost. + // Ignoring unconditional jump cost, we get + // freq(BB->Succ) > 2 * freq(BB->Pred), i.e., + // prob(BB->Succ) > 2 * prob(BB->Pred) + // + // When real profile data is available, we can precisely compute the + // probability threshold that is needed for edge BB->Succ to be considered. + // Without profile data, the heuristic requires the branch bias to be + // a lot larger to make sure the signal is very strong (e.g. 80% default). + // ----------------------------------------------------------------- + // Case 2: diamond like CFG (if-then-else): + // S + // / \ + // | \ + // BB Pred + // \ / + // Succ + // .. + // + // The current block is BB and edge BB->Succ is now being evaluated. + // Note that edge S->BB was previously already selected because + // prob(S->BB) > prob(S->Pred). + // At this point, 2 blocks can be placed after BB: Pred or Succ. If we + // choose Pred, we will have a topological ordering as shown on the left + // in the picture below. If we choose Succ, we have the solution as shown + // on the right: + // + // topo-order: + // + // S----- ---S + // | | | | + // ---BB | | BB + // | | | | + // | pred-- | Succ-- + // | | | | + // ---succ ---pred-- + // + // cost = freq(S->Pred) + freq(BB->Succ) cost = 2 * freq (S->Pred) + // = freq(S->Pred) + freq(S->BB) + // + // If we have profile data (i.e, branch probabilities can be trusted), the + // cost (number of taken branches) with layout S->BB->Succ->Pred is 2 * + // freq(S->Pred) while the cost of topo order is freq(S->Pred) + freq(S->BB). + // We know Prob(S->BB) > Prob(S->Pred), so freq(S->BB) > freq(S->Pred), which + // means the cost of topological order is greater. + // When profile data is not available, however, we need to be more + // conservative. If the branch prediction is wrong, breaking the topo-order + // will actually yield a layout with large cost. For this reason, we need + // strong biased branch at block S with Prob(S->BB) in order to select + // BB->Succ. This is equivalent to looking the CFG backward with backward + // edge: Prob(Succ->BB) needs to >= HotProb in order to be selected (without + // profile data). + + BranchProbability HotProb = getLayoutSuccessorProbThreshold(BB); + + // Forward checking. For case 2, SuccProb will be 1. + if (SuccProb < HotProb) { + DEBUG(dbgs() << " " << getBlockName(Succ) << " -> " << SuccProb + << " (prob) (CFG conflict)\n"); + return true; + } + + // Make sure that a hot successor doesn't have a globally more + // important predecessor. + BlockFrequency CandidateEdgeFreq = MBFI->getBlockFreq(BB) * RealSuccProb; + bool BadCFGConflict = false; + + for (MachineBasicBlock *Pred : Succ->predecessors()) { + if (Pred == Succ || BlockToChain[Pred] == &SuccChain || + (BlockFilter && !BlockFilter->count(Pred)) || + BlockToChain[Pred] == &Chain) + continue; + // Do backward checking. For case 1, it is actually redundant check. For + // case 2 above, we need a backward checking to filter out edges that are + // not 'strongly' biased. With profile data available, the check is mostly + // redundant too (when threshold prob is set at 50%) unless S has more than + // two successors. + // BB Pred + // \ / + // Succ + // We select edge BB->Succ if + // freq(BB->Succ) > freq(Succ) * HotProb + // i.e. freq(BB->Succ) > freq(BB->Succ) * HotProb + freq(Pred->Succ) * + // HotProb + // i.e. freq((BB->Succ) * (1 - HotProb) > freq(Pred->Succ) * HotProb + BlockFrequency PredEdgeFreq = + MBFI->getBlockFreq(Pred) * MBPI->getEdgeProbability(Pred, Succ); + if (PredEdgeFreq * HotProb >= CandidateEdgeFreq * HotProb.getCompl()) { + BadCFGConflict = true; + break; } + } + if (BadCFGConflict) { DEBUG(dbgs() << " " << getBlockName(Succ) << " -> " << SuccProb - << " (prob)" - << (SuccChain.LoopPredecessors != 0 ? " (CFG break)" : "") - << "\n"); + << " (prob) (non-cold CFG conflict)\n"); + return true; + } + + return false; +} + +/// \brief Select the best successor for a block. +/// +/// This looks across all successors of a particular block and attempts to +/// select the "best" one to be the layout successor. It only considers direct +/// successors which also pass the block filter. It will attempt to avoid +/// breaking CFG structure, but cave and break such structures in the case of +/// very hot successor edges. +/// +/// \returns The best successor block found, or null if none are viable. +MachineBasicBlock * +MachineBlockPlacement::selectBestSuccessor(MachineBasicBlock *BB, + BlockChain &Chain, + const BlockFilterSet *BlockFilter) { + const BranchProbability HotProb(StaticLikelyProb, 100); + + MachineBasicBlock *BestSucc = nullptr; + auto BestProb = BranchProbability::getZero(); + + SmallVector<MachineBasicBlock *, 4> Successors; + auto AdjustedSumProb = + collectViableSuccessors(BB, Chain, BlockFilter, Successors); + + DEBUG(dbgs() << "Attempting merge from: " << getBlockName(BB) << "\n"); + for (MachineBasicBlock *Succ : Successors) { + auto RealSuccProb = MBPI->getEdgeProbability(BB, Succ); + BranchProbability SuccProb = + getAdjustedProbability(RealSuccProb, AdjustedSumProb); + + // This heuristic is off by default. + if (shouldPredBlockBeOutlined(BB, Succ, Chain, BlockFilter, SuccProb, + HotProb)) + return Succ; + + BlockChain &SuccChain = *BlockToChain[Succ]; + // Skip the edge \c BB->Succ if block \c Succ has a better layout + // predecessor that yields lower global cost. + if (hasBetterLayoutPredecessor(BB, Succ, SuccChain, SuccProb, RealSuccProb, + Chain, BlockFilter)) + continue; + + DEBUG( + dbgs() << " " << getBlockName(Succ) << " -> " << SuccProb + << " (prob)" + << (SuccChain.UnscheduledPredecessors != 0 ? " (CFG break)" : "") + << "\n"); if (BestSucc && BestProb >= SuccProb) continue; BestSucc = Succ; @@ -513,12 +737,11 @@ MachineBlockPlacement::selectBestSuccessor(MachineBasicBlock *BB, /// profitable only really makes sense in the context of a loop. This returns /// the most frequently visited block in the worklist, which in the case of /// a loop, is the one most desirable to be physically close to the rest of the -/// loop body in order to improve icache behavior. +/// loop body in order to improve i-cache behavior. /// /// \returns The best block found, or null if none are viable. MachineBasicBlock *MachineBlockPlacement::selectBestCandidateBlock( - BlockChain &Chain, SmallVectorImpl<MachineBasicBlock *> &WorkList, - const BlockFilterSet *BlockFilter) { + BlockChain &Chain, SmallVectorImpl<MachineBasicBlock *> &WorkList) { // Once we need to walk the worklist looking for a candidate, cleanup the // worklist of already placed entries. // FIXME: If this shows up on profiles, it could be folded (at the cost of @@ -529,24 +752,51 @@ MachineBasicBlock *MachineBlockPlacement::selectBestCandidateBlock( }), WorkList.end()); + if (WorkList.empty()) + return nullptr; + + bool IsEHPad = WorkList[0]->isEHPad(); + MachineBasicBlock *BestBlock = nullptr; BlockFrequency BestFreq; for (MachineBasicBlock *MBB : WorkList) { + assert(MBB->isEHPad() == IsEHPad); + BlockChain &SuccChain = *BlockToChain[MBB]; - if (&SuccChain == &Chain) { - DEBUG(dbgs() << " " << getBlockName(MBB) << " -> Already merged!\n"); + if (&SuccChain == &Chain) continue; - } - assert(SuccChain.LoopPredecessors == 0 && "Found CFG-violating block"); + + assert(SuccChain.UnscheduledPredecessors == 0 && "Found CFG-violating block"); BlockFrequency CandidateFreq = MBFI->getBlockFreq(MBB); DEBUG(dbgs() << " " << getBlockName(MBB) << " -> "; MBFI->printBlockFreq(dbgs(), CandidateFreq) << " (freq)\n"); - if (BestBlock && BestFreq >= CandidateFreq) + + // For ehpad, we layout the least probable first as to avoid jumping back + // from least probable landingpads to more probable ones. + // + // FIXME: Using probability is probably (!) not the best way to achieve + // this. We should probably have a more principled approach to layout + // cleanup code. + // + // The goal is to get: + // + // +--------------------------+ + // | V + // InnerLp -> InnerCleanup OuterLp -> OuterCleanup -> Resume + // + // Rather than: + // + // +-------------------------------------+ + // V | + // OuterLp -> OuterCleanup -> Resume InnerLp -> InnerCleanup + if (BestBlock && (IsEHPad ^ (BestFreq >= CandidateFreq))) continue; + BestBlock = MBB; BestFreq = CandidateFreq; } + return BestBlock; } @@ -558,10 +808,10 @@ MachineBasicBlock *MachineBlockPlacement::selectBestCandidateBlock( /// LastUnplacedBlockIt. We update this iterator on each call to avoid /// re-scanning the entire sequence on repeated calls to this routine. MachineBasicBlock *MachineBlockPlacement::getFirstUnplacedBlock( - MachineFunction &F, const BlockChain &PlacedChain, + const BlockChain &PlacedChain, MachineFunction::iterator &PrevUnplacedBlockIt, const BlockFilterSet *BlockFilter) { - for (MachineFunction::iterator I = PrevUnplacedBlockIt, E = F.end(); I != E; + for (MachineFunction::iterator I = PrevUnplacedBlockIt, E = F->end(); I != E; ++I) { if (BlockFilter && !BlockFilter->count(&*I)) continue; @@ -576,22 +826,51 @@ MachineBasicBlock *MachineBlockPlacement::getFirstUnplacedBlock( return nullptr; } +void MachineBlockPlacement::fillWorkLists( + MachineBasicBlock *MBB, + SmallPtrSetImpl<BlockChain *> &UpdatedPreds, + const BlockFilterSet *BlockFilter = nullptr) { + BlockChain &Chain = *BlockToChain[MBB]; + if (!UpdatedPreds.insert(&Chain).second) + return; + + assert(Chain.UnscheduledPredecessors == 0); + for (MachineBasicBlock *ChainBB : Chain) { + assert(BlockToChain[ChainBB] == &Chain); + for (MachineBasicBlock *Pred : ChainBB->predecessors()) { + if (BlockFilter && !BlockFilter->count(Pred)) + continue; + if (BlockToChain[Pred] == &Chain) + continue; + ++Chain.UnscheduledPredecessors; + } + } + + if (Chain.UnscheduledPredecessors != 0) + return; + + MBB = *Chain.begin(); + if (MBB->isEHPad()) + EHPadWorkList.push_back(MBB); + else + BlockWorkList.push_back(MBB); +} + void MachineBlockPlacement::buildChain( MachineBasicBlock *BB, BlockChain &Chain, - SmallVectorImpl<MachineBasicBlock *> &BlockWorkList, const BlockFilterSet *BlockFilter) { - assert(BB); - assert(BlockToChain[BB] == &Chain); - MachineFunction &F = *BB->getParent(); - MachineFunction::iterator PrevUnplacedBlockIt = F.begin(); + assert(BB && "BB must not be null.\n"); + assert(BlockToChain[BB] == &Chain && "BlockToChainMap mis-match.\n"); + MachineFunction::iterator PrevUnplacedBlockIt = F->begin(); MachineBasicBlock *LoopHeaderBB = BB; - markChainSuccessors(Chain, LoopHeaderBB, BlockWorkList, BlockFilter); + markChainSuccessors(Chain, LoopHeaderBB, BlockFilter); BB = *std::prev(Chain.end()); for (;;) { - assert(BB); - assert(BlockToChain[BB] == &Chain); - assert(*std::prev(Chain.end()) == BB); + assert(BB && "null block found at end of chain in loop."); + assert(BlockToChain[BB] == &Chain && "BlockToChainMap mis-match in loop."); + assert(*std::prev(Chain.end()) == BB && "BB Not found at end of chain."); + // Look for the best viable successor if there is one to place immediately // after this block. @@ -601,11 +880,12 @@ void MachineBlockPlacement::buildChain( // block among those we've identified as not violating the loop's CFG at // this point. This won't be a fallthrough, but it will increase locality. if (!BestSucc) - BestSucc = selectBestCandidateBlock(Chain, BlockWorkList, BlockFilter); + BestSucc = selectBestCandidateBlock(Chain, BlockWorkList); + if (!BestSucc) + BestSucc = selectBestCandidateBlock(Chain, EHPadWorkList); if (!BestSucc) { - BestSucc = - getFirstUnplacedBlock(F, Chain, PrevUnplacedBlockIt, BlockFilter); + BestSucc = getFirstUnplacedBlock(Chain, PrevUnplacedBlockIt, BlockFilter); if (!BestSucc) break; @@ -615,18 +895,18 @@ void MachineBlockPlacement::buildChain( // Place this block, updating the datastructures to reflect its placement. BlockChain &SuccChain = *BlockToChain[BestSucc]; - // Zero out LoopPredecessors for the successor we're about to merge in case + // Zero out UnscheduledPredecessors for the successor we're about to merge in case // we selected a successor that didn't fit naturally into the CFG. - SuccChain.LoopPredecessors = 0; - DEBUG(dbgs() << "Merging from " << getBlockNum(BB) << " to " - << getBlockNum(BestSucc) << "\n"); - markChainSuccessors(SuccChain, LoopHeaderBB, BlockWorkList, BlockFilter); + SuccChain.UnscheduledPredecessors = 0; + DEBUG(dbgs() << "Merging from " << getBlockName(BB) << " to " + << getBlockName(BestSucc) << "\n"); + markChainSuccessors(SuccChain, LoopHeaderBB, BlockFilter); Chain.merge(BestSucc, &SuccChain); BB = *std::prev(Chain.end()); } DEBUG(dbgs() << "Finished forming chain for header block " - << getBlockNum(*Chain.begin()) << "\n"); + << getBlockName(*Chain.begin()) << "\n"); } /// \brief Find the best loop top block for layout. @@ -673,8 +953,10 @@ MachineBlockPlacement::findBestLoopTop(MachineLoop &L, } // If no direct predecessor is fine, just use the loop header. - if (!BestPred) + if (!BestPred) { + DEBUG(dbgs() << " final top unchanged\n"); return L.getHeader(); + } // Walk backwards through any straight line of predecessors. while (BestPred->pred_size() == 1 && @@ -692,7 +974,7 @@ MachineBlockPlacement::findBestLoopTop(MachineLoop &L, /// block to layout at the top of the loop. Typically this is done to maximize /// fallthrough opportunities. MachineBasicBlock * -MachineBlockPlacement::findBestLoopExit(MachineFunction &F, MachineLoop &L, +MachineBlockPlacement::findBestLoopExit(MachineLoop &L, const BlockFilterSet &LoopBlockSet) { // We don't want to layout the loop linearly in all cases. If the loop header // is just a normal basic block in the loop, we want to look for what block @@ -710,7 +992,7 @@ MachineBlockPlacement::findBestLoopExit(MachineFunction &F, MachineLoop &L, unsigned BestExitLoopDepth = 0; MachineBasicBlock *ExitingBB = nullptr; // If there are exits to outer loops, loop rotation can severely limit - // fallthrough opportunites unless it selects such an exit. Keep a set of + // fallthrough opportunities unless it selects such an exit. Keep a set of // blocks where rotating to exit with that block will reach an outer loop. SmallPtrSet<MachineBasicBlock *, 4> BlocksExitingToOuterLoop; @@ -780,7 +1062,6 @@ MachineBlockPlacement::findBestLoopExit(MachineFunction &F, MachineLoop &L, // Restore the old exiting state, no viable looping successor was found. ExitingBB = OldExitingBB; BestExitEdgeFreq = OldBestExitEdgeFreq; - continue; } } // Without a candidate exiting block or with only a single block in the @@ -973,7 +1254,7 @@ void MachineBlockPlacement::rotateLoopWithProfile( } } - DEBUG(dbgs() << "The cost of loop rotation by making " << getBlockNum(*Iter) + DEBUG(dbgs() << "The cost of loop rotation by making " << getBlockName(*Iter) << " to the top: " << Cost.getFrequency() << "\n"); if (Cost < SmallestRotationCost) { @@ -983,7 +1264,7 @@ void MachineBlockPlacement::rotateLoopWithProfile( } if (RotationPos != LoopChain.end()) { - DEBUG(dbgs() << "Rotate loop by making " << getBlockNum(*RotationPos) + DEBUG(dbgs() << "Rotate loop by making " << getBlockName(*RotationPos) << " to the top\n"); std::rotate(LoopChain.begin(), RotationPos, LoopChain.end()); } @@ -994,7 +1275,7 @@ void MachineBlockPlacement::rotateLoopWithProfile( /// When profile data is available, exclude cold blocks from the returned set; /// otherwise, collect all blocks in the loop. MachineBlockPlacement::BlockFilterSet -MachineBlockPlacement::collectLoopBlockSet(MachineFunction &F, MachineLoop &L) { +MachineBlockPlacement::collectLoopBlockSet(MachineLoop &L) { BlockFilterSet LoopBlockSet; // Filter cold blocks off from LoopBlockSet when profile data is available. @@ -1006,7 +1287,7 @@ MachineBlockPlacement::collectLoopBlockSet(MachineFunction &F, MachineLoop &L) { // will be merged into the first outer loop chain for which this block is not // cold anymore. This needs precise profile data and we only do this when // profile data is available. - if (F.getFunction()->getEntryCount()) { + if (F->getFunction()->getEntryCount()) { BlockFrequency LoopFreq(0); for (auto LoopPred : L.getHeader()->predecessors()) if (!L.contains(LoopPred)) @@ -1031,21 +1312,22 @@ MachineBlockPlacement::collectLoopBlockSet(MachineFunction &F, MachineLoop &L) { /// as much as possible. We can then stitch the chains together in a way which /// both preserves the topological structure and minimizes taken conditional /// branches. -void MachineBlockPlacement::buildLoopChains(MachineFunction &F, - MachineLoop &L) { +void MachineBlockPlacement::buildLoopChains(MachineLoop &L) { // First recurse through any nested loops, building chains for those inner // loops. for (MachineLoop *InnerLoop : L) - buildLoopChains(F, *InnerLoop); + buildLoopChains(*InnerLoop); - SmallVector<MachineBasicBlock *, 16> BlockWorkList; - BlockFilterSet LoopBlockSet = collectLoopBlockSet(F, L); + assert(BlockWorkList.empty()); + assert(EHPadWorkList.empty()); + BlockFilterSet LoopBlockSet = collectLoopBlockSet(L); // Check if we have profile data for this function. If yes, we will rotate // this loop by modeling costs more precisely which requires the profile data // for better layout. bool RotateLoopWithProfile = - PreciseRotationCost && F.getFunction()->getEntryCount(); + ForcePreciseRotationCost || + (PreciseRotationCost && F->getFunction()->getEntryCount()); // First check to see if there is an obviously preferable top block for the // loop. This will default to the header, but may end up as one of the @@ -1060,7 +1342,7 @@ void MachineBlockPlacement::buildLoopChains(MachineFunction &F, // branches by placing an exit edge at the bottom. MachineBasicBlock *ExitingBB = nullptr; if (!RotateLoopWithProfile && LoopTop == L.getHeader()) - ExitingBB = findBestLoopExit(F, L, LoopBlockSet); + ExitingBB = findBestLoopExit(L, LoopBlockSet); BlockChain &LoopChain = *BlockToChain[LoopTop]; @@ -1068,29 +1350,13 @@ void MachineBlockPlacement::buildLoopChains(MachineFunction &F, // walk the blocks, and use a set to prevent visiting a particular chain // twice. SmallPtrSet<BlockChain *, 4> UpdatedPreds; - assert(LoopChain.LoopPredecessors == 0); + assert(LoopChain.UnscheduledPredecessors == 0); UpdatedPreds.insert(&LoopChain); - for (MachineBasicBlock *LoopBB : LoopBlockSet) { - BlockChain &Chain = *BlockToChain[LoopBB]; - if (!UpdatedPreds.insert(&Chain).second) - continue; + for (MachineBasicBlock *LoopBB : LoopBlockSet) + fillWorkLists(LoopBB, UpdatedPreds, &LoopBlockSet); - assert(Chain.LoopPredecessors == 0); - for (MachineBasicBlock *ChainBB : Chain) { - assert(BlockToChain[ChainBB] == &Chain); - for (MachineBasicBlock *Pred : ChainBB->predecessors()) { - if (BlockToChain[Pred] == &Chain || !LoopBlockSet.count(Pred)) - continue; - ++Chain.LoopPredecessors; - } - } - - if (Chain.LoopPredecessors == 0) - BlockWorkList.push_back(*Chain.begin()); - } - - buildChain(LoopTop, LoopChain, BlockWorkList, &LoopBlockSet); + buildChain(LoopTop, LoopChain, &LoopBlockSet); if (RotateLoopWithProfile) rotateLoopWithProfile(LoopChain, L, LoopBlockSet); @@ -1100,7 +1366,7 @@ void MachineBlockPlacement::buildLoopChains(MachineFunction &F, DEBUG({ // Crash at the end so we get all of the debugging output first. bool BadLoop = false; - if (LoopChain.LoopPredecessors) { + if (LoopChain.UnscheduledPredecessors) { BadLoop = true; dbgs() << "Loop chain contains a block without its preds placed!\n" << " Loop header: " << getBlockName(*L.block_begin()) << "\n" @@ -1129,13 +1395,42 @@ void MachineBlockPlacement::buildLoopChains(MachineFunction &F, } assert(!BadLoop && "Detected problems with the placement of this loop."); }); + + BlockWorkList.clear(); + EHPadWorkList.clear(); +} + +/// When OutlineOpitonalBranches is on, this method collects BBs that +/// dominates all terminator blocks of the function \p F. +void MachineBlockPlacement::collectMustExecuteBBs() { + if (OutlineOptionalBranches) { + // Find the nearest common dominator of all of F's terminators. + MachineBasicBlock *Terminator = nullptr; + for (MachineBasicBlock &MBB : *F) { + if (MBB.succ_size() == 0) { + if (Terminator == nullptr) + Terminator = &MBB; + else + Terminator = MDT->findNearestCommonDominator(Terminator, &MBB); + } + } + + // MBBs dominating this common dominator are unavoidable. + UnavoidableBlocks.clear(); + for (MachineBasicBlock &MBB : *F) { + if (MDT->dominates(&MBB, Terminator)) { + UnavoidableBlocks.insert(&MBB); + } + } + } } -void MachineBlockPlacement::buildCFGChains(MachineFunction &F) { +void MachineBlockPlacement::buildCFGChains() { // Ensure that every BB in the function has an associated chain to simplify // the assumptions of the remaining algorithm. SmallVector<MachineOperand, 4> Cond; // For AnalyzeBranch. - for (MachineFunction::iterator FI = F.begin(), FE = F.end(); FI != FE; ++FI) { + for (MachineFunction::iterator FI = F->begin(), FE = F->end(); FI != FE; + ++FI) { MachineBasicBlock *BB = &*FI; BlockChain *Chain = new (ChainAllocator.Allocate()) BlockChain(BlockToChain, BB); @@ -1144,7 +1439,7 @@ void MachineBlockPlacement::buildCFGChains(MachineFunction &F) { for (;;) { Cond.clear(); MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For AnalyzeBranch. - if (!TII->AnalyzeBranch(*BB, TBB, FBB, Cond) || !FI->canFallThrough()) + if (!TII->analyzeBranch(*BB, TBB, FBB, Cond) || !FI->canFallThrough()) break; MachineFunction::iterator NextFI = std::next(FI); @@ -1161,55 +1456,22 @@ void MachineBlockPlacement::buildCFGChains(MachineFunction &F) { } } - if (OutlineOptionalBranches) { - // Find the nearest common dominator of all of F's terminators. - MachineBasicBlock *Terminator = nullptr; - for (MachineBasicBlock &MBB : F) { - if (MBB.succ_size() == 0) { - if (Terminator == nullptr) - Terminator = &MBB; - else - Terminator = MDT->findNearestCommonDominator(Terminator, &MBB); - } - } - - // MBBs dominating this common dominator are unavoidable. - UnavoidableBlocks.clear(); - for (MachineBasicBlock &MBB : F) { - if (MDT->dominates(&MBB, Terminator)) { - UnavoidableBlocks.insert(&MBB); - } - } - } + // Turned on with OutlineOptionalBranches option + collectMustExecuteBBs(); // Build any loop-based chains. for (MachineLoop *L : *MLI) - buildLoopChains(F, *L); + buildLoopChains(*L); - SmallVector<MachineBasicBlock *, 16> BlockWorkList; + assert(BlockWorkList.empty()); + assert(EHPadWorkList.empty()); SmallPtrSet<BlockChain *, 4> UpdatedPreds; - for (MachineBasicBlock &MBB : F) { - BlockChain &Chain = *BlockToChain[&MBB]; - if (!UpdatedPreds.insert(&Chain).second) - continue; - - assert(Chain.LoopPredecessors == 0); - for (MachineBasicBlock *ChainBB : Chain) { - assert(BlockToChain[ChainBB] == &Chain); - for (MachineBasicBlock *Pred : ChainBB->predecessors()) { - if (BlockToChain[Pred] == &Chain) - continue; - ++Chain.LoopPredecessors; - } - } - - if (Chain.LoopPredecessors == 0) - BlockWorkList.push_back(*Chain.begin()); - } + for (MachineBasicBlock &MBB : *F) + fillWorkLists(&MBB, UpdatedPreds); - BlockChain &FunctionChain = *BlockToChain[&F.front()]; - buildChain(&F.front(), FunctionChain, BlockWorkList); + BlockChain &FunctionChain = *BlockToChain[&F->front()]; + buildChain(&F->front(), FunctionChain); #ifndef NDEBUG typedef SmallPtrSet<MachineBasicBlock *, 16> FunctionBlockSetType; @@ -1218,7 +1480,7 @@ void MachineBlockPlacement::buildCFGChains(MachineFunction &F) { // Crash at the end so we get all of the debugging output first. bool BadFunc = false; FunctionBlockSetType FunctionBlockSet; - for (MachineBasicBlock &MBB : F) + for (MachineBasicBlock &MBB : *F) FunctionBlockSet.insert(&MBB); for (MachineBasicBlock *ChainBB : FunctionChain) @@ -1238,13 +1500,14 @@ void MachineBlockPlacement::buildCFGChains(MachineFunction &F) { }); // Splice the blocks into place. - MachineFunction::iterator InsertPos = F.begin(); + MachineFunction::iterator InsertPos = F->begin(); + DEBUG(dbgs() << "[MBP] Function: "<< F->getName() << "\n"); for (MachineBasicBlock *ChainBB : FunctionChain) { DEBUG(dbgs() << (ChainBB == *FunctionChain.begin() ? "Placing chain " : " ... ") << getBlockName(ChainBB) << "\n"); if (InsertPos != MachineFunction::iterator(ChainBB)) - F.splice(InsertPos, ChainBB); + F->splice(InsertPos, ChainBB); else ++InsertPos; @@ -1258,69 +1521,90 @@ void MachineBlockPlacement::buildCFGChains(MachineFunction &F) { // boiler plate. Cond.clear(); MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For AnalyzeBranch. - if (!TII->AnalyzeBranch(*PrevBB, TBB, FBB, Cond)) { - // The "PrevBB" is not yet updated to reflect current code layout, so, - // o. it may fall-through to a block without explict "goto" instruction - // before layout, and no longer fall-through it after layout; or - // o. just opposite. - // - // AnalyzeBranch() may return erroneous value for FBB when these two - // situations take place. For the first scenario FBB is mistakenly set - // NULL; for the 2nd scenario, the FBB, which is expected to be NULL, - // is mistakenly pointing to "*BI". - // - bool needUpdateBr = true; - if (!Cond.empty() && (!FBB || FBB == ChainBB)) { - PrevBB->updateTerminator(); - needUpdateBr = false; - Cond.clear(); - TBB = FBB = nullptr; - if (TII->AnalyzeBranch(*PrevBB, TBB, FBB, Cond)) { - // FIXME: This should never take place. - TBB = FBB = nullptr; - } - } + // The "PrevBB" is not yet updated to reflect current code layout, so, + // o. it may fall-through to a block without explicit "goto" instruction + // before layout, and no longer fall-through it after layout; or + // o. just opposite. + // + // analyzeBranch() may return erroneous value for FBB when these two + // situations take place. For the first scenario FBB is mistakenly set NULL; + // for the 2nd scenario, the FBB, which is expected to be NULL, is + // mistakenly pointing to "*BI". + // Thus, if the future change needs to use FBB before the layout is set, it + // has to correct FBB first by using the code similar to the following: + // + // if (!Cond.empty() && (!FBB || FBB == ChainBB)) { + // PrevBB->updateTerminator(); + // Cond.clear(); + // TBB = FBB = nullptr; + // if (TII->analyzeBranch(*PrevBB, TBB, FBB, Cond)) { + // // FIXME: This should never take place. + // TBB = FBB = nullptr; + // } + // } + if (!TII->analyzeBranch(*PrevBB, TBB, FBB, Cond)) + PrevBB->updateTerminator(); + } + + // Fixup the last block. + Cond.clear(); + MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For AnalyzeBranch. + if (!TII->analyzeBranch(F->back(), TBB, FBB, Cond)) + F->back().updateTerminator(); + + BlockWorkList.clear(); + EHPadWorkList.clear(); +} + +void MachineBlockPlacement::optimizeBranches() { + BlockChain &FunctionChain = *BlockToChain[&F->front()]; + SmallVector<MachineOperand, 4> Cond; // For AnalyzeBranch. + + // Now that all the basic blocks in the chain have the proper layout, + // make a final call to AnalyzeBranch with AllowModify set. + // Indeed, the target may be able to optimize the branches in a way we + // cannot because all branches may not be analyzable. + // E.g., the target may be able to remove an unconditional branch to + // a fallthrough when it occurs after predicated terminators. + for (MachineBasicBlock *ChainBB : FunctionChain) { + Cond.clear(); + MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For AnalyzeBranch. + if (!TII->analyzeBranch(*ChainBB, TBB, FBB, Cond, /*AllowModify*/ true)) { // If PrevBB has a two-way branch, try to re-order the branches // such that we branch to the successor with higher probability first. if (TBB && !Cond.empty() && FBB && - MBPI->getEdgeProbability(PrevBB, FBB) > - MBPI->getEdgeProbability(PrevBB, TBB) && + MBPI->getEdgeProbability(ChainBB, FBB) > + MBPI->getEdgeProbability(ChainBB, TBB) && !TII->ReverseBranchCondition(Cond)) { DEBUG(dbgs() << "Reverse order of the two branches: " - << getBlockName(PrevBB) << "\n"); + << getBlockName(ChainBB) << "\n"); DEBUG(dbgs() << " Edge probability: " - << MBPI->getEdgeProbability(PrevBB, FBB) << " vs " - << MBPI->getEdgeProbability(PrevBB, TBB) << "\n"); + << MBPI->getEdgeProbability(ChainBB, FBB) << " vs " + << MBPI->getEdgeProbability(ChainBB, TBB) << "\n"); DebugLoc dl; // FIXME: this is nowhere - TII->RemoveBranch(*PrevBB); - TII->InsertBranch(*PrevBB, FBB, TBB, Cond, dl); - needUpdateBr = true; + TII->RemoveBranch(*ChainBB); + TII->InsertBranch(*ChainBB, FBB, TBB, Cond, dl); + ChainBB->updateTerminator(); } - if (needUpdateBr) - PrevBB->updateTerminator(); } } +} - // Fixup the last block. - Cond.clear(); - MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For AnalyzeBranch. - if (!TII->AnalyzeBranch(F.back(), TBB, FBB, Cond)) - F.back().updateTerminator(); - +void MachineBlockPlacement::alignBlocks() { // Walk through the backedges of the function now that we have fully laid out // the basic blocks and align the destination of each backedge. We don't rely // exclusively on the loop info here so that we can align backedges in // unnatural CFGs and backedges that were introduced purely because of the // loop rotations done during this layout pass. - // FIXME: Use Function::optForSize(). - if (F.getFunction()->hasFnAttribute(Attribute::OptimizeForSize)) + if (F->getFunction()->optForSize()) return; + BlockChain &FunctionChain = *BlockToChain[&F->front()]; if (FunctionChain.begin() == FunctionChain.end()) return; // Empty chain. const BranchProbability ColdProb(1, 5); // 20% - BlockFrequency EntryFreq = MBFI->getBlockFreq(&F.front()); + BlockFrequency EntryFreq = MBFI->getBlockFreq(&F->front()); BlockFrequency WeightedEntryFreq = EntryFreq * ColdProb; for (MachineBasicBlock *ChainBB : FunctionChain) { if (ChainBB == *FunctionChain.begin()) @@ -1334,11 +1618,6 @@ void MachineBlockPlacement::buildCFGChains(MachineFunction &F) { if (!L) continue; - if (AlignAllLoops) { - ChainBB->setAlignment(AlignAllLoops); - continue; - } - unsigned Align = TLI->getPrefLoopAlignment(L); if (!Align) continue; // Don't care about loop alignment. @@ -1380,31 +1659,67 @@ void MachineBlockPlacement::buildCFGChains(MachineFunction &F) { } } -bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &F) { - // Check for single-block functions and skip them. - if (std::next(F.begin()) == F.end()) +bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) { + if (skipFunction(*MF.getFunction())) return false; - if (skipOptnoneFunction(*F.getFunction())) + // Check for single-block functions and skip them. + if (std::next(MF.begin()) == MF.end()) return false; + F = &MF; MBPI = &getAnalysis<MachineBranchProbabilityInfo>(); - MBFI = &getAnalysis<MachineBlockFrequencyInfo>(); + MBFI = llvm::make_unique<BranchFolder::MBFIWrapper>( + getAnalysis<MachineBlockFrequencyInfo>()); MLI = &getAnalysis<MachineLoopInfo>(); - TII = F.getSubtarget().getInstrInfo(); - TLI = F.getSubtarget().getTargetLowering(); + TII = MF.getSubtarget().getInstrInfo(); + TLI = MF.getSubtarget().getTargetLowering(); MDT = &getAnalysis<MachineDominatorTree>(); assert(BlockToChain.empty()); - buildCFGChains(F); + buildCFGChains(); + + // Changing the layout can create new tail merging opportunities. + TargetPassConfig *PassConfig = &getAnalysis<TargetPassConfig>(); + // TailMerge can create jump into if branches that make CFG irreducible for + // HW that requires structured CFG. + bool EnableTailMerge = !MF.getTarget().requiresStructuredCFG() && + PassConfig->getEnableTailMerge() && + BranchFoldPlacement; + // No tail merging opportunities if the block number is less than four. + if (MF.size() > 3 && EnableTailMerge) { + BranchFolder BF(/*EnableTailMerge=*/true, /*CommonHoist=*/false, *MBFI, + *MBPI); + + if (BF.OptimizeFunction(MF, TII, MF.getSubtarget().getRegisterInfo(), + getAnalysisIfAvailable<MachineModuleInfo>(), MLI, + /*AfterBlockPlacement=*/true)) { + // Redo the layout if tail merging creates/removes/moves blocks. + BlockToChain.clear(); + ChainAllocator.DestroyAll(); + buildCFGChains(); + } + } + + optimizeBranches(); + alignBlocks(); BlockToChain.clear(); ChainAllocator.DestroyAll(); if (AlignAllBlock) // Align all of the blocks in the function to a specific alignment. - for (MachineBasicBlock &MBB : F) + for (MachineBasicBlock &MBB : MF) MBB.setAlignment(AlignAllBlock); + else if (AlignAllNonFallThruBlocks) { + // Align all of the blocks that have no fall-through predecessors to a + // specific alignment. + for (auto MBI = std::next(MF.begin()), MBE = MF.end(); MBI != MBE; ++MBI) { + auto LayoutPred = std::prev(MBI); + if (!LayoutPred->isSuccessor(&*MBI)) + MBI->setAlignment(AlignAllNonFallThruBlocks); + } + } // We always return true as we have no way to track whether the final order // differs from the original order. |