diff options
Diffstat (limited to 'contrib/llvm/lib/Transforms/Scalar/JumpThreading.cpp')
-rw-r--r-- | contrib/llvm/lib/Transforms/Scalar/JumpThreading.cpp | 263 |
1 files changed, 221 insertions, 42 deletions
diff --git a/contrib/llvm/lib/Transforms/Scalar/JumpThreading.cpp b/contrib/llvm/lib/Transforms/Scalar/JumpThreading.cpp index 1130d22..087ce8a 100644 --- a/contrib/llvm/lib/Transforms/Scalar/JumpThreading.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/JumpThreading.cpp @@ -18,15 +18,22 @@ #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/CFG.h" +#include "llvm/Analysis/BlockFrequencyInfo.h" +#include "llvm/Analysis/BlockFrequencyInfoImpl.h" +#include "llvm/Analysis/BranchProbabilityInfo.h" #include "llvm/Analysis/ConstantFolding.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/LazyValueInfo.h" #include "llvm/Analysis/Loads.h" +#include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/LLVMContext.h" +#include "llvm/IR/MDBuilder.h" #include "llvm/IR/Metadata.h" #include "llvm/IR/ValueHandle.h" #include "llvm/Pass.h" @@ -36,6 +43,8 @@ #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/SSAUpdater.h" +#include <algorithm> +#include <memory> using namespace llvm; #define DEBUG_TYPE "jump-threading" @@ -49,6 +58,13 @@ BBDuplicateThreshold("jump-threading-threshold", cl::desc("Max block size to duplicate for jump threading"), cl::init(6), cl::Hidden); +static cl::opt<unsigned> +ImplicationSearchThreshold( + "jump-threading-implication-search-threshold", + cl::desc("The number of predecessors to search for a stronger " + "condition to use to thread over a weaker condition"), + cl::init(3), cl::Hidden); + namespace { // These are at global scope so static functions can use them too. typedef SmallVectorImpl<std::pair<Constant*, BasicBlock*> > PredValueInfo; @@ -80,6 +96,9 @@ namespace { class JumpThreading : public FunctionPass { TargetLibraryInfo *TLI; LazyValueInfo *LVI; + std::unique_ptr<BlockFrequencyInfo> BFI; + std::unique_ptr<BranchProbabilityInfo> BPI; + bool HasProfileData; #ifdef NDEBUG SmallPtrSet<BasicBlock*, 16> LoopHeaders; #else @@ -114,9 +133,15 @@ namespace { void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<LazyValueInfo>(); AU.addPreserved<LazyValueInfo>(); + AU.addPreserved<GlobalsAAWrapperPass>(); AU.addRequired<TargetLibraryInfoWrapperPass>(); } + void releaseMemory() override { + BFI.reset(); + BPI.reset(); + } + void FindLoopHeaders(Function &F); bool ProcessBlock(BasicBlock *BB); bool ThreadEdge(BasicBlock *BB, const SmallVectorImpl<BasicBlock*> &PredBBs, @@ -134,9 +159,16 @@ namespace { bool ProcessBranchOnPHI(PHINode *PN); bool ProcessBranchOnXOR(BinaryOperator *BO); + bool ProcessImpliedCondition(BasicBlock *BB); bool SimplifyPartiallyRedundantLoad(LoadInst *LI); bool TryToUnfoldSelect(CmpInst *CondCmp, BasicBlock *BB); + + private: + BasicBlock *SplitBlockPreds(BasicBlock *BB, ArrayRef<BasicBlock *> Preds, + const char *Suffix); + void UpdateBlockFreqAndEdgeWeight(BasicBlock *PredBB, BasicBlock *BB, + BasicBlock *NewBB, BasicBlock *SuccBB); }; } @@ -160,11 +192,21 @@ bool JumpThreading::runOnFunction(Function &F) { DEBUG(dbgs() << "Jump threading on function '" << F.getName() << "'\n"); TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); LVI = &getAnalysis<LazyValueInfo>(); + BFI.reset(); + BPI.reset(); + // When profile data is available, we need to update edge weights after + // successful jump threading, which requires both BPI and BFI being available. + HasProfileData = F.getEntryCount().hasValue(); + if (HasProfileData) { + LoopInfo LI{DominatorTree(F)}; + BPI.reset(new BranchProbabilityInfo(F, LI)); + BFI.reset(new BlockFrequencyInfo(F, *BPI, LI)); + } // Remove unreachable blocks from function as they may result in infinite // loop. We do threading if we found something profitable. Jump threading a // branch can create other opportunities. If these opportunities form a cycle - // i.e. if any jump treading is undoing previous threading in the path, then + // i.e. if any jump threading is undoing previous threading in the path, then // we will loop forever. We take care of this issue by not jump threading for // back edges. This works for normal cases but not for unreachable blocks as // they may have cycle with no back edge. @@ -176,7 +218,7 @@ bool JumpThreading::runOnFunction(Function &F) { do { Changed = false; for (Function::iterator I = F.begin(), E = F.end(); I != E;) { - BasicBlock *BB = I; + BasicBlock *BB = &*I; // Thread all of the branches we can over this block. while (ProcessBlock(BB)) Changed = true; @@ -239,11 +281,26 @@ bool JumpThreading::runOnFunction(Function &F) { static unsigned getJumpThreadDuplicationCost(const BasicBlock *BB, unsigned Threshold) { /// Ignore PHI nodes, these will be flattened when duplication happens. - BasicBlock::const_iterator I = BB->getFirstNonPHI(); + BasicBlock::const_iterator I(BB->getFirstNonPHI()); // FIXME: THREADING will delete values that are just used to compute the // branch, so they shouldn't count against the duplication cost. + unsigned Bonus = 0; + const TerminatorInst *BBTerm = BB->getTerminator(); + // Threading through a switch statement is particularly profitable. If this + // block ends in a switch, decrease its cost to make it more likely to happen. + if (isa<SwitchInst>(BBTerm)) + Bonus = 6; + + // The same holds for indirect branches, but slightly more so. + if (isa<IndirectBrInst>(BBTerm)) + Bonus = 8; + + // Bump the threshold up so the early exit from the loop doesn't skip the + // terminator-based Size adjustment at the end. + Threshold += Bonus; + // Sum up the cost of each instruction until we get to the terminator. Don't // include the terminator because the copy won't include it. unsigned Size = 0; @@ -260,6 +317,11 @@ static unsigned getJumpThreadDuplicationCost(const BasicBlock *BB, if (isa<BitCastInst>(I) && I->getType()->isPointerTy()) continue; + // Bail out if this instruction gives back a token type, it is not possible + // to duplicate it if it is used outside this BB. + if (I->getType()->isTokenTy() && I->isUsedOutsideOfBlock(BB)) + return ~0U; + // All other instructions count for at least one unit. ++Size; @@ -268,7 +330,7 @@ static unsigned getJumpThreadDuplicationCost(const BasicBlock *BB, // as having cost of 2 total, and if they are a vector intrinsic, we model // them as having cost 1. if (const CallInst *CI = dyn_cast<CallInst>(I)) { - if (CI->cannotDuplicate()) + if (CI->cannotDuplicate() || CI->isConvergent()) // Blocks with NoDuplicate are modelled as having infinite cost, so they // are never duplicated. return ~0U; @@ -279,16 +341,7 @@ static unsigned getJumpThreadDuplicationCost(const BasicBlock *BB, } } - // Threading through a switch statement is particularly profitable. If this - // block ends in a switch, decrease its cost to make it more likely to happen. - if (isa<SwitchInst>(I)) - Size = Size > 6 ? Size-6 : 0; - - // The same holds for indirect branches, but slightly more so. - if (isa<IndirectBrInst>(I)) - Size = Size > 8 ? Size-8 : 0; - - return Size; + return Size > Bonus ? Size - Bonus : 0; } /// FindLoopHeaders - We do not want jump threading to turn proper loop @@ -669,7 +722,8 @@ bool JumpThreading::ProcessBlock(BasicBlock *BB) { // because now the condition in this block can be threaded through // predecessors of our predecessor block. if (BasicBlock *SinglePred = BB->getSinglePredecessor()) { - if (SinglePred->getTerminator()->getNumSuccessors() == 1 && + const TerminatorInst *TI = SinglePred->getTerminator(); + if (!TI->isExceptional() && TI->getNumSuccessors() == 1 && SinglePred != BB && !hasAddressTakenAndUsed(BB)) { // If SinglePred was a loop header, BB becomes one. if (LoopHeaders.erase(SinglePred)) @@ -761,7 +815,7 @@ bool JumpThreading::ProcessBlock(BasicBlock *BB) { // If we're branching on a conditional, LVI might be able to determine // it's value at the branch instruction. We only handle comparisons // against a constant at this time. - // TODO: This should be extended to handle switches as well. + // TODO: This should be extended to handle switches as well. BranchInst *CondBr = dyn_cast<BranchInst>(BB->getTerminator()); Constant *CondConst = dyn_cast<Constant>(CondCmp->getOperand(1)); if (CondBr && CondConst && CondBr->isConditional()) { @@ -829,9 +883,40 @@ bool JumpThreading::ProcessBlock(BasicBlock *BB) { CondInst->getParent() == BB && isa<BranchInst>(BB->getTerminator())) return ProcessBranchOnXOR(cast<BinaryOperator>(CondInst)); + // Search for a stronger dominating condition that can be used to simplify a + // conditional branch leaving BB. + if (ProcessImpliedCondition(BB)) + return true; + + return false; +} + +bool JumpThreading::ProcessImpliedCondition(BasicBlock *BB) { + auto *BI = dyn_cast<BranchInst>(BB->getTerminator()); + if (!BI || !BI->isConditional()) + return false; + + Value *Cond = BI->getCondition(); + BasicBlock *CurrentBB = BB; + BasicBlock *CurrentPred = BB->getSinglePredecessor(); + unsigned Iter = 0; + + auto &DL = BB->getModule()->getDataLayout(); + + while (CurrentPred && Iter++ < ImplicationSearchThreshold) { + auto *PBI = dyn_cast<BranchInst>(CurrentPred->getTerminator()); + if (!PBI || !PBI->isConditional() || PBI->getSuccessor(0) != CurrentBB) + return false; - // TODO: If we have: "br (X > 0)" and we have a predecessor where we know - // "(X == 4)", thread through this block. + if (isImpliedCondition(PBI->getCondition(), Cond, DL)) { + BI->getSuccessor(1)->removePredecessor(BB); + BranchInst::Create(BI->getSuccessor(0), BI); + BI->eraseFromParent(); + return true; + } + CurrentBB = CurrentPred; + CurrentPred = CurrentBB->getSinglePredecessor(); + } return false; } @@ -850,10 +935,10 @@ bool JumpThreading::SimplifyPartiallyRedundantLoad(LoadInst *LI) { if (LoadBB->getSinglePredecessor()) return false; - // If the load is defined in a landing pad, it can't be partially redundant, - // because the edges between the invoke and the landing pad cannot have other + // If the load is defined in an EH pad, it can't be partially redundant, + // because the edges between the invoke and the EH pad cannot have other // instructions between them. - if (LoadBB->isLandingPad()) + if (LoadBB->isEHPad()) return false; Value *LoadedPtr = LI->getOperand(0); @@ -866,11 +951,11 @@ bool JumpThreading::SimplifyPartiallyRedundantLoad(LoadInst *LI) { // Scan a few instructions up from the load, to see if it is obviously live at // the entry to its block. - BasicBlock::iterator BBIt = LI; + BasicBlock::iterator BBIt(LI); if (Value *AvailableVal = - FindAvailableLoadedValue(LoadedPtr, LoadBB, BBIt, 6)) { - // If the value if the load is locally available within the block, just use + FindAvailableLoadedValue(LoadedPtr, LoadBB, BBIt, DefMaxInstsToScan)) { + // If the value of the load is locally available within the block, just use // it. This frequently occurs for reg2mem'd allocas. //cerr << "LOAD ELIMINATED:\n" << *BBIt << *LI << "\n"; @@ -914,7 +999,8 @@ bool JumpThreading::SimplifyPartiallyRedundantLoad(LoadInst *LI) { // Scan the predecessor to see if the value is available in the pred. BBIt = PredBB->end(); AAMDNodes ThisAATags; - Value *PredAvailable = FindAvailableLoadedValue(LoadedPtr, PredBB, BBIt, 6, + Value *PredAvailable = FindAvailableLoadedValue(LoadedPtr, PredBB, BBIt, + DefMaxInstsToScan, nullptr, &ThisAATags); if (!PredAvailable) { OneUnavailablePred = PredBB; @@ -968,8 +1054,7 @@ bool JumpThreading::SimplifyPartiallyRedundantLoad(LoadInst *LI) { } // Split them out to their own block. - UnavailablePred = - SplitBlockPredecessors(LoadBB, PredsToSplit, "thread-pre-split"); + UnavailablePred = SplitBlockPreds(LoadBB, PredsToSplit, "thread-pre-split"); } // If the value isn't available in all predecessors, then there will be @@ -995,7 +1080,7 @@ bool JumpThreading::SimplifyPartiallyRedundantLoad(LoadInst *LI) { // Create a PHI node at the start of the block for the PRE'd load value. pred_iterator PB = pred_begin(LoadBB), PE = pred_end(LoadBB); PHINode *PN = PHINode::Create(LI->getType(), std::distance(PB, PE), "", - LoadBB->begin()); + &LoadBB->front()); PN->takeName(LI); PN->setDebugLoc(LI->getDebugLoc()); @@ -1262,7 +1347,7 @@ bool JumpThreading::ProcessBranchOnXOR(BinaryOperator *BO) { // Into: // BB': // %Y = icmp ne i32 %A, %B - // br i1 %Z, ... + // br i1 %Y, ... PredValueInfoTy XorOpValues; bool isLHS = true; @@ -1387,14 +1472,14 @@ bool JumpThreading::ThreadEdge(BasicBlock *BB, return false; } - // And finally, do it! Start by factoring the predecessors is needed. + // And finally, do it! Start by factoring the predecessors if needed. BasicBlock *PredBB; if (PredBBs.size() == 1) PredBB = PredBBs[0]; else { DEBUG(dbgs() << " Factoring out " << PredBBs.size() << " common predecessors.\n"); - PredBB = SplitBlockPredecessors(BB, PredBBs, ".thr_comm"); + PredBB = SplitBlockPreds(BB, PredBBs, ".thr_comm"); } // And finally, do it! @@ -1415,6 +1500,13 @@ bool JumpThreading::ThreadEdge(BasicBlock *BB, BB->getParent(), BB); NewBB->moveAfter(PredBB); + // Set the block frequency of NewBB. + if (HasProfileData) { + auto NewBBFreq = + BFI->getBlockFreq(PredBB) * BPI->getEdgeProbability(PredBB, BB); + BFI->setBlockFreq(NewBB, NewBBFreq.getFrequency()); + } + BasicBlock::iterator BI = BB->begin(); for (; PHINode *PN = dyn_cast<PHINode>(BI); ++BI) ValueMapping[PN] = PN->getIncomingValueForBlock(PredBB); @@ -1425,7 +1517,7 @@ bool JumpThreading::ThreadEdge(BasicBlock *BB, Instruction *New = BI->clone(); New->setName(BI->getName()); NewBB->getInstList().push_back(New); - ValueMapping[BI] = New; + ValueMapping[&*BI] = New; // Remap operands to patch up intra-block references. for (unsigned i = 0, e = New->getNumOperands(); i != e; ++i) @@ -1438,7 +1530,7 @@ bool JumpThreading::ThreadEdge(BasicBlock *BB, // We didn't copy the terminator from BB over to NewBB, because there is now // an unconditional jump to SuccBB. Insert the unconditional jump. - BranchInst *NewBI =BranchInst::Create(SuccBB, NewBB); + BranchInst *NewBI = BranchInst::Create(SuccBB, NewBB); NewBI->setDebugLoc(BB->getTerminator()->getDebugLoc()); // Check to see if SuccBB has PHI nodes. If so, we need to add entries to the @@ -1475,8 +1567,8 @@ bool JumpThreading::ThreadEdge(BasicBlock *BB, // its block to be uses of the appropriate PHI node etc. See ValuesInBlocks // with the two values we know. SSAUpdate.Initialize(I->getType(), I->getName()); - SSAUpdate.AddAvailableValue(BB, I); - SSAUpdate.AddAvailableValue(NewBB, ValueMapping[I]); + SSAUpdate.AddAvailableValue(BB, &*I); + SSAUpdate.AddAvailableValue(NewBB, ValueMapping[&*I]); while (!UsesToRename.empty()) SSAUpdate.RewriteUse(*UsesToRename.pop_back_val()); @@ -1499,11 +1591,98 @@ bool JumpThreading::ThreadEdge(BasicBlock *BB, // frequently happens because of phi translation. SimplifyInstructionsInBlock(NewBB, TLI); + // Update the edge weight from BB to SuccBB, which should be less than before. + UpdateBlockFreqAndEdgeWeight(PredBB, BB, NewBB, SuccBB); + // Threaded an edge! ++NumThreads; return true; } +/// Create a new basic block that will be the predecessor of BB and successor of +/// all blocks in Preds. When profile data is availble, update the frequency of +/// this new block. +BasicBlock *JumpThreading::SplitBlockPreds(BasicBlock *BB, + ArrayRef<BasicBlock *> Preds, + const char *Suffix) { + // Collect the frequencies of all predecessors of BB, which will be used to + // update the edge weight on BB->SuccBB. + BlockFrequency PredBBFreq(0); + if (HasProfileData) + for (auto Pred : Preds) + PredBBFreq += BFI->getBlockFreq(Pred) * BPI->getEdgeProbability(Pred, BB); + + BasicBlock *PredBB = SplitBlockPredecessors(BB, Preds, Suffix); + + // Set the block frequency of the newly created PredBB, which is the sum of + // frequencies of Preds. + if (HasProfileData) + BFI->setBlockFreq(PredBB, PredBBFreq.getFrequency()); + return PredBB; +} + +/// Update the block frequency of BB and branch weight and the metadata on the +/// edge BB->SuccBB. This is done by scaling the weight of BB->SuccBB by 1 - +/// Freq(PredBB->BB) / Freq(BB->SuccBB). +void JumpThreading::UpdateBlockFreqAndEdgeWeight(BasicBlock *PredBB, + BasicBlock *BB, + BasicBlock *NewBB, + BasicBlock *SuccBB) { + if (!HasProfileData) + return; + + assert(BFI && BPI && "BFI & BPI should have been created here"); + + // As the edge from PredBB to BB is deleted, we have to update the block + // frequency of BB. + auto BBOrigFreq = BFI->getBlockFreq(BB); + auto NewBBFreq = BFI->getBlockFreq(NewBB); + auto BB2SuccBBFreq = BBOrigFreq * BPI->getEdgeProbability(BB, SuccBB); + auto BBNewFreq = BBOrigFreq - NewBBFreq; + BFI->setBlockFreq(BB, BBNewFreq.getFrequency()); + + // Collect updated outgoing edges' frequencies from BB and use them to update + // edge probabilities. + SmallVector<uint64_t, 4> BBSuccFreq; + for (auto I = succ_begin(BB), E = succ_end(BB); I != E; ++I) { + auto SuccFreq = (*I == SuccBB) + ? BB2SuccBBFreq - NewBBFreq + : BBOrigFreq * BPI->getEdgeProbability(BB, *I); + BBSuccFreq.push_back(SuccFreq.getFrequency()); + } + + uint64_t MaxBBSuccFreq = + *std::max_element(BBSuccFreq.begin(), BBSuccFreq.end()); + + SmallVector<BranchProbability, 4> BBSuccProbs; + if (MaxBBSuccFreq == 0) + BBSuccProbs.assign(BBSuccFreq.size(), + {1, static_cast<unsigned>(BBSuccFreq.size())}); + else { + for (uint64_t Freq : BBSuccFreq) + BBSuccProbs.push_back( + BranchProbability::getBranchProbability(Freq, MaxBBSuccFreq)); + // Normalize edge probabilities so that they sum up to one. + BranchProbability::normalizeProbabilities(BBSuccProbs.begin(), + BBSuccProbs.end()); + } + + // Update edge probabilities in BPI. + for (int I = 0, E = BBSuccProbs.size(); I < E; I++) + BPI->setEdgeProbability(BB, I, BBSuccProbs[I]); + + if (BBSuccProbs.size() >= 2) { + SmallVector<uint32_t, 4> Weights; + for (auto Prob : BBSuccProbs) + Weights.push_back(Prob.getNumerator()); + + auto TI = BB->getTerminator(); + TI->setMetadata( + LLVMContext::MD_prof, + MDBuilder(TI->getParent()->getContext()).createBranchWeights(Weights)); + } +} + /// DuplicateCondBranchOnPHIIntoPred - PredBB contains an unconditional branch /// to BB which contains an i1 PHI node and a conditional branch on that PHI. /// If we can duplicate the contents of BB up into PredBB do so now, this @@ -1530,14 +1709,14 @@ bool JumpThreading::DuplicateCondBranchOnPHIIntoPred(BasicBlock *BB, return false; } - // And finally, do it! Start by factoring the predecessors is needed. + // And finally, do it! Start by factoring the predecessors if needed. BasicBlock *PredBB; if (PredBBs.size() == 1) PredBB = PredBBs[0]; else { DEBUG(dbgs() << " Factoring out " << PredBBs.size() << " common predecessors.\n"); - PredBB = SplitBlockPredecessors(BB, PredBBs, ".thr_comm"); + PredBB = SplitBlockPreds(BB, PredBBs, ".thr_comm"); } // Okay, we decided to do this! Clone all the instructions in BB onto the end @@ -1581,12 +1760,12 @@ bool JumpThreading::DuplicateCondBranchOnPHIIntoPred(BasicBlock *BB, if (Value *IV = SimplifyInstruction(New, BB->getModule()->getDataLayout())) { delete New; - ValueMapping[BI] = IV; + ValueMapping[&*BI] = IV; } else { // Otherwise, insert the new instruction into the block. New->setName(BI->getName()); - PredBB->getInstList().insert(OldPredBranch, New); - ValueMapping[BI] = New; + PredBB->getInstList().insert(OldPredBranch->getIterator(), New); + ValueMapping[&*BI] = New; } } @@ -1628,8 +1807,8 @@ bool JumpThreading::DuplicateCondBranchOnPHIIntoPred(BasicBlock *BB, // its block to be uses of the appropriate PHI node etc. See ValuesInBlocks // with the two values we know. SSAUpdate.Initialize(I->getType(), I->getName()); - SSAUpdate.AddAvailableValue(BB, I); - SSAUpdate.AddAvailableValue(PredBB, ValueMapping[I]); + SSAUpdate.AddAvailableValue(BB, &*I); + SSAUpdate.AddAvailableValue(PredBB, ValueMapping[&*I]); while (!UsesToRename.empty()) SSAUpdate.RewriteUse(*UsesToRename.pop_back_val()); |