diff options
Diffstat (limited to 'contrib/llvm/lib/Transforms/Scalar')
58 files changed, 9712 insertions, 3373 deletions
diff --git a/contrib/llvm/lib/Transforms/Scalar/ADCE.cpp b/contrib/llvm/lib/Transforms/Scalar/ADCE.cpp index adc903c..5b467dc 100644 --- a/contrib/llvm/lib/Transforms/Scalar/ADCE.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/ADCE.cpp @@ -41,8 +41,8 @@ using namespace llvm; STATISTIC(NumRemoved, "Number of instructions removed"); STATISTIC(NumBranchesRemoved, "Number of branch instructions removed"); -// This is a tempoary option until we change the interface -// to this pass based on optimization level. +// This is a temporary option until we change the interface to this pass based +// on optimization level. static cl::opt<bool> RemoveControlFlowFlag("adce-remove-control-flow", cl::init(true), cl::Hidden); @@ -110,7 +110,7 @@ class AggressiveDeadCodeElimination { /// The set of blocks which we have determined whose control /// dependence sources must be live and which have not had - /// those dependences analyized. + /// those dependences analyzed. SmallPtrSet<BasicBlock *, 16> NewLiveBlocks; /// Set up auxiliary data structures for Instructions and BasicBlocks and @@ -145,7 +145,7 @@ class AggressiveDeadCodeElimination { /// was removed. bool removeDeadInstructions(); - /// Identify connected sections of the control flow grap which have + /// Identify connected sections of the control flow graph which have /// dead terminators and rewrite the control flow graph to remove them. void updateDeadRegions(); @@ -234,7 +234,7 @@ void AggressiveDeadCodeElimination::initialize() { return Iter != end() && Iter->second; } } State; - + State.reserve(F.size()); // Iterate over blocks in depth-first pre-order and // treat all edges to a block already seen as loop back edges @@ -262,25 +262,6 @@ void AggressiveDeadCodeElimination::initialize() { continue; auto *BB = BBInfo.BB; if (!PDT.getNode(BB)) { - markLive(BBInfo.Terminator); - continue; - } - for (auto *Succ : successors(BB)) - if (!PDT.getNode(Succ)) { - markLive(BBInfo.Terminator); - break; - } - } - - // Mark blocks live if there is no path from the block to the - // return of the function or a successor for which this is true. - // This protects IDFCalculator which cannot handle such blocks. - for (auto &BBInfoPair : BlockInfo) { - auto &BBInfo = BBInfoPair.second; - if (BBInfo.terminatorIsLive()) - continue; - auto *BB = BBInfo.BB; - if (!PDT.getNode(BB)) { DEBUG(dbgs() << "Not post-dominated by return: " << BB->getName() << '\n';); markLive(BBInfo.Terminator); @@ -579,7 +560,7 @@ void AggressiveDeadCodeElimination::updateDeadRegions() { PreferredSucc = Info; } assert((PreferredSucc && PreferredSucc->PostOrder > 0) && - "Failed to find safe successor for dead branc"); + "Failed to find safe successor for dead branch"); bool First = true; for (auto *Succ : successors(BB)) { if (!First || Succ != PreferredSucc->BB) @@ -594,13 +575,13 @@ void AggressiveDeadCodeElimination::updateDeadRegions() { // reverse top-sort order void AggressiveDeadCodeElimination::computeReversePostOrder() { - - // This provides a post-order numbering of the reverse conrtol flow graph + + // This provides a post-order numbering of the reverse control flow graph // Note that it is incomplete in the presence of infinite loops but we don't // need numbers blocks which don't reach the end of the functions since // all branches in those blocks are forced live. - - // For each block without successors, extend the DFS from the bloack + + // For each block without successors, extend the DFS from the block // backward through the graph SmallPtrSet<BasicBlock*, 16> Visited; unsigned PostOrder = 0; @@ -644,8 +625,8 @@ PreservedAnalyses ADCEPass::run(Function &F, FunctionAnalysisManager &FAM) { if (!AggressiveDeadCodeElimination(F, PDT).performDeadCodeElimination()) return PreservedAnalyses::all(); - // FIXME: This should also 'preserve the CFG'. - auto PA = PreservedAnalyses(); + PreservedAnalyses PA; + PA.preserveSet<CFGAnalyses>(); PA.preserve<GlobalsAA>(); return PA; } diff --git a/contrib/llvm/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp b/contrib/llvm/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp index c1df317..99480f1 100644 --- a/contrib/llvm/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp @@ -19,12 +19,11 @@ #define AA_NAME "alignment-from-assumptions" #define DEBUG_TYPE AA_NAME #include "llvm/Transforms/Scalar/AlignmentFromAssumptions.h" -#include "llvm/Transforms/Scalar.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" -#include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/AssumptionCache.h" +#include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" #include "llvm/Analysis/ValueTracking.h" @@ -35,6 +34,7 @@ #include "llvm/IR/Module.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Scalar.h" using namespace llvm; STATISTIC(NumLoadAlignChanged, @@ -438,19 +438,13 @@ AlignmentFromAssumptionsPass::run(Function &F, FunctionAnalysisManager &AM) { AssumptionCache &AC = AM.getResult<AssumptionAnalysis>(F); ScalarEvolution &SE = AM.getResult<ScalarEvolutionAnalysis>(F); DominatorTree &DT = AM.getResult<DominatorTreeAnalysis>(F); - bool Changed = runImpl(F, AC, &SE, &DT); - - // FIXME: We need to invalidate this to avoid PR28400. Is there a better - // solution? - AM.invalidate<ScalarEvolutionAnalysis>(F); - - if (!Changed) + if (!runImpl(F, AC, &SE, &DT)) return PreservedAnalyses::all(); + PreservedAnalyses PA; + PA.preserveSet<CFGAnalyses>(); PA.preserve<AAManager>(); PA.preserve<ScalarEvolutionAnalysis>(); PA.preserve<GlobalsAA>(); - PA.preserve<LoopAnalysis>(); - PA.preserve<DominatorTreeAnalysis>(); return PA; } diff --git a/contrib/llvm/lib/Transforms/Scalar/BDCE.cpp b/contrib/llvm/lib/Transforms/Scalar/BDCE.cpp index 251b387..2e56186 100644 --- a/contrib/llvm/lib/Transforms/Scalar/BDCE.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/BDCE.cpp @@ -15,6 +15,7 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/Scalar/BDCE.h" +#include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/DemandedBits.h" @@ -35,6 +36,46 @@ using namespace llvm; STATISTIC(NumRemoved, "Number of instructions removed (unused)"); STATISTIC(NumSimplified, "Number of instructions trivialized (dead bits)"); +/// If an instruction is trivialized (dead), then the chain of users of that +/// instruction may need to be cleared of assumptions that can no longer be +/// guaranteed correct. +static void clearAssumptionsOfUsers(Instruction *I, DemandedBits &DB) { + assert(I->getType()->isIntegerTy() && "Trivializing a non-integer value?"); + + // Initialize the worklist with eligible direct users. + SmallVector<Instruction *, 16> WorkList; + for (User *JU : I->users()) { + // If all bits of a user are demanded, then we know that nothing below that + // in the def-use chain needs to be changed. + auto *J = dyn_cast<Instruction>(JU); + if (J && !DB.getDemandedBits(J).isAllOnesValue()) + WorkList.push_back(J); + } + + // DFS through subsequent users while tracking visits to avoid cycles. + SmallPtrSet<Instruction *, 16> Visited; + while (!WorkList.empty()) { + Instruction *J = WorkList.pop_back_val(); + + // NSW, NUW, and exact are based on operands that might have changed. + J->dropPoisonGeneratingFlags(); + + // We do not have to worry about llvm.assume or range metadata: + // 1. llvm.assume demands its operand, so trivializing can't change it. + // 2. range metadata only applies to memory accesses which demand all bits. + + Visited.insert(J); + + for (User *KU : J->users()) { + // If all bits of a user are demanded, then we know that nothing below + // that in the def-use chain needs to be changed. + auto *K = dyn_cast<Instruction>(KU); + if (K && !Visited.count(K) && !DB.getDemandedBits(K).isAllOnesValue()) + WorkList.push_back(K); + } + } +} + static bool bitTrackingDCE(Function &F, DemandedBits &DB) { SmallVector<Instruction*, 128> Worklist; bool Changed = false; @@ -51,6 +92,9 @@ static bool bitTrackingDCE(Function &F, DemandedBits &DB) { // replacing all uses with something else. Then, if they don't need to // remain live (because they have side effects, etc.) we can remove them. DEBUG(dbgs() << "BDCE: Trivializing: " << I << " (all bits dead)\n"); + + clearAssumptionsOfUsers(&I, DB); + // FIXME: In theory we could substitute undef here instead of zero. // This should be reconsidered once we settle on the semantics of // undef, poison, etc. @@ -80,8 +124,8 @@ PreservedAnalyses BDCEPass::run(Function &F, FunctionAnalysisManager &AM) { if (!bitTrackingDCE(F, DB)) return PreservedAnalyses::all(); - // FIXME: This should also 'preserve the CFG'. - auto PA = PreservedAnalyses(); + PreservedAnalyses PA; + PA.preserveSet<CFGAnalyses>(); PA.preserve<GlobalsAA>(); return PA; } diff --git a/contrib/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp b/contrib/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp index 3826251..122c931 100644 --- a/contrib/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp @@ -38,11 +38,13 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/IR/Constants.h" +#include "llvm/IR/GetElementPtrTypeIterator.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/Pass.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/Local.h" #include <tuple> using namespace llvm; @@ -53,6 +55,12 @@ using namespace consthoist; STATISTIC(NumConstantsHoisted, "Number of constants hoisted"); STATISTIC(NumConstantsRebased, "Number of constants rebased"); +static cl::opt<bool> ConstHoistWithBlockFrequency( + "consthoist-with-block-frequency", cl::init(true), cl::Hidden, + cl::desc("Enable the use of the block frequency analysis to reduce the " + "chance to execute const materialization more frequently than " + "without hoisting.")); + namespace { /// \brief The constant hoisting pass. class ConstantHoistingLegacyPass : public FunctionPass { @@ -68,6 +76,8 @@ public: void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); + if (ConstHoistWithBlockFrequency) + AU.addRequired<BlockFrequencyInfoWrapperPass>(); AU.addRequired<DominatorTreeWrapperPass>(); AU.addRequired<TargetTransformInfoWrapperPass>(); } @@ -82,6 +92,7 @@ private: char ConstantHoistingLegacyPass::ID = 0; INITIALIZE_PASS_BEGIN(ConstantHoistingLegacyPass, "consthoist", "Constant Hoisting", false, false) +INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) INITIALIZE_PASS_END(ConstantHoistingLegacyPass, "consthoist", @@ -99,9 +110,13 @@ bool ConstantHoistingLegacyPass::runOnFunction(Function &Fn) { DEBUG(dbgs() << "********** Begin Constant Hoisting **********\n"); DEBUG(dbgs() << "********** Function: " << Fn.getName() << '\n'); - bool MadeChange = Impl.runImpl( - Fn, getAnalysis<TargetTransformInfoWrapperPass>().getTTI(Fn), - getAnalysis<DominatorTreeWrapperPass>().getDomTree(), Fn.getEntryBlock()); + bool MadeChange = + Impl.runImpl(Fn, getAnalysis<TargetTransformInfoWrapperPass>().getTTI(Fn), + getAnalysis<DominatorTreeWrapperPass>().getDomTree(), + ConstHoistWithBlockFrequency + ? &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI() + : nullptr, + Fn.getEntryBlock()); if (MadeChange) { DEBUG(dbgs() << "********** Function after Constant Hoisting: " @@ -136,37 +151,163 @@ Instruction *ConstantHoistingPass::findMatInsertPt(Instruction *Inst, if (Idx != ~0U && isa<PHINode>(Inst)) return cast<PHINode>(Inst)->getIncomingBlock(Idx)->getTerminator(); - BasicBlock *IDom = DT->getNode(Inst->getParent())->getIDom()->getBlock(); - return IDom->getTerminator(); + // This must be an EH pad. Iterate over immediate dominators until we find a + // non-EH pad. We need to skip over catchswitch blocks, which are both EH pads + // and terminators. + auto IDom = DT->getNode(Inst->getParent())->getIDom(); + while (IDom->getBlock()->isEHPad()) { + assert(Entry != IDom->getBlock() && "eh pad in entry block"); + IDom = IDom->getIDom(); + } + + return IDom->getBlock()->getTerminator(); +} + +/// \brief Given \p BBs as input, find another set of BBs which collectively +/// dominates \p BBs and have the minimal sum of frequencies. Return the BB +/// set found in \p BBs. +static void findBestInsertionSet(DominatorTree &DT, BlockFrequencyInfo &BFI, + BasicBlock *Entry, + SmallPtrSet<BasicBlock *, 8> &BBs) { + assert(!BBs.count(Entry) && "Assume Entry is not in BBs"); + // Nodes on the current path to the root. + SmallPtrSet<BasicBlock *, 8> Path; + // Candidates includes any block 'BB' in set 'BBs' that is not strictly + // dominated by any other blocks in set 'BBs', and all nodes in the path + // in the dominator tree from Entry to 'BB'. + SmallPtrSet<BasicBlock *, 16> Candidates; + for (auto BB : BBs) { + Path.clear(); + // Walk up the dominator tree until Entry or another BB in BBs + // is reached. Insert the nodes on the way to the Path. + BasicBlock *Node = BB; + // The "Path" is a candidate path to be added into Candidates set. + bool isCandidate = false; + do { + Path.insert(Node); + if (Node == Entry || Candidates.count(Node)) { + isCandidate = true; + break; + } + assert(DT.getNode(Node)->getIDom() && + "Entry doens't dominate current Node"); + Node = DT.getNode(Node)->getIDom()->getBlock(); + } while (!BBs.count(Node)); + + // If isCandidate is false, Node is another Block in BBs dominating + // current 'BB'. Drop the nodes on the Path. + if (!isCandidate) + continue; + + // Add nodes on the Path into Candidates. + Candidates.insert(Path.begin(), Path.end()); + } + + // Sort the nodes in Candidates in top-down order and save the nodes + // in Orders. + unsigned Idx = 0; + SmallVector<BasicBlock *, 16> Orders; + Orders.push_back(Entry); + while (Idx != Orders.size()) { + BasicBlock *Node = Orders[Idx++]; + for (auto ChildDomNode : DT.getNode(Node)->getChildren()) { + if (Candidates.count(ChildDomNode->getBlock())) + Orders.push_back(ChildDomNode->getBlock()); + } + } + + // Visit Orders in bottom-up order. + typedef std::pair<SmallPtrSet<BasicBlock *, 16>, BlockFrequency> + InsertPtsCostPair; + // InsertPtsMap is a map from a BB to the best insertion points for the + // subtree of BB (subtree not including the BB itself). + DenseMap<BasicBlock *, InsertPtsCostPair> InsertPtsMap; + InsertPtsMap.reserve(Orders.size() + 1); + for (auto RIt = Orders.rbegin(); RIt != Orders.rend(); RIt++) { + BasicBlock *Node = *RIt; + bool NodeInBBs = BBs.count(Node); + SmallPtrSet<BasicBlock *, 16> &InsertPts = InsertPtsMap[Node].first; + BlockFrequency &InsertPtsFreq = InsertPtsMap[Node].second; + + // Return the optimal insert points in BBs. + if (Node == Entry) { + BBs.clear(); + if (InsertPtsFreq > BFI.getBlockFreq(Node) || + (InsertPtsFreq == BFI.getBlockFreq(Node) && InsertPts.size() > 1)) + BBs.insert(Entry); + else + BBs.insert(InsertPts.begin(), InsertPts.end()); + break; + } + + BasicBlock *Parent = DT.getNode(Node)->getIDom()->getBlock(); + // Initially, ParentInsertPts is empty and ParentPtsFreq is 0. Every child + // will update its parent's ParentInsertPts and ParentPtsFreq. + SmallPtrSet<BasicBlock *, 16> &ParentInsertPts = InsertPtsMap[Parent].first; + BlockFrequency &ParentPtsFreq = InsertPtsMap[Parent].second; + // Choose to insert in Node or in subtree of Node. + // Don't hoist to EHPad because we may not find a proper place to insert + // in EHPad. + // If the total frequency of InsertPts is the same as the frequency of the + // target Node, and InsertPts contains more than one nodes, choose hoisting + // to reduce code size. + if (NodeInBBs || + (!Node->isEHPad() && + (InsertPtsFreq > BFI.getBlockFreq(Node) || + (InsertPtsFreq == BFI.getBlockFreq(Node) && InsertPts.size() > 1)))) { + ParentInsertPts.insert(Node); + ParentPtsFreq += BFI.getBlockFreq(Node); + } else { + ParentInsertPts.insert(InsertPts.begin(), InsertPts.end()); + ParentPtsFreq += InsertPtsFreq; + } + } } /// \brief Find an insertion point that dominates all uses. -Instruction *ConstantHoistingPass::findConstantInsertionPoint( +SmallPtrSet<Instruction *, 8> ConstantHoistingPass::findConstantInsertionPoint( const ConstantInfo &ConstInfo) const { assert(!ConstInfo.RebasedConstants.empty() && "Invalid constant info entry."); // Collect all basic blocks. SmallPtrSet<BasicBlock *, 8> BBs; + SmallPtrSet<Instruction *, 8> InsertPts; for (auto const &RCI : ConstInfo.RebasedConstants) for (auto const &U : RCI.Uses) BBs.insert(findMatInsertPt(U.Inst, U.OpndIdx)->getParent()); - if (BBs.count(Entry)) - return &Entry->front(); + if (BBs.count(Entry)) { + InsertPts.insert(&Entry->front()); + return InsertPts; + } + + if (BFI) { + findBestInsertionSet(*DT, *BFI, Entry, BBs); + for (auto BB : BBs) { + BasicBlock::iterator InsertPt = BB->begin(); + for (; isa<PHINode>(InsertPt) || InsertPt->isEHPad(); ++InsertPt) + ; + InsertPts.insert(&*InsertPt); + } + return InsertPts; + } while (BBs.size() >= 2) { BasicBlock *BB, *BB1, *BB2; BB1 = *BBs.begin(); BB2 = *std::next(BBs.begin()); BB = DT->findNearestCommonDominator(BB1, BB2); - if (BB == Entry) - return &Entry->front(); + if (BB == Entry) { + InsertPts.insert(&Entry->front()); + return InsertPts; + } BBs.erase(BB1); BBs.erase(BB2); BBs.insert(BB); } assert((BBs.size() == 1) && "Expected only one element."); Instruction &FirstInst = (*BBs.begin())->front(); - return findMatInsertPt(&FirstInst); + InsertPts.insert(findMatInsertPt(&FirstInst)); + return InsertPts; } @@ -210,68 +351,65 @@ void ConstantHoistingPass::collectConstantCandidates( } } -/// \brief Scan the instruction for expensive integer constants and record them -/// in the constant candidate vector. -void ConstantHoistingPass::collectConstantCandidates( - ConstCandMapType &ConstCandMap, Instruction *Inst) { - // Skip all cast instructions. They are visited indirectly later on. - if (Inst->isCast()) - return; - - // Can't handle inline asm. Skip it. - if (auto Call = dyn_cast<CallInst>(Inst)) - if (isa<InlineAsm>(Call->getCalledValue())) - return; - // Switch cases must remain constant, and if the value being tested is - // constant the entire thing should disappear. - if (isa<SwitchInst>(Inst)) - return; +/// \brief Check the operand for instruction Inst at index Idx. +void ConstantHoistingPass::collectConstantCandidates( + ConstCandMapType &ConstCandMap, Instruction *Inst, unsigned Idx) { + Value *Opnd = Inst->getOperand(Idx); - // Static allocas (constant size in the entry block) are handled by - // prologue/epilogue insertion so they're free anyway. We definitely don't - // want to make them non-constant. - auto AI = dyn_cast<AllocaInst>(Inst); - if (AI && AI->isStaticAlloca()) + // Visit constant integers. + if (auto ConstInt = dyn_cast<ConstantInt>(Opnd)) { + collectConstantCandidates(ConstCandMap, Inst, Idx, ConstInt); return; + } - // Scan all operands. - for (unsigned Idx = 0, E = Inst->getNumOperands(); Idx != E; ++Idx) { - Value *Opnd = Inst->getOperand(Idx); + // Visit cast instructions that have constant integers. + if (auto CastInst = dyn_cast<Instruction>(Opnd)) { + // Only visit cast instructions, which have been skipped. All other + // instructions should have already been visited. + if (!CastInst->isCast()) + return; - // Visit constant integers. - if (auto ConstInt = dyn_cast<ConstantInt>(Opnd)) { + if (auto *ConstInt = dyn_cast<ConstantInt>(CastInst->getOperand(0))) { + // Pretend the constant is directly used by the instruction and ignore + // the cast instruction. collectConstantCandidates(ConstCandMap, Inst, Idx, ConstInt); - continue; + return; } + } - // Visit cast instructions that have constant integers. - if (auto CastInst = dyn_cast<Instruction>(Opnd)) { - // Only visit cast instructions, which have been skipped. All other - // instructions should have already been visited. - if (!CastInst->isCast()) - continue; + // Visit constant expressions that have constant integers. + if (auto ConstExpr = dyn_cast<ConstantExpr>(Opnd)) { + // Only visit constant cast expressions. + if (!ConstExpr->isCast()) + return; - if (auto *ConstInt = dyn_cast<ConstantInt>(CastInst->getOperand(0))) { - // Pretend the constant is directly used by the instruction and ignore - // the cast instruction. - collectConstantCandidates(ConstCandMap, Inst, Idx, ConstInt); - continue; - } + if (auto ConstInt = dyn_cast<ConstantInt>(ConstExpr->getOperand(0))) { + // Pretend the constant is directly used by the instruction and ignore + // the constant expression. + collectConstantCandidates(ConstCandMap, Inst, Idx, ConstInt); + return; } + } +} - // Visit constant expressions that have constant integers. - if (auto ConstExpr = dyn_cast<ConstantExpr>(Opnd)) { - // Only visit constant cast expressions. - if (!ConstExpr->isCast()) - continue; - if (auto ConstInt = dyn_cast<ConstantInt>(ConstExpr->getOperand(0))) { - // Pretend the constant is directly used by the instruction and ignore - // the constant expression. - collectConstantCandidates(ConstCandMap, Inst, Idx, ConstInt); - continue; - } +/// \brief Scan the instruction for expensive integer constants and record them +/// in the constant candidate vector. +void ConstantHoistingPass::collectConstantCandidates( + ConstCandMapType &ConstCandMap, Instruction *Inst) { + // Skip all cast instructions. They are visited indirectly later on. + if (Inst->isCast()) + return; + + // Scan all operands. + for (unsigned Idx = 0, E = Inst->getNumOperands(); Idx != E; ++Idx) { + // The cost of materializing the constants (defined in + // `TargetTransformInfo::getIntImmCost`) for instructions which only take + // constant variables is lower than `TargetTransformInfo::TCC_Basic`. So + // it's safe for us to collect constant candidates from all IntrinsicInsts. + if (canReplaceOperandWithVariable(Inst, Idx) || isa<IntrinsicInst>(Inst)) { + collectConstantCandidates(ConstCandMap, Inst, Idx); } } // end of for all operands } @@ -289,8 +427,8 @@ void ConstantHoistingPass::collectConstantCandidates(Function &Fn) { // bit widths (APInt Operator- does not like that). If the value cannot be // represented in uint64 we return an "empty" APInt. This is then interpreted // as the value is not in range. -static llvm::Optional<APInt> calculateOffsetDiff(APInt V1, APInt V2) -{ +static llvm::Optional<APInt> calculateOffsetDiff(const APInt &V1, + const APInt &V2) { llvm::Optional<APInt> Res = None; unsigned BW = V1.getBitWidth() > V2.getBitWidth() ? V1.getBitWidth() : V2.getBitWidth(); @@ -549,29 +687,54 @@ bool ConstantHoistingPass::emitBaseConstants() { bool MadeChange = false; for (auto const &ConstInfo : ConstantVec) { // Hoist and hide the base constant behind a bitcast. - Instruction *IP = findConstantInsertionPoint(ConstInfo); - IntegerType *Ty = ConstInfo.BaseConstant->getType(); - Instruction *Base = - new BitCastInst(ConstInfo.BaseConstant, Ty, "const", IP); - DEBUG(dbgs() << "Hoist constant (" << *ConstInfo.BaseConstant << ") to BB " - << IP->getParent()->getName() << '\n' << *Base << '\n'); - NumConstantsHoisted++; + SmallPtrSet<Instruction *, 8> IPSet = findConstantInsertionPoint(ConstInfo); + assert(!IPSet.empty() && "IPSet is empty"); + + unsigned UsesNum = 0; + unsigned ReBasesNum = 0; + for (Instruction *IP : IPSet) { + IntegerType *Ty = ConstInfo.BaseConstant->getType(); + Instruction *Base = + new BitCastInst(ConstInfo.BaseConstant, Ty, "const", IP); + DEBUG(dbgs() << "Hoist constant (" << *ConstInfo.BaseConstant + << ") to BB " << IP->getParent()->getName() << '\n' + << *Base << '\n'); + + // Emit materialization code for all rebased constants. + unsigned Uses = 0; + for (auto const &RCI : ConstInfo.RebasedConstants) { + for (auto const &U : RCI.Uses) { + Uses++; + BasicBlock *OrigMatInsertBB = + findMatInsertPt(U.Inst, U.OpndIdx)->getParent(); + // If Base constant is to be inserted in multiple places, + // generate rebase for U using the Base dominating U. + if (IPSet.size() == 1 || + DT->dominates(Base->getParent(), OrigMatInsertBB)) { + emitBaseConstants(Base, RCI.Offset, U); + ReBasesNum++; + } + } + } + UsesNum = Uses; - // Emit materialization code for all rebased constants. - for (auto const &RCI : ConstInfo.RebasedConstants) { - NumConstantsRebased++; - for (auto const &U : RCI.Uses) - emitBaseConstants(Base, RCI.Offset, U); + // Use the same debug location as the last user of the constant. + assert(!Base->use_empty() && "The use list is empty!?"); + assert(isa<Instruction>(Base->user_back()) && + "All uses should be instructions."); + Base->setDebugLoc(cast<Instruction>(Base->user_back())->getDebugLoc()); } + (void)UsesNum; + (void)ReBasesNum; + // Expect all uses are rebased after rebase is done. + assert(UsesNum == ReBasesNum && "Not all uses are rebased"); + + NumConstantsHoisted++; - // Use the same debug location as the last user of the constant. - assert(!Base->use_empty() && "The use list is empty!?"); - assert(isa<Instruction>(Base->user_back()) && - "All uses should be instructions."); - Base->setDebugLoc(cast<Instruction>(Base->user_back())->getDebugLoc()); + // Base constant is also included in ConstInfo.RebasedConstants, so + // deduct 1 from ConstInfo.RebasedConstants.size(). + NumConstantsRebased = ConstInfo.RebasedConstants.size() - 1; - // Correct for base constant, which we counted above too. - NumConstantsRebased--; MadeChange = true; } return MadeChange; @@ -587,9 +750,11 @@ void ConstantHoistingPass::deleteDeadCastInst() const { /// \brief Optimize expensive integer constants in the given function. bool ConstantHoistingPass::runImpl(Function &Fn, TargetTransformInfo &TTI, - DominatorTree &DT, BasicBlock &Entry) { + DominatorTree &DT, BlockFrequencyInfo *BFI, + BasicBlock &Entry) { this->TTI = &TTI; this->DT = &DT; + this->BFI = BFI; this->Entry = &Entry; // Collect all constant candidates. collectConstantCandidates(Fn); @@ -620,9 +785,13 @@ PreservedAnalyses ConstantHoistingPass::run(Function &F, FunctionAnalysisManager &AM) { auto &DT = AM.getResult<DominatorTreeAnalysis>(F); auto &TTI = AM.getResult<TargetIRAnalysis>(F); - if (!runImpl(F, TTI, DT, F.getEntryBlock())) + auto BFI = ConstHoistWithBlockFrequency + ? &AM.getResult<BlockFrequencyAnalysis>(F) + : nullptr; + if (!runImpl(F, TTI, DT, BFI, F.getEntryBlock())) return PreservedAnalyses::all(); - // FIXME: This should also 'preserve the CFG'. - return PreservedAnalyses::none(); + PreservedAnalyses PA; + PA.preserveSet<CFGAnalyses>(); + return PA; } diff --git a/contrib/llvm/lib/Transforms/Scalar/ConstantProp.cpp b/contrib/llvm/lib/Transforms/Scalar/ConstantProp.cpp index 9e98219..4fa2789 100644 --- a/contrib/llvm/lib/Transforms/Scalar/ConstantProp.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/ConstantProp.cpp @@ -18,15 +18,15 @@ // //===----------------------------------------------------------------------===// -#include "llvm/Transforms/Scalar.h" -#include "llvm/Transforms/Utils/Local.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/ConstantFolding.h" +#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/IR/Constant.h" #include "llvm/IR/InstIterator.h" #include "llvm/IR/Instruction.h" #include "llvm/Pass.h" -#include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/Local.h" #include <set> using namespace llvm; diff --git a/contrib/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp b/contrib/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp index 84f9373..2815778 100644 --- a/contrib/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp @@ -12,8 +12,8 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/Scalar/CorrelatedValuePropagation.h" -#include "llvm/Transforms/Scalar.h" #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/LazyValueInfo.h" @@ -26,6 +26,7 @@ #include "llvm/Pass.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/Local.h" using namespace llvm; @@ -95,7 +96,8 @@ static bool processSelect(SelectInst *S, LazyValueInfo *LVI) { return true; } -static bool processPHI(PHINode *P, LazyValueInfo *LVI) { +static bool processPHI(PHINode *P, LazyValueInfo *LVI, + const SimplifyQuery &SQ) { bool Changed = false; BasicBlock *BB = P->getParent(); @@ -149,9 +151,7 @@ static bool processPHI(PHINode *P, LazyValueInfo *LVI) { Changed = true; } - // FIXME: Provide TLI, DT, AT to SimplifyInstruction. - const DataLayout &DL = BB->getModule()->getDataLayout(); - if (Value *V = SimplifyInstruction(P, DL)) { + if (Value *V = SimplifyInstruction(P, SQ)) { P->replaceAllUsesWith(V); P->eraseFromParent(); Changed = true; @@ -232,12 +232,10 @@ static bool processSwitch(SwitchInst *SI, LazyValueInfo *LVI) { pred_iterator PB = pred_begin(BB), PE = pred_end(BB); if (PB == PE) return false; - // Analyse each switch case in turn. This is done in reverse order so that - // removing a case doesn't cause trouble for the iteration. + // Analyse each switch case in turn. bool Changed = false; - for (SwitchInst::CaseIt CI = SI->case_end(), CE = SI->case_begin(); CI-- != CE; - ) { - ConstantInt *Case = CI.getCaseValue(); + for (auto CI = SI->case_begin(), CE = SI->case_end(); CI != CE;) { + ConstantInt *Case = CI->getCaseValue(); // Check to see if the switch condition is equal to/not equal to the case // value on every incoming edge, equal/not equal being the same each time. @@ -270,8 +268,9 @@ static bool processSwitch(SwitchInst *SI, LazyValueInfo *LVI) { if (State == LazyValueInfo::False) { // This case never fires - remove it. - CI.getCaseSuccessor()->removePredecessor(BB); - SI->removeCase(CI); // Does not invalidate the iterator. + CI->getCaseSuccessor()->removePredecessor(BB); + CI = SI->removeCase(CI); + CE = SI->case_end(); // The condition can be modified by removePredecessor's PHI simplification // logic. @@ -279,7 +278,9 @@ static bool processSwitch(SwitchInst *SI, LazyValueInfo *LVI) { ++NumDeadCases; Changed = true; - } else if (State == LazyValueInfo::True) { + continue; + } + if (State == LazyValueInfo::True) { // This case always fires. Arrange for the switch to be turned into an // unconditional branch by replacing the switch condition with the case // value. @@ -288,6 +289,9 @@ static bool processSwitch(SwitchInst *SI, LazyValueInfo *LVI) { Changed = true; break; } + + // Increment the case iterator since we didn't delete it. + ++CI; } if (Changed) @@ -300,7 +304,7 @@ static bool processSwitch(SwitchInst *SI, LazyValueInfo *LVI) { /// Infer nonnull attributes for the arguments at the specified callsite. static bool processCallSite(CallSite CS, LazyValueInfo *LVI) { - SmallVector<unsigned, 4> Indices; + SmallVector<unsigned, 4> ArgNos; unsigned ArgNo = 0; for (Value *V : CS.args()) { @@ -308,23 +312,24 @@ static bool processCallSite(CallSite CS, LazyValueInfo *LVI) { // Try to mark pointer typed parameters as non-null. We skip the // relatively expensive analysis for constants which are obviously either // null or non-null to start with. - if (Type && !CS.paramHasAttr(ArgNo + 1, Attribute::NonNull) && + if (Type && !CS.paramHasAttr(ArgNo, Attribute::NonNull) && !isa<Constant>(V) && LVI->getPredicateAt(ICmpInst::ICMP_EQ, V, ConstantPointerNull::get(Type), CS.getInstruction()) == LazyValueInfo::False) - Indices.push_back(ArgNo + 1); + ArgNos.push_back(ArgNo); ArgNo++; } assert(ArgNo == CS.arg_size() && "sanity check"); - if (Indices.empty()) + if (ArgNos.empty()) return false; - AttributeSet AS = CS.getAttributes(); + AttributeList AS = CS.getAttributes(); LLVMContext &Ctx = CS.getInstruction()->getContext(); - AS = AS.addAttribute(Ctx, Indices, Attribute::get(Ctx, Attribute::NonNull)); + AS = AS.addParamAttribute(Ctx, ArgNos, + Attribute::get(Ctx, Attribute::NonNull)); CS.setAttributes(AS); return true; @@ -437,9 +442,8 @@ static bool processAdd(BinaryOperator *AddOp, LazyValueInfo *LVI) { bool Changed = false; if (!NUW) { - ConstantRange NUWRange = - LRange.makeGuaranteedNoWrapRegion(BinaryOperator::Add, LRange, - OBO::NoUnsignedWrap); + ConstantRange NUWRange = ConstantRange::makeGuaranteedNoWrapRegion( + BinaryOperator::Add, LRange, OBO::NoUnsignedWrap); if (!NUWRange.isEmptySet()) { bool NewNUW = NUWRange.contains(LazyRRange()); AddOp->setHasNoUnsignedWrap(NewNUW); @@ -447,9 +451,8 @@ static bool processAdd(BinaryOperator *AddOp, LazyValueInfo *LVI) { } } if (!NSW) { - ConstantRange NSWRange = - LRange.makeGuaranteedNoWrapRegion(BinaryOperator::Add, LRange, - OBO::NoSignedWrap); + ConstantRange NSWRange = ConstantRange::makeGuaranteedNoWrapRegion( + BinaryOperator::Add, LRange, OBO::NoSignedWrap); if (!NSWRange.isEmptySet()) { bool NewNSW = NSWRange.contains(LazyRRange()); AddOp->setHasNoSignedWrap(NewNSW); @@ -483,9 +486,8 @@ static Constant *getConstantAt(Value *V, Instruction *At, LazyValueInfo *LVI) { ConstantInt::getFalse(C->getContext()); } -static bool runImpl(Function &F, LazyValueInfo *LVI) { +static bool runImpl(Function &F, LazyValueInfo *LVI, const SimplifyQuery &SQ) { bool FnChanged = false; - // Visiting in a pre-order depth-first traversal causes us to simplify early // blocks before querying later blocks (which require us to analyze early // blocks). Eagerly simplifying shallow blocks means there is strictly less @@ -500,7 +502,7 @@ static bool runImpl(Function &F, LazyValueInfo *LVI) { BBChanged |= processSelect(cast<SelectInst>(II), LVI); break; case Instruction::PHI: - BBChanged |= processPHI(cast<PHINode>(II), LVI); + BBChanged |= processPHI(cast<PHINode>(II), LVI, SQ); break; case Instruction::ICmp: case Instruction::FCmp: @@ -548,7 +550,7 @@ static bool runImpl(Function &F, LazyValueInfo *LVI) { BBChanged = true; } } - }; + } FnChanged |= BBChanged; } @@ -561,18 +563,14 @@ bool CorrelatedValuePropagation::runOnFunction(Function &F) { return false; LazyValueInfo *LVI = &getAnalysis<LazyValueInfoWrapperPass>().getLVI(); - return runImpl(F, LVI); + return runImpl(F, LVI, getBestSimplifyQuery(*this, F)); } PreservedAnalyses CorrelatedValuePropagationPass::run(Function &F, FunctionAnalysisManager &AM) { LazyValueInfo *LVI = &AM.getResult<LazyValueAnalysis>(F); - bool Changed = runImpl(F, LVI); - - // FIXME: We need to invalidate LVI to avoid PR28400. Is there a better - // solution? - AM.invalidate<LazyValueAnalysis>(F); + bool Changed = runImpl(F, LVI, getBestSimplifyQuery(AM, F)); if (!Changed) return PreservedAnalyses::all(); diff --git a/contrib/llvm/lib/Transforms/Scalar/DCE.cpp b/contrib/llvm/lib/Transforms/Scalar/DCE.cpp index cc2a3cf..fa4806e 100644 --- a/contrib/llvm/lib/Transforms/Scalar/DCE.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/DCE.cpp @@ -19,10 +19,10 @@ #include "llvm/Transforms/Scalar/DCE.h" #include "llvm/ADT/SetVector.h" #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/IR/InstIterator.h" #include "llvm/IR/Instruction.h" #include "llvm/Pass.h" -#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/Local.h" using namespace llvm; @@ -124,9 +124,12 @@ static bool eliminateDeadCode(Function &F, TargetLibraryInfo *TLI) { } PreservedAnalyses DCEPass::run(Function &F, FunctionAnalysisManager &AM) { - if (eliminateDeadCode(F, AM.getCachedResult<TargetLibraryAnalysis>(F))) - return PreservedAnalyses::none(); - return PreservedAnalyses::all(); + if (!eliminateDeadCode(F, AM.getCachedResult<TargetLibraryAnalysis>(F))) + return PreservedAnalyses::all(); + + PreservedAnalyses PA; + PA.preserveSet<CFGAnalyses>(); + return PA; } namespace { diff --git a/contrib/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/contrib/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp index 4d4c3ba..1ec38e5 100644 --- a/contrib/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp @@ -135,13 +135,13 @@ static bool hasMemoryWrite(Instruction *I, const TargetLibraryInfo &TLI) { if (auto CS = CallSite(I)) { if (Function *F = CS.getCalledFunction()) { StringRef FnName = F->getName(); - if (TLI.has(LibFunc::strcpy) && FnName == TLI.getName(LibFunc::strcpy)) + if (TLI.has(LibFunc_strcpy) && FnName == TLI.getName(LibFunc_strcpy)) return true; - if (TLI.has(LibFunc::strncpy) && FnName == TLI.getName(LibFunc::strncpy)) + if (TLI.has(LibFunc_strncpy) && FnName == TLI.getName(LibFunc_strncpy)) return true; - if (TLI.has(LibFunc::strcat) && FnName == TLI.getName(LibFunc::strcat)) + if (TLI.has(LibFunc_strcat) && FnName == TLI.getName(LibFunc_strcat)) return true; - if (TLI.has(LibFunc::strncat) && FnName == TLI.getName(LibFunc::strncat)) + if (TLI.has(LibFunc_strncat) && FnName == TLI.getName(LibFunc_strncat)) return true; } } @@ -287,19 +287,14 @@ static uint64_t getPointerSize(const Value *V, const DataLayout &DL, } namespace { -enum OverwriteResult { - OverwriteBegin, - OverwriteComplete, - OverwriteEnd, - OverwriteUnknown -}; +enum OverwriteResult { OW_Begin, OW_Complete, OW_End, OW_Unknown }; } -/// Return 'OverwriteComplete' if a store to the 'Later' location completely -/// overwrites a store to the 'Earlier' location, 'OverwriteEnd' if the end of -/// the 'Earlier' location is completely overwritten by 'Later', -/// 'OverwriteBegin' if the beginning of the 'Earlier' location is overwritten -/// by 'Later', or 'OverwriteUnknown' if nothing can be determined. +/// Return 'OW_Complete' if a store to the 'Later' location completely +/// overwrites a store to the 'Earlier' location, 'OW_End' if the end of the +/// 'Earlier' location is completely overwritten by 'Later', 'OW_Begin' if the +/// beginning of the 'Earlier' location is overwritten by 'Later', or +/// 'OW_Unknown' if nothing can be determined. static OverwriteResult isOverwrite(const MemoryLocation &Later, const MemoryLocation &Earlier, const DataLayout &DL, @@ -310,7 +305,7 @@ static OverwriteResult isOverwrite(const MemoryLocation &Later, // If we don't know the sizes of either access, then we can't do a comparison. if (Later.Size == MemoryLocation::UnknownSize || Earlier.Size == MemoryLocation::UnknownSize) - return OverwriteUnknown; + return OW_Unknown; const Value *P1 = Earlier.Ptr->stripPointerCasts(); const Value *P2 = Later.Ptr->stripPointerCasts(); @@ -320,7 +315,7 @@ static OverwriteResult isOverwrite(const MemoryLocation &Later, if (P1 == P2) { // Make sure that the Later size is >= the Earlier size. if (Later.Size >= Earlier.Size) - return OverwriteComplete; + return OW_Complete; } // Check to see if the later store is to the entire object (either a global, @@ -332,13 +327,13 @@ static OverwriteResult isOverwrite(const MemoryLocation &Later, // If we can't resolve the same pointers to the same object, then we can't // analyze them at all. if (UO1 != UO2) - return OverwriteUnknown; + return OW_Unknown; // If the "Later" store is to a recognizable object, get its size. uint64_t ObjectSize = getPointerSize(UO2, DL, TLI); if (ObjectSize != MemoryLocation::UnknownSize) if (ObjectSize == Later.Size && ObjectSize >= Earlier.Size) - return OverwriteComplete; + return OW_Complete; // Okay, we have stores to two completely different pointers. Try to // decompose the pointer into a "base + constant_offset" form. If the base @@ -350,7 +345,7 @@ static OverwriteResult isOverwrite(const MemoryLocation &Later, // If the base pointers still differ, we have two completely different stores. if (BP1 != BP2) - return OverwriteUnknown; + return OW_Unknown; // The later store completely overlaps the earlier store if: // @@ -370,7 +365,7 @@ static OverwriteResult isOverwrite(const MemoryLocation &Later, if (EarlierOff >= LaterOff && Later.Size >= Earlier.Size && uint64_t(EarlierOff - LaterOff) + Earlier.Size <= Later.Size) - return OverwriteComplete; + return OW_Complete; // We may now overlap, although the overlap is not complete. There might also // be other incomplete overlaps, and together, they might cover the complete @@ -428,7 +423,7 @@ static OverwriteResult isOverwrite(const MemoryLocation &Later, ") Composite Later [" << ILI->second << ", " << ILI->first << ")\n"); ++NumCompletePartials; - return OverwriteComplete; + return OW_Complete; } } @@ -443,7 +438,7 @@ static OverwriteResult isOverwrite(const MemoryLocation &Later, if (!EnablePartialOverwriteTracking && (LaterOff > EarlierOff && LaterOff < int64_t(EarlierOff + Earlier.Size) && int64_t(LaterOff + Later.Size) >= int64_t(EarlierOff + Earlier.Size))) - return OverwriteEnd; + return OW_End; // Finally, we also need to check if the later store overwrites the beginning // of the earlier store. @@ -458,11 +453,11 @@ static OverwriteResult isOverwrite(const MemoryLocation &Later, (LaterOff <= EarlierOff && int64_t(LaterOff + Later.Size) > EarlierOff)) { assert(int64_t(LaterOff + Later.Size) < int64_t(EarlierOff + Earlier.Size) && - "Expect to be handled as OverwriteComplete"); - return OverwriteBegin; + "Expect to be handled as OW_Complete"); + return OW_Begin; } // Otherwise, they don't completely overlap. - return OverwriteUnknown; + return OW_Unknown; } /// If 'Inst' might be a self read (i.e. a noop copy of a @@ -551,7 +546,7 @@ static bool memoryIsNotModifiedBetween(Instruction *FirstI, Instruction *I = &*BI; if (I->mayWriteToMemory() && I != SecondI) { auto Res = AA->getModRefInfo(I, MemLoc); - if (Res != MRI_NoModRef) + if (Res & MRI_Mod) return false; } } @@ -909,7 +904,7 @@ static bool tryToShortenBegin(Instruction *EarlierWrite, if (LaterStart <= EarlierStart && LaterStart + LaterSize > EarlierStart) { assert(LaterStart + LaterSize < EarlierStart + EarlierSize && - "Should have been handled as OverwriteComplete"); + "Should have been handled as OW_Complete"); if (tryToShorten(EarlierWrite, EarlierStart, EarlierSize, LaterStart, LaterSize, false)) { IntervalMap.erase(OII); @@ -1105,7 +1100,7 @@ static bool eliminateDeadStores(BasicBlock &BB, AliasAnalysis *AA, OverwriteResult OR = isOverwrite(Loc, DepLoc, DL, *TLI, DepWriteOffset, InstWriteOffset, DepWrite, IOL); - if (OR == OverwriteComplete) { + if (OR == OW_Complete) { DEBUG(dbgs() << "DSE: Remove Dead Store:\n DEAD: " << *DepWrite << "\n KILLER: " << *Inst << '\n'); @@ -1117,15 +1112,15 @@ static bool eliminateDeadStores(BasicBlock &BB, AliasAnalysis *AA, // We erased DepWrite; start over. InstDep = MD->getDependency(Inst); continue; - } else if ((OR == OverwriteEnd && isShortenableAtTheEnd(DepWrite)) || - ((OR == OverwriteBegin && + } else if ((OR == OW_End && isShortenableAtTheEnd(DepWrite)) || + ((OR == OW_Begin && isShortenableAtTheBeginning(DepWrite)))) { assert(!EnablePartialOverwriteTracking && "Do not expect to perform " "when partial-overwrite " "tracking is enabled"); int64_t EarlierSize = DepLoc.Size; int64_t LaterSize = Loc.Size; - bool IsOverwriteEnd = (OR == OverwriteEnd); + bool IsOverwriteEnd = (OR == OW_End); MadeChange |= tryToShorten(DepWrite, DepWriteOffset, EarlierSize, InstWriteOffset, LaterSize, IsOverwriteEnd); } @@ -1186,8 +1181,9 @@ PreservedAnalyses DSEPass::run(Function &F, FunctionAnalysisManager &AM) { if (!eliminateDeadStores(F, AA, MD, DT, TLI)) return PreservedAnalyses::all(); + PreservedAnalyses PA; - PA.preserve<DominatorTreeAnalysis>(); + PA.preserveSet<CFGAnalyses>(); PA.preserve<GlobalsAA>(); PA.preserve<MemoryDependenceAnalysis>(); return PA; diff --git a/contrib/llvm/lib/Transforms/Scalar/EarlyCSE.cpp b/contrib/llvm/lib/Transforms/Scalar/EarlyCSE.cpp index 16e08ee..c5c9b2c 100644 --- a/contrib/llvm/lib/Transforms/Scalar/EarlyCSE.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/EarlyCSE.cpp @@ -15,10 +15,13 @@ #include "llvm/Transforms/Scalar/EarlyCSE.h" #include "llvm/ADT/Hashing.h" #include "llvm/ADT/ScopedHashTable.h" +#include "llvm/ADT/SetVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/Analysis/MemorySSA.h" +#include "llvm/Analysis/MemorySSAUpdater.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/IR/DataLayout.h" @@ -32,7 +35,6 @@ #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/Local.h" -#include "llvm/Transforms/Utils/MemorySSA.h" #include <deque> using namespace llvm; using namespace llvm::PatternMatch; @@ -252,7 +254,9 @@ public: const TargetTransformInfo &TTI; DominatorTree &DT; AssumptionCache &AC; + const SimplifyQuery SQ; MemorySSA *MSSA; + std::unique_ptr<MemorySSAUpdater> MSSAUpdater; typedef RecyclingAllocator< BumpPtrAllocator, ScopedHashTableVal<SimpleValue, Value *>> AllocatorTy; typedef ScopedHashTable<SimpleValue, Value *, DenseMapInfo<SimpleValue>, @@ -313,9 +317,12 @@ public: unsigned CurrentGeneration; /// \brief Set up the EarlyCSE runner for a particular function. - EarlyCSE(const TargetLibraryInfo &TLI, const TargetTransformInfo &TTI, - DominatorTree &DT, AssumptionCache &AC, MemorySSA *MSSA) - : TLI(TLI), TTI(TTI), DT(DT), AC(AC), MSSA(MSSA), CurrentGeneration(0) {} + EarlyCSE(const DataLayout &DL, const TargetLibraryInfo &TLI, + const TargetTransformInfo &TTI, DominatorTree &DT, + AssumptionCache &AC, MemorySSA *MSSA) + : TLI(TLI), TTI(TTI), DT(DT), AC(AC), SQ(DL, &TLI, &DT, &AC), MSSA(MSSA), + MSSAUpdater(make_unique<MemorySSAUpdater>(MSSA)), CurrentGeneration(0) { + } bool run(); @@ -388,7 +395,7 @@ private: ParseMemoryInst(Instruction *Inst, const TargetTransformInfo &TTI) : IsTargetMemInst(false), Inst(Inst) { if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) - if (TTI.getTgtMemIntrinsic(II, Info) && Info.NumMemRefs == 1) + if (TTI.getTgtMemIntrinsic(II, Info)) IsTargetMemInst = true; } bool isLoad() const { @@ -400,17 +407,14 @@ private: return isa<StoreInst>(Inst); } bool isAtomic() const { - if (IsTargetMemInst) { - assert(Info.IsSimple && "need to refine IsSimple in TTI"); - return false; - } + if (IsTargetMemInst) + return Info.Ordering != AtomicOrdering::NotAtomic; return Inst->isAtomic(); } bool isUnordered() const { - if (IsTargetMemInst) { - assert(Info.IsSimple && "need to refine IsSimple in TTI"); - return true; - } + if (IsTargetMemInst) + return Info.isUnordered(); + if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) { return LI->isUnordered(); } else if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) { @@ -421,10 +425,9 @@ private: } bool isVolatile() const { - if (IsTargetMemInst) { - assert(Info.IsSimple && "need to refine IsSimple in TTI"); - return false; - } + if (IsTargetMemInst) + return Info.IsVolatile; + if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) { return LI->isVolatile(); } else if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) { @@ -504,7 +507,7 @@ private: if (MemoryAccess *MA = MSSA->getMemoryAccess(Inst)) { // Optimize MemoryPhi nodes that may become redundant by having all the // same input values once MA is removed. - SmallVector<MemoryPhi *, 4> PhisToCheck; + SmallSetVector<MemoryPhi *, 4> PhisToCheck; SmallVector<MemoryAccess *, 8> WorkQueue; WorkQueue.push_back(MA); // Process MemoryPhi nodes in FIFO order using a ever-growing vector since @@ -515,9 +518,9 @@ private: for (auto *U : WI->users()) if (MemoryPhi *MP = dyn_cast<MemoryPhi>(U)) - PhisToCheck.push_back(MP); + PhisToCheck.insert(MP); - MSSA->removeMemoryAccess(WI); + MSSAUpdater->removeMemoryAccess(WI); for (MemoryPhi *MP : PhisToCheck) { MemoryAccess *FirstIn = MP->getIncomingValue(0); @@ -559,13 +562,27 @@ bool EarlyCSE::isSameMemGeneration(unsigned EarlierGeneration, if (!MSSA) return false; + // If MemorySSA has determined that one of EarlierInst or LaterInst does not + // read/write memory, then we can safely return true here. + // FIXME: We could be more aggressive when checking doesNotAccessMemory(), + // onlyReadsMemory(), mayReadFromMemory(), and mayWriteToMemory() in this pass + // by also checking the MemorySSA MemoryAccess on the instruction. Initial + // experiments suggest this isn't worthwhile, at least for C/C++ code compiled + // with the default optimization pipeline. + auto *EarlierMA = MSSA->getMemoryAccess(EarlierInst); + if (!EarlierMA) + return true; + auto *LaterMA = MSSA->getMemoryAccess(LaterInst); + if (!LaterMA) + return true; + // Since we know LaterDef dominates LaterInst and EarlierInst dominates // LaterInst, if LaterDef dominates EarlierInst then it can't occur between // EarlierInst and LaterInst and neither can any other write that potentially // clobbers LaterInst. MemoryAccess *LaterDef = MSSA->getWalker()->getClobberingMemoryAccess(LaterInst); - return MSSA->dominates(LaterDef, MSSA->getMemoryAccess(EarlierInst)); + return MSSA->dominates(LaterDef, EarlierMA); } bool EarlyCSE::processNode(DomTreeNode *Node) { @@ -587,27 +604,28 @@ bool EarlyCSE::processNode(DomTreeNode *Node) { // which reaches this block where the condition might hold a different // value. Since we're adding this to the scoped hash table (like any other // def), it will have been popped if we encounter a future merge block. - if (BasicBlock *Pred = BB->getSinglePredecessor()) - if (auto *BI = dyn_cast<BranchInst>(Pred->getTerminator())) - if (BI->isConditional()) - if (auto *CondInst = dyn_cast<Instruction>(BI->getCondition())) - if (SimpleValue::canHandle(CondInst)) { - assert(BI->getSuccessor(0) == BB || BI->getSuccessor(1) == BB); - auto *ConditionalConstant = (BI->getSuccessor(0) == BB) ? - ConstantInt::getTrue(BB->getContext()) : - ConstantInt::getFalse(BB->getContext()); - AvailableValues.insert(CondInst, ConditionalConstant); - DEBUG(dbgs() << "EarlyCSE CVP: Add conditional value for '" - << CondInst->getName() << "' as " << *ConditionalConstant - << " in " << BB->getName() << "\n"); - // Replace all dominated uses with the known value. - if (unsigned Count = - replaceDominatedUsesWith(CondInst, ConditionalConstant, DT, - BasicBlockEdge(Pred, BB))) { - Changed = true; - NumCSECVP = NumCSECVP + Count; - } - } + if (BasicBlock *Pred = BB->getSinglePredecessor()) { + auto *BI = dyn_cast<BranchInst>(Pred->getTerminator()); + if (BI && BI->isConditional()) { + auto *CondInst = dyn_cast<Instruction>(BI->getCondition()); + if (CondInst && SimpleValue::canHandle(CondInst)) { + assert(BI->getSuccessor(0) == BB || BI->getSuccessor(1) == BB); + auto *TorF = (BI->getSuccessor(0) == BB) + ? ConstantInt::getTrue(BB->getContext()) + : ConstantInt::getFalse(BB->getContext()); + AvailableValues.insert(CondInst, TorF); + DEBUG(dbgs() << "EarlyCSE CVP: Add conditional value for '" + << CondInst->getName() << "' as " << *TorF << " in " + << BB->getName() << "\n"); + // Replace all dominated uses with the known value. + if (unsigned Count = replaceDominatedUsesWith( + CondInst, TorF, DT, BasicBlockEdge(Pred, BB))) { + Changed = true; + NumCSECVP += Count; + } + } + } + } /// LastStore - Keep track of the last non-volatile store that we saw... for /// as long as there in no instruction that reads memory. If we see a store @@ -615,8 +633,6 @@ bool EarlyCSE::processNode(DomTreeNode *Node) { /// stores which can occur in bitfield code among other things. Instruction *LastStore = nullptr; - const DataLayout &DL = BB->getModule()->getDataLayout(); - // See if any instructions in the block can be eliminated. If so, do it. If // not, add them to AvailableValues. for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) { @@ -634,10 +650,16 @@ bool EarlyCSE::processNode(DomTreeNode *Node) { // Skip assume intrinsics, they don't really have side effects (although // they're marked as such to ensure preservation of control dependencies), - // and this pass will not disturb any of the assumption's control - // dependencies. + // and this pass will not bother with its removal. However, we should mark + // its condition as true for all dominated blocks. if (match(Inst, m_Intrinsic<Intrinsic::assume>())) { - DEBUG(dbgs() << "EarlyCSE skipping assumption: " << *Inst << '\n'); + auto *CondI = + dyn_cast<Instruction>(cast<CallInst>(Inst)->getArgOperand(0)); + if (CondI && SimpleValue::canHandle(CondI)) { + DEBUG(dbgs() << "EarlyCSE considering assumption: " << *Inst << '\n'); + AvailableValues.insert(CondI, ConstantInt::getTrue(BB->getContext())); + } else + DEBUG(dbgs() << "EarlyCSE skipping assumption: " << *Inst << '\n'); continue; } @@ -657,10 +679,25 @@ bool EarlyCSE::processNode(DomTreeNode *Node) { if (match(Inst, m_Intrinsic<Intrinsic::experimental_guard>())) { if (auto *CondI = dyn_cast<Instruction>(cast<CallInst>(Inst)->getArgOperand(0))) { - // The condition we're on guarding here is true for all dominated - // locations. - if (SimpleValue::canHandle(CondI)) + if (SimpleValue::canHandle(CondI)) { + // Do we already know the actual value of this condition? + if (auto *KnownCond = AvailableValues.lookup(CondI)) { + // Is the condition known to be true? + if (isa<ConstantInt>(KnownCond) && + cast<ConstantInt>(KnownCond)->isOne()) { + DEBUG(dbgs() << "EarlyCSE removing guard: " << *Inst << '\n'); + removeMSSA(Inst); + Inst->eraseFromParent(); + Changed = true; + continue; + } else + // Use the known value if it wasn't true. + cast<CallInst>(Inst)->setArgOperand(0, KnownCond); + } + // The condition we're on guarding here is true for all dominated + // locations. AvailableValues.insert(CondI, ConstantInt::getTrue(BB->getContext())); + } } // Guard intrinsics read all memory, but don't write any memory. @@ -672,7 +709,7 @@ bool EarlyCSE::processNode(DomTreeNode *Node) { // If the instruction can be simplified (e.g. X+0 = X) then replace it with // its simpler value. - if (Value *V = SimplifyInstruction(Inst, DL, &TLI, &DT, &AC)) { + if (Value *V = SimplifyInstruction(Inst, SQ)) { DEBUG(dbgs() << "EarlyCSE Simplify: " << *Inst << " to: " << *V << '\n'); bool Killed = false; if (!Inst->use_empty()) { @@ -761,12 +798,13 @@ bool EarlyCSE::processNode(DomTreeNode *Node) { continue; } - // If this instruction may read from memory, forget LastStore. - // Load/store intrinsics will indicate both a read and a write to - // memory. The target may override this (e.g. so that a store intrinsic - // does not read from memory, and thus will be treated the same as a - // regular store for commoning purposes). - if (Inst->mayReadFromMemory() && + // If this instruction may read from memory or throw (and potentially read + // from memory in the exception handler), forget LastStore. Load/store + // intrinsics will indicate both a read and a write to memory. The target + // may override this (e.g. so that a store intrinsic does not read from + // memory, and thus will be treated the same as a regular store for + // commoning purposes). + if ((Inst->mayReadFromMemory() || Inst->mayThrow()) && !(MemInst.isValid() && !MemInst.mayReadFromMemory())) LastStore = nullptr; @@ -962,15 +1000,13 @@ PreservedAnalyses EarlyCSEPass::run(Function &F, auto *MSSA = UseMemorySSA ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA() : nullptr; - EarlyCSE CSE(TLI, TTI, DT, AC, MSSA); + EarlyCSE CSE(F.getParent()->getDataLayout(), TLI, TTI, DT, AC, MSSA); if (!CSE.run()) return PreservedAnalyses::all(); - // CSE preserves the dominator tree because it doesn't mutate the CFG. - // FIXME: Bundle this with other CFG-preservation. PreservedAnalyses PA; - PA.preserve<DominatorTreeAnalysis>(); + PA.preserveSet<CFGAnalyses>(); PA.preserve<GlobalsAA>(); if (UseMemorySSA) PA.preserve<MemorySSAAnalysis>(); @@ -1008,7 +1044,7 @@ public: auto *MSSA = UseMemorySSA ? &getAnalysis<MemorySSAWrapperPass>().getMSSA() : nullptr; - EarlyCSE CSE(TLI, TTI, DT, AC, MSSA); + EarlyCSE CSE(F.getParent()->getDataLayout(), TLI, TTI, DT, AC, MSSA); return CSE.run(); } diff --git a/contrib/llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp b/contrib/llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp index 185cdbd..063df77 100644 --- a/contrib/llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/FlattenCFGPass.cpp @@ -11,10 +11,10 @@ // //===----------------------------------------------------------------------===// -#include "llvm/Transforms/Scalar.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/IR/CFG.h" #include "llvm/Pass.h" +#include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/Local.h" using namespace llvm; diff --git a/contrib/llvm/lib/Transforms/Scalar/Float2Int.cpp b/contrib/llvm/lib/Transforms/Scalar/Float2Int.cpp index 545036d..b105ece 100644 --- a/contrib/llvm/lib/Transforms/Scalar/Float2Int.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/Float2Int.cpp @@ -137,13 +137,13 @@ void Float2IntPass::findRoots(Function &F, SmallPtrSet<Instruction*,8> &Roots) { } // Helper - mark I as having been traversed, having range R. -ConstantRange Float2IntPass::seen(Instruction *I, ConstantRange R) { +void Float2IntPass::seen(Instruction *I, ConstantRange R) { DEBUG(dbgs() << "F2I: " << *I << ":" << R << "\n"); - if (SeenInsts.find(I) != SeenInsts.end()) - SeenInsts.find(I)->second = R; + auto IT = SeenInsts.find(I); + if (IT != SeenInsts.end()) + IT->second = std::move(R); else - SeenInsts.insert(std::make_pair(I, R)); - return R; + SeenInsts.insert(std::make_pair(I, std::move(R))); } // Helper - get a range representing a poison value. @@ -516,11 +516,10 @@ FunctionPass *createFloat2IntPass() { return new Float2IntLegacyPass(); } PreservedAnalyses Float2IntPass::run(Function &F, FunctionAnalysisManager &) { if (!runImpl(F)) return PreservedAnalyses::all(); - else { - // FIXME: This should also 'preserve the CFG'. - PreservedAnalyses PA; - PA.preserve<GlobalsAA>(); - return PA; - } + + PreservedAnalyses PA; + PA.preserveSet<CFGAnalyses>(); + PA.preserve<GlobalsAA>(); + return PA; } } // End namespace llvm diff --git a/contrib/llvm/lib/Transforms/Scalar/GVN.cpp b/contrib/llvm/lib/Transforms/Scalar/GVN.cpp index 0137378..ea28705 100644 --- a/contrib/llvm/lib/Transforms/Scalar/GVN.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/GVN.cpp @@ -36,7 +36,6 @@ #include "llvm/Analysis/OptimizationDiagnosticInfo.h" #include "llvm/Analysis/PHITransAddr.h" #include "llvm/Analysis/TargetLibraryInfo.h" -#include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/GlobalVariable.h" @@ -51,9 +50,12 @@ #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/SSAUpdater.h" +#include "llvm/Transforms/Utils/VNCoercion.h" + #include <vector> using namespace llvm; using namespace llvm::gvn; +using namespace llvm::VNCoercion; using namespace PatternMatch; #define DEBUG_TYPE "gvn" @@ -595,11 +597,12 @@ PreservedAnalyses GVN::run(Function &F, FunctionAnalysisManager &AM) { PreservedAnalyses PA; PA.preserve<DominatorTreeAnalysis>(); PA.preserve<GlobalsAA>(); + PA.preserve<TargetLibraryAnalysis>(); return PA; } -LLVM_DUMP_METHOD -void GVN::dump(DenseMap<uint32_t, Value*>& d) { +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +LLVM_DUMP_METHOD void GVN::dump(DenseMap<uint32_t, Value*>& d) const { errs() << "{\n"; for (DenseMap<uint32_t, Value*>::iterator I = d.begin(), E = d.end(); I != E; ++I) { @@ -608,6 +611,7 @@ void GVN::dump(DenseMap<uint32_t, Value*>& d) { } errs() << "}\n"; } +#endif /// Return true if we can prove that the value /// we're analyzing is fully available in the specified block. As we go, keep @@ -690,442 +694,6 @@ SpeculationFailure: } -/// Return true if CoerceAvailableValueToLoadType will succeed. -static bool CanCoerceMustAliasedValueToLoad(Value *StoredVal, - Type *LoadTy, - const DataLayout &DL) { - // If the loaded or stored value is an first class array or struct, don't try - // to transform them. We need to be able to bitcast to integer. - if (LoadTy->isStructTy() || LoadTy->isArrayTy() || - StoredVal->getType()->isStructTy() || - StoredVal->getType()->isArrayTy()) - return false; - - // The store has to be at least as big as the load. - if (DL.getTypeSizeInBits(StoredVal->getType()) < - DL.getTypeSizeInBits(LoadTy)) - return false; - - return true; -} - -/// If we saw a store of a value to memory, and -/// then a load from a must-aliased pointer of a different type, try to coerce -/// the stored value. LoadedTy is the type of the load we want to replace. -/// IRB is IRBuilder used to insert new instructions. -/// -/// If we can't do it, return null. -static Value *CoerceAvailableValueToLoadType(Value *StoredVal, Type *LoadedTy, - IRBuilder<> &IRB, - const DataLayout &DL) { - assert(CanCoerceMustAliasedValueToLoad(StoredVal, LoadedTy, DL) && - "precondition violation - materialization can't fail"); - - if (auto *C = dyn_cast<Constant>(StoredVal)) - if (auto *FoldedStoredVal = ConstantFoldConstant(C, DL)) - StoredVal = FoldedStoredVal; - - // If this is already the right type, just return it. - Type *StoredValTy = StoredVal->getType(); - - uint64_t StoredValSize = DL.getTypeSizeInBits(StoredValTy); - uint64_t LoadedValSize = DL.getTypeSizeInBits(LoadedTy); - - // If the store and reload are the same size, we can always reuse it. - if (StoredValSize == LoadedValSize) { - // Pointer to Pointer -> use bitcast. - if (StoredValTy->getScalarType()->isPointerTy() && - LoadedTy->getScalarType()->isPointerTy()) { - StoredVal = IRB.CreateBitCast(StoredVal, LoadedTy); - } else { - // Convert source pointers to integers, which can be bitcast. - if (StoredValTy->getScalarType()->isPointerTy()) { - StoredValTy = DL.getIntPtrType(StoredValTy); - StoredVal = IRB.CreatePtrToInt(StoredVal, StoredValTy); - } - - Type *TypeToCastTo = LoadedTy; - if (TypeToCastTo->getScalarType()->isPointerTy()) - TypeToCastTo = DL.getIntPtrType(TypeToCastTo); - - if (StoredValTy != TypeToCastTo) - StoredVal = IRB.CreateBitCast(StoredVal, TypeToCastTo); - - // Cast to pointer if the load needs a pointer type. - if (LoadedTy->getScalarType()->isPointerTy()) - StoredVal = IRB.CreateIntToPtr(StoredVal, LoadedTy); - } - - if (auto *C = dyn_cast<ConstantExpr>(StoredVal)) - if (auto *FoldedStoredVal = ConstantFoldConstant(C, DL)) - StoredVal = FoldedStoredVal; - - return StoredVal; - } - - // If the loaded value is smaller than the available value, then we can - // extract out a piece from it. If the available value is too small, then we - // can't do anything. - assert(StoredValSize >= LoadedValSize && - "CanCoerceMustAliasedValueToLoad fail"); - - // Convert source pointers to integers, which can be manipulated. - if (StoredValTy->getScalarType()->isPointerTy()) { - StoredValTy = DL.getIntPtrType(StoredValTy); - StoredVal = IRB.CreatePtrToInt(StoredVal, StoredValTy); - } - - // Convert vectors and fp to integer, which can be manipulated. - if (!StoredValTy->isIntegerTy()) { - StoredValTy = IntegerType::get(StoredValTy->getContext(), StoredValSize); - StoredVal = IRB.CreateBitCast(StoredVal, StoredValTy); - } - - // If this is a big-endian system, we need to shift the value down to the low - // bits so that a truncate will work. - if (DL.isBigEndian()) { - uint64_t ShiftAmt = DL.getTypeStoreSizeInBits(StoredValTy) - - DL.getTypeStoreSizeInBits(LoadedTy); - StoredVal = IRB.CreateLShr(StoredVal, ShiftAmt, "tmp"); - } - - // Truncate the integer to the right size now. - Type *NewIntTy = IntegerType::get(StoredValTy->getContext(), LoadedValSize); - StoredVal = IRB.CreateTrunc(StoredVal, NewIntTy, "trunc"); - - if (LoadedTy != NewIntTy) { - // If the result is a pointer, inttoptr. - if (LoadedTy->getScalarType()->isPointerTy()) - StoredVal = IRB.CreateIntToPtr(StoredVal, LoadedTy, "inttoptr"); - else - // Otherwise, bitcast. - StoredVal = IRB.CreateBitCast(StoredVal, LoadedTy, "bitcast"); - } - - if (auto *C = dyn_cast<Constant>(StoredVal)) - if (auto *FoldedStoredVal = ConstantFoldConstant(C, DL)) - StoredVal = FoldedStoredVal; - - return StoredVal; -} - -/// This function is called when we have a -/// memdep query of a load that ends up being a clobbering memory write (store, -/// memset, memcpy, memmove). This means that the write *may* provide bits used -/// by the load but we can't be sure because the pointers don't mustalias. -/// -/// Check this case to see if there is anything more we can do before we give -/// up. This returns -1 if we have to give up, or a byte number in the stored -/// value of the piece that feeds the load. -static int AnalyzeLoadFromClobberingWrite(Type *LoadTy, Value *LoadPtr, - Value *WritePtr, - uint64_t WriteSizeInBits, - const DataLayout &DL) { - // If the loaded or stored value is a first class array or struct, don't try - // to transform them. We need to be able to bitcast to integer. - if (LoadTy->isStructTy() || LoadTy->isArrayTy()) - return -1; - - int64_t StoreOffset = 0, LoadOffset = 0; - Value *StoreBase = - GetPointerBaseWithConstantOffset(WritePtr, StoreOffset, DL); - Value *LoadBase = GetPointerBaseWithConstantOffset(LoadPtr, LoadOffset, DL); - if (StoreBase != LoadBase) - return -1; - - // If the load and store are to the exact same address, they should have been - // a must alias. AA must have gotten confused. - // FIXME: Study to see if/when this happens. One case is forwarding a memset - // to a load from the base of the memset. - - // If the load and store don't overlap at all, the store doesn't provide - // anything to the load. In this case, they really don't alias at all, AA - // must have gotten confused. - uint64_t LoadSize = DL.getTypeSizeInBits(LoadTy); - - if ((WriteSizeInBits & 7) | (LoadSize & 7)) - return -1; - uint64_t StoreSize = WriteSizeInBits / 8; // Convert to bytes. - LoadSize /= 8; - - - bool isAAFailure = false; - if (StoreOffset < LoadOffset) - isAAFailure = StoreOffset+int64_t(StoreSize) <= LoadOffset; - else - isAAFailure = LoadOffset+int64_t(LoadSize) <= StoreOffset; - - if (isAAFailure) - return -1; - - // If the Load isn't completely contained within the stored bits, we don't - // have all the bits to feed it. We could do something crazy in the future - // (issue a smaller load then merge the bits in) but this seems unlikely to be - // valuable. - if (StoreOffset > LoadOffset || - StoreOffset+StoreSize < LoadOffset+LoadSize) - return -1; - - // Okay, we can do this transformation. Return the number of bytes into the - // store that the load is. - return LoadOffset-StoreOffset; -} - -/// This function is called when we have a -/// memdep query of a load that ends up being a clobbering store. -static int AnalyzeLoadFromClobberingStore(Type *LoadTy, Value *LoadPtr, - StoreInst *DepSI) { - // Cannot handle reading from store of first-class aggregate yet. - if (DepSI->getValueOperand()->getType()->isStructTy() || - DepSI->getValueOperand()->getType()->isArrayTy()) - return -1; - - const DataLayout &DL = DepSI->getModule()->getDataLayout(); - Value *StorePtr = DepSI->getPointerOperand(); - uint64_t StoreSize =DL.getTypeSizeInBits(DepSI->getValueOperand()->getType()); - return AnalyzeLoadFromClobberingWrite(LoadTy, LoadPtr, - StorePtr, StoreSize, DL); -} - -/// This function is called when we have a -/// memdep query of a load that ends up being clobbered by another load. See if -/// the other load can feed into the second load. -static int AnalyzeLoadFromClobberingLoad(Type *LoadTy, Value *LoadPtr, - LoadInst *DepLI, const DataLayout &DL){ - // Cannot handle reading from store of first-class aggregate yet. - if (DepLI->getType()->isStructTy() || DepLI->getType()->isArrayTy()) - return -1; - - Value *DepPtr = DepLI->getPointerOperand(); - uint64_t DepSize = DL.getTypeSizeInBits(DepLI->getType()); - int R = AnalyzeLoadFromClobberingWrite(LoadTy, LoadPtr, DepPtr, DepSize, DL); - if (R != -1) return R; - - // If we have a load/load clobber an DepLI can be widened to cover this load, - // then we should widen it! - int64_t LoadOffs = 0; - const Value *LoadBase = - GetPointerBaseWithConstantOffset(LoadPtr, LoadOffs, DL); - unsigned LoadSize = DL.getTypeStoreSize(LoadTy); - - unsigned Size = MemoryDependenceResults::getLoadLoadClobberFullWidthSize( - LoadBase, LoadOffs, LoadSize, DepLI); - if (Size == 0) return -1; - - // Check non-obvious conditions enforced by MDA which we rely on for being - // able to materialize this potentially available value - assert(DepLI->isSimple() && "Cannot widen volatile/atomic load!"); - assert(DepLI->getType()->isIntegerTy() && "Can't widen non-integer load"); - - return AnalyzeLoadFromClobberingWrite(LoadTy, LoadPtr, DepPtr, Size*8, DL); -} - - - -static int AnalyzeLoadFromClobberingMemInst(Type *LoadTy, Value *LoadPtr, - MemIntrinsic *MI, - const DataLayout &DL) { - // If the mem operation is a non-constant size, we can't handle it. - ConstantInt *SizeCst = dyn_cast<ConstantInt>(MI->getLength()); - if (!SizeCst) return -1; - uint64_t MemSizeInBits = SizeCst->getZExtValue()*8; - - // If this is memset, we just need to see if the offset is valid in the size - // of the memset.. - if (MI->getIntrinsicID() == Intrinsic::memset) - return AnalyzeLoadFromClobberingWrite(LoadTy, LoadPtr, MI->getDest(), - MemSizeInBits, DL); - - // If we have a memcpy/memmove, the only case we can handle is if this is a - // copy from constant memory. In that case, we can read directly from the - // constant memory. - MemTransferInst *MTI = cast<MemTransferInst>(MI); - - Constant *Src = dyn_cast<Constant>(MTI->getSource()); - if (!Src) return -1; - - GlobalVariable *GV = dyn_cast<GlobalVariable>(GetUnderlyingObject(Src, DL)); - if (!GV || !GV->isConstant()) return -1; - - // See if the access is within the bounds of the transfer. - int Offset = AnalyzeLoadFromClobberingWrite(LoadTy, LoadPtr, - MI->getDest(), MemSizeInBits, DL); - if (Offset == -1) - return Offset; - - unsigned AS = Src->getType()->getPointerAddressSpace(); - // Otherwise, see if we can constant fold a load from the constant with the - // offset applied as appropriate. - Src = ConstantExpr::getBitCast(Src, - Type::getInt8PtrTy(Src->getContext(), AS)); - Constant *OffsetCst = - ConstantInt::get(Type::getInt64Ty(Src->getContext()), (unsigned)Offset); - Src = ConstantExpr::getGetElementPtr(Type::getInt8Ty(Src->getContext()), Src, - OffsetCst); - Src = ConstantExpr::getBitCast(Src, PointerType::get(LoadTy, AS)); - if (ConstantFoldLoadFromConstPtr(Src, LoadTy, DL)) - return Offset; - return -1; -} - - -/// This function is called when we have a -/// memdep query of a load that ends up being a clobbering store. This means -/// that the store provides bits used by the load but we the pointers don't -/// mustalias. Check this case to see if there is anything more we can do -/// before we give up. -static Value *GetStoreValueForLoad(Value *SrcVal, unsigned Offset, - Type *LoadTy, - Instruction *InsertPt, const DataLayout &DL){ - LLVMContext &Ctx = SrcVal->getType()->getContext(); - - uint64_t StoreSize = (DL.getTypeSizeInBits(SrcVal->getType()) + 7) / 8; - uint64_t LoadSize = (DL.getTypeSizeInBits(LoadTy) + 7) / 8; - - IRBuilder<> Builder(InsertPt); - - // Compute which bits of the stored value are being used by the load. Convert - // to an integer type to start with. - if (SrcVal->getType()->getScalarType()->isPointerTy()) - SrcVal = Builder.CreatePtrToInt(SrcVal, - DL.getIntPtrType(SrcVal->getType())); - if (!SrcVal->getType()->isIntegerTy()) - SrcVal = Builder.CreateBitCast(SrcVal, IntegerType::get(Ctx, StoreSize*8)); - - // Shift the bits to the least significant depending on endianness. - unsigned ShiftAmt; - if (DL.isLittleEndian()) - ShiftAmt = Offset*8; - else - ShiftAmt = (StoreSize-LoadSize-Offset)*8; - - if (ShiftAmt) - SrcVal = Builder.CreateLShr(SrcVal, ShiftAmt); - - if (LoadSize != StoreSize) - SrcVal = Builder.CreateTrunc(SrcVal, IntegerType::get(Ctx, LoadSize*8)); - - return CoerceAvailableValueToLoadType(SrcVal, LoadTy, Builder, DL); -} - -/// This function is called when we have a -/// memdep query of a load that ends up being a clobbering load. This means -/// that the load *may* provide bits used by the load but we can't be sure -/// because the pointers don't mustalias. Check this case to see if there is -/// anything more we can do before we give up. -static Value *GetLoadValueForLoad(LoadInst *SrcVal, unsigned Offset, - Type *LoadTy, Instruction *InsertPt, - GVN &gvn) { - const DataLayout &DL = SrcVal->getModule()->getDataLayout(); - // If Offset+LoadTy exceeds the size of SrcVal, then we must be wanting to - // widen SrcVal out to a larger load. - unsigned SrcValStoreSize = DL.getTypeStoreSize(SrcVal->getType()); - unsigned LoadSize = DL.getTypeStoreSize(LoadTy); - if (Offset+LoadSize > SrcValStoreSize) { - assert(SrcVal->isSimple() && "Cannot widen volatile/atomic load!"); - assert(SrcVal->getType()->isIntegerTy() && "Can't widen non-integer load"); - // If we have a load/load clobber an DepLI can be widened to cover this - // load, then we should widen it to the next power of 2 size big enough! - unsigned NewLoadSize = Offset+LoadSize; - if (!isPowerOf2_32(NewLoadSize)) - NewLoadSize = NextPowerOf2(NewLoadSize); - - Value *PtrVal = SrcVal->getPointerOperand(); - - // Insert the new load after the old load. This ensures that subsequent - // memdep queries will find the new load. We can't easily remove the old - // load completely because it is already in the value numbering table. - IRBuilder<> Builder(SrcVal->getParent(), ++BasicBlock::iterator(SrcVal)); - Type *DestPTy = - IntegerType::get(LoadTy->getContext(), NewLoadSize*8); - DestPTy = PointerType::get(DestPTy, - PtrVal->getType()->getPointerAddressSpace()); - Builder.SetCurrentDebugLocation(SrcVal->getDebugLoc()); - PtrVal = Builder.CreateBitCast(PtrVal, DestPTy); - LoadInst *NewLoad = Builder.CreateLoad(PtrVal); - NewLoad->takeName(SrcVal); - NewLoad->setAlignment(SrcVal->getAlignment()); - - DEBUG(dbgs() << "GVN WIDENED LOAD: " << *SrcVal << "\n"); - DEBUG(dbgs() << "TO: " << *NewLoad << "\n"); - - // Replace uses of the original load with the wider load. On a big endian - // system, we need to shift down to get the relevant bits. - Value *RV = NewLoad; - if (DL.isBigEndian()) - RV = Builder.CreateLShr(RV, (NewLoadSize - SrcValStoreSize) * 8); - RV = Builder.CreateTrunc(RV, SrcVal->getType()); - SrcVal->replaceAllUsesWith(RV); - - // We would like to use gvn.markInstructionForDeletion here, but we can't - // because the load is already memoized into the leader map table that GVN - // tracks. It is potentially possible to remove the load from the table, - // but then there all of the operations based on it would need to be - // rehashed. Just leave the dead load around. - gvn.getMemDep().removeInstruction(SrcVal); - SrcVal = NewLoad; - } - - return GetStoreValueForLoad(SrcVal, Offset, LoadTy, InsertPt, DL); -} - - -/// This function is called when we have a -/// memdep query of a load that ends up being a clobbering mem intrinsic. -static Value *GetMemInstValueForLoad(MemIntrinsic *SrcInst, unsigned Offset, - Type *LoadTy, Instruction *InsertPt, - const DataLayout &DL){ - LLVMContext &Ctx = LoadTy->getContext(); - uint64_t LoadSize = DL.getTypeSizeInBits(LoadTy)/8; - - IRBuilder<> Builder(InsertPt); - - // We know that this method is only called when the mem transfer fully - // provides the bits for the load. - if (MemSetInst *MSI = dyn_cast<MemSetInst>(SrcInst)) { - // memset(P, 'x', 1234) -> splat('x'), even if x is a variable, and - // independently of what the offset is. - Value *Val = MSI->getValue(); - if (LoadSize != 1) - Val = Builder.CreateZExt(Val, IntegerType::get(Ctx, LoadSize*8)); - - Value *OneElt = Val; - - // Splat the value out to the right number of bits. - for (unsigned NumBytesSet = 1; NumBytesSet != LoadSize; ) { - // If we can double the number of bytes set, do it. - if (NumBytesSet*2 <= LoadSize) { - Value *ShVal = Builder.CreateShl(Val, NumBytesSet*8); - Val = Builder.CreateOr(Val, ShVal); - NumBytesSet <<= 1; - continue; - } - - // Otherwise insert one byte at a time. - Value *ShVal = Builder.CreateShl(Val, 1*8); - Val = Builder.CreateOr(OneElt, ShVal); - ++NumBytesSet; - } - - return CoerceAvailableValueToLoadType(Val, LoadTy, Builder, DL); - } - - // Otherwise, this is a memcpy/memmove from a constant global. - MemTransferInst *MTI = cast<MemTransferInst>(SrcInst); - Constant *Src = cast<Constant>(MTI->getSource()); - unsigned AS = Src->getType()->getPointerAddressSpace(); - - // Otherwise, see if we can constant fold a load from the constant with the - // offset applied as appropriate. - Src = ConstantExpr::getBitCast(Src, - Type::getInt8PtrTy(Src->getContext(), AS)); - Constant *OffsetCst = - ConstantInt::get(Type::getInt64Ty(Src->getContext()), (unsigned)Offset); - Src = ConstantExpr::getGetElementPtr(Type::getInt8Ty(Src->getContext()), Src, - OffsetCst); - Src = ConstantExpr::getBitCast(Src, PointerType::get(LoadTy, AS)); - return ConstantFoldLoadFromConstPtr(Src, LoadTy, DL); -} /// Given a set of loads specified by ValuesPerBlock, @@ -1171,7 +739,7 @@ Value *AvailableValue::MaterializeAdjustedValue(LoadInst *LI, if (isSimpleValue()) { Res = getSimpleValue(); if (Res->getType() != LoadTy) { - Res = GetStoreValueForLoad(Res, Offset, LoadTy, InsertPt, DL); + Res = getStoreValueForLoad(Res, Offset, LoadTy, InsertPt, DL); DEBUG(dbgs() << "GVN COERCED NONLOCAL VAL:\nOffset: " << Offset << " " << *getSimpleValue() << '\n' @@ -1182,14 +750,20 @@ Value *AvailableValue::MaterializeAdjustedValue(LoadInst *LI, if (Load->getType() == LoadTy && Offset == 0) { Res = Load; } else { - Res = GetLoadValueForLoad(Load, Offset, LoadTy, InsertPt, gvn); - + Res = getLoadValueForLoad(Load, Offset, LoadTy, InsertPt, DL); + // We would like to use gvn.markInstructionForDeletion here, but we can't + // because the load is already memoized into the leader map table that GVN + // tracks. It is potentially possible to remove the load from the table, + // but then there all of the operations based on it would need to be + // rehashed. Just leave the dead load around. + gvn.getMemDep().removeInstruction(Load); DEBUG(dbgs() << "GVN COERCED NONLOCAL LOAD:\nOffset: " << Offset << " " << *getCoercedLoadValue() << '\n' - << *Res << '\n' << "\n\n\n"); + << *Res << '\n' + << "\n\n\n"); } } else if (isMemIntrinValue()) { - Res = GetMemInstValueForLoad(getMemIntrinValue(), Offset, LoadTy, + Res = getMemInstValueForLoad(getMemIntrinValue(), Offset, LoadTy, InsertPt, DL); DEBUG(dbgs() << "GVN COERCED NONLOCAL MEM INTRIN:\nOffset: " << Offset << " " << *getMemIntrinValue() << '\n' @@ -1258,7 +832,7 @@ bool GVN::AnalyzeLoadAvailability(LoadInst *LI, MemDepResult DepInfo, // Can't forward from non-atomic to atomic without violating memory model. if (Address && LI->isAtomic() <= DepSI->isAtomic()) { int Offset = - AnalyzeLoadFromClobberingStore(LI->getType(), Address, DepSI); + analyzeLoadFromClobberingStore(LI->getType(), Address, DepSI, DL); if (Offset != -1) { Res = AvailableValue::get(DepSI->getValueOperand(), Offset); return true; @@ -1276,7 +850,7 @@ bool GVN::AnalyzeLoadAvailability(LoadInst *LI, MemDepResult DepInfo, // Can't forward from non-atomic to atomic without violating memory model. if (DepLI != LI && Address && LI->isAtomic() <= DepLI->isAtomic()) { int Offset = - AnalyzeLoadFromClobberingLoad(LI->getType(), Address, DepLI, DL); + analyzeLoadFromClobberingLoad(LI->getType(), Address, DepLI, DL); if (Offset != -1) { Res = AvailableValue::getLoad(DepLI, Offset); @@ -1289,7 +863,7 @@ bool GVN::AnalyzeLoadAvailability(LoadInst *LI, MemDepResult DepInfo, // forward a value on from it. if (MemIntrinsic *DepMI = dyn_cast<MemIntrinsic>(DepInfo.getInst())) { if (Address && !LI->isAtomic()) { - int Offset = AnalyzeLoadFromClobberingMemInst(LI->getType(), Address, + int Offset = analyzeLoadFromClobberingMemInst(LI->getType(), Address, DepMI, DL); if (Offset != -1) { Res = AvailableValue::getMI(DepMI, Offset); @@ -1334,7 +908,7 @@ bool GVN::AnalyzeLoadAvailability(LoadInst *LI, MemDepResult DepInfo, // different types if we have to. If the stored value is larger or equal to // the loaded value, we can reuse it. if (S->getValueOperand()->getType() != LI->getType() && - !CanCoerceMustAliasedValueToLoad(S->getValueOperand(), + !canCoerceMustAliasedValueToLoad(S->getValueOperand(), LI->getType(), DL)) return false; @@ -1351,7 +925,7 @@ bool GVN::AnalyzeLoadAvailability(LoadInst *LI, MemDepResult DepInfo, // If the stored value is larger or equal to the loaded value, we can reuse // it. if (LD->getType() != LI->getType() && - !CanCoerceMustAliasedValueToLoad(LD, LI->getType(), DL)) + !canCoerceMustAliasedValueToLoad(LD, LI->getType(), DL)) return false; // Can't forward from non-atomic to atomic without violating memory model. @@ -1592,8 +1166,9 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock, auto *NewLoad = new LoadInst(LoadPtr, LI->getName()+".pre", LI->isVolatile(), LI->getAlignment(), - LI->getOrdering(), LI->getSynchScope(), + LI->getOrdering(), LI->getSyncScopeID(), UnavailablePred->getTerminator()); + NewLoad->setDebugLoc(LI->getDebugLoc()); // Transfer the old load's AA tags to the new load. AAMDNodes Tags; @@ -1628,7 +1203,7 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock, V->takeName(LI); if (Instruction *I = dyn_cast<Instruction>(V)) I->setDebugLoc(LI->getDebugLoc()); - if (V->getType()->getScalarType()->isPointerTy()) + if (V->getType()->isPtrOrPtrVectorTy()) MD->invalidateCachedPointerInfo(V); markInstructionForDeletion(LI); ORE->emit(OptimizationRemark(DEBUG_TYPE, "LoadPRE", LI) @@ -1713,9 +1288,9 @@ bool GVN::processNonLocalLoad(LoadInst *LI) { // If instruction I has debug info, then we should not update it. // Also, if I has a null DebugLoc, then it is still potentially incorrect // to propagate LI's DebugLoc because LI may not post-dominate I. - if (LI->getDebugLoc() && ValuesPerBlock.size() != 1) + if (LI->getDebugLoc() && LI->getParent() == I->getParent()) I->setDebugLoc(LI->getDebugLoc()); - if (V->getType()->getScalarType()->isPointerTy()) + if (V->getType()->isPtrOrPtrVectorTy()) MD->invalidateCachedPointerInfo(V); markInstructionForDeletion(LI); ++NumGVNLoad; @@ -1795,7 +1370,7 @@ static void patchReplacementInstruction(Instruction *I, Value *Repl) { // Patch the replacement so that it is not more restrictive than the value // being replaced. - // Note that if 'I' is a load being replaced by some operation, + // Note that if 'I' is a load being replaced by some operation, // for example, by an arithmetic operation, then andIRFlags() // would just erase all math flags from the original arithmetic // operation, which is clearly not wanted and not needed. @@ -1869,7 +1444,7 @@ bool GVN::processLoad(LoadInst *L) { reportLoadElim(L, AvailableValue, ORE); // Tell MDA to rexamine the reused pointer since we might have more // information after forwarding it. - if (MD && AvailableValue->getType()->getScalarType()->isPointerTy()) + if (MD && AvailableValue->getType()->isPtrOrPtrVectorTy()) MD->invalidateCachedPointerInfo(AvailableValue); return true; } @@ -2024,7 +1599,7 @@ bool GVN::propagateEquality(Value *LHS, Value *RHS, const BasicBlockEdge &Root, // RHS neither 'true' nor 'false' - bail out. continue; // Whether RHS equals 'true'. Otherwise it equals 'false'. - bool isKnownTrue = CI->isAllOnesValue(); + bool isKnownTrue = CI->isMinusOne(); bool isKnownFalse = !isKnownTrue; // If "A && B" is known true then both A and B are known true. If "A || B" @@ -2113,7 +1688,7 @@ bool GVN::processInstruction(Instruction *I) { // example if it determines that %y is equal to %x then the instruction // "%z = and i32 %x, %y" becomes "%z = and i32 %x, %x" which we now simplify. const DataLayout &DL = I->getModule()->getDataLayout(); - if (Value *V = SimplifyInstruction(I, DL, TLI, DT, AC)) { + if (Value *V = SimplifyInstruction(I, {DL, TLI, DT, AC})) { bool Changed = false; if (!I->use_empty()) { I->replaceAllUsesWith(V); @@ -2124,7 +1699,7 @@ bool GVN::processInstruction(Instruction *I) { Changed = true; } if (Changed) { - if (MD && V->getType()->getScalarType()->isPointerTy()) + if (MD && V->getType()->isPtrOrPtrVectorTy()) MD->invalidateCachedPointerInfo(V); ++NumGVNSimpl; return true; @@ -2187,11 +1762,11 @@ bool GVN::processInstruction(Instruction *I) { for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end(); i != e; ++i) { - BasicBlock *Dst = i.getCaseSuccessor(); + BasicBlock *Dst = i->getCaseSuccessor(); // If there is only a single edge, propagate the case value into it. if (SwitchEdges.lookup(Dst) == 1) { BasicBlockEdge E(Parent, Dst); - Changed |= propagateEquality(SwitchCond, i.getCaseValue(), E, true); + Changed |= propagateEquality(SwitchCond, i->getCaseValue(), E, true); } } return Changed; @@ -2235,7 +1810,7 @@ bool GVN::processInstruction(Instruction *I) { // Remove it! patchAndReplaceAllUsesWith(I, Repl); - if (MD && Repl->getType()->getScalarType()->isPointerTy()) + if (MD && Repl->getType()->isPtrOrPtrVectorTy()) MD->invalidateCachedPointerInfo(Repl); markInstructionForDeletion(I); return true; @@ -2483,7 +2058,7 @@ bool GVN::performScalarPRE(Instruction *CurInst) { if (!performScalarPREInsertion(PREInstr, PREPred, ValNo)) { // If we failed insertion, make sure we remove the instruction. DEBUG(verifyRemoved(PREInstr)); - delete PREInstr; + PREInstr->deleteValue(); return false; } } @@ -2509,7 +2084,7 @@ bool GVN::performScalarPRE(Instruction *CurInst) { addToLeaderTable(ValNo, Phi, CurrentBlock); Phi->setDebugLoc(CurInst->getDebugLoc()); CurInst->replaceAllUsesWith(Phi); - if (MD && Phi->getType()->getScalarType()->isPointerTy()) + if (MD && Phi->getType()->isPtrOrPtrVectorTy()) MD->invalidateCachedPointerInfo(Phi); VN.erase(CurInst); removeFromLeaderTable(ValNo, CurInst, CurrentBlock); @@ -2581,21 +2156,12 @@ bool GVN::iterateOnFunction(Function &F) { // Top-down walk of the dominator tree bool Changed = false; - // Save the blocks this function have before transformation begins. GVN may - // split critical edge, and hence may invalidate the RPO/DT iterator. - // - std::vector<BasicBlock *> BBVect; - BBVect.reserve(256); // Needed for value numbering with phi construction to work. + // RPOT walks the graph in its constructor and will not be invalidated during + // processBlock. ReversePostOrderTraversal<Function *> RPOT(&F); - for (ReversePostOrderTraversal<Function *>::rpo_iterator RI = RPOT.begin(), - RE = RPOT.end(); - RI != RE; ++RI) - BBVect.push_back(*RI); - - for (std::vector<BasicBlock *>::iterator I = BBVect.begin(), E = BBVect.end(); - I != E; I++) - Changed |= processBlock(*I); + for (BasicBlock *BB : RPOT) + Changed |= processBlock(BB); return Changed; } @@ -2783,6 +2349,7 @@ public: AU.addPreserved<DominatorTreeWrapperPass>(); AU.addPreserved<GlobalsAAWrapperPass>(); + AU.addPreserved<TargetLibraryInfoWrapperPass>(); AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); } diff --git a/contrib/llvm/lib/Transforms/Scalar/GVNHoist.cpp b/contrib/llvm/lib/Transforms/Scalar/GVNHoist.cpp index f8e1d2e..29de792 100644 --- a/contrib/llvm/lib/Transforms/Scalar/GVNHoist.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/GVNHoist.cpp @@ -17,16 +17,40 @@ // is disabled in the following cases. // 1. Scalars across calls. // 2. geps when corresponding load/store cannot be hoisted. +// +// TODO: Hoist from >2 successors. Currently GVNHoist will not hoist stores +// in this case because it works on two instructions at a time. +// entry: +// switch i32 %c1, label %exit1 [ +// i32 0, label %sw0 +// i32 1, label %sw1 +// ] +// +// sw0: +// store i32 1, i32* @G +// br label %exit +// +// sw1: +// store i32 1, i32* @G +// br label %exit +// +// exit1: +// store i32 1, i32* @G +// ret void +// exit: +// ret void //===----------------------------------------------------------------------===// -#include "llvm/Transforms/Scalar/GVN.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/GlobalsModRef.h" +#include "llvm/Analysis/MemorySSA.h" +#include "llvm/Analysis/MemorySSAUpdater.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Scalar/GVN.h" #include "llvm/Transforms/Utils/Local.h" -#include "llvm/Transforms/Utils/MemorySSA.h" using namespace llvm; @@ -60,7 +84,7 @@ static cl::opt<int> cl::desc("Maximum length of dependent chains to hoist " "(default = 10, unlimited = -1)")); -namespace { +namespace llvm { // Provides a sorting function based on the execution order of two instructions. struct SortByDFSIn { @@ -72,13 +96,6 @@ public: // Returns true when A executes before B. bool operator()(const Instruction *A, const Instruction *B) const { - // FIXME: libc++ has a std::sort() algorithm that will call the compare - // function on the same element. Once PR20837 is fixed and some more years - // pass by and all the buildbots have moved to a corrected std::sort(), - // enable the following assert: - // - // assert(A != B); - const BasicBlock *BA = A->getParent(); const BasicBlock *BB = B->getParent(); unsigned ADFS, BDFS; @@ -202,6 +219,7 @@ public: GVNHoist(DominatorTree *DT, AliasAnalysis *AA, MemoryDependenceResults *MD, MemorySSA *MSSA) : DT(DT), AA(AA), MD(MD), MSSA(MSSA), + MSSAUpdater(make_unique<MemorySSAUpdater>(MSSA)), HoistingGeps(false), HoistedCtr(0) { } @@ -249,9 +267,11 @@ private: AliasAnalysis *AA; MemoryDependenceResults *MD; MemorySSA *MSSA; + std::unique_ptr<MemorySSAUpdater> MSSAUpdater; const bool HoistingGeps; DenseMap<const Value *, unsigned> DFSNumber; BBSideEffectsSet BBSideEffects; + DenseSet<const BasicBlock*> HoistBarrier; int HoistedCtr; enum InsKind { Unknown, Scalar, Load, Store }; @@ -307,8 +327,8 @@ private: continue; } - // Check for end of function, calls that do not return, etc. - if (!isGuaranteedToTransferExecutionToSuccessor(BB->getTerminator())) + // We reached the leaf Basic Block => not all paths have this instruction. + if (!BB->getTerminator()->getNumSuccessors()) return false; // When reaching the back-edge of a loop, there may be a path through the @@ -360,7 +380,7 @@ private: ReachedNewPt = true; } } - if (defClobbersUseOrDef(Def, MU, *AA)) + if (MemorySSAUtil::defClobbersUseOrDef(Def, MU, *AA)) return true; } @@ -387,7 +407,8 @@ private: // executed between the execution of NewBB and OldBB. Hoisting an expression // from OldBB into NewBB has to be safe on all execution paths. for (auto I = idf_begin(OldBB), E = idf_end(OldBB); I != E;) { - if (*I == NewBB) { + const BasicBlock *BB = *I; + if (BB == NewBB) { // Stop traversal when reaching HoistPt. I.skipChildren(); continue; @@ -398,11 +419,17 @@ private: return true; // Impossible to hoist with exceptions on the path. - if (hasEH(*I)) + if (hasEH(BB)) + return true; + + // No such instruction after HoistBarrier in a basic block was + // selected for hoisting so instructions selected within basic block with + // a hoist barrier can be hoisted. + if ((BB != OldBB) && HoistBarrier.count(BB)) return true; // Check that we do not move a store past loads. - if (hasMemoryUse(NewPt, Def, *I)) + if (hasMemoryUse(NewPt, Def, BB)) return true; // -1 is unlimited number of blocks on all paths. @@ -419,17 +446,18 @@ private: // Decrement by 1 NBBsOnAllPaths for each block between HoistPt and BB, and // return true when the counter NBBsOnAllPaths reaches 0, except when it is // initialized to -1 which is unlimited. - bool hasEHOnPath(const BasicBlock *HoistPt, const BasicBlock *BB, + bool hasEHOnPath(const BasicBlock *HoistPt, const BasicBlock *SrcBB, int &NBBsOnAllPaths) { - assert(DT->dominates(HoistPt, BB) && "Invalid path"); + assert(DT->dominates(HoistPt, SrcBB) && "Invalid path"); // Walk all basic blocks reachable in depth-first iteration on // the inverse CFG from BBInsn to NewHoistPt. These blocks are all the // blocks that may be executed between the execution of NewHoistPt and // BBInsn. Hoisting an expression from BBInsn into NewHoistPt has to be safe // on all execution paths. - for (auto I = idf_begin(BB), E = idf_end(BB); I != E;) { - if (*I == HoistPt) { + for (auto I = idf_begin(SrcBB), E = idf_end(SrcBB); I != E;) { + const BasicBlock *BB = *I; + if (BB == HoistPt) { // Stop traversal when reaching NewHoistPt. I.skipChildren(); continue; @@ -440,7 +468,13 @@ private: return true; // Impossible to hoist with exceptions on the path. - if (hasEH(*I)) + if (hasEH(BB)) + return true; + + // No such instruction after HoistBarrier in a basic block was + // selected for hoisting so instructions selected within basic block with + // a hoist barrier can be hoisted. + if ((BB != SrcBB) && HoistBarrier.count(BB)) return true; // -1 is unlimited number of blocks on all paths. @@ -626,6 +660,8 @@ private: // Compute the insertion point and the list of expressions to be hoisted. SmallVecInsn InstructionsToHoist; for (auto I : V) + // We don't need to check for hoist-barriers here because if + // I->getParent() is a barrier then I precedes the barrier. if (!hasEH(I->getParent())) InstructionsToHoist.push_back(I); @@ -809,9 +845,9 @@ private: // legal when the ld/st is not moved past its current definition. MemoryAccess *Def = OldMemAcc->getDefiningAccess(); NewMemAcc = - MSSA->createMemoryAccessInBB(Repl, Def, HoistPt, MemorySSA::End); + MSSAUpdater->createMemoryAccessInBB(Repl, Def, HoistPt, MemorySSA::End); OldMemAcc->replaceAllUsesWith(NewMemAcc); - MSSA->removeMemoryAccess(OldMemAcc); + MSSAUpdater->removeMemoryAccess(OldMemAcc); } } @@ -850,7 +886,7 @@ private: // Update the uses of the old MSSA access with NewMemAcc. MemoryAccess *OldMA = MSSA->getMemoryAccess(I); OldMA->replaceAllUsesWith(NewMemAcc); - MSSA->removeMemoryAccess(OldMA); + MSSAUpdater->removeMemoryAccess(OldMA); } Repl->andIRFlags(I); @@ -872,7 +908,7 @@ private: auto In = Phi->incoming_values(); if (all_of(In, [&](Use &U) { return U == NewMemAcc; })) { Phi->replaceAllUsesWith(NewMemAcc); - MSSA->removeMemoryAccess(Phi); + MSSAUpdater->removeMemoryAccess(Phi); } } } @@ -896,6 +932,12 @@ private: for (BasicBlock *BB : depth_first(&F.getEntryBlock())) { int InstructionNb = 0; for (Instruction &I1 : *BB) { + // If I1 cannot guarantee progress, subsequent instructions + // in BB cannot be hoisted anyways. + if (!isGuaranteedToTransferExecutionToSuccessor(&I1)) { + HoistBarrier.insert(BB); + break; + } // Only hoist the first instructions in BB up to MaxDepthInBB. Hoisting // deeper may increase the register pressure and compilation time. if (MaxDepthInBB != -1 && InstructionNb++ >= MaxDepthInBB) @@ -969,6 +1011,7 @@ public: AU.addRequired<MemorySSAWrapperPass>(); AU.addPreserved<DominatorTreeWrapperPass>(); AU.addPreserved<MemorySSAWrapperPass>(); + AU.addPreserved<GlobalsAAWrapperPass>(); } }; } // namespace @@ -985,6 +1028,7 @@ PreservedAnalyses GVNHoistPass::run(Function &F, FunctionAnalysisManager &AM) { PreservedAnalyses PA; PA.preserve<DominatorTreeAnalysis>(); PA.preserve<MemorySSAAnalysis>(); + PA.preserve<GlobalsAA>(); return PA; } diff --git a/contrib/llvm/lib/Transforms/Scalar/GVNSink.cpp b/contrib/llvm/lib/Transforms/Scalar/GVNSink.cpp new file mode 100644 index 0000000..5fd2dfc --- /dev/null +++ b/contrib/llvm/lib/Transforms/Scalar/GVNSink.cpp @@ -0,0 +1,883 @@ +//===- GVNSink.cpp - sink expressions into successors -------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file GVNSink.cpp +/// This pass attempts to sink instructions into successors, reducing static +/// instruction count and enabling if-conversion. +/// +/// We use a variant of global value numbering to decide what can be sunk. +/// Consider: +/// +/// [ %a1 = add i32 %b, 1 ] [ %c1 = add i32 %d, 1 ] +/// [ %a2 = xor i32 %a1, 1 ] [ %c2 = xor i32 %c1, 1 ] +/// \ / +/// [ %e = phi i32 %a2, %c2 ] +/// [ add i32 %e, 4 ] +/// +/// +/// GVN would number %a1 and %c1 differently because they compute different +/// results - the VN of an instruction is a function of its opcode and the +/// transitive closure of its operands. This is the key property for hoisting +/// and CSE. +/// +/// What we want when sinking however is for a numbering that is a function of +/// the *uses* of an instruction, which allows us to answer the question "if I +/// replace %a1 with %c1, will it contribute in an equivalent way to all +/// successive instructions?". The PostValueTable class in GVN provides this +/// mapping. +/// +//===----------------------------------------------------------------------===// + +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/DenseMapInfo.h" +#include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/Hashing.h" +#include "llvm/ADT/Optional.h" +#include "llvm/ADT/PostOrderIterator.h" +#include "llvm/ADT/SCCIterator.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/Analysis/GlobalsModRef.h" +#include "llvm/Analysis/MemorySSA.h" +#include "llvm/Analysis/PostDominators.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Verifier.h" +#include "llvm/Support/MathExtras.h" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Scalar/GVN.h" +#include "llvm/Transforms/Scalar/GVNExpression.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Local.h" +#include <unordered_set> +using namespace llvm; + +#define DEBUG_TYPE "gvn-sink" + +STATISTIC(NumRemoved, "Number of instructions removed"); + +namespace llvm { +namespace GVNExpression { + +LLVM_DUMP_METHOD void Expression::dump() const { + print(dbgs()); + dbgs() << "\n"; +} + +} +} + +namespace { + +static bool isMemoryInst(const Instruction *I) { + return isa<LoadInst>(I) || isa<StoreInst>(I) || + (isa<InvokeInst>(I) && !cast<InvokeInst>(I)->doesNotAccessMemory()) || + (isa<CallInst>(I) && !cast<CallInst>(I)->doesNotAccessMemory()); +} + +/// Iterates through instructions in a set of blocks in reverse order from the +/// first non-terminator. For example (assume all blocks have size n): +/// LockstepReverseIterator I([B1, B2, B3]); +/// *I-- = [B1[n], B2[n], B3[n]]; +/// *I-- = [B1[n-1], B2[n-1], B3[n-1]]; +/// *I-- = [B1[n-2], B2[n-2], B3[n-2]]; +/// ... +/// +/// It continues until all blocks have been exhausted. Use \c getActiveBlocks() +/// to +/// determine which blocks are still going and the order they appear in the +/// list returned by operator*. +class LockstepReverseIterator { + ArrayRef<BasicBlock *> Blocks; + SmallPtrSet<BasicBlock *, 4> ActiveBlocks; + SmallVector<Instruction *, 4> Insts; + bool Fail; + +public: + LockstepReverseIterator(ArrayRef<BasicBlock *> Blocks) : Blocks(Blocks) { + reset(); + } + + void reset() { + Fail = false; + ActiveBlocks.clear(); + for (BasicBlock *BB : Blocks) + ActiveBlocks.insert(BB); + Insts.clear(); + for (BasicBlock *BB : Blocks) { + if (BB->size() <= 1) { + // Block wasn't big enough - only contained a terminator. + ActiveBlocks.erase(BB); + continue; + } + Insts.push_back(BB->getTerminator()->getPrevNode()); + } + if (Insts.empty()) + Fail = true; + } + + bool isValid() const { return !Fail; } + ArrayRef<Instruction *> operator*() const { return Insts; } + SmallPtrSet<BasicBlock *, 4> &getActiveBlocks() { return ActiveBlocks; } + + void restrictToBlocks(SmallPtrSetImpl<BasicBlock *> &Blocks) { + for (auto II = Insts.begin(); II != Insts.end();) { + if (std::find(Blocks.begin(), Blocks.end(), (*II)->getParent()) == + Blocks.end()) { + ActiveBlocks.erase((*II)->getParent()); + II = Insts.erase(II); + } else { + ++II; + } + } + } + + void operator--() { + if (Fail) + return; + SmallVector<Instruction *, 4> NewInsts; + for (auto *Inst : Insts) { + if (Inst == &Inst->getParent()->front()) + ActiveBlocks.erase(Inst->getParent()); + else + NewInsts.push_back(Inst->getPrevNode()); + } + if (NewInsts.empty()) { + Fail = true; + return; + } + Insts = NewInsts; + } +}; + +//===----------------------------------------------------------------------===// + +/// Candidate solution for sinking. There may be different ways to +/// sink instructions, differing in the number of instructions sunk, +/// the number of predecessors sunk from and the number of PHIs +/// required. +struct SinkingInstructionCandidate { + unsigned NumBlocks; + unsigned NumInstructions; + unsigned NumPHIs; + unsigned NumMemoryInsts; + int Cost = -1; + SmallVector<BasicBlock *, 4> Blocks; + + void calculateCost(unsigned NumOrigPHIs, unsigned NumOrigBlocks) { + unsigned NumExtraPHIs = NumPHIs - NumOrigPHIs; + unsigned SplitEdgeCost = (NumOrigBlocks > NumBlocks) ? 2 : 0; + Cost = (NumInstructions * (NumBlocks - 1)) - + (NumExtraPHIs * + NumExtraPHIs) // PHIs are expensive, so make sure they're worth it. + - SplitEdgeCost; + } + bool operator>(const SinkingInstructionCandidate &Other) const { + return Cost > Other.Cost; + } +}; + +#ifndef NDEBUG +llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, + const SinkingInstructionCandidate &C) { + OS << "<Candidate Cost=" << C.Cost << " #Blocks=" << C.NumBlocks + << " #Insts=" << C.NumInstructions << " #PHIs=" << C.NumPHIs << ">"; + return OS; +} +#endif + +//===----------------------------------------------------------------------===// + +/// Describes a PHI node that may or may not exist. These track the PHIs +/// that must be created if we sunk a sequence of instructions. It provides +/// a hash function for efficient equality comparisons. +class ModelledPHI { + SmallVector<Value *, 4> Values; + SmallVector<BasicBlock *, 4> Blocks; + +public: + ModelledPHI() {} + ModelledPHI(const PHINode *PN) { + for (unsigned I = 0, E = PN->getNumIncomingValues(); I != E; ++I) + Blocks.push_back(PN->getIncomingBlock(I)); + std::sort(Blocks.begin(), Blocks.end()); + + // This assumes the PHI is already well-formed and there aren't conflicting + // incoming values for the same block. + for (auto *B : Blocks) + Values.push_back(PN->getIncomingValueForBlock(B)); + } + /// Create a dummy ModelledPHI that will compare unequal to any other ModelledPHI + /// without the same ID. + /// \note This is specifically for DenseMapInfo - do not use this! + static ModelledPHI createDummy(size_t ID) { + ModelledPHI M; + M.Values.push_back(reinterpret_cast<Value*>(ID)); + return M; + } + + /// Create a PHI from an array of incoming values and incoming blocks. + template <typename VArray, typename BArray> + ModelledPHI(const VArray &V, const BArray &B) { + std::copy(V.begin(), V.end(), std::back_inserter(Values)); + std::copy(B.begin(), B.end(), std::back_inserter(Blocks)); + } + + /// Create a PHI from [I[OpNum] for I in Insts]. + template <typename BArray> + ModelledPHI(ArrayRef<Instruction *> Insts, unsigned OpNum, const BArray &B) { + std::copy(B.begin(), B.end(), std::back_inserter(Blocks)); + for (auto *I : Insts) + Values.push_back(I->getOperand(OpNum)); + } + + /// Restrict the PHI's contents down to only \c NewBlocks. + /// \c NewBlocks must be a subset of \c this->Blocks. + void restrictToBlocks(const SmallPtrSetImpl<BasicBlock *> &NewBlocks) { + auto BI = Blocks.begin(); + auto VI = Values.begin(); + while (BI != Blocks.end()) { + assert(VI != Values.end()); + if (std::find(NewBlocks.begin(), NewBlocks.end(), *BI) == + NewBlocks.end()) { + BI = Blocks.erase(BI); + VI = Values.erase(VI); + } else { + ++BI; + ++VI; + } + } + assert(Blocks.size() == NewBlocks.size()); + } + + ArrayRef<Value *> getValues() const { return Values; } + + bool areAllIncomingValuesSame() const { + return all_of(Values, [&](Value *V) { return V == Values[0]; }); + } + bool areAllIncomingValuesSameType() const { + return all_of( + Values, [&](Value *V) { return V->getType() == Values[0]->getType(); }); + } + bool areAnyIncomingValuesConstant() const { + return any_of(Values, [&](Value *V) { return isa<Constant>(V); }); + } + // Hash functor + unsigned hash() const { + return (unsigned)hash_combine_range(Values.begin(), Values.end()); + } + bool operator==(const ModelledPHI &Other) const { + return Values == Other.Values && Blocks == Other.Blocks; + } +}; + +template <typename ModelledPHI> struct DenseMapInfo { + static inline ModelledPHI &getEmptyKey() { + static ModelledPHI Dummy = ModelledPHI::createDummy(0); + return Dummy; + } + static inline ModelledPHI &getTombstoneKey() { + static ModelledPHI Dummy = ModelledPHI::createDummy(1); + return Dummy; + } + static unsigned getHashValue(const ModelledPHI &V) { return V.hash(); } + static bool isEqual(const ModelledPHI &LHS, const ModelledPHI &RHS) { + return LHS == RHS; + } +}; + +typedef DenseSet<ModelledPHI, DenseMapInfo<ModelledPHI>> ModelledPHISet; + +//===----------------------------------------------------------------------===// +// ValueTable +//===----------------------------------------------------------------------===// +// This is a value number table where the value number is a function of the +// *uses* of a value, rather than its operands. Thus, if VN(A) == VN(B) we know +// that the program would be equivalent if we replaced A with PHI(A, B). +//===----------------------------------------------------------------------===// + +/// A GVN expression describing how an instruction is used. The operands +/// field of BasicExpression is used to store uses, not operands. +/// +/// This class also contains fields for discriminators used when determining +/// equivalence of instructions with sideeffects. +class InstructionUseExpr : public GVNExpression::BasicExpression { + unsigned MemoryUseOrder = -1; + bool Volatile = false; + +public: + InstructionUseExpr(Instruction *I, ArrayRecycler<Value *> &R, + BumpPtrAllocator &A) + : GVNExpression::BasicExpression(I->getNumUses()) { + allocateOperands(R, A); + setOpcode(I->getOpcode()); + setType(I->getType()); + + for (auto &U : I->uses()) + op_push_back(U.getUser()); + std::sort(op_begin(), op_end()); + } + void setMemoryUseOrder(unsigned MUO) { MemoryUseOrder = MUO; } + void setVolatile(bool V) { Volatile = V; } + + virtual hash_code getHashValue() const { + return hash_combine(GVNExpression::BasicExpression::getHashValue(), + MemoryUseOrder, Volatile); + } + + template <typename Function> hash_code getHashValue(Function MapFn) { + hash_code H = + hash_combine(getOpcode(), getType(), MemoryUseOrder, Volatile); + for (auto *V : operands()) + H = hash_combine(H, MapFn(V)); + return H; + } +}; + +class ValueTable { + DenseMap<Value *, uint32_t> ValueNumbering; + DenseMap<GVNExpression::Expression *, uint32_t> ExpressionNumbering; + DenseMap<size_t, uint32_t> HashNumbering; + BumpPtrAllocator Allocator; + ArrayRecycler<Value *> Recycler; + uint32_t nextValueNumber; + + /// Create an expression for I based on its opcode and its uses. If I + /// touches or reads memory, the expression is also based upon its memory + /// order - see \c getMemoryUseOrder(). + InstructionUseExpr *createExpr(Instruction *I) { + InstructionUseExpr *E = + new (Allocator) InstructionUseExpr(I, Recycler, Allocator); + if (isMemoryInst(I)) + E->setMemoryUseOrder(getMemoryUseOrder(I)); + + if (CmpInst *C = dyn_cast<CmpInst>(I)) { + CmpInst::Predicate Predicate = C->getPredicate(); + E->setOpcode((C->getOpcode() << 8) | Predicate); + } + return E; + } + + /// Helper to compute the value number for a memory instruction + /// (LoadInst/StoreInst), including checking the memory ordering and + /// volatility. + template <class Inst> InstructionUseExpr *createMemoryExpr(Inst *I) { + if (isStrongerThanUnordered(I->getOrdering()) || I->isAtomic()) + return nullptr; + InstructionUseExpr *E = createExpr(I); + E->setVolatile(I->isVolatile()); + return E; + } + +public: + /// Returns the value number for the specified value, assigning + /// it a new number if it did not have one before. + uint32_t lookupOrAdd(Value *V) { + auto VI = ValueNumbering.find(V); + if (VI != ValueNumbering.end()) + return VI->second; + + if (!isa<Instruction>(V)) { + ValueNumbering[V] = nextValueNumber; + return nextValueNumber++; + } + + Instruction *I = cast<Instruction>(V); + InstructionUseExpr *exp = nullptr; + switch (I->getOpcode()) { + case Instruction::Load: + exp = createMemoryExpr(cast<LoadInst>(I)); + break; + case Instruction::Store: + exp = createMemoryExpr(cast<StoreInst>(I)); + break; + case Instruction::Call: + case Instruction::Invoke: + case Instruction::Add: + case Instruction::FAdd: + case Instruction::Sub: + case Instruction::FSub: + case Instruction::Mul: + case Instruction::FMul: + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::FDiv: + case Instruction::URem: + case Instruction::SRem: + case Instruction::FRem: + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: + case Instruction::ICmp: + case Instruction::FCmp: + case Instruction::Trunc: + case Instruction::ZExt: + case Instruction::SExt: + case Instruction::FPToUI: + case Instruction::FPToSI: + case Instruction::UIToFP: + case Instruction::SIToFP: + case Instruction::FPTrunc: + case Instruction::FPExt: + case Instruction::PtrToInt: + case Instruction::IntToPtr: + case Instruction::BitCast: + case Instruction::Select: + case Instruction::ExtractElement: + case Instruction::InsertElement: + case Instruction::ShuffleVector: + case Instruction::InsertValue: + case Instruction::GetElementPtr: + exp = createExpr(I); + break; + default: + break; + } + + if (!exp) { + ValueNumbering[V] = nextValueNumber; + return nextValueNumber++; + } + + uint32_t e = ExpressionNumbering[exp]; + if (!e) { + hash_code H = exp->getHashValue([=](Value *V) { return lookupOrAdd(V); }); + auto I = HashNumbering.find(H); + if (I != HashNumbering.end()) { + e = I->second; + } else { + e = nextValueNumber++; + HashNumbering[H] = e; + ExpressionNumbering[exp] = e; + } + } + ValueNumbering[V] = e; + return e; + } + + /// Returns the value number of the specified value. Fails if the value has + /// not yet been numbered. + uint32_t lookup(Value *V) const { + auto VI = ValueNumbering.find(V); + assert(VI != ValueNumbering.end() && "Value not numbered?"); + return VI->second; + } + + /// Removes all value numberings and resets the value table. + void clear() { + ValueNumbering.clear(); + ExpressionNumbering.clear(); + HashNumbering.clear(); + Recycler.clear(Allocator); + nextValueNumber = 1; + } + + ValueTable() : nextValueNumber(1) {} + + /// \c Inst uses or touches memory. Return an ID describing the memory state + /// at \c Inst such that if getMemoryUseOrder(I1) == getMemoryUseOrder(I2), + /// the exact same memory operations happen after I1 and I2. + /// + /// This is a very hard problem in general, so we use domain-specific + /// knowledge that we only ever check for equivalence between blocks sharing a + /// single immediate successor that is common, and when determining if I1 == + /// I2 we will have already determined that next(I1) == next(I2). This + /// inductive property allows us to simply return the value number of the next + /// instruction that defines memory. + uint32_t getMemoryUseOrder(Instruction *Inst) { + auto *BB = Inst->getParent(); + for (auto I = std::next(Inst->getIterator()), E = BB->end(); + I != E && !I->isTerminator(); ++I) { + if (!isMemoryInst(&*I)) + continue; + if (isa<LoadInst>(&*I)) + continue; + CallInst *CI = dyn_cast<CallInst>(&*I); + if (CI && CI->onlyReadsMemory()) + continue; + InvokeInst *II = dyn_cast<InvokeInst>(&*I); + if (II && II->onlyReadsMemory()) + continue; + return lookupOrAdd(&*I); + } + return 0; + } +}; + +//===----------------------------------------------------------------------===// + +class GVNSink { +public: + GVNSink() : VN() {} + bool run(Function &F) { + DEBUG(dbgs() << "GVNSink: running on function @" << F.getName() << "\n"); + + unsigned NumSunk = 0; + ReversePostOrderTraversal<Function*> RPOT(&F); + for (auto *N : RPOT) + NumSunk += sinkBB(N); + + return NumSunk > 0; + } + +private: + ValueTable VN; + + bool isInstructionBlacklisted(Instruction *I) { + // These instructions may change or break semantics if moved. + if (isa<PHINode>(I) || I->isEHPad() || isa<AllocaInst>(I) || + I->getType()->isTokenTy()) + return true; + return false; + } + + /// The main heuristic function. Analyze the set of instructions pointed to by + /// LRI and return a candidate solution if these instructions can be sunk, or + /// None otherwise. + Optional<SinkingInstructionCandidate> analyzeInstructionForSinking( + LockstepReverseIterator &LRI, unsigned &InstNum, unsigned &MemoryInstNum, + ModelledPHISet &NeededPHIs, SmallPtrSetImpl<Value *> &PHIContents); + + /// Create a ModelledPHI for each PHI in BB, adding to PHIs. + void analyzeInitialPHIs(BasicBlock *BB, ModelledPHISet &PHIs, + SmallPtrSetImpl<Value *> &PHIContents) { + for (auto &I : *BB) { + auto *PN = dyn_cast<PHINode>(&I); + if (!PN) + return; + + auto MPHI = ModelledPHI(PN); + PHIs.insert(MPHI); + for (auto *V : MPHI.getValues()) + PHIContents.insert(V); + } + } + + /// The main instruction sinking driver. Set up state and try and sink + /// instructions into BBEnd from its predecessors. + unsigned sinkBB(BasicBlock *BBEnd); + + /// Perform the actual mechanics of sinking an instruction from Blocks into + /// BBEnd, which is their only successor. + void sinkLastInstruction(ArrayRef<BasicBlock *> Blocks, BasicBlock *BBEnd); + + /// Remove PHIs that all have the same incoming value. + void foldPointlessPHINodes(BasicBlock *BB) { + auto I = BB->begin(); + while (PHINode *PN = dyn_cast<PHINode>(I++)) { + if (!all_of(PN->incoming_values(), + [&](const Value *V) { return V == PN->getIncomingValue(0); })) + continue; + if (PN->getIncomingValue(0) != PN) + PN->replaceAllUsesWith(PN->getIncomingValue(0)); + else + PN->replaceAllUsesWith(UndefValue::get(PN->getType())); + PN->eraseFromParent(); + } + } +}; + +Optional<SinkingInstructionCandidate> GVNSink::analyzeInstructionForSinking( + LockstepReverseIterator &LRI, unsigned &InstNum, unsigned &MemoryInstNum, + ModelledPHISet &NeededPHIs, SmallPtrSetImpl<Value *> &PHIContents) { + auto Insts = *LRI; + DEBUG(dbgs() << " -- Analyzing instruction set: [\n"; for (auto *I + : Insts) { + I->dump(); + } dbgs() << " ]\n";); + + DenseMap<uint32_t, unsigned> VNums; + for (auto *I : Insts) { + uint32_t N = VN.lookupOrAdd(I); + DEBUG(dbgs() << " VN=" << utohexstr(N) << " for" << *I << "\n"); + if (N == ~0U) + return None; + VNums[N]++; + } + unsigned VNumToSink = + std::max_element(VNums.begin(), VNums.end(), + [](const std::pair<uint32_t, unsigned> &I, + const std::pair<uint32_t, unsigned> &J) { + return I.second < J.second; + }) + ->first; + + if (VNums[VNumToSink] == 1) + // Can't sink anything! + return None; + + // Now restrict the number of incoming blocks down to only those with + // VNumToSink. + auto &ActivePreds = LRI.getActiveBlocks(); + unsigned InitialActivePredSize = ActivePreds.size(); + SmallVector<Instruction *, 4> NewInsts; + for (auto *I : Insts) { + if (VN.lookup(I) != VNumToSink) + ActivePreds.erase(I->getParent()); + else + NewInsts.push_back(I); + } + for (auto *I : NewInsts) + if (isInstructionBlacklisted(I)) + return None; + + // If we've restricted the incoming blocks, restrict all needed PHIs also + // to that set. + bool RecomputePHIContents = false; + if (ActivePreds.size() != InitialActivePredSize) { + ModelledPHISet NewNeededPHIs; + for (auto P : NeededPHIs) { + P.restrictToBlocks(ActivePreds); + NewNeededPHIs.insert(P); + } + NeededPHIs = NewNeededPHIs; + LRI.restrictToBlocks(ActivePreds); + RecomputePHIContents = true; + } + + // The sunk instruction's results. + ModelledPHI NewPHI(NewInsts, ActivePreds); + + // Does sinking this instruction render previous PHIs redundant? + if (NeededPHIs.find(NewPHI) != NeededPHIs.end()) { + NeededPHIs.erase(NewPHI); + RecomputePHIContents = true; + } + + if (RecomputePHIContents) { + // The needed PHIs have changed, so recompute the set of all needed + // values. + PHIContents.clear(); + for (auto &PHI : NeededPHIs) + PHIContents.insert(PHI.getValues().begin(), PHI.getValues().end()); + } + + // Is this instruction required by a later PHI that doesn't match this PHI? + // if so, we can't sink this instruction. + for (auto *V : NewPHI.getValues()) + if (PHIContents.count(V)) + // V exists in this PHI, but the whole PHI is different to NewPHI + // (else it would have been removed earlier). We cannot continue + // because this isn't representable. + return None; + + // Which operands need PHIs? + // FIXME: If any of these fail, we should partition up the candidates to + // try and continue making progress. + Instruction *I0 = NewInsts[0]; + for (unsigned OpNum = 0, E = I0->getNumOperands(); OpNum != E; ++OpNum) { + ModelledPHI PHI(NewInsts, OpNum, ActivePreds); + if (PHI.areAllIncomingValuesSame()) + continue; + if (!canReplaceOperandWithVariable(I0, OpNum)) + // We can 't create a PHI from this instruction! + return None; + if (NeededPHIs.count(PHI)) + continue; + if (!PHI.areAllIncomingValuesSameType()) + return None; + // Don't create indirect calls! The called value is the final operand. + if ((isa<CallInst>(I0) || isa<InvokeInst>(I0)) && OpNum == E - 1 && + PHI.areAnyIncomingValuesConstant()) + return None; + + NeededPHIs.reserve(NeededPHIs.size()); + NeededPHIs.insert(PHI); + PHIContents.insert(PHI.getValues().begin(), PHI.getValues().end()); + } + + if (isMemoryInst(NewInsts[0])) + ++MemoryInstNum; + + SinkingInstructionCandidate Cand; + Cand.NumInstructions = ++InstNum; + Cand.NumMemoryInsts = MemoryInstNum; + Cand.NumBlocks = ActivePreds.size(); + Cand.NumPHIs = NeededPHIs.size(); + for (auto *C : ActivePreds) + Cand.Blocks.push_back(C); + + return Cand; +} + +unsigned GVNSink::sinkBB(BasicBlock *BBEnd) { + DEBUG(dbgs() << "GVNSink: running on basic block "; + BBEnd->printAsOperand(dbgs()); dbgs() << "\n"); + SmallVector<BasicBlock *, 4> Preds; + for (auto *B : predecessors(BBEnd)) { + auto *T = B->getTerminator(); + if (isa<BranchInst>(T) || isa<SwitchInst>(T)) + Preds.push_back(B); + else + return 0; + } + if (Preds.size() < 2) + return 0; + std::sort(Preds.begin(), Preds.end()); + + unsigned NumOrigPreds = Preds.size(); + // We can only sink instructions through unconditional branches. + for (auto I = Preds.begin(); I != Preds.end();) { + if ((*I)->getTerminator()->getNumSuccessors() != 1) + I = Preds.erase(I); + else + ++I; + } + + LockstepReverseIterator LRI(Preds); + SmallVector<SinkingInstructionCandidate, 4> Candidates; + unsigned InstNum = 0, MemoryInstNum = 0; + ModelledPHISet NeededPHIs; + SmallPtrSet<Value *, 4> PHIContents; + analyzeInitialPHIs(BBEnd, NeededPHIs, PHIContents); + unsigned NumOrigPHIs = NeededPHIs.size(); + + while (LRI.isValid()) { + auto Cand = analyzeInstructionForSinking(LRI, InstNum, MemoryInstNum, + NeededPHIs, PHIContents); + if (!Cand) + break; + Cand->calculateCost(NumOrigPHIs, Preds.size()); + Candidates.emplace_back(*Cand); + --LRI; + } + + std::stable_sort( + Candidates.begin(), Candidates.end(), + [](const SinkingInstructionCandidate &A, + const SinkingInstructionCandidate &B) { return A > B; }); + DEBUG(dbgs() << " -- Sinking candidates:\n"; for (auto &C + : Candidates) dbgs() + << " " << C << "\n";); + + // Pick the top candidate, as long it is positive! + if (Candidates.empty() || Candidates.front().Cost <= 0) + return 0; + auto C = Candidates.front(); + + DEBUG(dbgs() << " -- Sinking: " << C << "\n"); + BasicBlock *InsertBB = BBEnd; + if (C.Blocks.size() < NumOrigPreds) { + DEBUG(dbgs() << " -- Splitting edge to "; BBEnd->printAsOperand(dbgs()); + dbgs() << "\n"); + InsertBB = SplitBlockPredecessors(BBEnd, C.Blocks, ".gvnsink.split"); + if (!InsertBB) { + DEBUG(dbgs() << " -- FAILED to split edge!\n"); + // Edge couldn't be split. + return 0; + } + } + + for (unsigned I = 0; I < C.NumInstructions; ++I) + sinkLastInstruction(C.Blocks, InsertBB); + + return C.NumInstructions; +} + +void GVNSink::sinkLastInstruction(ArrayRef<BasicBlock *> Blocks, + BasicBlock *BBEnd) { + SmallVector<Instruction *, 4> Insts; + for (BasicBlock *BB : Blocks) + Insts.push_back(BB->getTerminator()->getPrevNode()); + Instruction *I0 = Insts.front(); + + SmallVector<Value *, 4> NewOperands; + for (unsigned O = 0, E = I0->getNumOperands(); O != E; ++O) { + bool NeedPHI = any_of(Insts, [&I0, O](const Instruction *I) { + return I->getOperand(O) != I0->getOperand(O); + }); + if (!NeedPHI) { + NewOperands.push_back(I0->getOperand(O)); + continue; + } + + // Create a new PHI in the successor block and populate it. + auto *Op = I0->getOperand(O); + assert(!Op->getType()->isTokenTy() && "Can't PHI tokens!"); + auto *PN = PHINode::Create(Op->getType(), Insts.size(), + Op->getName() + ".sink", &BBEnd->front()); + for (auto *I : Insts) + PN->addIncoming(I->getOperand(O), I->getParent()); + NewOperands.push_back(PN); + } + + // Arbitrarily use I0 as the new "common" instruction; remap its operands + // and move it to the start of the successor block. + for (unsigned O = 0, E = I0->getNumOperands(); O != E; ++O) + I0->getOperandUse(O).set(NewOperands[O]); + I0->moveBefore(&*BBEnd->getFirstInsertionPt()); + + // Update metadata and IR flags. + for (auto *I : Insts) + if (I != I0) { + combineMetadataForCSE(I0, I); + I0->andIRFlags(I); + } + + for (auto *I : Insts) + if (I != I0) + I->replaceAllUsesWith(I0); + foldPointlessPHINodes(BBEnd); + + // Finally nuke all instructions apart from the common instruction. + for (auto *I : Insts) + if (I != I0) + I->eraseFromParent(); + + NumRemoved += Insts.size() - 1; +} + +//////////////////////////////////////////////////////////////////////////////// +// Pass machinery / boilerplate + +class GVNSinkLegacyPass : public FunctionPass { +public: + static char ID; + + GVNSinkLegacyPass() : FunctionPass(ID) { + initializeGVNSinkLegacyPassPass(*PassRegistry::getPassRegistry()); + } + + bool runOnFunction(Function &F) override { + if (skipFunction(F)) + return false; + GVNSink G; + return G.run(F); + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addPreserved<GlobalsAAWrapperPass>(); + } +}; +} // namespace + +PreservedAnalyses GVNSinkPass::run(Function &F, FunctionAnalysisManager &AM) { + GVNSink G; + if (!G.run(F)) + return PreservedAnalyses::all(); + + PreservedAnalyses PA; + PA.preserve<GlobalsAA>(); + return PA; +} + +char GVNSinkLegacyPass::ID = 0; +INITIALIZE_PASS_BEGIN(GVNSinkLegacyPass, "gvn-sink", + "Early GVN sinking of Expressions", false, false) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass) +INITIALIZE_PASS_END(GVNSinkLegacyPass, "gvn-sink", + "Early GVN sinking of Expressions", false, false) + +FunctionPass *llvm::createGVNSinkPass() { return new GVNSinkLegacyPass(); } diff --git a/contrib/llvm/lib/Transforms/Scalar/GuardWidening.cpp b/contrib/llvm/lib/Transforms/Scalar/GuardWidening.cpp index b05ef00..fb7c6e1 100644 --- a/contrib/llvm/lib/Transforms/Scalar/GuardWidening.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/GuardWidening.cpp @@ -40,7 +40,6 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/Scalar/GuardWidening.h" -#include "llvm/Pass.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/DepthFirstIterator.h" #include "llvm/Analysis/LoopInfo.h" @@ -50,7 +49,9 @@ #include "llvm/IR/Dominators.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/PatternMatch.h" +#include "llvm/Pass.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/KnownBits.h" #include "llvm/Transforms/Scalar.h" using namespace llvm; @@ -536,10 +537,8 @@ bool GuardWideningImpl::parseRangeChecks( Changed = true; } else if (match(Check.getBase(), m_Or(m_Value(OpLHS), m_ConstantInt(OpRHS)))) { - unsigned BitWidth = OpLHS->getType()->getScalarSizeInBits(); - APInt KnownZero(BitWidth, 0), KnownOne(BitWidth, 0); - computeKnownBits(OpLHS, KnownZero, KnownOne, DL); - if ((OpRHS->getValue() & KnownZero) == OpRHS->getValue()) { + KnownBits Known = computeKnownBits(OpLHS, DL); + if ((OpRHS->getValue() & Known.Zero) == OpRHS->getValue()) { Check.setBase(OpLHS); APInt NewOffset = Check.getOffsetValue() + OpRHS->getValue(); Check.setOffset(ConstantInt::get(Ctx, NewOffset)); @@ -568,8 +567,7 @@ bool GuardWideningImpl::combineRangeChecks( return RC.getBase() == CurrentBase && RC.getLength() == CurrentLength; }; - std::copy_if(Checks.begin(), Checks.end(), - std::back_inserter(CurrentChecks), IsCurrentCheck); + copy_if(Checks, std::back_inserter(CurrentChecks), IsCurrentCheck); Checks.erase(remove_if(Checks, IsCurrentCheck), Checks.end()); assert(CurrentChecks.size() != 0 && "We know we have at least one!"); @@ -613,16 +611,16 @@ bool GuardWideningImpl::combineRangeChecks( // We have a series of f+1 checks as: // // I+k_0 u< L ... Chk_0 - // I_k_1 u< L ... Chk_1 + // I+k_1 u< L ... Chk_1 // ... - // I_k_f u< L ... Chk_(f+1) + // I+k_f u< L ... Chk_f // - // with forall i in [0,f): k_f-k_i u< k_f-k_0 ... Precond_0 + // with forall i in [0,f]: k_f-k_i u< k_f-k_0 ... Precond_0 // k_f-k_0 u< INT_MIN+k_f ... Precond_1 // k_f != k_0 ... Precond_2 // // Claim: - // Chk_0 AND Chk_(f+1) implies all the other checks + // Chk_0 AND Chk_f implies all the other checks // // Informal proof sketch: // @@ -658,8 +656,12 @@ PreservedAnalyses GuardWideningPass::run(Function &F, auto &DT = AM.getResult<DominatorTreeAnalysis>(F); auto &LI = AM.getResult<LoopAnalysis>(F); auto &PDT = AM.getResult<PostDominatorTreeAnalysis>(F); - bool Changed = GuardWideningImpl(DT, PDT, LI).run(); - return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all(); + if (!GuardWideningImpl(DT, PDT, LI).run()) + return PreservedAnalyses::all(); + + PreservedAnalyses PA; + PA.preserveSet<CFGAnalyses>(); + return PA; } StringRef GuardWideningImpl::scoreTypeToString(WideningScore WS) { diff --git a/contrib/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp b/contrib/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp index 1752fb7..1078296 100644 --- a/contrib/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp @@ -86,6 +86,10 @@ static cl::opt<bool> UsePostIncrementRanges( cl::desc("Use post increment control-dependent ranges in IndVarSimplify"), cl::init(true)); +static cl::opt<bool> +DisableLFTR("disable-lftr", cl::Hidden, cl::init(false), + cl::desc("Disable Linear Function Test Replace optimization")); + namespace { struct RewritePhi; @@ -97,7 +101,7 @@ class IndVarSimplify { TargetLibraryInfo *TLI; const TargetTransformInfo *TTI; - SmallVector<WeakVH, 16> DeadInsts; + SmallVector<WeakTrackingVH, 16> DeadInsts; bool Changed = false; bool isValidRewrite(Value *FromVal, Value *ToVal); @@ -231,8 +235,9 @@ static bool ConvertToSInt(const APFloat &APF, int64_t &IntVal) { bool isExact = false; // See if we can convert this to an int64_t uint64_t UIntVal; - if (APF.convertToInteger(&UIntVal, 64, true, APFloat::rmTowardZero, - &isExact) != APFloat::opOK || !isExact) + if (APF.convertToInteger(makeMutableArrayRef(UIntVal), 64, true, + APFloat::rmTowardZero, &isExact) != APFloat::opOK || + !isExact) return false; IntVal = UIntVal; return true; @@ -414,8 +419,8 @@ void IndVarSimplify::handleFloatingPointIV(Loop *L, PHINode *PN) { Compare->getName()); // In the following deletions, PN may become dead and may be deleted. - // Use a WeakVH to observe whether this happens. - WeakVH WeakPH = PN; + // Use a WeakTrackingVH to observe whether this happens. + WeakTrackingVH WeakPH = PN; // Delete the old floating point exit comparison. The branch starts using the // new comparison. @@ -450,7 +455,7 @@ void IndVarSimplify::rewriteNonIntegerIVs(Loop *L) { // BasicBlock *Header = L->getHeader(); - SmallVector<WeakVH, 8> PHIs; + SmallVector<WeakTrackingVH, 8> PHIs; for (BasicBlock::iterator I = Header->begin(); PHINode *PN = dyn_cast<PHINode>(I); ++I) PHIs.push_back(PN); @@ -900,13 +905,13 @@ class WidenIV { PHINode *WidePhi; Instruction *WideInc; const SCEV *WideIncExpr; - SmallVectorImpl<WeakVH> &DeadInsts; + SmallVectorImpl<WeakTrackingVH> &DeadInsts; SmallPtrSet<Instruction *,16> Widened; SmallVector<NarrowIVDefUse, 8> NarrowIVUsers; enum ExtendKind { ZeroExtended, SignExtended, Unknown }; - // A map tracking the kind of extension used to widen each narrow IV + // A map tracking the kind of extension used to widen each narrow IV // and narrow IV user. // Key: pointer to a narrow IV or IV user. // Value: the kind of extension used to widen this Instruction. @@ -940,20 +945,13 @@ class WidenIV { } public: - WidenIV(const WideIVInfo &WI, LoopInfo *LInfo, - ScalarEvolution *SEv, DominatorTree *DTree, - SmallVectorImpl<WeakVH> &DI, bool HasGuards) : - OrigPhi(WI.NarrowIV), - WideType(WI.WidestNativeType), - LI(LInfo), - L(LI->getLoopFor(OrigPhi->getParent())), - SE(SEv), - DT(DTree), - HasGuards(HasGuards), - WidePhi(nullptr), - WideInc(nullptr), - WideIncExpr(nullptr), - DeadInsts(DI) { + WidenIV(const WideIVInfo &WI, LoopInfo *LInfo, ScalarEvolution *SEv, + DominatorTree *DTree, SmallVectorImpl<WeakTrackingVH> &DI, + bool HasGuards) + : OrigPhi(WI.NarrowIV), WideType(WI.WidestNativeType), LI(LInfo), + L(LI->getLoopFor(OrigPhi->getParent())), SE(SEv), DT(DTree), + HasGuards(HasGuards), WidePhi(nullptr), WideInc(nullptr), + WideIncExpr(nullptr), DeadInsts(DI) { assert(L->getHeader() == OrigPhi->getParent() && "Phi must be an IV"); ExtendKindMap[OrigPhi] = WI.IsSigned ? SignExtended : ZeroExtended; } @@ -1608,7 +1606,7 @@ void WidenIV::calculatePostIncRange(Instruction *NarrowDef, return; CmpInst::Predicate P = - TrueDest ? Pred : CmpInst::getInversePredicate(Pred); + TrueDest ? Pred : CmpInst::getInversePredicate(Pred); auto CmpRHSRange = SE->getSignedRange(SE->getSCEV(CmpRHS)); auto CmpConstrainedLHSRange = @@ -1634,7 +1632,7 @@ void WidenIV::calculatePostIncRange(Instruction *NarrowDef, UpdateRangeFromGuards(NarrowUser); BasicBlock *NarrowUserBB = NarrowUser->getParent(); - // If NarrowUserBB is statically unreachable asking dominator queries may + // If NarrowUserBB is statically unreachable asking dominator queries may // yield surprising results. (e.g. the block may not have a dom tree node) if (!DT->isReachableFromEntry(NarrowUserBB)) return; @@ -1829,6 +1827,7 @@ static PHINode *getLoopPhiForCounter(Value *IncV, Loop *L, DominatorTree *DT) { // An IV counter must preserve its type. if (IncI->getNumOperands() == 2) break; + LLVM_FALLTHROUGH; default: return nullptr; } @@ -2152,6 +2151,8 @@ linearFunctionTestReplace(Loop *L, Value *CmpIndVar = IndVar; const SCEV *IVCount = BackedgeTakenCount; + assert(L->getLoopLatch() && "Loop no longer in simplified form?"); + // If the exiting block is the same as the backedge block, we prefer to // compare against the post-incremented value, otherwise we must compare // against the preincremented value. @@ -2376,6 +2377,7 @@ bool IndVarSimplify::run(Loop *L) { // Loop::getCanonicalInductionVariable only supports loops with preheaders, // and we're in trouble if we can't find the induction variable even when // we've manually inserted one. + // - LFTR relies on having a single backedge. if (!L->isLoopSimplifyForm()) return false; @@ -2415,7 +2417,8 @@ bool IndVarSimplify::run(Loop *L) { // If we have a trip count expression, rewrite the loop's exit condition // using it. We can currently only handle loops with a single exit. - if (canExpandBackedgeTakenCount(L, SE, Rewriter) && needsLFTR(L, DT)) { + if (!DisableLFTR && canExpandBackedgeTakenCount(L, SE, Rewriter) && + needsLFTR(L, DT)) { PHINode *IndVar = FindLoopCounter(L, BackedgeTakenCount, SE, DT); if (IndVar) { // Check preconditions for proper SCEVExpander operation. SCEV does not @@ -2492,8 +2495,9 @@ PreservedAnalyses IndVarSimplifyPass::run(Loop &L, LoopAnalysisManager &AM, if (!IVS.run(&L)) return PreservedAnalyses::all(); - // FIXME: This should also 'preserve the CFG'. - return getLoopPassPreservedAnalyses(); + auto PA = getLoopPassPreservedAnalyses(); + PA.preserveSet<CFGAnalyses>(); + return PA; } namespace { diff --git a/contrib/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp b/contrib/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp index 8e81541..99b4458 100644 --- a/contrib/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp @@ -59,8 +59,8 @@ #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Cloning.h" -#include "llvm/Transforms/Utils/LoopUtils.h" #include "llvm/Transforms/Utils/LoopSimplify.h" +#include "llvm/Transforms/Utils/LoopUtils.h" using namespace llvm; @@ -446,6 +446,15 @@ struct LoopStructure { BasicBlock *LatchExit; unsigned LatchBrExitIdx; + // The loop represented by this instance of LoopStructure is semantically + // equivalent to: + // + // intN_ty inc = IndVarIncreasing ? 1 : -1; + // pred_ty predicate = IndVarIncreasing ? ICMP_SLT : ICMP_SGT; + // + // for (intN_ty iv = IndVarStart; predicate(iv, LoopExitAt); iv = IndVarNext) + // ... body ... + Value *IndVarNext; Value *IndVarStart; Value *LoopExitAt; @@ -789,9 +798,32 @@ LoopStructure::parseLoopStructure(ScalarEvolution &SE, BranchProbabilityInfo &BP return None; } + const SCEV *StartNext = IndVarNext->getStart(); + const SCEV *Addend = SE.getNegativeSCEV(IndVarNext->getStepRecurrence(SE)); + const SCEV *IndVarStart = SE.getAddExpr(StartNext, Addend); + ConstantInt *One = ConstantInt::get(IndVarTy, 1); // TODO: generalize the predicates here to also match their unsigned variants. if (IsIncreasing) { + bool DecreasedRightValueByOne = false; + // Try to turn eq/ne predicates to those we can work with. + if (Pred == ICmpInst::ICMP_NE && LatchBrExitIdx == 1) + // while (++i != len) { while (++i < len) { + // ... ---> ... + // } } + Pred = ICmpInst::ICMP_SLT; + else if (Pred == ICmpInst::ICMP_EQ && LatchBrExitIdx == 0 && + !CanBeSMin(SE, RightSCEV)) { + // while (true) { while (true) { + // if (++i == len) ---> if (++i > len - 1) + // break; break; + // ... ... + // } } + Pred = ICmpInst::ICMP_SGT; + RightSCEV = SE.getMinusSCEV(RightSCEV, SE.getOne(RightSCEV->getType())); + DecreasedRightValueByOne = true; + } + bool FoundExpectedPred = (Pred == ICmpInst::ICMP_SLT && LatchBrExitIdx == 1) || (Pred == ICmpInst::ICMP_SGT && LatchBrExitIdx == 0); @@ -809,11 +841,48 @@ LoopStructure::parseLoopStructure(ScalarEvolution &SE, BranchProbabilityInfo &BP return None; } - IRBuilder<> B(Preheader->getTerminator()); - RightValue = B.CreateAdd(RightValue, One); - } + if (!SE.isLoopEntryGuardedByCond( + &L, CmpInst::ICMP_SLT, IndVarStart, + SE.getAddExpr(RightSCEV, SE.getOne(RightSCEV->getType())))) { + FailureReason = "Induction variable start not bounded by upper limit"; + return None; + } + // We need to increase the right value unless we have already decreased + // it virtually when we replaced EQ with SGT. + if (!DecreasedRightValueByOne) { + IRBuilder<> B(Preheader->getTerminator()); + RightValue = B.CreateAdd(RightValue, One); + } + } else { + if (!SE.isLoopEntryGuardedByCond(&L, CmpInst::ICMP_SLT, IndVarStart, + RightSCEV)) { + FailureReason = "Induction variable start not bounded by upper limit"; + return None; + } + assert(!DecreasedRightValueByOne && + "Right value can be decreased only for LatchBrExitIdx == 0!"); + } } else { + bool IncreasedRightValueByOne = false; + // Try to turn eq/ne predicates to those we can work with. + if (Pred == ICmpInst::ICMP_NE && LatchBrExitIdx == 1) + // while (--i != len) { while (--i > len) { + // ... ---> ... + // } } + Pred = ICmpInst::ICMP_SGT; + else if (Pred == ICmpInst::ICMP_EQ && LatchBrExitIdx == 0 && + !CanBeSMax(SE, RightSCEV)) { + // while (true) { while (true) { + // if (--i == len) ---> if (--i < len + 1) + // break; break; + // ... ... + // } } + Pred = ICmpInst::ICMP_SLT; + RightSCEV = SE.getAddExpr(RightSCEV, SE.getOne(RightSCEV->getType())); + IncreasedRightValueByOne = true; + } + bool FoundExpectedPred = (Pred == ICmpInst::ICMP_SGT && LatchBrExitIdx == 1) || (Pred == ICmpInst::ICMP_SLT && LatchBrExitIdx == 0); @@ -831,15 +900,30 @@ LoopStructure::parseLoopStructure(ScalarEvolution &SE, BranchProbabilityInfo &BP return None; } - IRBuilder<> B(Preheader->getTerminator()); - RightValue = B.CreateSub(RightValue, One); + if (!SE.isLoopEntryGuardedByCond( + &L, CmpInst::ICMP_SGT, IndVarStart, + SE.getMinusSCEV(RightSCEV, SE.getOne(RightSCEV->getType())))) { + FailureReason = "Induction variable start not bounded by lower limit"; + return None; + } + + // We need to decrease the right value unless we have already increased + // it virtually when we replaced EQ with SLT. + if (!IncreasedRightValueByOne) { + IRBuilder<> B(Preheader->getTerminator()); + RightValue = B.CreateSub(RightValue, One); + } + } else { + if (!SE.isLoopEntryGuardedByCond(&L, CmpInst::ICMP_SGT, IndVarStart, + RightSCEV)) { + FailureReason = "Induction variable start not bounded by lower limit"; + return None; + } + assert(!IncreasedRightValueByOne && + "Right value can be increased only for LatchBrExitIdx == 0!"); } } - const SCEV *StartNext = IndVarNext->getStart(); - const SCEV *Addend = SE.getNegativeSCEV(IndVarNext->getStepRecurrence(SE)); - const SCEV *IndVarStart = SE.getAddExpr(StartNext, Addend); - BasicBlock *LatchExit = LatchBr->getSuccessor(LatchBrExitIdx); assert(SE.getLoopDisposition(LatchCount, &L) == @@ -883,20 +967,23 @@ LoopConstrainer::calculateSubRanges() const { // I think we can be more aggressive here and make this nuw / nsw if the // addition that feeds into the icmp for the latch's terminating branch is nuw // / nsw. In any case, a wrapping 2's complement addition is safe. - ConstantInt *One = ConstantInt::get(Ty, 1); const SCEV *Start = SE.getSCEV(MainLoopStructure.IndVarStart); const SCEV *End = SE.getSCEV(MainLoopStructure.LoopExitAt); bool Increasing = MainLoopStructure.IndVarIncreasing; - // We compute `Smallest` and `Greatest` such that [Smallest, Greatest) is the - // range of values the induction variable takes. + // We compute `Smallest` and `Greatest` such that [Smallest, Greatest), or + // [Smallest, GreatestSeen] is the range of values the induction variable + // takes. - const SCEV *Smallest = nullptr, *Greatest = nullptr; + const SCEV *Smallest = nullptr, *Greatest = nullptr, *GreatestSeen = nullptr; + const SCEV *One = SE.getOne(Ty); if (Increasing) { Smallest = Start; Greatest = End; + // No overflow, because the range [Smallest, GreatestSeen] is not empty. + GreatestSeen = SE.getMinusSCEV(End, One); } else { // These two computations may sign-overflow. Here is why that is okay: // @@ -914,8 +1001,9 @@ LoopConstrainer::calculateSubRanges() const { // will be an empty range. Returning an empty range is always safe. // - Smallest = SE.getAddExpr(End, SE.getSCEV(One)); - Greatest = SE.getAddExpr(Start, SE.getSCEV(One)); + Smallest = SE.getAddExpr(End, One); + Greatest = SE.getAddExpr(Start, One); + GreatestSeen = Start; } auto Clamp = [this, Smallest, Greatest](const SCEV *S) { @@ -930,7 +1018,7 @@ LoopConstrainer::calculateSubRanges() const { Result.LowLimit = Clamp(Range.getBegin()); bool ProvablyNoPostLoop = - SE.isKnownPredicate(ICmpInst::ICMP_SLE, Greatest, Range.getEnd()); + SE.isKnownPredicate(ICmpInst::ICMP_SLT, GreatestSeen, Range.getEnd()); if (!ProvablyNoPostLoop) Result.HighLimit = Clamp(Range.getEnd()); @@ -1194,7 +1282,12 @@ void LoopConstrainer::addToParentLoopIfNeeded(ArrayRef<BasicBlock *> BBs) { Loop *LoopConstrainer::createClonedLoopStructure(Loop *Original, Loop *Parent, ValueToValueMapTy &VM) { - Loop &New = LPM.addLoop(Parent); + Loop &New = *new Loop(); + if (Parent) + Parent->addChildLoop(&New); + else + LI.addTopLevelLoop(&New); + LPM.addLoop(New); // Add all of the blocks in Original to the new loop. for (auto *BB : Original->blocks()) @@ -1332,28 +1425,35 @@ bool LoopConstrainer::run() { DT.recalculate(F); + // We need to first add all the pre and post loop blocks into the loop + // structures (as part of createClonedLoopStructure), and then update the + // LCSSA form and LoopSimplifyForm. This is necessary for correctly updating + // LI when LoopSimplifyForm is generated. + Loop *PreL = nullptr, *PostL = nullptr; if (!PreLoop.Blocks.empty()) { - auto *L = createClonedLoopStructure( + PreL = createClonedLoopStructure( &OriginalLoop, OriginalLoop.getParentLoop(), PreLoop.Map); - formLCSSARecursively(*L, DT, &LI, &SE); - simplifyLoop(L, &DT, &LI, &SE, nullptr, true); - // Pre loops are slow paths, we do not need to perform any loop - // optimizations on them. - DisableAllLoopOptsOnLoop(*L); } if (!PostLoop.Blocks.empty()) { - auto *L = createClonedLoopStructure( + PostL = createClonedLoopStructure( &OriginalLoop, OriginalLoop.getParentLoop(), PostLoop.Map); + } + + // This function canonicalizes the loop into Loop-Simplify and LCSSA forms. + auto CanonicalizeLoop = [&] (Loop *L, bool IsOriginalLoop) { formLCSSARecursively(*L, DT, &LI, &SE); simplifyLoop(L, &DT, &LI, &SE, nullptr, true); - // Post loops are slow paths, we do not need to perform any loop + // Pre/post loops are slow paths, we do not need to perform any loop // optimizations on them. - DisableAllLoopOptsOnLoop(*L); - } - - formLCSSARecursively(OriginalLoop, DT, &LI, &SE); - simplifyLoop(&OriginalLoop, &DT, &LI, &SE, nullptr, true); + if (!IsOriginalLoop) + DisableAllLoopOptsOnLoop(*L); + }; + if (PreL) + CanonicalizeLoop(PreL, false); + if (PostL) + CanonicalizeLoop(PostL, false); + CanonicalizeLoop(&OriginalLoop, true); return true; } diff --git a/contrib/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp b/contrib/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp new file mode 100644 index 0000000..89b28f0 --- /dev/null +++ b/contrib/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp @@ -0,0 +1,969 @@ +//===-- NVPTXInferAddressSpace.cpp - ---------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// CUDA C/C++ includes memory space designation as variable type qualifers (such +// as __global__ and __shared__). Knowing the space of a memory access allows +// CUDA compilers to emit faster PTX loads and stores. For example, a load from +// shared memory can be translated to `ld.shared` which is roughly 10% faster +// than a generic `ld` on an NVIDIA Tesla K40c. +// +// Unfortunately, type qualifiers only apply to variable declarations, so CUDA +// compilers must infer the memory space of an address expression from +// type-qualified variables. +// +// LLVM IR uses non-zero (so-called) specific address spaces to represent memory +// spaces (e.g. addrspace(3) means shared memory). The Clang frontend +// places only type-qualified variables in specific address spaces, and then +// conservatively `addrspacecast`s each type-qualified variable to addrspace(0) +// (so-called the generic address space) for other instructions to use. +// +// For example, the Clang translates the following CUDA code +// __shared__ float a[10]; +// float v = a[i]; +// to +// %0 = addrspacecast [10 x float] addrspace(3)* @a to [10 x float]* +// %1 = gep [10 x float], [10 x float]* %0, i64 0, i64 %i +// %v = load float, float* %1 ; emits ld.f32 +// @a is in addrspace(3) since it's type-qualified, but its use from %1 is +// redirected to %0 (the generic version of @a). +// +// The optimization implemented in this file propagates specific address spaces +// from type-qualified variable declarations to its users. For example, it +// optimizes the above IR to +// %1 = gep [10 x float] addrspace(3)* @a, i64 0, i64 %i +// %v = load float addrspace(3)* %1 ; emits ld.shared.f32 +// propagating the addrspace(3) from @a to %1. As the result, the NVPTX +// codegen is able to emit ld.shared.f32 for %v. +// +// Address space inference works in two steps. First, it uses a data-flow +// analysis to infer as many generic pointers as possible to point to only one +// specific address space. In the above example, it can prove that %1 only +// points to addrspace(3). This algorithm was published in +// CUDA: Compiling and optimizing for a GPU platform +// Chakrabarti, Grover, Aarts, Kong, Kudlur, Lin, Marathe, Murphy, Wang +// ICCS 2012 +// +// Then, address space inference replaces all refinable generic pointers with +// equivalent specific pointers. +// +// The major challenge of implementing this optimization is handling PHINodes, +// which may create loops in the data flow graph. This brings two complications. +// +// First, the data flow analysis in Step 1 needs to be circular. For example, +// %generic.input = addrspacecast float addrspace(3)* %input to float* +// loop: +// %y = phi [ %generic.input, %y2 ] +// %y2 = getelementptr %y, 1 +// %v = load %y2 +// br ..., label %loop, ... +// proving %y specific requires proving both %generic.input and %y2 specific, +// but proving %y2 specific circles back to %y. To address this complication, +// the data flow analysis operates on a lattice: +// uninitialized > specific address spaces > generic. +// All address expressions (our implementation only considers phi, bitcast, +// addrspacecast, and getelementptr) start with the uninitialized address space. +// The monotone transfer function moves the address space of a pointer down a +// lattice path from uninitialized to specific and then to generic. A join +// operation of two different specific address spaces pushes the expression down +// to the generic address space. The analysis completes once it reaches a fixed +// point. +// +// Second, IR rewriting in Step 2 also needs to be circular. For example, +// converting %y to addrspace(3) requires the compiler to know the converted +// %y2, but converting %y2 needs the converted %y. To address this complication, +// we break these cycles using "undef" placeholders. When converting an +// instruction `I` to a new address space, if its operand `Op` is not converted +// yet, we let `I` temporarily use `undef` and fix all the uses of undef later. +// For instance, our algorithm first converts %y to +// %y' = phi float addrspace(3)* [ %input, undef ] +// Then, it converts %y2 to +// %y2' = getelementptr %y', 1 +// Finally, it fixes the undef in %y' so that +// %y' = phi float addrspace(3)* [ %input, %y2' ] +// +//===----------------------------------------------------------------------===// + +#include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/Optional.h" +#include "llvm/ADT/SetVector.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/InstIterator.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Operator.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/Local.h" +#include "llvm/Transforms/Utils/ValueMapper.h" + +#define DEBUG_TYPE "infer-address-spaces" + +using namespace llvm; + +namespace { +static const unsigned UninitializedAddressSpace = ~0u; + +using ValueToAddrSpaceMapTy = DenseMap<const Value *, unsigned>; + +/// \brief InferAddressSpaces +class InferAddressSpaces : public FunctionPass { + /// Target specific address space which uses of should be replaced if + /// possible. + unsigned FlatAddrSpace; + +public: + static char ID; + + InferAddressSpaces() : FunctionPass(ID) {} + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + AU.addRequired<TargetTransformInfoWrapperPass>(); + } + + bool runOnFunction(Function &F) override; + +private: + // Returns the new address space of V if updated; otherwise, returns None. + Optional<unsigned> + updateAddressSpace(const Value &V, + const ValueToAddrSpaceMapTy &InferredAddrSpace) const; + + // Tries to infer the specific address space of each address expression in + // Postorder. + void inferAddressSpaces(ArrayRef<WeakTrackingVH> Postorder, + ValueToAddrSpaceMapTy *InferredAddrSpace) const; + + bool isSafeToCastConstAddrSpace(Constant *C, unsigned NewAS) const; + + // Changes the flat address expressions in function F to point to specific + // address spaces if InferredAddrSpace says so. Postorder is the postorder of + // all flat expressions in the use-def graph of function F. + bool + rewriteWithNewAddressSpaces(ArrayRef<WeakTrackingVH> Postorder, + const ValueToAddrSpaceMapTy &InferredAddrSpace, + Function *F) const; + + void appendsFlatAddressExpressionToPostorderStack( + Value *V, std::vector<std::pair<Value *, bool>> &PostorderStack, + DenseSet<Value *> &Visited) const; + + bool rewriteIntrinsicOperands(IntrinsicInst *II, + Value *OldV, Value *NewV) const; + void collectRewritableIntrinsicOperands( + IntrinsicInst *II, + std::vector<std::pair<Value *, bool>> &PostorderStack, + DenseSet<Value *> &Visited) const; + + std::vector<WeakTrackingVH> collectFlatAddressExpressions(Function &F) const; + + Value *cloneValueWithNewAddressSpace( + Value *V, unsigned NewAddrSpace, + const ValueToValueMapTy &ValueWithNewAddrSpace, + SmallVectorImpl<const Use *> *UndefUsesToFix) const; + unsigned joinAddressSpaces(unsigned AS1, unsigned AS2) const; +}; +} // end anonymous namespace + +char InferAddressSpaces::ID = 0; + +namespace llvm { +void initializeInferAddressSpacesPass(PassRegistry &); +} + +INITIALIZE_PASS(InferAddressSpaces, DEBUG_TYPE, "Infer address spaces", + false, false) + +// Returns true if V is an address expression. +// TODO: Currently, we consider only phi, bitcast, addrspacecast, and +// getelementptr operators. +static bool isAddressExpression(const Value &V) { + if (!isa<Operator>(V)) + return false; + + switch (cast<Operator>(V).getOpcode()) { + case Instruction::PHI: + case Instruction::BitCast: + case Instruction::AddrSpaceCast: + case Instruction::GetElementPtr: + case Instruction::Select: + return true; + default: + return false; + } +} + +// Returns the pointer operands of V. +// +// Precondition: V is an address expression. +static SmallVector<Value *, 2> getPointerOperands(const Value &V) { + const Operator &Op = cast<Operator>(V); + switch (Op.getOpcode()) { + case Instruction::PHI: { + auto IncomingValues = cast<PHINode>(Op).incoming_values(); + return SmallVector<Value *, 2>(IncomingValues.begin(), + IncomingValues.end()); + } + case Instruction::BitCast: + case Instruction::AddrSpaceCast: + case Instruction::GetElementPtr: + return {Op.getOperand(0)}; + case Instruction::Select: + return {Op.getOperand(1), Op.getOperand(2)}; + default: + llvm_unreachable("Unexpected instruction type."); + } +} + +// TODO: Move logic to TTI? +bool InferAddressSpaces::rewriteIntrinsicOperands(IntrinsicInst *II, + Value *OldV, + Value *NewV) const { + Module *M = II->getParent()->getParent()->getParent(); + + switch (II->getIntrinsicID()) { + case Intrinsic::amdgcn_atomic_inc: + case Intrinsic::amdgcn_atomic_dec:{ + const ConstantInt *IsVolatile = dyn_cast<ConstantInt>(II->getArgOperand(4)); + if (!IsVolatile || !IsVolatile->isZero()) + return false; + + LLVM_FALLTHROUGH; + } + case Intrinsic::objectsize: { + Type *DestTy = II->getType(); + Type *SrcTy = NewV->getType(); + Function *NewDecl = + Intrinsic::getDeclaration(M, II->getIntrinsicID(), {DestTy, SrcTy}); + II->setArgOperand(0, NewV); + II->setCalledFunction(NewDecl); + return true; + } + default: + return false; + } +} + +// TODO: Move logic to TTI? +void InferAddressSpaces::collectRewritableIntrinsicOperands( + IntrinsicInst *II, std::vector<std::pair<Value *, bool>> &PostorderStack, + DenseSet<Value *> &Visited) const { + switch (II->getIntrinsicID()) { + case Intrinsic::objectsize: + case Intrinsic::amdgcn_atomic_inc: + case Intrinsic::amdgcn_atomic_dec: + appendsFlatAddressExpressionToPostorderStack(II->getArgOperand(0), + PostorderStack, Visited); + break; + default: + break; + } +} + +// Returns all flat address expressions in function F. The elements are +// If V is an unvisited flat address expression, appends V to PostorderStack +// and marks it as visited. +void InferAddressSpaces::appendsFlatAddressExpressionToPostorderStack( + Value *V, std::vector<std::pair<Value *, bool>> &PostorderStack, + DenseSet<Value *> &Visited) const { + assert(V->getType()->isPointerTy()); + + // Generic addressing expressions may be hidden in nested constant + // expressions. + if (ConstantExpr *CE = dyn_cast<ConstantExpr>(V)) { + // TODO: Look in non-address parts, like icmp operands. + if (isAddressExpression(*CE) && Visited.insert(CE).second) + PostorderStack.push_back(std::make_pair(CE, false)); + + return; + } + + if (isAddressExpression(*V) && + V->getType()->getPointerAddressSpace() == FlatAddrSpace) { + if (Visited.insert(V).second) { + PostorderStack.push_back(std::make_pair(V, false)); + + Operator *Op = cast<Operator>(V); + for (unsigned I = 0, E = Op->getNumOperands(); I != E; ++I) { + if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Op->getOperand(I))) { + if (isAddressExpression(*CE) && Visited.insert(CE).second) + PostorderStack.emplace_back(CE, false); + } + } + } + } +} + +// Returns all flat address expressions in function F. The elements are ordered +// ordered in postorder. +std::vector<WeakTrackingVH> +InferAddressSpaces::collectFlatAddressExpressions(Function &F) const { + // This function implements a non-recursive postorder traversal of a partial + // use-def graph of function F. + std::vector<std::pair<Value *, bool>> PostorderStack; + // The set of visited expressions. + DenseSet<Value *> Visited; + + auto PushPtrOperand = [&](Value *Ptr) { + appendsFlatAddressExpressionToPostorderStack(Ptr, PostorderStack, + Visited); + }; + + // Look at operations that may be interesting accelerate by moving to a known + // address space. We aim at generating after loads and stores, but pure + // addressing calculations may also be faster. + for (Instruction &I : instructions(F)) { + if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) { + if (!GEP->getType()->isVectorTy()) + PushPtrOperand(GEP->getPointerOperand()); + } else if (auto *LI = dyn_cast<LoadInst>(&I)) + PushPtrOperand(LI->getPointerOperand()); + else if (auto *SI = dyn_cast<StoreInst>(&I)) + PushPtrOperand(SI->getPointerOperand()); + else if (auto *RMW = dyn_cast<AtomicRMWInst>(&I)) + PushPtrOperand(RMW->getPointerOperand()); + else if (auto *CmpX = dyn_cast<AtomicCmpXchgInst>(&I)) + PushPtrOperand(CmpX->getPointerOperand()); + else if (auto *MI = dyn_cast<MemIntrinsic>(&I)) { + // For memset/memcpy/memmove, any pointer operand can be replaced. + PushPtrOperand(MI->getRawDest()); + + // Handle 2nd operand for memcpy/memmove. + if (auto *MTI = dyn_cast<MemTransferInst>(MI)) + PushPtrOperand(MTI->getRawSource()); + } else if (auto *II = dyn_cast<IntrinsicInst>(&I)) + collectRewritableIntrinsicOperands(II, PostorderStack, Visited); + else if (ICmpInst *Cmp = dyn_cast<ICmpInst>(&I)) { + // FIXME: Handle vectors of pointers + if (Cmp->getOperand(0)->getType()->isPointerTy()) { + PushPtrOperand(Cmp->getOperand(0)); + PushPtrOperand(Cmp->getOperand(1)); + } + } else if (auto *ASC = dyn_cast<AddrSpaceCastInst>(&I)) { + if (!ASC->getType()->isVectorTy()) + PushPtrOperand(ASC->getPointerOperand()); + } + } + + std::vector<WeakTrackingVH> Postorder; // The resultant postorder. + while (!PostorderStack.empty()) { + Value *TopVal = PostorderStack.back().first; + // If the operands of the expression on the top are already explored, + // adds that expression to the resultant postorder. + if (PostorderStack.back().second) { + if (TopVal->getType()->getPointerAddressSpace() == FlatAddrSpace) + Postorder.push_back(TopVal); + PostorderStack.pop_back(); + continue; + } + // Otherwise, adds its operands to the stack and explores them. + PostorderStack.back().second = true; + for (Value *PtrOperand : getPointerOperands(*TopVal)) { + appendsFlatAddressExpressionToPostorderStack(PtrOperand, PostorderStack, + Visited); + } + } + return Postorder; +} + +// A helper function for cloneInstructionWithNewAddressSpace. Returns the clone +// of OperandUse.get() in the new address space. If the clone is not ready yet, +// returns an undef in the new address space as a placeholder. +static Value *operandWithNewAddressSpaceOrCreateUndef( + const Use &OperandUse, unsigned NewAddrSpace, + const ValueToValueMapTy &ValueWithNewAddrSpace, + SmallVectorImpl<const Use *> *UndefUsesToFix) { + Value *Operand = OperandUse.get(); + + Type *NewPtrTy = + Operand->getType()->getPointerElementType()->getPointerTo(NewAddrSpace); + + if (Constant *C = dyn_cast<Constant>(Operand)) + return ConstantExpr::getAddrSpaceCast(C, NewPtrTy); + + if (Value *NewOperand = ValueWithNewAddrSpace.lookup(Operand)) + return NewOperand; + + UndefUsesToFix->push_back(&OperandUse); + return UndefValue::get(NewPtrTy); +} + +// Returns a clone of `I` with its operands converted to those specified in +// ValueWithNewAddrSpace. Due to potential cycles in the data flow graph, an +// operand whose address space needs to be modified might not exist in +// ValueWithNewAddrSpace. In that case, uses undef as a placeholder operand and +// adds that operand use to UndefUsesToFix so that caller can fix them later. +// +// Note that we do not necessarily clone `I`, e.g., if it is an addrspacecast +// from a pointer whose type already matches. Therefore, this function returns a +// Value* instead of an Instruction*. +static Value *cloneInstructionWithNewAddressSpace( + Instruction *I, unsigned NewAddrSpace, + const ValueToValueMapTy &ValueWithNewAddrSpace, + SmallVectorImpl<const Use *> *UndefUsesToFix) { + Type *NewPtrType = + I->getType()->getPointerElementType()->getPointerTo(NewAddrSpace); + + if (I->getOpcode() == Instruction::AddrSpaceCast) { + Value *Src = I->getOperand(0); + // Because `I` is flat, the source address space must be specific. + // Therefore, the inferred address space must be the source space, according + // to our algorithm. + assert(Src->getType()->getPointerAddressSpace() == NewAddrSpace); + if (Src->getType() != NewPtrType) + return new BitCastInst(Src, NewPtrType); + return Src; + } + + // Computes the converted pointer operands. + SmallVector<Value *, 4> NewPointerOperands; + for (const Use &OperandUse : I->operands()) { + if (!OperandUse.get()->getType()->isPointerTy()) + NewPointerOperands.push_back(nullptr); + else + NewPointerOperands.push_back(operandWithNewAddressSpaceOrCreateUndef( + OperandUse, NewAddrSpace, ValueWithNewAddrSpace, UndefUsesToFix)); + } + + switch (I->getOpcode()) { + case Instruction::BitCast: + return new BitCastInst(NewPointerOperands[0], NewPtrType); + case Instruction::PHI: { + assert(I->getType()->isPointerTy()); + PHINode *PHI = cast<PHINode>(I); + PHINode *NewPHI = PHINode::Create(NewPtrType, PHI->getNumIncomingValues()); + for (unsigned Index = 0; Index < PHI->getNumIncomingValues(); ++Index) { + unsigned OperandNo = PHINode::getOperandNumForIncomingValue(Index); + NewPHI->addIncoming(NewPointerOperands[OperandNo], + PHI->getIncomingBlock(Index)); + } + return NewPHI; + } + case Instruction::GetElementPtr: { + GetElementPtrInst *GEP = cast<GetElementPtrInst>(I); + GetElementPtrInst *NewGEP = GetElementPtrInst::Create( + GEP->getSourceElementType(), NewPointerOperands[0], + SmallVector<Value *, 4>(GEP->idx_begin(), GEP->idx_end())); + NewGEP->setIsInBounds(GEP->isInBounds()); + return NewGEP; + } + case Instruction::Select: { + assert(I->getType()->isPointerTy()); + return SelectInst::Create(I->getOperand(0), NewPointerOperands[1], + NewPointerOperands[2], "", nullptr, I); + } + default: + llvm_unreachable("Unexpected opcode"); + } +} + +// Similar to cloneInstructionWithNewAddressSpace, returns a clone of the +// constant expression `CE` with its operands replaced as specified in +// ValueWithNewAddrSpace. +static Value *cloneConstantExprWithNewAddressSpace( + ConstantExpr *CE, unsigned NewAddrSpace, + const ValueToValueMapTy &ValueWithNewAddrSpace) { + Type *TargetType = + CE->getType()->getPointerElementType()->getPointerTo(NewAddrSpace); + + if (CE->getOpcode() == Instruction::AddrSpaceCast) { + // Because CE is flat, the source address space must be specific. + // Therefore, the inferred address space must be the source space according + // to our algorithm. + assert(CE->getOperand(0)->getType()->getPointerAddressSpace() == + NewAddrSpace); + return ConstantExpr::getBitCast(CE->getOperand(0), TargetType); + } + + if (CE->getOpcode() == Instruction::BitCast) { + if (Value *NewOperand = ValueWithNewAddrSpace.lookup(CE->getOperand(0))) + return ConstantExpr::getBitCast(cast<Constant>(NewOperand), TargetType); + return ConstantExpr::getAddrSpaceCast(CE, TargetType); + } + + if (CE->getOpcode() == Instruction::Select) { + Constant *Src0 = CE->getOperand(1); + Constant *Src1 = CE->getOperand(2); + if (Src0->getType()->getPointerAddressSpace() == + Src1->getType()->getPointerAddressSpace()) { + + return ConstantExpr::getSelect( + CE->getOperand(0), ConstantExpr::getAddrSpaceCast(Src0, TargetType), + ConstantExpr::getAddrSpaceCast(Src1, TargetType)); + } + } + + // Computes the operands of the new constant expression. + bool IsNew = false; + SmallVector<Constant *, 4> NewOperands; + for (unsigned Index = 0; Index < CE->getNumOperands(); ++Index) { + Constant *Operand = CE->getOperand(Index); + // If the address space of `Operand` needs to be modified, the new operand + // with the new address space should already be in ValueWithNewAddrSpace + // because (1) the constant expressions we consider (i.e. addrspacecast, + // bitcast, and getelementptr) do not incur cycles in the data flow graph + // and (2) this function is called on constant expressions in postorder. + if (Value *NewOperand = ValueWithNewAddrSpace.lookup(Operand)) { + IsNew = true; + NewOperands.push_back(cast<Constant>(NewOperand)); + } else { + // Otherwise, reuses the old operand. + NewOperands.push_back(Operand); + } + } + + // If !IsNew, we will replace the Value with itself. However, replaced values + // are assumed to wrapped in a addrspace cast later so drop it now. + if (!IsNew) + return nullptr; + + if (CE->getOpcode() == Instruction::GetElementPtr) { + // Needs to specify the source type while constructing a getelementptr + // constant expression. + return CE->getWithOperands( + NewOperands, TargetType, /*OnlyIfReduced=*/false, + NewOperands[0]->getType()->getPointerElementType()); + } + + return CE->getWithOperands(NewOperands, TargetType); +} + +// Returns a clone of the value `V`, with its operands replaced as specified in +// ValueWithNewAddrSpace. This function is called on every flat address +// expression whose address space needs to be modified, in postorder. +// +// See cloneInstructionWithNewAddressSpace for the meaning of UndefUsesToFix. +Value *InferAddressSpaces::cloneValueWithNewAddressSpace( + Value *V, unsigned NewAddrSpace, + const ValueToValueMapTy &ValueWithNewAddrSpace, + SmallVectorImpl<const Use *> *UndefUsesToFix) const { + // All values in Postorder are flat address expressions. + assert(isAddressExpression(*V) && + V->getType()->getPointerAddressSpace() == FlatAddrSpace); + + if (Instruction *I = dyn_cast<Instruction>(V)) { + Value *NewV = cloneInstructionWithNewAddressSpace( + I, NewAddrSpace, ValueWithNewAddrSpace, UndefUsesToFix); + if (Instruction *NewI = dyn_cast<Instruction>(NewV)) { + if (NewI->getParent() == nullptr) { + NewI->insertBefore(I); + NewI->takeName(I); + } + } + return NewV; + } + + return cloneConstantExprWithNewAddressSpace( + cast<ConstantExpr>(V), NewAddrSpace, ValueWithNewAddrSpace); +} + +// Defines the join operation on the address space lattice (see the file header +// comments). +unsigned InferAddressSpaces::joinAddressSpaces(unsigned AS1, + unsigned AS2) const { + if (AS1 == FlatAddrSpace || AS2 == FlatAddrSpace) + return FlatAddrSpace; + + if (AS1 == UninitializedAddressSpace) + return AS2; + if (AS2 == UninitializedAddressSpace) + return AS1; + + // The join of two different specific address spaces is flat. + return (AS1 == AS2) ? AS1 : FlatAddrSpace; +} + +bool InferAddressSpaces::runOnFunction(Function &F) { + if (skipFunction(F)) + return false; + + const TargetTransformInfo &TTI = + getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); + FlatAddrSpace = TTI.getFlatAddressSpace(); + if (FlatAddrSpace == UninitializedAddressSpace) + return false; + + // Collects all flat address expressions in postorder. + std::vector<WeakTrackingVH> Postorder = collectFlatAddressExpressions(F); + + // Runs a data-flow analysis to refine the address spaces of every expression + // in Postorder. + ValueToAddrSpaceMapTy InferredAddrSpace; + inferAddressSpaces(Postorder, &InferredAddrSpace); + + // Changes the address spaces of the flat address expressions who are inferred + // to point to a specific address space. + return rewriteWithNewAddressSpaces(Postorder, InferredAddrSpace, &F); +} + +// Constants need to be tracked through RAUW to handle cases with nested +// constant expressions, so wrap values in WeakTrackingVH. +void InferAddressSpaces::inferAddressSpaces( + ArrayRef<WeakTrackingVH> Postorder, + ValueToAddrSpaceMapTy *InferredAddrSpace) const { + SetVector<Value *> Worklist(Postorder.begin(), Postorder.end()); + // Initially, all expressions are in the uninitialized address space. + for (Value *V : Postorder) + (*InferredAddrSpace)[V] = UninitializedAddressSpace; + + while (!Worklist.empty()) { + Value *V = Worklist.pop_back_val(); + + // Tries to update the address space of the stack top according to the + // address spaces of its operands. + DEBUG(dbgs() << "Updating the address space of\n " << *V << '\n'); + Optional<unsigned> NewAS = updateAddressSpace(*V, *InferredAddrSpace); + if (!NewAS.hasValue()) + continue; + // If any updates are made, grabs its users to the worklist because + // their address spaces can also be possibly updated. + DEBUG(dbgs() << " to " << NewAS.getValue() << '\n'); + (*InferredAddrSpace)[V] = NewAS.getValue(); + + for (Value *User : V->users()) { + // Skip if User is already in the worklist. + if (Worklist.count(User)) + continue; + + auto Pos = InferredAddrSpace->find(User); + // Our algorithm only updates the address spaces of flat address + // expressions, which are those in InferredAddrSpace. + if (Pos == InferredAddrSpace->end()) + continue; + + // Function updateAddressSpace moves the address space down a lattice + // path. Therefore, nothing to do if User is already inferred as flat (the + // bottom element in the lattice). + if (Pos->second == FlatAddrSpace) + continue; + + Worklist.insert(User); + } + } +} + +Optional<unsigned> InferAddressSpaces::updateAddressSpace( + const Value &V, const ValueToAddrSpaceMapTy &InferredAddrSpace) const { + assert(InferredAddrSpace.count(&V)); + + // The new inferred address space equals the join of the address spaces + // of all its pointer operands. + unsigned NewAS = UninitializedAddressSpace; + + const Operator &Op = cast<Operator>(V); + if (Op.getOpcode() == Instruction::Select) { + Value *Src0 = Op.getOperand(1); + Value *Src1 = Op.getOperand(2); + + auto I = InferredAddrSpace.find(Src0); + unsigned Src0AS = (I != InferredAddrSpace.end()) ? + I->second : Src0->getType()->getPointerAddressSpace(); + + auto J = InferredAddrSpace.find(Src1); + unsigned Src1AS = (J != InferredAddrSpace.end()) ? + J->second : Src1->getType()->getPointerAddressSpace(); + + auto *C0 = dyn_cast<Constant>(Src0); + auto *C1 = dyn_cast<Constant>(Src1); + + // If one of the inputs is a constant, we may be able to do a constant + // addrspacecast of it. Defer inferring the address space until the input + // address space is known. + if ((C1 && Src0AS == UninitializedAddressSpace) || + (C0 && Src1AS == UninitializedAddressSpace)) + return None; + + if (C0 && isSafeToCastConstAddrSpace(C0, Src1AS)) + NewAS = Src1AS; + else if (C1 && isSafeToCastConstAddrSpace(C1, Src0AS)) + NewAS = Src0AS; + else + NewAS = joinAddressSpaces(Src0AS, Src1AS); + } else { + for (Value *PtrOperand : getPointerOperands(V)) { + auto I = InferredAddrSpace.find(PtrOperand); + unsigned OperandAS = I != InferredAddrSpace.end() ? + I->second : PtrOperand->getType()->getPointerAddressSpace(); + + // join(flat, *) = flat. So we can break if NewAS is already flat. + NewAS = joinAddressSpaces(NewAS, OperandAS); + if (NewAS == FlatAddrSpace) + break; + } + } + + unsigned OldAS = InferredAddrSpace.lookup(&V); + assert(OldAS != FlatAddrSpace); + if (OldAS == NewAS) + return None; + return NewAS; +} + +/// \p returns true if \p U is the pointer operand of a memory instruction with +/// a single pointer operand that can have its address space changed by simply +/// mutating the use to a new value. +static bool isSimplePointerUseValidToReplace(Use &U) { + User *Inst = U.getUser(); + unsigned OpNo = U.getOperandNo(); + + if (auto *LI = dyn_cast<LoadInst>(Inst)) + return OpNo == LoadInst::getPointerOperandIndex() && !LI->isVolatile(); + + if (auto *SI = dyn_cast<StoreInst>(Inst)) + return OpNo == StoreInst::getPointerOperandIndex() && !SI->isVolatile(); + + if (auto *RMW = dyn_cast<AtomicRMWInst>(Inst)) + return OpNo == AtomicRMWInst::getPointerOperandIndex() && !RMW->isVolatile(); + + if (auto *CmpX = dyn_cast<AtomicCmpXchgInst>(Inst)) { + return OpNo == AtomicCmpXchgInst::getPointerOperandIndex() && + !CmpX->isVolatile(); + } + + return false; +} + +/// Update memory intrinsic uses that require more complex processing than +/// simple memory instructions. Thse require re-mangling and may have multiple +/// pointer operands. +static bool handleMemIntrinsicPtrUse(MemIntrinsic *MI, Value *OldV, + Value *NewV) { + IRBuilder<> B(MI); + MDNode *TBAA = MI->getMetadata(LLVMContext::MD_tbaa); + MDNode *ScopeMD = MI->getMetadata(LLVMContext::MD_alias_scope); + MDNode *NoAliasMD = MI->getMetadata(LLVMContext::MD_noalias); + + if (auto *MSI = dyn_cast<MemSetInst>(MI)) { + B.CreateMemSet(NewV, MSI->getValue(), + MSI->getLength(), MSI->getAlignment(), + false, // isVolatile + TBAA, ScopeMD, NoAliasMD); + } else if (auto *MTI = dyn_cast<MemTransferInst>(MI)) { + Value *Src = MTI->getRawSource(); + Value *Dest = MTI->getRawDest(); + + // Be careful in case this is a self-to-self copy. + if (Src == OldV) + Src = NewV; + + if (Dest == OldV) + Dest = NewV; + + if (isa<MemCpyInst>(MTI)) { + MDNode *TBAAStruct = MTI->getMetadata(LLVMContext::MD_tbaa_struct); + B.CreateMemCpy(Dest, Src, MTI->getLength(), + MTI->getAlignment(), + false, // isVolatile + TBAA, TBAAStruct, ScopeMD, NoAliasMD); + } else { + assert(isa<MemMoveInst>(MTI)); + B.CreateMemMove(Dest, Src, MTI->getLength(), + MTI->getAlignment(), + false, // isVolatile + TBAA, ScopeMD, NoAliasMD); + } + } else + llvm_unreachable("unhandled MemIntrinsic"); + + MI->eraseFromParent(); + return true; +} + +// \p returns true if it is OK to change the address space of constant \p C with +// a ConstantExpr addrspacecast. +bool InferAddressSpaces::isSafeToCastConstAddrSpace(Constant *C, unsigned NewAS) const { + assert(NewAS != UninitializedAddressSpace); + + unsigned SrcAS = C->getType()->getPointerAddressSpace(); + if (SrcAS == NewAS || isa<UndefValue>(C)) + return true; + + // Prevent illegal casts between different non-flat address spaces. + if (SrcAS != FlatAddrSpace && NewAS != FlatAddrSpace) + return false; + + if (isa<ConstantPointerNull>(C)) + return true; + + if (auto *Op = dyn_cast<Operator>(C)) { + // If we already have a constant addrspacecast, it should be safe to cast it + // off. + if (Op->getOpcode() == Instruction::AddrSpaceCast) + return isSafeToCastConstAddrSpace(cast<Constant>(Op->getOperand(0)), NewAS); + + if (Op->getOpcode() == Instruction::IntToPtr && + Op->getType()->getPointerAddressSpace() == FlatAddrSpace) + return true; + } + + return false; +} + +static Value::use_iterator skipToNextUser(Value::use_iterator I, + Value::use_iterator End) { + User *CurUser = I->getUser(); + ++I; + + while (I != End && I->getUser() == CurUser) + ++I; + + return I; +} + +bool InferAddressSpaces::rewriteWithNewAddressSpaces( + ArrayRef<WeakTrackingVH> Postorder, + const ValueToAddrSpaceMapTy &InferredAddrSpace, Function *F) const { + // For each address expression to be modified, creates a clone of it with its + // pointer operands converted to the new address space. Since the pointer + // operands are converted, the clone is naturally in the new address space by + // construction. + ValueToValueMapTy ValueWithNewAddrSpace; + SmallVector<const Use *, 32> UndefUsesToFix; + for (Value* V : Postorder) { + unsigned NewAddrSpace = InferredAddrSpace.lookup(V); + if (V->getType()->getPointerAddressSpace() != NewAddrSpace) { + ValueWithNewAddrSpace[V] = cloneValueWithNewAddressSpace( + V, NewAddrSpace, ValueWithNewAddrSpace, &UndefUsesToFix); + } + } + + if (ValueWithNewAddrSpace.empty()) + return false; + + // Fixes all the undef uses generated by cloneInstructionWithNewAddressSpace. + for (const Use *UndefUse : UndefUsesToFix) { + User *V = UndefUse->getUser(); + User *NewV = cast<User>(ValueWithNewAddrSpace.lookup(V)); + unsigned OperandNo = UndefUse->getOperandNo(); + assert(isa<UndefValue>(NewV->getOperand(OperandNo))); + NewV->setOperand(OperandNo, ValueWithNewAddrSpace.lookup(UndefUse->get())); + } + + SmallVector<Instruction *, 16> DeadInstructions; + + // Replaces the uses of the old address expressions with the new ones. + for (const WeakTrackingVH &WVH : Postorder) { + assert(WVH && "value was unexpectedly deleted"); + Value *V = WVH; + Value *NewV = ValueWithNewAddrSpace.lookup(V); + if (NewV == nullptr) + continue; + + DEBUG(dbgs() << "Replacing the uses of " << *V + << "\n with\n " << *NewV << '\n'); + + if (Constant *C = dyn_cast<Constant>(V)) { + Constant *Replace = ConstantExpr::getAddrSpaceCast(cast<Constant>(NewV), + C->getType()); + if (C != Replace) { + DEBUG(dbgs() << "Inserting replacement const cast: " + << Replace << ": " << *Replace << '\n'); + C->replaceAllUsesWith(Replace); + V = Replace; + } + } + + Value::use_iterator I, E, Next; + for (I = V->use_begin(), E = V->use_end(); I != E; ) { + Use &U = *I; + + // Some users may see the same pointer operand in multiple operands. Skip + // to the next instruction. + I = skipToNextUser(I, E); + + if (isSimplePointerUseValidToReplace(U)) { + // If V is used as the pointer operand of a compatible memory operation, + // sets the pointer operand to NewV. This replacement does not change + // the element type, so the resultant load/store is still valid. + U.set(NewV); + continue; + } + + User *CurUser = U.getUser(); + // Handle more complex cases like intrinsic that need to be remangled. + if (auto *MI = dyn_cast<MemIntrinsic>(CurUser)) { + if (!MI->isVolatile() && handleMemIntrinsicPtrUse(MI, V, NewV)) + continue; + } + + if (auto *II = dyn_cast<IntrinsicInst>(CurUser)) { + if (rewriteIntrinsicOperands(II, V, NewV)) + continue; + } + + if (isa<Instruction>(CurUser)) { + if (ICmpInst *Cmp = dyn_cast<ICmpInst>(CurUser)) { + // If we can infer that both pointers are in the same addrspace, + // transform e.g. + // %cmp = icmp eq float* %p, %q + // into + // %cmp = icmp eq float addrspace(3)* %new_p, %new_q + + unsigned NewAS = NewV->getType()->getPointerAddressSpace(); + int SrcIdx = U.getOperandNo(); + int OtherIdx = (SrcIdx == 0) ? 1 : 0; + Value *OtherSrc = Cmp->getOperand(OtherIdx); + + if (Value *OtherNewV = ValueWithNewAddrSpace.lookup(OtherSrc)) { + if (OtherNewV->getType()->getPointerAddressSpace() == NewAS) { + Cmp->setOperand(OtherIdx, OtherNewV); + Cmp->setOperand(SrcIdx, NewV); + continue; + } + } + + // Even if the type mismatches, we can cast the constant. + if (auto *KOtherSrc = dyn_cast<Constant>(OtherSrc)) { + if (isSafeToCastConstAddrSpace(KOtherSrc, NewAS)) { + Cmp->setOperand(SrcIdx, NewV); + Cmp->setOperand(OtherIdx, + ConstantExpr::getAddrSpaceCast(KOtherSrc, NewV->getType())); + continue; + } + } + } + + if (AddrSpaceCastInst *ASC = dyn_cast<AddrSpaceCastInst>(CurUser)) { + unsigned NewAS = NewV->getType()->getPointerAddressSpace(); + if (ASC->getDestAddressSpace() == NewAS) { + ASC->replaceAllUsesWith(NewV); + DeadInstructions.push_back(ASC); + continue; + } + } + + // Otherwise, replaces the use with flat(NewV). + if (Instruction *I = dyn_cast<Instruction>(V)) { + BasicBlock::iterator InsertPos = std::next(I->getIterator()); + while (isa<PHINode>(InsertPos)) + ++InsertPos; + U.set(new AddrSpaceCastInst(NewV, V->getType(), "", &*InsertPos)); + } else { + U.set(ConstantExpr::getAddrSpaceCast(cast<Constant>(NewV), + V->getType())); + } + } + } + + if (V->use_empty()) { + if (Instruction *I = dyn_cast<Instruction>(V)) + DeadInstructions.push_back(I); + } + } + + for (Instruction *I : DeadInstructions) + RecursivelyDeleteTriviallyDeadInstructions(I); + + return true; +} + +FunctionPass *llvm::createInferAddressSpacesPass() { + return new InferAddressSpaces(); +} diff --git a/contrib/llvm/lib/Transforms/Scalar/JumpThreading.cpp b/contrib/llvm/lib/Transforms/Scalar/JumpThreading.cpp index 1870c3d..dc9143b 100644 --- a/contrib/llvm/lib/Transforms/Scalar/JumpThreading.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/JumpThreading.cpp @@ -12,29 +12,33 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/Scalar/JumpThreading.h" -#include "llvm/Transforms/Scalar.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/DenseSet.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/Statistic.h" -#include "llvm/Analysis/GlobalsModRef.h" -#include "llvm/Analysis/CFG.h" +#include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/BlockFrequencyInfoImpl.h" +#include "llvm/Analysis/CFG.h" #include "llvm/Analysis/ConstantFolding.h" +#include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/Loads.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/ValueTracking.h" +#include "llvm/IR/ConstantRange.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/MDBuilder.h" #include "llvm/IR/Metadata.h" +#include "llvm/IR/PatternMatch.h" #include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Cloning.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/SSAUpdater.h" #include <algorithm> @@ -60,6 +64,11 @@ ImplicationSearchThreshold( "condition to use to thread over a weaker condition"), cl::init(3), cl::Hidden); +static cl::opt<bool> PrintLVIAfterJumpThreading( + "print-lvi-after-jump-threading", + cl::desc("Print the LazyValueInfo cache after JumpThreading"), cl::init(false), + cl::Hidden); + namespace { /// This pass performs 'jump threading', which looks at blocks that have /// multiple predecessors and multiple successors. If one or more of the @@ -89,8 +98,10 @@ namespace { bool runOnFunction(Function &F) override; void getAnalysisUsage(AnalysisUsage &AU) const override { + if (PrintLVIAfterJumpThreading) + AU.addRequired<DominatorTreeWrapperPass>(); + AU.addRequired<AAResultsWrapperPass>(); AU.addRequired<LazyValueInfoWrapperPass>(); - AU.addPreserved<LazyValueInfoWrapperPass>(); AU.addPreserved<GlobalsAAWrapperPass>(); AU.addRequired<TargetLibraryInfoWrapperPass>(); } @@ -104,6 +115,7 @@ INITIALIZE_PASS_BEGIN(JumpThreading, "jump-threading", "Jump Threading", false, false) INITIALIZE_PASS_DEPENDENCY(LazyValueInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) INITIALIZE_PASS_END(JumpThreading, "jump-threading", "Jump Threading", false, false) @@ -121,16 +133,24 @@ bool JumpThreading::runOnFunction(Function &F) { return false; auto TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(); auto LVI = &getAnalysis<LazyValueInfoWrapperPass>().getLVI(); + auto AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); std::unique_ptr<BlockFrequencyInfo> BFI; std::unique_ptr<BranchProbabilityInfo> BPI; bool HasProfileData = F.getEntryCount().hasValue(); if (HasProfileData) { LoopInfo LI{DominatorTree(F)}; - BPI.reset(new BranchProbabilityInfo(F, LI)); + BPI.reset(new BranchProbabilityInfo(F, LI, TLI)); BFI.reset(new BlockFrequencyInfo(F, *BPI, LI)); } - return Impl.runImpl(F, TLI, LVI, HasProfileData, std::move(BFI), - std::move(BPI)); + + bool Changed = Impl.runImpl(F, TLI, LVI, AA, HasProfileData, std::move(BFI), + std::move(BPI)); + if (PrintLVIAfterJumpThreading) { + dbgs() << "LVI for function '" << F.getName() << "':\n"; + LVI->printLVI(F, getAnalysis<DominatorTreeWrapperPass>().getDomTree(), + dbgs()); + } + return Changed; } PreservedAnalyses JumpThreadingPass::run(Function &F, @@ -138,20 +158,19 @@ PreservedAnalyses JumpThreadingPass::run(Function &F, auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); auto &LVI = AM.getResult<LazyValueAnalysis>(F); + auto &AA = AM.getResult<AAManager>(F); + std::unique_ptr<BlockFrequencyInfo> BFI; std::unique_ptr<BranchProbabilityInfo> BPI; bool HasProfileData = F.getEntryCount().hasValue(); if (HasProfileData) { LoopInfo LI{DominatorTree(F)}; - BPI.reset(new BranchProbabilityInfo(F, LI)); + BPI.reset(new BranchProbabilityInfo(F, LI, &TLI)); BFI.reset(new BlockFrequencyInfo(F, *BPI, LI)); } - bool Changed = - runImpl(F, &TLI, &LVI, HasProfileData, std::move(BFI), std::move(BPI)); - // FIXME: We need to invalidate LVI to avoid PR28400. Is there a better - // solution? - AM.invalidate<LazyValueAnalysis>(F); + bool Changed = runImpl(F, &TLI, &LVI, &AA, HasProfileData, std::move(BFI), + std::move(BPI)); if (!Changed) return PreservedAnalyses::all(); @@ -161,18 +180,23 @@ PreservedAnalyses JumpThreadingPass::run(Function &F, } bool JumpThreadingPass::runImpl(Function &F, TargetLibraryInfo *TLI_, - LazyValueInfo *LVI_, bool HasProfileData_, + LazyValueInfo *LVI_, AliasAnalysis *AA_, + bool HasProfileData_, std::unique_ptr<BlockFrequencyInfo> BFI_, std::unique_ptr<BranchProbabilityInfo> BPI_) { DEBUG(dbgs() << "Jump threading on function '" << F.getName() << "'\n"); TLI = TLI_; LVI = LVI_; + AA = AA_; BFI.reset(); BPI.reset(); // When profile data is available, we need to update edge weights after // successful jump threading, which requires both BPI and BFI being available. HasProfileData = HasProfileData_; + auto *GuardDecl = F.getParent()->getFunction( + Intrinsic::getName(Intrinsic::experimental_guard)); + HasGuards = GuardDecl && !GuardDecl->use_empty(); if (HasProfileData) { BPI = std::move(BPI_); BFI = std::move(BFI_); @@ -219,33 +243,22 @@ bool JumpThreadingPass::runImpl(Function &F, TargetLibraryInfo *TLI_, // Can't thread an unconditional jump, but if the block is "almost // empty", we can replace uses of it with uses of the successor and make // this dead. - // We should not eliminate the loop header either, because eliminating - // a loop header might later prevent LoopSimplify from transforming nested - // loops into simplified form. + // We should not eliminate the loop header or latch either, because + // eliminating a loop header or latch might later prevent LoopSimplify + // from transforming nested loops into simplified form. We will rely on + // later passes in backend to clean up empty blocks. if (BI && BI->isUnconditional() && BB != &BB->getParent()->getEntryBlock() && // If the terminator is the only non-phi instruction, try to nuke it. - BB->getFirstNonPHIOrDbg()->isTerminator() && !LoopHeaders.count(BB)) { - // Since TryToSimplifyUncondBranchFromEmptyBlock may delete the - // block, we have to make sure it isn't in the LoopHeaders set. We - // reinsert afterward if needed. - bool ErasedFromLoopHeaders = LoopHeaders.erase(BB); - BasicBlock *Succ = BI->getSuccessor(0); - + BB->getFirstNonPHIOrDbg()->isTerminator() && !LoopHeaders.count(BB) && + !LoopHeaders.count(BI->getSuccessor(0))) { // FIXME: It is always conservatively correct to drop the info // for a block even if it doesn't get erased. This isn't totally // awesome, but it allows us to use AssertingVH to prevent nasty // dangling pointer issues within LazyValueInfo. LVI->eraseBlock(BB); - if (TryToSimplifyUncondBranchFromEmptyBlock(BB)) { + if (TryToSimplifyUncondBranchFromEmptyBlock(BB)) Changed = true; - // If we deleted BB and BB was the header of a loop, then the - // successor is now the header of the loop. - BB = Succ; - } - - if (ErasedFromLoopHeaders) - LoopHeaders.insert(BB); } } EverChanged |= Changed; @@ -255,10 +268,42 @@ bool JumpThreadingPass::runImpl(Function &F, TargetLibraryInfo *TLI_, return EverChanged; } -/// getJumpThreadDuplicationCost - Return the cost of duplicating this block to -/// thread across it. Stop scanning the block when passing the threshold. -static unsigned getJumpThreadDuplicationCost(const BasicBlock *BB, +// Replace uses of Cond with ToVal when safe to do so. If all uses are +// replaced, we can remove Cond. We cannot blindly replace all uses of Cond +// because we may incorrectly replace uses when guards/assumes are uses of +// of `Cond` and we used the guards/assume to reason about the `Cond` value +// at the end of block. RAUW unconditionally replaces all uses +// including the guards/assumes themselves and the uses before the +// guard/assume. +static void ReplaceFoldableUses(Instruction *Cond, Value *ToVal) { + assert(Cond->getType() == ToVal->getType()); + auto *BB = Cond->getParent(); + // We can unconditionally replace all uses in non-local blocks (i.e. uses + // strictly dominated by BB), since LVI information is true from the + // terminator of BB. + replaceNonLocalUsesWith(Cond, ToVal); + for (Instruction &I : reverse(*BB)) { + // Reached the Cond whose uses we are trying to replace, so there are no + // more uses. + if (&I == Cond) + break; + // We only replace uses in instructions that are guaranteed to reach the end + // of BB, where we know Cond is ToVal. + if (!isGuaranteedToTransferExecutionToSuccessor(&I)) + break; + I.replaceUsesOfWith(Cond, ToVal); + } + if (Cond->use_empty() && !Cond->mayHaveSideEffects()) + Cond->eraseFromParent(); +} + +/// Return the cost of duplicating a piece of this block from first non-phi +/// and before StopAt instruction to thread across it. Stop scanning the block +/// when exceeding the threshold. If duplication is impossible, returns ~0U. +static unsigned getJumpThreadDuplicationCost(BasicBlock *BB, + Instruction *StopAt, unsigned Threshold) { + assert(StopAt->getParent() == BB && "Not an instruction from proper BB?"); /// Ignore PHI nodes, these will be flattened when duplication happens. BasicBlock::const_iterator I(BB->getFirstNonPHI()); @@ -266,15 +311,17 @@ static unsigned getJumpThreadDuplicationCost(const BasicBlock *BB, // branch, so they shouldn't count against the duplication cost. unsigned Bonus = 0; - const TerminatorInst *BBTerm = BB->getTerminator(); - // Threading through a switch statement is particularly profitable. If this - // block ends in a switch, decrease its cost to make it more likely to happen. - if (isa<SwitchInst>(BBTerm)) - Bonus = 6; - - // The same holds for indirect branches, but slightly more so. - if (isa<IndirectBrInst>(BBTerm)) - Bonus = 8; + if (BB->getTerminator() == StopAt) { + // Threading through a switch statement is particularly profitable. If this + // block ends in a switch, decrease its cost to make it more likely to + // happen. + if (isa<SwitchInst>(StopAt)) + Bonus = 6; + + // The same holds for indirect branches, but slightly more so. + if (isa<IndirectBrInst>(StopAt)) + Bonus = 8; + } // Bump the threshold up so the early exit from the loop doesn't skip the // terminator-based Size adjustment at the end. @@ -283,7 +330,7 @@ static unsigned getJumpThreadDuplicationCost(const BasicBlock *BB, // Sum up the cost of each instruction until we get to the terminator. Don't // include the terminator because the copy won't include it. unsigned Size = 0; - for (; !isa<TerminatorInst>(I); ++I) { + for (; &*I != StopAt; ++I) { // Stop scanning the block if we've reached the threshold. if (Size > Threshold) @@ -544,7 +591,12 @@ bool JumpThreadingPass::ComputeValueKnownInPredecessors( // Handle compare with phi operand, where the PHI is defined in this block. if (CmpInst *Cmp = dyn_cast<CmpInst>(I)) { assert(Preference == WantInteger && "Compares only produce integers"); - PHINode *PN = dyn_cast<PHINode>(Cmp->getOperand(0)); + Type *CmpType = Cmp->getType(); + Value *CmpLHS = Cmp->getOperand(0); + Value *CmpRHS = Cmp->getOperand(1); + CmpInst::Predicate Pred = Cmp->getPredicate(); + + PHINode *PN = dyn_cast<PHINode>(CmpLHS); if (PN && PN->getParent() == BB) { const DataLayout &DL = PN->getModule()->getDataLayout(); // We can do this simplification if any comparisons fold to true or false. @@ -552,15 +604,15 @@ bool JumpThreadingPass::ComputeValueKnownInPredecessors( for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) { BasicBlock *PredBB = PN->getIncomingBlock(i); Value *LHS = PN->getIncomingValue(i); - Value *RHS = Cmp->getOperand(1)->DoPHITranslation(BB, PredBB); + Value *RHS = CmpRHS->DoPHITranslation(BB, PredBB); - Value *Res = SimplifyCmpInst(Cmp->getPredicate(), LHS, RHS, DL); + Value *Res = SimplifyCmpInst(Pred, LHS, RHS, {DL}); if (!Res) { if (!isa<Constant>(RHS)) continue; LazyValueInfo::Tristate - ResT = LVI->getPredicateOnEdge(Cmp->getPredicate(), LHS, + ResT = LVI->getPredicateOnEdge(Pred, LHS, cast<Constant>(RHS), PredBB, BB, CxtI ? CxtI : Cmp); if (ResT == LazyValueInfo::Unknown) @@ -577,44 +629,81 @@ bool JumpThreadingPass::ComputeValueKnownInPredecessors( // If comparing a live-in value against a constant, see if we know the // live-in value on any predecessors. - if (isa<Constant>(Cmp->getOperand(1)) && Cmp->getType()->isIntegerTy()) { - if (!isa<Instruction>(Cmp->getOperand(0)) || - cast<Instruction>(Cmp->getOperand(0))->getParent() != BB) { - Constant *RHSCst = cast<Constant>(Cmp->getOperand(1)); + if (isa<Constant>(CmpRHS) && !CmpType->isVectorTy()) { + Constant *CmpConst = cast<Constant>(CmpRHS); + if (!isa<Instruction>(CmpLHS) || + cast<Instruction>(CmpLHS)->getParent() != BB) { for (BasicBlock *P : predecessors(BB)) { // If the value is known by LazyValueInfo to be a constant in a // predecessor, use that information to try to thread this block. LazyValueInfo::Tristate Res = - LVI->getPredicateOnEdge(Cmp->getPredicate(), Cmp->getOperand(0), - RHSCst, P, BB, CxtI ? CxtI : Cmp); + LVI->getPredicateOnEdge(Pred, CmpLHS, + CmpConst, P, BB, CxtI ? CxtI : Cmp); if (Res == LazyValueInfo::Unknown) continue; - Constant *ResC = ConstantInt::get(Cmp->getType(), Res); + Constant *ResC = ConstantInt::get(CmpType, Res); Result.push_back(std::make_pair(ResC, P)); } return !Result.empty(); } + // InstCombine can fold some forms of constant range checks into + // (icmp (add (x, C1)), C2). See if we have we have such a thing with + // x as a live-in. + { + using namespace PatternMatch; + Value *AddLHS; + ConstantInt *AddConst; + if (isa<ConstantInt>(CmpConst) && + match(CmpLHS, m_Add(m_Value(AddLHS), m_ConstantInt(AddConst)))) { + if (!isa<Instruction>(AddLHS) || + cast<Instruction>(AddLHS)->getParent() != BB) { + for (BasicBlock *P : predecessors(BB)) { + // If the value is known by LazyValueInfo to be a ConstantRange in + // a predecessor, use that information to try to thread this + // block. + ConstantRange CR = LVI->getConstantRangeOnEdge( + AddLHS, P, BB, CxtI ? CxtI : cast<Instruction>(CmpLHS)); + // Propagate the range through the addition. + CR = CR.add(AddConst->getValue()); + + // Get the range where the compare returns true. + ConstantRange CmpRange = ConstantRange::makeExactICmpRegion( + Pred, cast<ConstantInt>(CmpConst)->getValue()); + + Constant *ResC; + if (CmpRange.contains(CR)) + ResC = ConstantInt::getTrue(CmpType); + else if (CmpRange.inverse().contains(CR)) + ResC = ConstantInt::getFalse(CmpType); + else + continue; + + Result.push_back(std::make_pair(ResC, P)); + } + + return !Result.empty(); + } + } + } + // Try to find a constant value for the LHS of a comparison, // and evaluate it statically if we can. - if (Constant *CmpConst = dyn_cast<Constant>(Cmp->getOperand(1))) { - PredValueInfoTy LHSVals; - ComputeValueKnownInPredecessors(I->getOperand(0), BB, LHSVals, - WantInteger, CxtI); - - for (const auto &LHSVal : LHSVals) { - Constant *V = LHSVal.first; - Constant *Folded = ConstantExpr::getCompare(Cmp->getPredicate(), - V, CmpConst); - if (Constant *KC = getKnownConstant(Folded, WantInteger)) - Result.push_back(std::make_pair(KC, LHSVal.second)); - } + PredValueInfoTy LHSVals; + ComputeValueKnownInPredecessors(I->getOperand(0), BB, LHSVals, + WantInteger, CxtI); - return !Result.empty(); + for (const auto &LHSVal : LHSVals) { + Constant *V = LHSVal.first; + Constant *Folded = ConstantExpr::getCompare(Pred, V, CmpConst); + if (Constant *KC = getKnownConstant(Folded, WantInteger)) + Result.push_back(std::make_pair(KC, LHSVal.second)); } + + return !Result.empty(); } } @@ -722,6 +811,37 @@ bool JumpThreadingPass::ProcessBlock(BasicBlock *BB) { LVI->eraseBlock(SinglePred); MergeBasicBlockIntoOnlyPred(BB); + // Now that BB is merged into SinglePred (i.e. SinglePred Code followed by + // BB code within one basic block `BB`), we need to invalidate the LVI + // information associated with BB, because the LVI information need not be + // true for all of BB after the merge. For example, + // Before the merge, LVI info and code is as follows: + // SinglePred: <LVI info1 for %p val> + // %y = use of %p + // call @exit() // need not transfer execution to successor. + // assume(%p) // from this point on %p is true + // br label %BB + // BB: <LVI info2 for %p val, i.e. %p is true> + // %x = use of %p + // br label exit + // + // Note that this LVI info for blocks BB and SinglPred is correct for %p + // (info2 and info1 respectively). After the merge and the deletion of the + // LVI info1 for SinglePred. We have the following code: + // BB: <LVI info2 for %p val> + // %y = use of %p + // call @exit() + // assume(%p) + // %x = use of %p <-- LVI info2 is correct from here onwards. + // br label exit + // LVI info2 for BB is incorrect at the beginning of BB. + + // Invalidate LVI information for BB if the LVI is not provably true for + // all of BB. + if (any_of(*BB, [](Instruction &I) { + return !isGuaranteedToTransferExecutionToSuccessor(&I); + })) + LVI->eraseBlock(BB); return true; } } @@ -729,6 +849,10 @@ bool JumpThreadingPass::ProcessBlock(BasicBlock *BB) { if (TryToUnfoldSelectInCurrBB(BB)) return true; + // Look if we can propagate guards to predecessors. + if (HasGuards && ProcessGuards(BB)) + return true; + // What kind of constant we're looking for. ConstantPreference Preference = WantInteger; @@ -804,7 +928,6 @@ bool JumpThreadingPass::ProcessBlock(BasicBlock *BB) { return false; } - if (CmpInst *CondCmp = dyn_cast<CmpInst>(CondInst)) { // If we're branching on a conditional, LVI might be able to determine // it's value at the branch instruction. We only handle comparisons @@ -812,7 +935,12 @@ bool JumpThreadingPass::ProcessBlock(BasicBlock *BB) { // TODO: This should be extended to handle switches as well. BranchInst *CondBr = dyn_cast<BranchInst>(BB->getTerminator()); Constant *CondConst = dyn_cast<Constant>(CondCmp->getOperand(1)); - if (CondBr && CondConst && CondBr->isConditional()) { + if (CondBr && CondConst) { + // We should have returned as soon as we turn a conditional branch to + // unconditional. Because its no longer interesting as far as jump + // threading is concerned. + assert(CondBr->isConditional() && "Threading on unconditional terminator"); + LazyValueInfo::Tristate Ret = LVI->getPredicateAt(CondCmp->getPredicate(), CondCmp->getOperand(0), CondConst, CondBr); @@ -824,21 +952,27 @@ bool JumpThreadingPass::ProcessBlock(BasicBlock *BB) { CondBr->eraseFromParent(); if (CondCmp->use_empty()) CondCmp->eraseFromParent(); + // We can safely replace *some* uses of the CondInst if it has + // exactly one value as returned by LVI. RAUW is incorrect in the + // presence of guards and assumes, that have the `Cond` as the use. This + // is because we use the guards/assume to reason about the `Cond` value + // at the end of block, but RAUW unconditionally replaces all uses + // including the guards/assumes themselves and the uses before the + // guard/assume. else if (CondCmp->getParent() == BB) { - // If the fact we just learned is true for all uses of the - // condition, replace it with a constant value auto *CI = Ret == LazyValueInfo::True ? ConstantInt::getTrue(CondCmp->getType()) : ConstantInt::getFalse(CondCmp->getType()); - CondCmp->replaceAllUsesWith(CI); - CondCmp->eraseFromParent(); + ReplaceFoldableUses(CondCmp, CI); } return true; } - } - if (CondBr && CondConst && TryToUnfoldSelect(CondCmp, BB)) - return true; + // We did not manage to simplify this branch, try to see whether + // CondCmp depends on a known phi-select pattern. + if (TryToUnfoldSelect(CondCmp, BB)) + return true; + } } // Check for some cases that are worth simplifying. Right now we want to look @@ -857,7 +991,6 @@ bool JumpThreadingPass::ProcessBlock(BasicBlock *BB) { if (SimplifyPartiallyRedundantLoad(LI)) return true; - // Handle a variety of cases where we are branching on something derived from // a PHI node in the current block. If we can prove that any predecessors // compute a predictable value based on a PHI node, thread those predecessors. @@ -871,7 +1004,6 @@ bool JumpThreadingPass::ProcessBlock(BasicBlock *BB) { if (PN->getParent() == BB && isa<BranchInst>(BB->getTerminator())) return ProcessBranchOnPHI(PN); - // If this is an otherwise-unfoldable branch on a XOR, see if we can simplify. if (CondInst->getOpcode() == Instruction::Xor && CondInst->getParent() == BB && isa<BranchInst>(BB->getTerminator())) @@ -920,6 +1052,14 @@ bool JumpThreadingPass::ProcessImpliedCondition(BasicBlock *BB) { return false; } +/// Return true if Op is an instruction defined in the given block. +static bool isOpDefinedInBlock(Value *Op, BasicBlock *BB) { + if (Instruction *OpInst = dyn_cast<Instruction>(Op)) + if (OpInst->getParent() == BB) + return true; + return false; +} + /// SimplifyPartiallyRedundantLoad - If LI is an obviously partially redundant /// load instruction, eliminate it by replacing it with a PHI node. This is an /// important optimization that encourages jump threading, and needs to be run @@ -942,18 +1082,17 @@ bool JumpThreadingPass::SimplifyPartiallyRedundantLoad(LoadInst *LI) { Value *LoadedPtr = LI->getOperand(0); - // If the loaded operand is defined in the LoadBB, it can't be available. - // TODO: Could do simple PHI translation, that would be fun :) - if (Instruction *PtrOp = dyn_cast<Instruction>(LoadedPtr)) - if (PtrOp->getParent() == LoadBB) - return false; + // If the loaded operand is defined in the LoadBB and its not a phi, + // it can't be available in predecessors. + if (isOpDefinedInBlock(LoadedPtr, LoadBB) && !isa<PHINode>(LoadedPtr)) + return false; // Scan a few instructions up from the load, to see if it is obviously live at // the entry to its block. BasicBlock::iterator BBIt(LI); bool IsLoadCSE; - if (Value *AvailableVal = - FindAvailableLoadedValue(LI, LoadBB, BBIt, DefMaxInstsToScan, nullptr, &IsLoadCSE)) { + if (Value *AvailableVal = FindAvailableLoadedValue( + LI, LoadBB, BBIt, DefMaxInstsToScan, AA, &IsLoadCSE)) { // If the value of the load is locally available within the block, just use // it. This frequently occurs for reg2mem'd allocas. @@ -997,12 +1136,34 @@ bool JumpThreadingPass::SimplifyPartiallyRedundantLoad(LoadInst *LI) { if (!PredsScanned.insert(PredBB).second) continue; - // Scan the predecessor to see if the value is available in the pred. BBIt = PredBB->end(); - Value *PredAvailable = FindAvailableLoadedValue(LI, PredBB, BBIt, - DefMaxInstsToScan, - nullptr, - &IsLoadCSE); + unsigned NumScanedInst = 0; + Value *PredAvailable = nullptr; + // NOTE: We don't CSE load that is volatile or anything stronger than + // unordered, that should have been checked when we entered the function. + assert(LI->isUnordered() && "Attempting to CSE volatile or atomic loads"); + // If this is a load on a phi pointer, phi-translate it and search + // for available load/store to the pointer in predecessors. + Value *Ptr = LoadedPtr->DoPHITranslation(LoadBB, PredBB); + PredAvailable = FindAvailablePtrLoadStore( + Ptr, LI->getType(), LI->isAtomic(), PredBB, BBIt, DefMaxInstsToScan, + AA, &IsLoadCSE, &NumScanedInst); + + // If PredBB has a single predecessor, continue scanning through the + // single precessor. + BasicBlock *SinglePredBB = PredBB; + while (!PredAvailable && SinglePredBB && BBIt == SinglePredBB->begin() && + NumScanedInst < DefMaxInstsToScan) { + SinglePredBB = SinglePredBB->getSinglePredecessor(); + if (SinglePredBB) { + BBIt = SinglePredBB->end(); + PredAvailable = FindAvailablePtrLoadStore( + Ptr, LI->getType(), LI->isAtomic(), SinglePredBB, BBIt, + (DefMaxInstsToScan - NumScanedInst), AA, &IsLoadCSE, + &NumScanedInst); + } + } + if (!PredAvailable) { OneUnavailablePred = PredBB; continue; @@ -1062,10 +1223,10 @@ bool JumpThreadingPass::SimplifyPartiallyRedundantLoad(LoadInst *LI) { if (UnavailablePred) { assert(UnavailablePred->getTerminator()->getNumSuccessors() == 1 && "Can't handle critical edge here!"); - LoadInst *NewVal = - new LoadInst(LoadedPtr, LI->getName() + ".pr", false, - LI->getAlignment(), LI->getOrdering(), LI->getSynchScope(), - UnavailablePred->getTerminator()); + LoadInst *NewVal = new LoadInst( + LoadedPtr->DoPHITranslation(LoadBB, UnavailablePred), + LI->getName() + ".pr", false, LI->getAlignment(), LI->getOrdering(), + LI->getSyncScopeID(), UnavailablePred->getTerminator()); NewVal->setDebugLoc(LI->getDebugLoc()); if (AATags) NewVal->setAAMetadata(AATags); @@ -1210,37 +1371,53 @@ bool JumpThreadingPass::ProcessThreadableEdges(Value *Cond, BasicBlock *BB, BasicBlock *OnlyDest = nullptr; BasicBlock *MultipleDestSentinel = (BasicBlock*)(intptr_t)~0ULL; + Constant *OnlyVal = nullptr; + Constant *MultipleVal = (Constant *)(intptr_t)~0ULL; + unsigned PredWithKnownDest = 0; for (const auto &PredValue : PredValues) { BasicBlock *Pred = PredValue.second; if (!SeenPreds.insert(Pred).second) continue; // Duplicate predecessor entry. - // If the predecessor ends with an indirect goto, we can't change its - // destination. - if (isa<IndirectBrInst>(Pred->getTerminator())) - continue; - Constant *Val = PredValue.first; BasicBlock *DestBB; if (isa<UndefValue>(Val)) DestBB = nullptr; - else if (BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator())) + else if (BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator())) { + assert(isa<ConstantInt>(Val) && "Expecting a constant integer"); DestBB = BI->getSuccessor(cast<ConstantInt>(Val)->isZero()); - else if (SwitchInst *SI = dyn_cast<SwitchInst>(BB->getTerminator())) { - DestBB = SI->findCaseValue(cast<ConstantInt>(Val)).getCaseSuccessor(); + } else if (SwitchInst *SI = dyn_cast<SwitchInst>(BB->getTerminator())) { + assert(isa<ConstantInt>(Val) && "Expecting a constant integer"); + DestBB = SI->findCaseValue(cast<ConstantInt>(Val))->getCaseSuccessor(); } else { assert(isa<IndirectBrInst>(BB->getTerminator()) && "Unexpected terminator"); + assert(isa<BlockAddress>(Val) && "Expecting a constant blockaddress"); DestBB = cast<BlockAddress>(Val)->getBasicBlock(); } // If we have exactly one destination, remember it for efficiency below. - if (PredToDestList.empty()) + if (PredToDestList.empty()) { OnlyDest = DestBB; - else if (OnlyDest != DestBB) - OnlyDest = MultipleDestSentinel; + OnlyVal = Val; + } else { + if (OnlyDest != DestBB) + OnlyDest = MultipleDestSentinel; + // It possible we have same destination, but different value, e.g. default + // case in switchinst. + if (Val != OnlyVal) + OnlyVal = MultipleVal; + } + + // We know where this predecessor is going. + ++PredWithKnownDest; + + // If the predecessor ends with an indirect goto, we can't change its + // destination. + if (isa<IndirectBrInst>(Pred->getTerminator())) + continue; PredToDestList.push_back(std::make_pair(Pred, DestBB)); } @@ -1249,6 +1426,45 @@ bool JumpThreadingPass::ProcessThreadableEdges(Value *Cond, BasicBlock *BB, if (PredToDestList.empty()) return false; + // If all the predecessors go to a single known successor, we want to fold, + // not thread. By doing so, we do not need to duplicate the current block and + // also miss potential opportunities in case we dont/cant duplicate. + if (OnlyDest && OnlyDest != MultipleDestSentinel) { + if (PredWithKnownDest == + (size_t)std::distance(pred_begin(BB), pred_end(BB))) { + bool SeenFirstBranchToOnlyDest = false; + for (BasicBlock *SuccBB : successors(BB)) { + if (SuccBB == OnlyDest && !SeenFirstBranchToOnlyDest) + SeenFirstBranchToOnlyDest = true; // Don't modify the first branch. + else + SuccBB->removePredecessor(BB, true); // This is unreachable successor. + } + + // Finally update the terminator. + TerminatorInst *Term = BB->getTerminator(); + BranchInst::Create(OnlyDest, Term); + Term->eraseFromParent(); + + // If the condition is now dead due to the removal of the old terminator, + // erase it. + if (auto *CondInst = dyn_cast<Instruction>(Cond)) { + if (CondInst->use_empty() && !CondInst->mayHaveSideEffects()) + CondInst->eraseFromParent(); + // We can safely replace *some* uses of the CondInst if it has + // exactly one value as returned by LVI. RAUW is incorrect in the + // presence of guards and assumes, that have the `Cond` as the use. This + // is because we use the guards/assume to reason about the `Cond` value + // at the end of block, but RAUW unconditionally replaces all uses + // including the guards/assumes themselves and the uses before the + // guard/assume. + else if (OnlyVal && OnlyVal != MultipleVal && + CondInst->getParent() == BB) + ReplaceFoldableUses(CondInst, OnlyVal); + } + return true; + } + } + // Determine which is the most common successor. If we have many inputs and // this block is a switch, we want to start by threading the batch that goes // to the most popular destination first. If we only know about one @@ -1468,7 +1684,8 @@ bool JumpThreadingPass::ThreadEdge(BasicBlock *BB, return false; } - unsigned JumpThreadCost = getJumpThreadDuplicationCost(BB, BBDupThreshold); + unsigned JumpThreadCost = + getJumpThreadDuplicationCost(BB, BB->getTerminator(), BBDupThreshold); if (JumpThreadCost > BBDupThreshold) { DEBUG(dbgs() << " Not threading BB '" << BB->getName() << "' - Cost is too high: " << JumpThreadCost << "\n"); @@ -1756,7 +1973,8 @@ bool JumpThreadingPass::DuplicateCondBranchOnPHIIntoPred( return false; } - unsigned DuplicationCost = getJumpThreadDuplicationCost(BB, BBDupThreshold); + unsigned DuplicationCost = + getJumpThreadDuplicationCost(BB, BB->getTerminator(), BBDupThreshold); if (DuplicationCost > BBDupThreshold) { DEBUG(dbgs() << " Not duplicating BB '" << BB->getName() << "' - Cost is too high: " << DuplicationCost << "\n"); @@ -1811,11 +2029,12 @@ bool JumpThreadingPass::DuplicateCondBranchOnPHIIntoPred( // If this instruction can be simplified after the operands are updated, // just use the simplified value instead. This frequently happens due to // phi translation. - if (Value *IV = - SimplifyInstruction(New, BB->getModule()->getDataLayout())) { + if (Value *IV = SimplifyInstruction( + New, + {BB->getModule()->getDataLayout(), TLI, nullptr, nullptr, New})) { ValueMapping[&*BI] = IV; if (!New->mayHaveSideEffects()) { - delete New; + New->deleteValue(); New = nullptr; } } else { @@ -1888,10 +2107,10 @@ bool JumpThreadingPass::DuplicateCondBranchOnPHIIntoPred( /// TryToUnfoldSelect - Look for blocks of the form /// bb1: /// %a = select -/// br bb +/// br bb2 /// /// bb2: -/// %p = phi [%a, %bb] ... +/// %p = phi [%a, %bb1] ... /// %c = icmp %p /// br i1 %c /// @@ -1963,11 +2182,19 @@ bool JumpThreadingPass::TryToUnfoldSelect(CmpInst *CondCmp, BasicBlock *BB) { return false; } -/// TryToUnfoldSelectInCurrBB - Look for PHI/Select in the same BB of the form +/// TryToUnfoldSelectInCurrBB - Look for PHI/Select or PHI/CMP/Select in the +/// same BB in the form /// bb: /// %p = phi [false, %bb1], [true, %bb2], [false, %bb3], [true, %bb4], ... -/// %s = select p, trueval, falseval +/// %s = select %p, trueval, falseval /// +/// or +/// +/// bb: +/// %p = phi [0, %bb1], [1, %bb2], [0, %bb3], [1, %bb4], ... +/// %c = cmp %p, 0 +/// %s = select %c, trueval, falseval +// /// And expand the select into a branch structure. This later enables /// jump-threading over bb in this pass. /// @@ -1981,43 +2208,180 @@ bool JumpThreadingPass::TryToUnfoldSelectInCurrBB(BasicBlock *BB) { if (LoopHeaders.count(BB)) return false; - // Look for a Phi/Select pair in the same basic block. The Phi feeds the - // condition of the Select and at least one of the incoming values is a - // constant. for (BasicBlock::iterator BI = BB->begin(); PHINode *PN = dyn_cast<PHINode>(BI); ++BI) { - unsigned NumPHIValues = PN->getNumIncomingValues(); - if (NumPHIValues == 0 || !PN->hasOneUse()) + // Look for a Phi having at least one constant incoming value. + if (llvm::all_of(PN->incoming_values(), + [](Value *V) { return !isa<ConstantInt>(V); })) continue; - SelectInst *SI = dyn_cast<SelectInst>(PN->user_back()); - if (!SI || SI->getParent() != BB) - continue; + auto isUnfoldCandidate = [BB](SelectInst *SI, Value *V) { + // Check if SI is in BB and use V as condition. + if (SI->getParent() != BB) + return false; + Value *Cond = SI->getCondition(); + return (Cond && Cond == V && Cond->getType()->isIntegerTy(1)); + }; - Value *Cond = SI->getCondition(); - if (!Cond || Cond != PN || !Cond->getType()->isIntegerTy(1)) + SelectInst *SI = nullptr; + for (Use &U : PN->uses()) { + if (ICmpInst *Cmp = dyn_cast<ICmpInst>(U.getUser())) { + // Look for a ICmp in BB that compares PN with a constant and is the + // condition of a Select. + if (Cmp->getParent() == BB && Cmp->hasOneUse() && + isa<ConstantInt>(Cmp->getOperand(1 - U.getOperandNo()))) + if (SelectInst *SelectI = dyn_cast<SelectInst>(Cmp->user_back())) + if (isUnfoldCandidate(SelectI, Cmp->use_begin()->get())) { + SI = SelectI; + break; + } + } else if (SelectInst *SelectI = dyn_cast<SelectInst>(U.getUser())) { + // Look for a Select in BB that uses PN as condtion. + if (isUnfoldCandidate(SelectI, U.get())) { + SI = SelectI; + break; + } + } + } + + if (!SI) continue; + // Expand the select. + TerminatorInst *Term = + SplitBlockAndInsertIfThen(SI->getCondition(), SI, false); + PHINode *NewPN = PHINode::Create(SI->getType(), 2, "", SI); + NewPN->addIncoming(SI->getTrueValue(), Term->getParent()); + NewPN->addIncoming(SI->getFalseValue(), BB); + SI->replaceAllUsesWith(NewPN); + SI->eraseFromParent(); + return true; + } + return false; +} - bool HasConst = false; - for (unsigned i = 0; i != NumPHIValues; ++i) { - if (PN->getIncomingBlock(i) == BB) - return false; - if (isa<ConstantInt>(PN->getIncomingValue(i))) - HasConst = true; - } +/// Try to propagate a guard from the current BB into one of its predecessors +/// in case if another branch of execution implies that the condition of this +/// guard is always true. Currently we only process the simplest case that +/// looks like: +/// +/// Start: +/// %cond = ... +/// br i1 %cond, label %T1, label %F1 +/// T1: +/// br label %Merge +/// F1: +/// br label %Merge +/// Merge: +/// %condGuard = ... +/// call void(i1, ...) @llvm.experimental.guard( i1 %condGuard )[ "deopt"() ] +/// +/// And cond either implies condGuard or !condGuard. In this case all the +/// instructions before the guard can be duplicated in both branches, and the +/// guard is then threaded to one of them. +bool JumpThreadingPass::ProcessGuards(BasicBlock *BB) { + using namespace PatternMatch; + // We only want to deal with two predecessors. + BasicBlock *Pred1, *Pred2; + auto PI = pred_begin(BB), PE = pred_end(BB); + if (PI == PE) + return false; + Pred1 = *PI++; + if (PI == PE) + return false; + Pred2 = *PI++; + if (PI != PE) + return false; + if (Pred1 == Pred2) + return false; - if (HasConst) { - // Expand the select. - TerminatorInst *Term = - SplitBlockAndInsertIfThen(SI->getCondition(), SI, false); - PHINode *NewPN = PHINode::Create(SI->getType(), 2, "", SI); - NewPN->addIncoming(SI->getTrueValue(), Term->getParent()); - NewPN->addIncoming(SI->getFalseValue(), BB); - SI->replaceAllUsesWith(NewPN); - SI->eraseFromParent(); - return true; + // Try to thread one of the guards of the block. + // TODO: Look up deeper than to immediate predecessor? + auto *Parent = Pred1->getSinglePredecessor(); + if (!Parent || Parent != Pred2->getSinglePredecessor()) + return false; + + if (auto *BI = dyn_cast<BranchInst>(Parent->getTerminator())) + for (auto &I : *BB) + if (match(&I, m_Intrinsic<Intrinsic::experimental_guard>())) + if (ThreadGuard(BB, cast<IntrinsicInst>(&I), BI)) + return true; + + return false; +} + +/// Try to propagate the guard from BB which is the lower block of a diamond +/// to one of its branches, in case if diamond's condition implies guard's +/// condition. +bool JumpThreadingPass::ThreadGuard(BasicBlock *BB, IntrinsicInst *Guard, + BranchInst *BI) { + assert(BI->getNumSuccessors() == 2 && "Wrong number of successors?"); + assert(BI->isConditional() && "Unconditional branch has 2 successors?"); + Value *GuardCond = Guard->getArgOperand(0); + Value *BranchCond = BI->getCondition(); + BasicBlock *TrueDest = BI->getSuccessor(0); + BasicBlock *FalseDest = BI->getSuccessor(1); + + auto &DL = BB->getModule()->getDataLayout(); + bool TrueDestIsSafe = false; + bool FalseDestIsSafe = false; + + // True dest is safe if BranchCond => GuardCond. + auto Impl = isImpliedCondition(BranchCond, GuardCond, DL); + if (Impl && *Impl) + TrueDestIsSafe = true; + else { + // False dest is safe if !BranchCond => GuardCond. + Impl = + isImpliedCondition(BranchCond, GuardCond, DL, /* InvertAPred */ true); + if (Impl && *Impl) + FalseDestIsSafe = true; + } + + if (!TrueDestIsSafe && !FalseDestIsSafe) + return false; + + BasicBlock *UnguardedBlock = TrueDestIsSafe ? TrueDest : FalseDest; + BasicBlock *GuardedBlock = FalseDestIsSafe ? TrueDest : FalseDest; + + ValueToValueMapTy UnguardedMapping, GuardedMapping; + Instruction *AfterGuard = Guard->getNextNode(); + unsigned Cost = getJumpThreadDuplicationCost(BB, AfterGuard, BBDupThreshold); + if (Cost > BBDupThreshold) + return false; + // Duplicate all instructions before the guard and the guard itself to the + // branch where implication is not proved. + GuardedBlock = DuplicateInstructionsInSplitBetween( + BB, GuardedBlock, AfterGuard, GuardedMapping); + assert(GuardedBlock && "Could not create the guarded block?"); + // Duplicate all instructions before the guard in the unguarded branch. + // Since we have successfully duplicated the guarded block and this block + // has fewer instructions, we expect it to succeed. + UnguardedBlock = DuplicateInstructionsInSplitBetween(BB, UnguardedBlock, + Guard, UnguardedMapping); + assert(UnguardedBlock && "Could not create the unguarded block?"); + DEBUG(dbgs() << "Moved guard " << *Guard << " to block " + << GuardedBlock->getName() << "\n"); + + // Some instructions before the guard may still have uses. For them, we need + // to create Phi nodes merging their copies in both guarded and unguarded + // branches. Those instructions that have no uses can be just removed. + SmallVector<Instruction *, 4> ToRemove; + for (auto BI = BB->begin(); &*BI != AfterGuard; ++BI) + if (!isa<PHINode>(&*BI)) + ToRemove.push_back(&*BI); + + Instruction *InsertionPoint = &*BB->getFirstInsertionPt(); + assert(InsertionPoint && "Empty block?"); + // Substitute with Phis & remove. + for (auto *Inst : reverse(ToRemove)) { + if (!Inst->use_empty()) { + PHINode *NewPN = PHINode::Create(Inst->getType(), 2); + NewPN->addIncoming(UnguardedMapping[Inst], UnguardedBlock); + NewPN->addIncoming(GuardedMapping[Inst], GuardedBlock); + NewPN->insertBefore(InsertionPoint); + Inst->replaceAllUsesWith(NewPN); } + Inst->eraseFromParent(); } - - return false; + return true; } diff --git a/contrib/llvm/lib/Transforms/Scalar/LICM.cpp b/contrib/llvm/lib/Transforms/Scalar/LICM.cpp index f51d11c..37b9c4b 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LICM.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LICM.cpp @@ -77,10 +77,16 @@ STATISTIC(NumMovedLoads, "Number of load insts hoisted or sunk"); STATISTIC(NumMovedCalls, "Number of call insts hoisted or sunk"); STATISTIC(NumPromoted, "Number of memory locations promoted to registers"); +/// Memory promotion is enabled by default. static cl::opt<bool> - DisablePromotion("disable-licm-promotion", cl::Hidden, + DisablePromotion("disable-licm-promotion", cl::Hidden, cl::init(false), cl::desc("Disable memory promotion in LICM pass")); +static cl::opt<uint32_t> MaxNumUsesTraversed( + "licm-max-num-uses-traversed", cl::Hidden, cl::init(8), + cl::desc("Max num uses visited for identifying load " + "invariance in loop using invariant start (default = 8)")); + static bool inSubLoop(BasicBlock *BB, Loop *CurLoop, LoopInfo *LI); static bool isNotUsedInLoop(const Instruction &I, const Loop *CurLoop, const LoopSafetyInfo *SafetyInfo); @@ -201,9 +207,9 @@ PreservedAnalyses LICMPass::run(Loop &L, LoopAnalysisManager &AM, if (!LICM.runOnLoop(&L, &AR.AA, &AR.LI, &AR.DT, &AR.TLI, &AR.SE, ORE, true)) return PreservedAnalyses::all(); - // FIXME: There is no setPreservesCFG in the new PM. When that becomes - // available, it should be used here. - return getLoopPassPreservedAnalyses(); + auto PA = getLoopPassPreservedAnalyses(); + PA.preserveSet<CFGAnalyses>(); + return PA; } char LegacyLICMPass::ID = 0; @@ -425,6 +431,29 @@ bool llvm::hoistRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI, continue; } + // Attempt to remove floating point division out of the loop by converting + // it to a reciprocal multiplication. + if (I.getOpcode() == Instruction::FDiv && + CurLoop->isLoopInvariant(I.getOperand(1)) && + I.hasAllowReciprocal()) { + auto Divisor = I.getOperand(1); + auto One = llvm::ConstantFP::get(Divisor->getType(), 1.0); + auto ReciprocalDivisor = BinaryOperator::CreateFDiv(One, Divisor); + ReciprocalDivisor->setFastMathFlags(I.getFastMathFlags()); + ReciprocalDivisor->insertBefore(&I); + + auto Product = BinaryOperator::CreateFMul(I.getOperand(0), + ReciprocalDivisor); + Product->setFastMathFlags(I.getFastMathFlags()); + Product->insertAfter(&I); + I.replaceAllUsesWith(Product); + I.eraseFromParent(); + + hoist(*ReciprocalDivisor, DT, CurLoop, SafetyInfo, ORE); + Changed = true; + continue; + } + // Try hoisting the instruction out to the preheader. We can only do this // if all of the operands of the instruction are loop invariant and if it // is safe to hoist the instruction. @@ -461,7 +490,10 @@ void llvm::computeLoopSafetyInfo(LoopSafetyInfo *SafetyInfo, Loop *CurLoop) { SafetyInfo->MayThrow = SafetyInfo->HeaderMayThrow; // Iterate over loop instructions and compute safety info. - for (Loop::block_iterator BB = CurLoop->block_begin(), + // Skip header as it has been computed and stored in HeaderMayThrow. + // The first block in loopinfo.Blocks is guaranteed to be the header. + assert(Header == *CurLoop->getBlocks().begin() && "First block must be header"); + for (Loop::block_iterator BB = std::next(CurLoop->block_begin()), BBE = CurLoop->block_end(); (BB != BBE) && !SafetyInfo->MayThrow; ++BB) for (BasicBlock::iterator I = (*BB)->begin(), E = (*BB)->end(); @@ -477,6 +509,59 @@ void llvm::computeLoopSafetyInfo(LoopSafetyInfo *SafetyInfo, Loop *CurLoop) { SafetyInfo->BlockColors = colorEHFunclets(*Fn); } +// Return true if LI is invariant within scope of the loop. LI is invariant if +// CurLoop is dominated by an invariant.start representing the same memory location +// and size as the memory location LI loads from, and also the invariant.start +// has no uses. +static bool isLoadInvariantInLoop(LoadInst *LI, DominatorTree *DT, + Loop *CurLoop) { + Value *Addr = LI->getOperand(0); + const DataLayout &DL = LI->getModule()->getDataLayout(); + const uint32_t LocSizeInBits = DL.getTypeSizeInBits( + cast<PointerType>(Addr->getType())->getElementType()); + + // if the type is i8 addrspace(x)*, we know this is the type of + // llvm.invariant.start operand + auto *PtrInt8Ty = PointerType::get(Type::getInt8Ty(LI->getContext()), + LI->getPointerAddressSpace()); + unsigned BitcastsVisited = 0; + // Look through bitcasts until we reach the i8* type (this is invariant.start + // operand type). + while (Addr->getType() != PtrInt8Ty) { + auto *BC = dyn_cast<BitCastInst>(Addr); + // Avoid traversing high number of bitcast uses. + if (++BitcastsVisited > MaxNumUsesTraversed || !BC) + return false; + Addr = BC->getOperand(0); + } + + unsigned UsesVisited = 0; + // Traverse all uses of the load operand value, to see if invariant.start is + // one of the uses, and whether it dominates the load instruction. + for (auto *U : Addr->users()) { + // Avoid traversing for Load operand with high number of users. + if (++UsesVisited > MaxNumUsesTraversed) + return false; + IntrinsicInst *II = dyn_cast<IntrinsicInst>(U); + // If there are escaping uses of invariant.start instruction, the load maybe + // non-invariant. + if (!II || II->getIntrinsicID() != Intrinsic::invariant_start || + !II->use_empty()) + continue; + unsigned InvariantSizeInBits = + cast<ConstantInt>(II->getArgOperand(0))->getSExtValue() * 8; + // Confirm the invariant.start location size contains the load operand size + // in bits. Also, the invariant.start should dominate the load, and we + // should not hoist the load out of a loop that contains this dominating + // invariant.start. + if (LocSizeInBits <= InvariantSizeInBits && + DT->properlyDominates(II->getParent(), CurLoop->getHeader())) + return true; + } + + return false; +} + bool llvm::canSinkOrHoistInst(Instruction &I, AAResults *AA, DominatorTree *DT, Loop *CurLoop, AliasSetTracker *CurAST, LoopSafetyInfo *SafetyInfo, @@ -493,6 +578,10 @@ bool llvm::canSinkOrHoistInst(Instruction &I, AAResults *AA, DominatorTree *DT, if (LI->getMetadata(LLVMContext::MD_invariant_load)) return true; + // This checks for an invariant.start dominating the load. + if (isLoadInvariantInLoop(LI, DT, CurLoop)) + return true; + // Don't hoist loads which have may-aliased stores in loop. uint64_t Size = 0; if (LI->getType()->isSized()) @@ -782,7 +871,7 @@ static bool hoist(Instruction &I, const DominatorTree *DT, const Loop *CurLoop, DEBUG(dbgs() << "LICM hoisting to " << Preheader->getName() << ": " << I << "\n"); ORE->emit(OptimizationRemark(DEBUG_TYPE, "Hoisted", &I) - << "hosting " << ore::NV("Inst", &I)); + << "hoisting " << ore::NV("Inst", &I)); // Metadata can be dependent on conditions we are hoisting above. // Conservatively strip all metadata on the instruction unless we were @@ -852,6 +941,7 @@ class LoopPromoter : public LoadAndStorePromoter { LoopInfo &LI; DebugLoc DL; int Alignment; + bool UnorderedAtomic; AAMDNodes AATags; Value *maybeInsertLCSSAPHI(Value *V, BasicBlock *BB) const { @@ -875,10 +965,11 @@ public: SmallVectorImpl<BasicBlock *> &LEB, SmallVectorImpl<Instruction *> &LIP, PredIteratorCache &PIC, AliasSetTracker &ast, LoopInfo &li, DebugLoc dl, int alignment, - const AAMDNodes &AATags) + bool UnorderedAtomic, const AAMDNodes &AATags) : LoadAndStorePromoter(Insts, S), SomePtr(SP), PointerMustAliases(PMA), LoopExitBlocks(LEB), LoopInsertPts(LIP), PredCache(PIC), AST(ast), - LI(li), DL(std::move(dl)), Alignment(alignment), AATags(AATags) {} + LI(li), DL(std::move(dl)), Alignment(alignment), + UnorderedAtomic(UnorderedAtomic),AATags(AATags) {} bool isInstInList(Instruction *I, const SmallVectorImpl<Instruction *> &) const override { @@ -902,6 +993,8 @@ public: Value *Ptr = maybeInsertLCSSAPHI(SomePtr, ExitBlock); Instruction *InsertPos = LoopInsertPts[i]; StoreInst *NewSI = new StoreInst(LiveInValue, Ptr, InsertPos); + if (UnorderedAtomic) + NewSI->setOrdering(AtomicOrdering::Unordered); NewSI->setAlignment(Alignment); NewSI->setDebugLoc(DL); if (AATags) @@ -992,18 +1085,41 @@ bool llvm::promoteLoopAccessesToScalars( // We start with an alignment of one and try to find instructions that allow // us to prove better alignment. unsigned Alignment = 1; + // Keep track of which types of access we see + bool SawUnorderedAtomic = false; + bool SawNotAtomic = false; AAMDNodes AATags; const DataLayout &MDL = Preheader->getModule()->getDataLayout(); + // Do we know this object does not escape ? + bool IsKnownNonEscapingObject = false; if (SafetyInfo->MayThrow) { // If a loop can throw, we have to insert a store along each unwind edge. // That said, we can't actually make the unwind edge explicit. Therefore, // we have to prove that the store is dead along the unwind edge. // - // Currently, this code just special-cases alloca instructions. - if (!isa<AllocaInst>(GetUnderlyingObject(SomePtr, MDL))) - return false; + // If the underlying object is not an alloca, nor a pointer that does not + // escape, then we can not effectively prove that the store is dead along + // the unwind edge. i.e. the caller of this function could have ways to + // access the pointed object. + Value *Object = GetUnderlyingObject(SomePtr, MDL); + // If this is a base pointer we do not understand, simply bail. + // We only handle alloca and return value from alloc-like fn right now. + if (!isa<AllocaInst>(Object)) { + if (!isAllocLikeFn(Object, TLI)) + return false; + // If this is an alloc like fn. There are more constraints we need to verify. + // More specifically, we must make sure that the pointer can not escape. + // + // NOTE: PointerMayBeCaptured is not enough as the pointer may have escaped + // even though its not captured by the enclosing function. Standard allocation + // functions like malloc, calloc, and operator new return values which can + // be assumed not to have previously escaped. + if (PointerMayBeCaptured(Object, true, true)) + return false; + IsKnownNonEscapingObject = true; + } } // Check that all of the pointers in the alias set have the same type. We @@ -1029,8 +1145,11 @@ bool llvm::promoteLoopAccessesToScalars( // it. if (LoadInst *Load = dyn_cast<LoadInst>(UI)) { assert(!Load->isVolatile() && "AST broken"); - if (!Load->isSimple()) + if (!Load->isUnordered()) return false; + + SawUnorderedAtomic |= Load->isAtomic(); + SawNotAtomic |= !Load->isAtomic(); if (!DereferenceableInPH) DereferenceableInPH = isSafeToExecuteUnconditionally( @@ -1041,9 +1160,12 @@ bool llvm::promoteLoopAccessesToScalars( if (UI->getOperand(1) != ASIV) continue; assert(!Store->isVolatile() && "AST broken"); - if (!Store->isSimple()) + if (!Store->isUnordered()) return false; + SawUnorderedAtomic |= Store->isAtomic(); + SawNotAtomic |= !Store->isAtomic(); + // If the store is guaranteed to execute, both properties are satisfied. // We may want to check if a store is guaranteed to execute even if we // already know that promotion is safe, since it may have higher @@ -1096,6 +1218,12 @@ bool llvm::promoteLoopAccessesToScalars( } } + // If we found both an unordered atomic instruction and a non-atomic memory + // access, bail. We can't blindly promote non-atomic to atomic since we + // might not be able to lower the result. We can't downgrade since that + // would violate memory model. Also, align 0 is an error for atomics. + if (SawUnorderedAtomic && SawNotAtomic) + return false; // If we couldn't prove we can hoist the load, bail. if (!DereferenceableInPH) @@ -1106,10 +1234,15 @@ bool llvm::promoteLoopAccessesToScalars( // stores along paths which originally didn't have them without violating the // memory model. if (!SafeToInsertStore) { - Value *Object = GetUnderlyingObject(SomePtr, MDL); - SafeToInsertStore = - (isAllocLikeFn(Object, TLI) || isa<AllocaInst>(Object)) && + // If this is a known non-escaping object, it is safe to insert the stores. + if (IsKnownNonEscapingObject) + SafeToInsertStore = true; + else { + Value *Object = GetUnderlyingObject(SomePtr, MDL); + SafeToInsertStore = + (isAllocLikeFn(Object, TLI) || isa<AllocaInst>(Object)) && !PointerMayBeCaptured(Object, true, true); + } } // If we've still failed to prove we can sink the store, give up. @@ -1134,12 +1267,15 @@ bool llvm::promoteLoopAccessesToScalars( SmallVector<PHINode *, 16> NewPHIs; SSAUpdater SSA(&NewPHIs); LoopPromoter Promoter(SomePtr, LoopUses, SSA, PointerMustAliases, ExitBlocks, - InsertPts, PIC, *CurAST, *LI, DL, Alignment, AATags); + InsertPts, PIC, *CurAST, *LI, DL, Alignment, + SawUnorderedAtomic, AATags); // Set up the preheader to have a definition of the value. It is the live-out // value from the preheader that uses in the loop will use. LoadInst *PreheaderLoad = new LoadInst( SomePtr, SomePtr->getName() + ".promoted", Preheader->getTerminator()); + if (SawUnorderedAtomic) + PreheaderLoad->setOrdering(AtomicOrdering::Unordered); PreheaderLoad->setAlignment(Alignment); PreheaderLoad->setDebugLoc(DL); if (AATags) diff --git a/contrib/llvm/lib/Transforms/Scalar/LoadCombine.cpp b/contrib/llvm/lib/Transforms/Scalar/LoadCombine.cpp deleted file mode 100644 index 389f1c5..0000000 --- a/contrib/llvm/lib/Transforms/Scalar/LoadCombine.cpp +++ /dev/null @@ -1,280 +0,0 @@ -//===- LoadCombine.cpp - Combine Adjacent Loads ---------------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -/// \file -/// This transformation combines adjacent loads. -/// -//===----------------------------------------------------------------------===// - -#include "llvm/Transforms/Scalar.h" -#include "llvm/ADT/DenseMap.h" -#include "llvm/ADT/Statistic.h" -#include "llvm/Analysis/AliasAnalysis.h" -#include "llvm/Analysis/AliasSetTracker.h" -#include "llvm/Analysis/GlobalsModRef.h" -#include "llvm/Analysis/TargetFolder.h" -#include "llvm/IR/DataLayout.h" -#include "llvm/IR/Function.h" -#include "llvm/IR/IRBuilder.h" -#include "llvm/IR/Instructions.h" -#include "llvm/IR/Module.h" -#include "llvm/Pass.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/MathExtras.h" -#include "llvm/Support/raw_ostream.h" - -using namespace llvm; - -#define DEBUG_TYPE "load-combine" - -STATISTIC(NumLoadsAnalyzed, "Number of loads analyzed for combining"); -STATISTIC(NumLoadsCombined, "Number of loads combined"); - -#define LDCOMBINE_NAME "Combine Adjacent Loads" - -namespace { -struct PointerOffsetPair { - Value *Pointer; - APInt Offset; -}; - -struct LoadPOPPair { - LoadInst *Load; - PointerOffsetPair POP; - /// \brief The new load needs to be created before the first load in IR order. - unsigned InsertOrder; -}; - -class LoadCombine : public BasicBlockPass { - LLVMContext *C; - AliasAnalysis *AA; - -public: - LoadCombine() : BasicBlockPass(ID), C(nullptr), AA(nullptr) { - initializeLoadCombinePass(*PassRegistry::getPassRegistry()); - } - - using llvm::Pass::doInitialization; - bool doInitialization(Function &) override; - bool runOnBasicBlock(BasicBlock &BB) override; - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.setPreservesCFG(); - AU.addRequired<AAResultsWrapperPass>(); - AU.addPreserved<GlobalsAAWrapperPass>(); - } - - StringRef getPassName() const override { return LDCOMBINE_NAME; } - static char ID; - - typedef IRBuilder<TargetFolder> BuilderTy; - -private: - BuilderTy *Builder; - - PointerOffsetPair getPointerOffsetPair(LoadInst &); - bool combineLoads(DenseMap<const Value *, SmallVector<LoadPOPPair, 8>> &); - bool aggregateLoads(SmallVectorImpl<LoadPOPPair> &); - bool combineLoads(SmallVectorImpl<LoadPOPPair> &); -}; -} - -bool LoadCombine::doInitialization(Function &F) { - DEBUG(dbgs() << "LoadCombine function: " << F.getName() << "\n"); - C = &F.getContext(); - return true; -} - -PointerOffsetPair LoadCombine::getPointerOffsetPair(LoadInst &LI) { - auto &DL = LI.getModule()->getDataLayout(); - - PointerOffsetPair POP; - POP.Pointer = LI.getPointerOperand(); - unsigned BitWidth = DL.getPointerSizeInBits(LI.getPointerAddressSpace()); - POP.Offset = APInt(BitWidth, 0); - - while (isa<BitCastInst>(POP.Pointer) || isa<GetElementPtrInst>(POP.Pointer)) { - if (auto *GEP = dyn_cast<GetElementPtrInst>(POP.Pointer)) { - APInt LastOffset = POP.Offset; - if (!GEP->accumulateConstantOffset(DL, POP.Offset)) { - // Can't handle GEPs with variable indices. - POP.Offset = LastOffset; - return POP; - } - POP.Pointer = GEP->getPointerOperand(); - } else if (auto *BC = dyn_cast<BitCastInst>(POP.Pointer)) { - POP.Pointer = BC->getOperand(0); - } - } - return POP; -} - -bool LoadCombine::combineLoads( - DenseMap<const Value *, SmallVector<LoadPOPPair, 8>> &LoadMap) { - bool Combined = false; - for (auto &Loads : LoadMap) { - if (Loads.second.size() < 2) - continue; - std::sort(Loads.second.begin(), Loads.second.end(), - [](const LoadPOPPair &A, const LoadPOPPair &B) { - return A.POP.Offset.slt(B.POP.Offset); - }); - if (aggregateLoads(Loads.second)) - Combined = true; - } - return Combined; -} - -/// \brief Try to aggregate loads from a sorted list of loads to be combined. -/// -/// It is guaranteed that no writes occur between any of the loads. All loads -/// have the same base pointer. There are at least two loads. -bool LoadCombine::aggregateLoads(SmallVectorImpl<LoadPOPPair> &Loads) { - assert(Loads.size() >= 2 && "Insufficient loads!"); - LoadInst *BaseLoad = nullptr; - SmallVector<LoadPOPPair, 8> AggregateLoads; - bool Combined = false; - bool ValidPrevOffset = false; - APInt PrevOffset; - uint64_t PrevSize = 0; - for (auto &L : Loads) { - if (ValidPrevOffset == false) { - BaseLoad = L.Load; - PrevOffset = L.POP.Offset; - PrevSize = L.Load->getModule()->getDataLayout().getTypeStoreSize( - L.Load->getType()); - AggregateLoads.push_back(L); - ValidPrevOffset = true; - continue; - } - if (L.Load->getAlignment() > BaseLoad->getAlignment()) - continue; - APInt PrevEnd = PrevOffset + PrevSize; - if (L.POP.Offset.sgt(PrevEnd)) { - // No other load will be combinable - if (combineLoads(AggregateLoads)) - Combined = true; - AggregateLoads.clear(); - ValidPrevOffset = false; - continue; - } - if (L.POP.Offset != PrevEnd) - // This load is offset less than the size of the last load. - // FIXME: We may want to handle this case. - continue; - PrevOffset = L.POP.Offset; - PrevSize = L.Load->getModule()->getDataLayout().getTypeStoreSize( - L.Load->getType()); - AggregateLoads.push_back(L); - } - if (combineLoads(AggregateLoads)) - Combined = true; - return Combined; -} - -/// \brief Given a list of combinable load. Combine the maximum number of them. -bool LoadCombine::combineLoads(SmallVectorImpl<LoadPOPPair> &Loads) { - // Remove loads from the end while the size is not a power of 2. - unsigned TotalSize = 0; - for (const auto &L : Loads) - TotalSize += L.Load->getType()->getPrimitiveSizeInBits(); - while (TotalSize != 0 && !isPowerOf2_32(TotalSize)) - TotalSize -= Loads.pop_back_val().Load->getType()->getPrimitiveSizeInBits(); - if (Loads.size() < 2) - return false; - - DEBUG({ - dbgs() << "***** Combining Loads ******\n"; - for (const auto &L : Loads) { - dbgs() << L.POP.Offset << ": " << *L.Load << "\n"; - } - }); - - // Find first load. This is where we put the new load. - LoadPOPPair FirstLP; - FirstLP.InsertOrder = -1u; - for (const auto &L : Loads) - if (L.InsertOrder < FirstLP.InsertOrder) - FirstLP = L; - - unsigned AddressSpace = - FirstLP.POP.Pointer->getType()->getPointerAddressSpace(); - - Builder->SetInsertPoint(FirstLP.Load); - Value *Ptr = Builder->CreateConstGEP1_64( - Builder->CreatePointerCast(Loads[0].POP.Pointer, - Builder->getInt8PtrTy(AddressSpace)), - Loads[0].POP.Offset.getSExtValue()); - LoadInst *NewLoad = new LoadInst( - Builder->CreatePointerCast( - Ptr, PointerType::get(IntegerType::get(Ptr->getContext(), TotalSize), - Ptr->getType()->getPointerAddressSpace())), - Twine(Loads[0].Load->getName()) + ".combined", false, - Loads[0].Load->getAlignment(), FirstLP.Load); - - for (const auto &L : Loads) { - Builder->SetInsertPoint(L.Load); - Value *V = Builder->CreateExtractInteger( - L.Load->getModule()->getDataLayout(), NewLoad, - cast<IntegerType>(L.Load->getType()), - (L.POP.Offset - Loads[0].POP.Offset).getZExtValue(), "combine.extract"); - L.Load->replaceAllUsesWith(V); - } - - NumLoadsCombined = NumLoadsCombined + Loads.size(); - return true; -} - -bool LoadCombine::runOnBasicBlock(BasicBlock &BB) { - if (skipBasicBlock(BB)) - return false; - - AA = &getAnalysis<AAResultsWrapperPass>().getAAResults(); - - IRBuilder<TargetFolder> TheBuilder( - BB.getContext(), TargetFolder(BB.getModule()->getDataLayout())); - Builder = &TheBuilder; - - DenseMap<const Value *, SmallVector<LoadPOPPair, 8>> LoadMap; - AliasSetTracker AST(*AA); - - bool Combined = false; - unsigned Index = 0; - for (auto &I : BB) { - if (I.mayThrow() || (I.mayWriteToMemory() && AST.containsUnknown(&I))) { - if (combineLoads(LoadMap)) - Combined = true; - LoadMap.clear(); - AST.clear(); - continue; - } - LoadInst *LI = dyn_cast<LoadInst>(&I); - if (!LI) - continue; - ++NumLoadsAnalyzed; - if (!LI->isSimple() || !LI->getType()->isIntegerTy()) - continue; - auto POP = getPointerOffsetPair(*LI); - if (!POP.Pointer) - continue; - LoadMap[POP.Pointer].push_back({LI, std::move(POP), Index++}); - AST.add(LI); - } - if (combineLoads(LoadMap)) - Combined = true; - return Combined; -} - -char LoadCombine::ID = 0; - -BasicBlockPass *llvm::createLoadCombinePass() { - return new LoadCombine(); -} - -INITIALIZE_PASS_BEGIN(LoadCombine, "load-combine", LDCOMBINE_NAME, false, false) -INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) -INITIALIZE_PASS_END(LoadCombine, "load-combine", LDCOMBINE_NAME, false, false) diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopDeletion.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopDeletion.cpp index cca75a3..ac4dd44 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LoopDeletion.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LoopDeletion.cpp @@ -20,6 +20,7 @@ #include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/LoopPass.h" #include "llvm/IR/Dominators.h" +#include "llvm/IR/PatternMatch.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Scalar/LoopPassManager.h" #include "llvm/Transforms/Utils/LoopUtils.h" @@ -29,32 +30,45 @@ using namespace llvm; STATISTIC(NumDeleted, "Number of loops deleted"); -/// isLoopDead - Determined if a loop is dead. This assumes that we've already -/// checked for unique exit and exiting blocks, and that the code is in LCSSA -/// form. -bool LoopDeletionPass::isLoopDead(Loop *L, ScalarEvolution &SE, - SmallVectorImpl<BasicBlock *> &exitingBlocks, - SmallVectorImpl<BasicBlock *> &exitBlocks, - bool &Changed, BasicBlock *Preheader) { - BasicBlock *exitBlock = exitBlocks[0]; - +/// This function deletes dead loops. The caller of this function needs to +/// guarantee that the loop is infact dead. Here we handle two kinds of dead +/// loop. The first kind (\p isLoopDead) is where only invariant values from +/// within the loop are used outside of it. The second kind (\p +/// isLoopNeverExecuted) is where the loop is provably never executed. We can +/// always remove never executed loops since they will not cause any difference +/// to program behaviour. +/// +/// This also updates the relevant analysis information in \p DT, \p SE, and \p +/// LI. It also updates the loop PM if an updater struct is provided. +// TODO: This function will be used by loop-simplifyCFG as well. So, move this +// to LoopUtils.cpp +static void deleteDeadLoop(Loop *L, DominatorTree &DT, ScalarEvolution &SE, + LoopInfo &LI, LPMUpdater *Updater = nullptr); +/// Determines if a loop is dead. +/// +/// This assumes that we've already checked for unique exit and exiting blocks, +/// and that the code is in LCSSA form. +static bool isLoopDead(Loop *L, ScalarEvolution &SE, + SmallVectorImpl<BasicBlock *> &ExitingBlocks, + BasicBlock *ExitBlock, bool &Changed, + BasicBlock *Preheader) { // Make sure that all PHI entries coming from the loop are loop invariant. // Because the code is in LCSSA form, any values used outside of the loop // must pass through a PHI in the exit block, meaning that this check is // sufficient to guarantee that no loop-variant values are used outside // of the loop. - BasicBlock::iterator BI = exitBlock->begin(); + BasicBlock::iterator BI = ExitBlock->begin(); bool AllEntriesInvariant = true; bool AllOutgoingValuesSame = true; while (PHINode *P = dyn_cast<PHINode>(BI)) { - Value *incoming = P->getIncomingValueForBlock(exitingBlocks[0]); + Value *incoming = P->getIncomingValueForBlock(ExitingBlocks[0]); // Make sure all exiting blocks produce the same incoming value for the exit // block. If there are different incoming values for different exiting // blocks, then it is impossible to statically determine which value should // be used. AllOutgoingValuesSame = - all_of(makeArrayRef(exitingBlocks).slice(1), [&](BasicBlock *BB) { + all_of(makeArrayRef(ExitingBlocks).slice(1), [&](BasicBlock *BB) { return incoming == P->getIncomingValueForBlock(BB); }); @@ -78,95 +92,187 @@ bool LoopDeletionPass::isLoopDead(Loop *L, ScalarEvolution &SE, // Make sure that no instructions in the block have potential side-effects. // This includes instructions that could write to memory, and loads that are - // marked volatile. This could be made more aggressive by using aliasing - // information to identify readonly and readnone calls. - for (Loop::block_iterator LI = L->block_begin(), LE = L->block_end(); - LI != LE; ++LI) { - for (Instruction &I : **LI) { - if (I.mayHaveSideEffects()) - return false; - } - } - + // marked volatile. + for (auto &I : L->blocks()) + if (any_of(*I, [](Instruction &I) { return I.mayHaveSideEffects(); })) + return false; return true; } -/// Remove dead loops, by which we mean loops that do not impact the observable -/// behavior of the program other than finite running time. Note we do ensure -/// that this never remove a loop that might be infinite, as doing so could -/// change the halting/non-halting nature of a program. NOTE: This entire -/// process relies pretty heavily on LoopSimplify and LCSSA in order to make -/// various safety checks work. -bool LoopDeletionPass::runImpl(Loop *L, DominatorTree &DT, ScalarEvolution &SE, - LoopInfo &loopInfo) { - assert(L->isLCSSAForm(DT) && "Expected LCSSA!"); +/// This function returns true if there is no viable path from the +/// entry block to the header of \p L. Right now, it only does +/// a local search to save compile time. +static bool isLoopNeverExecuted(Loop *L) { + using namespace PatternMatch; - // We can only remove the loop if there is a preheader that we can - // branch from after removing it. - BasicBlock *preheader = L->getLoopPreheader(); - if (!preheader) - return false; + auto *Preheader = L->getLoopPreheader(); + // TODO: We can relax this constraint, since we just need a loop + // predecessor. + assert(Preheader && "Needs preheader!"); - // If LoopSimplify form is not available, stay out of trouble. - if (!L->hasDedicatedExits()) + if (Preheader == &Preheader->getParent()->getEntryBlock()) return false; + // All predecessors of the preheader should have a constant conditional + // branch, with the loop's preheader as not-taken. + for (auto *Pred: predecessors(Preheader)) { + BasicBlock *Taken, *NotTaken; + ConstantInt *Cond; + if (!match(Pred->getTerminator(), + m_Br(m_ConstantInt(Cond), Taken, NotTaken))) + return false; + if (!Cond->getZExtValue()) + std::swap(Taken, NotTaken); + if (Taken == Preheader) + return false; + } + assert(!pred_empty(Preheader) && + "Preheader should have predecessors at this point!"); + // All the predecessors have the loop preheader as not-taken target. + return true; +} +/// Remove a loop if it is dead. +/// +/// A loop is considered dead if it does not impact the observable behavior of +/// the program other than finite running time. This never removes a loop that +/// might be infinite (unless it is never executed), as doing so could change +/// the halting/non-halting nature of a program. +/// +/// This entire process relies pretty heavily on LoopSimplify form and LCSSA in +/// order to make various safety checks work. +/// +/// \returns true if any changes were made. This may mutate the loop even if it +/// is unable to delete it due to hoisting trivially loop invariant +/// instructions out of the loop. +static bool deleteLoopIfDead(Loop *L, DominatorTree &DT, ScalarEvolution &SE, + LoopInfo &LI, LPMUpdater *Updater = nullptr) { + assert(L->isLCSSAForm(DT) && "Expected LCSSA!"); + + // We can only remove the loop if there is a preheader that we can branch from + // after removing it. Also, if LoopSimplify form is not available, stay out + // of trouble. + BasicBlock *Preheader = L->getLoopPreheader(); + if (!Preheader || !L->hasDedicatedExits()) { + DEBUG(dbgs() + << "Deletion requires Loop with preheader and dedicated exits.\n"); + return false; + } // We can't remove loops that contain subloops. If the subloops were dead, // they would already have been removed in earlier executions of this pass. - if (L->begin() != L->end()) + if (L->begin() != L->end()) { + DEBUG(dbgs() << "Loop contains subloops.\n"); return false; + } - SmallVector<BasicBlock *, 4> exitingBlocks; - L->getExitingBlocks(exitingBlocks); - SmallVector<BasicBlock *, 4> exitBlocks; - L->getUniqueExitBlocks(exitBlocks); + BasicBlock *ExitBlock = L->getUniqueExitBlock(); + + if (ExitBlock && isLoopNeverExecuted(L)) { + DEBUG(dbgs() << "Loop is proven to never execute, delete it!"); + // Set incoming value to undef for phi nodes in the exit block. + BasicBlock::iterator BI = ExitBlock->begin(); + while (PHINode *P = dyn_cast<PHINode>(BI)) { + for (unsigned i = 0; i < P->getNumIncomingValues(); i++) + P->setIncomingValue(i, UndefValue::get(P->getType())); + BI++; + } + deleteDeadLoop(L, DT, SE, LI, Updater); + ++NumDeleted; + return true; + } + + // The remaining checks below are for a loop being dead because all statements + // in the loop are invariant. + SmallVector<BasicBlock *, 4> ExitingBlocks; + L->getExitingBlocks(ExitingBlocks); // We require that the loop only have a single exit block. Otherwise, we'd // be in the situation of needing to be able to solve statically which exit // block will be branched to, or trying to preserve the branching logic in // a loop invariant manner. - if (exitBlocks.size() != 1) + if (!ExitBlock) { + DEBUG(dbgs() << "Deletion requires single exit block\n"); return false; - + } // Finally, we have to check that the loop really is dead. bool Changed = false; - if (!isLoopDead(L, SE, exitingBlocks, exitBlocks, Changed, preheader)) + if (!isLoopDead(L, SE, ExitingBlocks, ExitBlock, Changed, Preheader)) { + DEBUG(dbgs() << "Loop is not invariant, cannot delete.\n"); return Changed; + } // Don't remove loops for which we can't solve the trip count. // They could be infinite, in which case we'd be changing program behavior. const SCEV *S = SE.getMaxBackedgeTakenCount(L); - if (isa<SCEVCouldNotCompute>(S)) + if (isa<SCEVCouldNotCompute>(S)) { + DEBUG(dbgs() << "Could not compute SCEV MaxBackedgeTakenCount.\n"); return Changed; + } + + DEBUG(dbgs() << "Loop is invariant, delete it!"); + deleteDeadLoop(L, DT, SE, LI, Updater); + ++NumDeleted; + + return true; +} + +static void deleteDeadLoop(Loop *L, DominatorTree &DT, ScalarEvolution &SE, + LoopInfo &LI, LPMUpdater *Updater) { + assert(L->isLCSSAForm(DT) && "Expected LCSSA!"); + auto *Preheader = L->getLoopPreheader(); + assert(Preheader && "Preheader should exist!"); // Now that we know the removal is safe, remove the loop by changing the // branch from the preheader to go to the single exit block. - BasicBlock *exitBlock = exitBlocks[0]; - + // // Because we're deleting a large chunk of code at once, the sequence in which - // we remove things is very important to avoid invalidation issues. Don't - // mess with this unless you have good reason and know what you're doing. + // we remove things is very important to avoid invalidation issues. + + // If we have an LPM updater, tell it about the loop being removed. + if (Updater) + Updater->markLoopAsDeleted(*L); // Tell ScalarEvolution that the loop is deleted. Do this before // deleting the loop so that ScalarEvolution can look at the loop // to determine what it needs to clean up. SE.forgetLoop(L); - // Connect the preheader directly to the exit block. - TerminatorInst *TI = preheader->getTerminator(); - TI->replaceUsesOfWith(L->getHeader(), exitBlock); + auto *ExitBlock = L->getUniqueExitBlock(); + assert(ExitBlock && "Should have a unique exit block!"); - // Rewrite phis in the exit block to get their inputs from - // the preheader instead of the exiting block. - BasicBlock *exitingBlock = exitingBlocks[0]; - BasicBlock::iterator BI = exitBlock->begin(); + assert(L->hasDedicatedExits() && "Loop should have dedicated exits!"); + + // Connect the preheader directly to the exit block. + // Even when the loop is never executed, we cannot remove the edge from the + // source block to the exit block. Consider the case where the unexecuted loop + // branches back to an outer loop. If we deleted the loop and removed the edge + // coming to this inner loop, this will break the outer loop structure (by + // deleting the backedge of the outer loop). If the outer loop is indeed a + // non-loop, it will be deleted in a future iteration of loop deletion pass. + Preheader->getTerminator()->replaceUsesOfWith(L->getHeader(), ExitBlock); + + // Rewrite phis in the exit block to get their inputs from the Preheader + // instead of the exiting block. + BasicBlock::iterator BI = ExitBlock->begin(); while (PHINode *P = dyn_cast<PHINode>(BI)) { - int j = P->getBasicBlockIndex(exitingBlock); - assert(j >= 0 && "Can't find exiting block in exit block's phi node!"); - P->setIncomingBlock(j, preheader); - for (unsigned i = 1; i < exitingBlocks.size(); ++i) - P->removeIncomingValue(exitingBlocks[i]); + // Set the zero'th element of Phi to be from the preheader and remove all + // other incoming values. Given the loop has dedicated exits, all other + // incoming values must be from the exiting blocks. + int PredIndex = 0; + P->setIncomingBlock(PredIndex, Preheader); + // Removes all incoming values from all other exiting blocks (including + // duplicate values from an exiting block). + // Nuke all entries except the zero'th entry which is the preheader entry. + // NOTE! We need to remove Incoming Values in the reverse order as done + // below, to keep the indices valid for deletion (removeIncomingValues + // updates getNumIncomingValues and shifts all values down into the operand + // being deleted). + for (unsigned i = 0, e = P->getNumIncomingValues() - 1; i != e; ++i) + P->removeIncomingValue(e-i, false); + + assert((P->getNumIncomingValues() == 1 && + P->getIncomingBlock(PredIndex) == Preheader) && + "Should have exactly one value and that's from the preheader!"); ++BI; } @@ -175,11 +281,11 @@ bool LoopDeletionPass::runImpl(Loop *L, DominatorTree &DT, ScalarEvolution &SE, SmallVector<DomTreeNode*, 8> ChildNodes; for (Loop::block_iterator LI = L->block_begin(), LE = L->block_end(); LI != LE; ++LI) { - // Move all of the block's children to be children of the preheader, which + // Move all of the block's children to be children of the Preheader, which // allows us to remove the domtree entry for the block. ChildNodes.insert(ChildNodes.begin(), DT[*LI]->begin(), DT[*LI]->end()); for (DomTreeNode *ChildNode : ChildNodes) { - DT.changeImmediateDominator(ChildNode, DT[preheader]); + DT.changeImmediateDominator(ChildNode, DT[Preheader]); } ChildNodes.clear(); @@ -204,22 +310,19 @@ bool LoopDeletionPass::runImpl(Loop *L, DominatorTree &DT, ScalarEvolution &SE, SmallPtrSet<BasicBlock *, 8> blocks; blocks.insert(L->block_begin(), L->block_end()); for (BasicBlock *BB : blocks) - loopInfo.removeBlock(BB); + LI.removeBlock(BB); // The last step is to update LoopInfo now that we've eliminated this loop. - loopInfo.markAsRemoved(L); - Changed = true; - - ++NumDeleted; - - return Changed; + LI.markAsRemoved(L); } PreservedAnalyses LoopDeletionPass::run(Loop &L, LoopAnalysisManager &AM, LoopStandardAnalysisResults &AR, - LPMUpdater &) { - bool Changed = runImpl(&L, AR.DT, AR.SE, AR.LI); - if (!Changed) + LPMUpdater &Updater) { + + DEBUG(dbgs() << "Analyzing Loop for deletion: "); + DEBUG(L.dump()); + if (!deleteLoopIfDead(&L, AR.DT, AR.SE, AR.LI, &Updater)) return PreservedAnalyses::all(); return getLoopPassPreservedAnalyses(); @@ -254,11 +357,11 @@ Pass *llvm::createLoopDeletionPass() { return new LoopDeletionLegacyPass(); } bool LoopDeletionLegacyPass::runOnLoop(Loop *L, LPPassManager &) { if (skipLoop(L)) return false; - DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE(); - LoopInfo &loopInfo = getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); + LoopInfo &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); - LoopDeletionPass Impl; - return Impl.runImpl(L, DT, SE, loopInfo); + DEBUG(dbgs() << "Analyzing Loop for deletion: "); + DEBUG(L->dump()); + return deleteLoopIfDead(L, DT, SE, LI); } diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopDistribute.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopDistribute.cpp index 19716b2..3624bba 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LoopDistribute.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LoopDistribute.cpp @@ -812,29 +812,29 @@ private: const RuntimePointerChecking *RtPtrChecking) { SmallVector<RuntimePointerChecking::PointerCheck, 4> Checks; - std::copy_if(AllChecks.begin(), AllChecks.end(), std::back_inserter(Checks), - [&](const RuntimePointerChecking::PointerCheck &Check) { - for (unsigned PtrIdx1 : Check.first->Members) - for (unsigned PtrIdx2 : Check.second->Members) - // Only include this check if there is a pair of pointers - // that require checking and the pointers fall into - // separate partitions. - // - // (Note that we already know at this point that the two - // pointer groups need checking but it doesn't follow - // that each pair of pointers within the two groups need - // checking as well. - // - // In other words we don't want to include a check just - // because there is a pair of pointers between the two - // pointer groups that require checks and a different - // pair whose pointers fall into different partitions.) - if (RtPtrChecking->needsChecking(PtrIdx1, PtrIdx2) && - !RuntimePointerChecking::arePointersInSamePartition( - PtrToPartition, PtrIdx1, PtrIdx2)) - return true; - return false; - }); + copy_if(AllChecks, std::back_inserter(Checks), + [&](const RuntimePointerChecking::PointerCheck &Check) { + for (unsigned PtrIdx1 : Check.first->Members) + for (unsigned PtrIdx2 : Check.second->Members) + // Only include this check if there is a pair of pointers + // that require checking and the pointers fall into + // separate partitions. + // + // (Note that we already know at this point that the two + // pointer groups need checking but it doesn't follow + // that each pair of pointers within the two groups need + // checking as well. + // + // In other words we don't want to include a check just + // because there is a pair of pointers between the two + // pointer groups that require checks and a different + // pair whose pointers fall into different partitions.) + if (RtPtrChecking->needsChecking(PtrIdx1, PtrIdx2) && + !RuntimePointerChecking::arePointersInSamePartition( + PtrToPartition, PtrIdx1, PtrIdx2)) + return true; + return false; + }); return Checks; } diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp index 5fec51c..4a6a35c 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp @@ -110,6 +110,16 @@ private: bool HasMemset; bool HasMemsetPattern; bool HasMemcpy; + /// Return code for isLegalStore() + enum LegalStoreKind { + None = 0, + Memset, + MemsetPattern, + Memcpy, + UnorderedAtomicMemcpy, + DontUse // Dummy retval never to be used. Allows catching errors in retval + // handling. + }; /// \name Countable Loop Idiom Handling /// @{ @@ -119,8 +129,7 @@ private: SmallVectorImpl<BasicBlock *> &ExitBlocks); void collectStores(BasicBlock *BB); - bool isLegalStore(StoreInst *SI, bool &ForMemset, bool &ForMemsetPattern, - bool &ForMemcpy); + LegalStoreKind isLegalStore(StoreInst *SI); bool processLoopStores(SmallVectorImpl<StoreInst *> &SL, const SCEV *BECount, bool ForMemset); bool processLoopMemSet(MemSetInst *MSI, const SCEV *BECount); @@ -144,6 +153,10 @@ private: bool recognizePopcount(); void transformLoopToPopcount(BasicBlock *PreCondBB, Instruction *CntInst, PHINode *CntPhi, Value *Var); + bool recognizeAndInsertCTLZ(); + void transformLoopToCountable(BasicBlock *PreCondBB, Instruction *CntInst, + PHINode *CntPhi, Value *Var, const DebugLoc DL, + bool ZeroCheck, bool IsCntPhiUsedOutsideLoop); /// @} }; @@ -236,9 +249,9 @@ bool LoopIdiomRecognize::runOnLoop(Loop *L) { ApplyCodeSizeHeuristics = L->getHeader()->getParent()->optForSize() && UseLIRCodeSizeHeurs; - HasMemset = TLI->has(LibFunc::memset); - HasMemsetPattern = TLI->has(LibFunc::memset_pattern16); - HasMemcpy = TLI->has(LibFunc::memcpy); + HasMemset = TLI->has(LibFunc_memset); + HasMemsetPattern = TLI->has(LibFunc_memset_pattern16); + HasMemcpy = TLI->has(LibFunc_memcpy); if (HasMemset || HasMemsetPattern || HasMemcpy) if (SE->hasLoopInvariantBackedgeTakenCount(L)) @@ -339,15 +352,24 @@ static Constant *getMemSetPatternValue(Value *V, const DataLayout *DL) { return ConstantArray::get(AT, std::vector<Constant *>(ArraySize, C)); } -bool LoopIdiomRecognize::isLegalStore(StoreInst *SI, bool &ForMemset, - bool &ForMemsetPattern, bool &ForMemcpy) { +LoopIdiomRecognize::LegalStoreKind +LoopIdiomRecognize::isLegalStore(StoreInst *SI) { + // Don't touch volatile stores. - if (!SI->isSimple()) - return false; + if (SI->isVolatile()) + return LegalStoreKind::None; + // We only want simple or unordered-atomic stores. + if (!SI->isUnordered()) + return LegalStoreKind::None; + + // Don't convert stores of non-integral pointer types to memsets (which stores + // integers). + if (DL->isNonIntegralPointerType(SI->getValueOperand()->getType())) + return LegalStoreKind::None; // Avoid merging nontemporal stores. if (SI->getMetadata(LLVMContext::MD_nontemporal)) - return false; + return LegalStoreKind::None; Value *StoredVal = SI->getValueOperand(); Value *StorePtr = SI->getPointerOperand(); @@ -355,7 +377,7 @@ bool LoopIdiomRecognize::isLegalStore(StoreInst *SI, bool &ForMemset, // Reject stores that are so large that they overflow an unsigned. uint64_t SizeInBits = DL->getTypeSizeInBits(StoredVal->getType()); if ((SizeInBits & 7) || (SizeInBits >> 32) != 0) - return false; + return LegalStoreKind::None; // See if the pointer expression is an AddRec like {base,+,1} on the current // loop, which indicates a strided store. If we have something else, it's a @@ -363,11 +385,11 @@ bool LoopIdiomRecognize::isLegalStore(StoreInst *SI, bool &ForMemset, const SCEVAddRecExpr *StoreEv = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr)); if (!StoreEv || StoreEv->getLoop() != CurLoop || !StoreEv->isAffine()) - return false; + return LegalStoreKind::None; // Check to see if we have a constant stride. if (!isa<SCEVConstant>(StoreEv->getOperand(1))) - return false; + return LegalStoreKind::None; // See if the store can be turned into a memset. @@ -378,22 +400,23 @@ bool LoopIdiomRecognize::isLegalStore(StoreInst *SI, bool &ForMemset, Value *SplatValue = isBytewiseValue(StoredVal); Constant *PatternValue = nullptr; + // Note: memset and memset_pattern on unordered-atomic is yet not supported + bool UnorderedAtomic = SI->isUnordered() && !SI->isSimple(); + // If we're allowed to form a memset, and the stored value would be // acceptable for memset, use it. - if (HasMemset && SplatValue && + if (!UnorderedAtomic && HasMemset && SplatValue && // Verify that the stored value is loop invariant. If not, we can't // promote the memset. CurLoop->isLoopInvariant(SplatValue)) { // It looks like we can use SplatValue. - ForMemset = true; - return true; - } else if (HasMemsetPattern && + return LegalStoreKind::Memset; + } else if (!UnorderedAtomic && HasMemsetPattern && // Don't create memset_pattern16s with address spaces. StorePtr->getType()->getPointerAddressSpace() == 0 && (PatternValue = getMemSetPatternValue(StoredVal, DL))) { // It looks like we can use PatternValue! - ForMemsetPattern = true; - return true; + return LegalStoreKind::MemsetPattern; } // Otherwise, see if the store can be turned into a memcpy. @@ -403,12 +426,17 @@ bool LoopIdiomRecognize::isLegalStore(StoreInst *SI, bool &ForMemset, APInt Stride = getStoreStride(StoreEv); unsigned StoreSize = getStoreSizeInBytes(SI, DL); if (StoreSize != Stride && StoreSize != -Stride) - return false; + return LegalStoreKind::None; // The store must be feeding a non-volatile load. LoadInst *LI = dyn_cast<LoadInst>(SI->getValueOperand()); - if (!LI || !LI->isSimple()) - return false; + + // Only allow non-volatile loads + if (!LI || LI->isVolatile()) + return LegalStoreKind::None; + // Only allow simple or unordered-atomic loads + if (!LI->isUnordered()) + return LegalStoreKind::None; // See if the pointer expression is an AddRec like {base,+,1} on the current // loop, which indicates a strided load. If we have something else, it's a @@ -416,18 +444,19 @@ bool LoopIdiomRecognize::isLegalStore(StoreInst *SI, bool &ForMemset, const SCEVAddRecExpr *LoadEv = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(LI->getPointerOperand())); if (!LoadEv || LoadEv->getLoop() != CurLoop || !LoadEv->isAffine()) - return false; + return LegalStoreKind::None; // The store and load must share the same stride. if (StoreEv->getOperand(1) != LoadEv->getOperand(1)) - return false; + return LegalStoreKind::None; // Success. This store can be converted into a memcpy. - ForMemcpy = true; - return true; + UnorderedAtomic = UnorderedAtomic || LI->isAtomic(); + return UnorderedAtomic ? LegalStoreKind::UnorderedAtomicMemcpy + : LegalStoreKind::Memcpy; } // This store can't be transformed into a memset/memcpy. - return false; + return LegalStoreKind::None; } void LoopIdiomRecognize::collectStores(BasicBlock *BB) { @@ -439,24 +468,29 @@ void LoopIdiomRecognize::collectStores(BasicBlock *BB) { if (!SI) continue; - bool ForMemset = false; - bool ForMemsetPattern = false; - bool ForMemcpy = false; // Make sure this is a strided store with a constant stride. - if (!isLegalStore(SI, ForMemset, ForMemsetPattern, ForMemcpy)) - continue; - - // Save the store locations. - if (ForMemset) { + switch (isLegalStore(SI)) { + case LegalStoreKind::None: + // Nothing to do + break; + case LegalStoreKind::Memset: { // Find the base pointer. Value *Ptr = GetUnderlyingObject(SI->getPointerOperand(), *DL); StoreRefsForMemset[Ptr].push_back(SI); - } else if (ForMemsetPattern) { + } break; + case LegalStoreKind::MemsetPattern: { // Find the base pointer. Value *Ptr = GetUnderlyingObject(SI->getPointerOperand(), *DL); StoreRefsForMemsetPattern[Ptr].push_back(SI); - } else if (ForMemcpy) + } break; + case LegalStoreKind::Memcpy: + case LegalStoreKind::UnorderedAtomicMemcpy: StoreRefsForMemcpy.push_back(SI); + break; + default: + assert(false && "unhandled return value"); + break; + } } } @@ -494,7 +528,7 @@ bool LoopIdiomRecognize::runOnLoopBlock( Instruction *Inst = &*I++; // Look for memset instructions, which may be optimized to a larger memset. if (MemSetInst *MSI = dyn_cast<MemSetInst>(Inst)) { - WeakVH InstPtr(&*I); + WeakTrackingVH InstPtr(&*I); if (!processLoopMemSet(MSI, BECount)) continue; MadeChange = true; @@ -778,6 +812,11 @@ bool LoopIdiomRecognize::processLoopStridedStore( if (NegStride) Start = getStartForNegStride(Start, BECount, IntPtr, StoreSize, SE); + // TODO: ideally we should still be able to generate memset if SCEV expander + // is taught to generate the dependencies at the latest point. + if (!isSafeToExpand(Start, *SE)) + return false; + // Okay, we have a strided store "p[i]" of a splattable value. We can turn // this into a memset in the loop preheader now if we want. However, this // would be unsafe to do if there is anything else in the loop that may read @@ -809,6 +848,11 @@ bool LoopIdiomRecognize::processLoopStridedStore( SCEV::FlagNUW); } + // TODO: ideally we should still be able to generate memset if SCEV expander + // is taught to generate the dependencies at the latest point. + if (!isSafeToExpand(NumBytesS, *SE)) + return false; + Value *NumBytes = Expander.expandCodeFor(NumBytesS, IntPtr, Preheader->getTerminator()); @@ -823,7 +867,7 @@ bool LoopIdiomRecognize::processLoopStridedStore( Module *M = TheStore->getModule(); Value *MSP = M->getOrInsertFunction("memset_pattern16", Builder.getVoidTy(), - Int8PtrTy, Int8PtrTy, IntPtr, (void *)nullptr); + Int8PtrTy, Int8PtrTy, IntPtr); inferLibFuncAttributes(*M->getFunction("memset_pattern16"), *TLI); // Otherwise we should form a memset_pattern16. PatternValue is known to be @@ -851,10 +895,10 @@ bool LoopIdiomRecognize::processLoopStridedStore( /// If the stored value is a strided load in the same loop with the same stride /// this may be transformable into a memcpy. This kicks in for stuff like -/// for (i) A[i] = B[i]; +/// for (i) A[i] = B[i]; bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI, const SCEV *BECount) { - assert(SI->isSimple() && "Expected only non-volatile stores."); + assert(SI->isUnordered() && "Expected only non-volatile non-ordered stores."); Value *StorePtr = SI->getPointerOperand(); const SCEVAddRecExpr *StoreEv = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr)); @@ -864,7 +908,7 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI, // The store must be feeding a non-volatile load. LoadInst *LI = cast<LoadInst>(SI->getValueOperand()); - assert(LI->isSimple() && "Expected only non-volatile stores."); + assert(LI->isUnordered() && "Expected only non-volatile non-ordered loads."); // See if the pointer expression is an AddRec like {base,+,1} on the current // loop, which indicates a strided load. If we have something else, it's a @@ -938,6 +982,7 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI, const SCEV *NumBytesS = SE->getAddExpr(BECount, SE->getOne(IntPtrTy), SCEV::FlagNUW); + if (StoreSize != 1) NumBytesS = SE->getMulExpr(NumBytesS, SE->getConstant(IntPtrTy, StoreSize), SCEV::FlagNUW); @@ -945,9 +990,37 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI, Value *NumBytes = Expander.expandCodeFor(NumBytesS, IntPtrTy, Preheader->getTerminator()); - CallInst *NewCall = - Builder.CreateMemCpy(StoreBasePtr, LoadBasePtr, NumBytes, - std::min(SI->getAlignment(), LI->getAlignment())); + unsigned Align = std::min(SI->getAlignment(), LI->getAlignment()); + CallInst *NewCall = nullptr; + // Check whether to generate an unordered atomic memcpy: + // If the load or store are atomic, then they must neccessarily be unordered + // by previous checks. + if (!SI->isAtomic() && !LI->isAtomic()) + NewCall = Builder.CreateMemCpy(StoreBasePtr, LoadBasePtr, NumBytes, Align); + else { + // We cannot allow unaligned ops for unordered load/store, so reject + // anything where the alignment isn't at least the element size. + if (Align < StoreSize) + return false; + + // If the element.atomic memcpy is not lowered into explicit + // loads/stores later, then it will be lowered into an element-size + // specific lib call. If the lib call doesn't exist for our store size, then + // we shouldn't generate the memcpy. + if (StoreSize > TTI->getAtomicMemIntrinsicMaxElementSize()) + return false; + + NewCall = Builder.CreateElementUnorderedAtomicMemCpy( + StoreBasePtr, LoadBasePtr, NumBytes, StoreSize); + + // Propagate alignment info onto the pointer args. Note that unordered + // atomic loads/stores are *required* by the spec to have an alignment + // but non-atomic loads/stores may not. + NewCall->addParamAttr(0, Attribute::getWithAlignment(NewCall->getContext(), + SI->getAlignment())); + NewCall->addParamAttr(1, Attribute::getWithAlignment(NewCall->getContext(), + LI->getAlignment())); + } NewCall->setDebugLoc(SI->getDebugLoc()); DEBUG(dbgs() << " Formed memcpy: " << *NewCall << "\n" @@ -979,7 +1052,7 @@ bool LoopIdiomRecognize::avoidLIRForMultiBlockLoop(bool IsMemset, } bool LoopIdiomRecognize::runOnNoncountableLoop() { - return recognizePopcount(); + return recognizePopcount() || recognizeAndInsertCTLZ(); } /// Check if the given conditional branch is based on the comparison between @@ -1007,6 +1080,17 @@ static Value *matchCondition(BranchInst *BI, BasicBlock *LoopEntry) { return nullptr; } +// Check if the recurrence variable `VarX` is in the right form to create +// the idiom. Returns the value coerced to a PHINode if so. +static PHINode *getRecurrenceVar(Value *VarX, Instruction *DefX, + BasicBlock *LoopEntry) { + auto *PhiX = dyn_cast<PHINode>(VarX); + if (PhiX && PhiX->getParent() == LoopEntry && + (PhiX->getOperand(0) == DefX || PhiX->getOperand(1) == DefX)) + return PhiX; + return nullptr; +} + /// Return true iff the idiom is detected in the loop. /// /// Additionally: @@ -1076,19 +1160,15 @@ static bool detectPopcountIdiom(Loop *CurLoop, BasicBlock *PreCondBB, if (!Dec || !((SubInst->getOpcode() == Instruction::Sub && Dec->isOne()) || (SubInst->getOpcode() == Instruction::Add && - Dec->isAllOnesValue()))) { + Dec->isMinusOne()))) { return false; } } // step 3: Check the recurrence of variable X - { - PhiX = dyn_cast<PHINode>(VarX1); - if (!PhiX || - (PhiX->getOperand(0) != DefX2 && PhiX->getOperand(1) != DefX2)) { - return false; - } - } + PhiX = getRecurrenceVar(VarX1, DefX2, LoopEntry); + if (!PhiX) + return false; // step 4: Find the instruction which count the population: cnt2 = cnt1 + 1 { @@ -1104,8 +1184,8 @@ static bool detectPopcountIdiom(Loop *CurLoop, BasicBlock *PreCondBB, if (!Inc || !Inc->isOne()) continue; - PHINode *Phi = dyn_cast<PHINode>(Inst->getOperand(0)); - if (!Phi || Phi->getParent() != LoopEntry) + PHINode *Phi = getRecurrenceVar(Inst->getOperand(0), Inst, LoopEntry); + if (!Phi) continue; // Check if the result of the instruction is live of the loop. @@ -1144,6 +1224,169 @@ static bool detectPopcountIdiom(Loop *CurLoop, BasicBlock *PreCondBB, return true; } +/// Return true if the idiom is detected in the loop. +/// +/// Additionally: +/// 1) \p CntInst is set to the instruction Counting Leading Zeros (CTLZ) +/// or nullptr if there is no such. +/// 2) \p CntPhi is set to the corresponding phi node +/// or nullptr if there is no such. +/// 3) \p Var is set to the value whose CTLZ could be used. +/// 4) \p DefX is set to the instruction calculating Loop exit condition. +/// +/// The core idiom we are trying to detect is: +/// \code +/// if (x0 == 0) +/// goto loop-exit // the precondition of the loop +/// cnt0 = init-val; +/// do { +/// x = phi (x0, x.next); //PhiX +/// cnt = phi(cnt0, cnt.next); +/// +/// cnt.next = cnt + 1; +/// ... +/// x.next = x >> 1; // DefX +/// ... +/// } while(x.next != 0); +/// +/// loop-exit: +/// \endcode +static bool detectCTLZIdiom(Loop *CurLoop, PHINode *&PhiX, + Instruction *&CntInst, PHINode *&CntPhi, + Instruction *&DefX) { + BasicBlock *LoopEntry; + Value *VarX = nullptr; + + DefX = nullptr; + PhiX = nullptr; + CntInst = nullptr; + CntPhi = nullptr; + LoopEntry = *(CurLoop->block_begin()); + + // step 1: Check if the loop-back branch is in desirable form. + if (Value *T = matchCondition( + dyn_cast<BranchInst>(LoopEntry->getTerminator()), LoopEntry)) + DefX = dyn_cast<Instruction>(T); + else + return false; + + // step 2: detect instructions corresponding to "x.next = x >> 1" + if (!DefX || DefX->getOpcode() != Instruction::AShr) + return false; + if (ConstantInt *Shft = dyn_cast<ConstantInt>(DefX->getOperand(1))) + if (!Shft || !Shft->isOne()) + return false; + VarX = DefX->getOperand(0); + + // step 3: Check the recurrence of variable X + PhiX = getRecurrenceVar(VarX, DefX, LoopEntry); + if (!PhiX) + return false; + + // step 4: Find the instruction which count the CTLZ: cnt.next = cnt + 1 + // TODO: We can skip the step. If loop trip count is known (CTLZ), + // then all uses of "cnt.next" could be optimized to the trip count + // plus "cnt0". Currently it is not optimized. + // This step could be used to detect POPCNT instruction: + // cnt.next = cnt + (x.next & 1) + for (BasicBlock::iterator Iter = LoopEntry->getFirstNonPHI()->getIterator(), + IterE = LoopEntry->end(); + Iter != IterE; Iter++) { + Instruction *Inst = &*Iter; + if (Inst->getOpcode() != Instruction::Add) + continue; + + ConstantInt *Inc = dyn_cast<ConstantInt>(Inst->getOperand(1)); + if (!Inc || !Inc->isOne()) + continue; + + PHINode *Phi = getRecurrenceVar(Inst->getOperand(0), Inst, LoopEntry); + if (!Phi) + continue; + + CntInst = Inst; + CntPhi = Phi; + break; + } + if (!CntInst) + return false; + + return true; +} + +/// Recognize CTLZ idiom in a non-countable loop and convert the loop +/// to countable (with CTLZ trip count). +/// If CTLZ inserted as a new trip count returns true; otherwise, returns false. +bool LoopIdiomRecognize::recognizeAndInsertCTLZ() { + // Give up if the loop has multiple blocks or multiple backedges. + if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1) + return false; + + Instruction *CntInst, *DefX; + PHINode *CntPhi, *PhiX; + if (!detectCTLZIdiom(CurLoop, PhiX, CntInst, CntPhi, DefX)) + return false; + + bool IsCntPhiUsedOutsideLoop = false; + for (User *U : CntPhi->users()) + if (!CurLoop->contains(dyn_cast<Instruction>(U))) { + IsCntPhiUsedOutsideLoop = true; + break; + } + bool IsCntInstUsedOutsideLoop = false; + for (User *U : CntInst->users()) + if (!CurLoop->contains(dyn_cast<Instruction>(U))) { + IsCntInstUsedOutsideLoop = true; + break; + } + // If both CntInst and CntPhi are used outside the loop the profitability + // is questionable. + if (IsCntInstUsedOutsideLoop && IsCntPhiUsedOutsideLoop) + return false; + + // For some CPUs result of CTLZ(X) intrinsic is undefined + // when X is 0. If we can not guarantee X != 0, we need to check this + // when expand. + bool ZeroCheck = false; + // It is safe to assume Preheader exist as it was checked in + // parent function RunOnLoop. + BasicBlock *PH = CurLoop->getLoopPreheader(); + Value *InitX = PhiX->getIncomingValueForBlock(PH); + // If we check X != 0 before entering the loop we don't need a zero + // check in CTLZ intrinsic, but only if Cnt Phi is not used outside of the + // loop (if it is used we count CTLZ(X >> 1)). + if (!IsCntPhiUsedOutsideLoop) + if (BasicBlock *PreCondBB = PH->getSinglePredecessor()) + if (BranchInst *PreCondBr = + dyn_cast<BranchInst>(PreCondBB->getTerminator())) { + if (matchCondition(PreCondBr, PH) == InitX) + ZeroCheck = true; + } + + // Check if CTLZ intrinsic is profitable. Assume it is always profitable + // if we delete the loop (the loop has only 6 instructions): + // %n.addr.0 = phi [ %n, %entry ], [ %shr, %while.cond ] + // %i.0 = phi [ %i0, %entry ], [ %inc, %while.cond ] + // %shr = ashr %n.addr.0, 1 + // %tobool = icmp eq %shr, 0 + // %inc = add nsw %i.0, 1 + // br i1 %tobool + + IRBuilder<> Builder(PH->getTerminator()); + SmallVector<const Value *, 2> Ops = + {InitX, ZeroCheck ? Builder.getTrue() : Builder.getFalse()}; + ArrayRef<const Value *> Args(Ops); + if (CurLoop->getHeader()->size() != 6 && + TTI->getIntrinsicCost(Intrinsic::ctlz, InitX->getType(), Args) > + TargetTransformInfo::TCC_Basic) + return false; + + const DebugLoc DL = DefX->getDebugLoc(); + transformLoopToCountable(PH, CntInst, CntPhi, InitX, DL, ZeroCheck, + IsCntPhiUsedOutsideLoop); + return true; +} + /// Recognizes a population count idiom in a non-countable loop. /// /// If detected, transforms the relevant code to issue the popcount intrinsic @@ -1207,6 +1450,134 @@ static CallInst *createPopcntIntrinsic(IRBuilder<> &IRBuilder, Value *Val, return CI; } +static CallInst *createCTLZIntrinsic(IRBuilder<> &IRBuilder, Value *Val, + const DebugLoc &DL, bool ZeroCheck) { + Value *Ops[] = {Val, ZeroCheck ? IRBuilder.getTrue() : IRBuilder.getFalse()}; + Type *Tys[] = {Val->getType()}; + + Module *M = IRBuilder.GetInsertBlock()->getParent()->getParent(); + Value *Func = Intrinsic::getDeclaration(M, Intrinsic::ctlz, Tys); + CallInst *CI = IRBuilder.CreateCall(Func, Ops); + CI->setDebugLoc(DL); + + return CI; +} + +/// Transform the following loop: +/// loop: +/// CntPhi = PHI [Cnt0, CntInst] +/// PhiX = PHI [InitX, DefX] +/// CntInst = CntPhi + 1 +/// DefX = PhiX >> 1 +// LOOP_BODY +/// Br: loop if (DefX != 0) +/// Use(CntPhi) or Use(CntInst) +/// +/// Into: +/// If CntPhi used outside the loop: +/// CountPrev = BitWidth(InitX) - CTLZ(InitX >> 1) +/// Count = CountPrev + 1 +/// else +/// Count = BitWidth(InitX) - CTLZ(InitX) +/// loop: +/// CntPhi = PHI [Cnt0, CntInst] +/// PhiX = PHI [InitX, DefX] +/// PhiCount = PHI [Count, Dec] +/// CntInst = CntPhi + 1 +/// DefX = PhiX >> 1 +/// Dec = PhiCount - 1 +/// LOOP_BODY +/// Br: loop if (Dec != 0) +/// Use(CountPrev + Cnt0) // Use(CntPhi) +/// or +/// Use(Count + Cnt0) // Use(CntInst) +/// +/// If LOOP_BODY is empty the loop will be deleted. +/// If CntInst and DefX are not used in LOOP_BODY they will be removed. +void LoopIdiomRecognize::transformLoopToCountable( + BasicBlock *Preheader, Instruction *CntInst, PHINode *CntPhi, Value *InitX, + const DebugLoc DL, bool ZeroCheck, bool IsCntPhiUsedOutsideLoop) { + BranchInst *PreheaderBr = dyn_cast<BranchInst>(Preheader->getTerminator()); + + // Step 1: Insert the CTLZ instruction at the end of the preheader block + // Count = BitWidth - CTLZ(InitX); + // If there are uses of CntPhi create: + // CountPrev = BitWidth - CTLZ(InitX >> 1); + IRBuilder<> Builder(PreheaderBr); + Builder.SetCurrentDebugLocation(DL); + Value *CTLZ, *Count, *CountPrev, *NewCount, *InitXNext; + + if (IsCntPhiUsedOutsideLoop) + InitXNext = Builder.CreateAShr(InitX, + ConstantInt::get(InitX->getType(), 1)); + else + InitXNext = InitX; + CTLZ = createCTLZIntrinsic(Builder, InitXNext, DL, ZeroCheck); + Count = Builder.CreateSub( + ConstantInt::get(CTLZ->getType(), + CTLZ->getType()->getIntegerBitWidth()), + CTLZ); + if (IsCntPhiUsedOutsideLoop) { + CountPrev = Count; + Count = Builder.CreateAdd( + CountPrev, + ConstantInt::get(CountPrev->getType(), 1)); + } + if (IsCntPhiUsedOutsideLoop) + NewCount = Builder.CreateZExtOrTrunc(CountPrev, + cast<IntegerType>(CntInst->getType())); + else + NewCount = Builder.CreateZExtOrTrunc(Count, + cast<IntegerType>(CntInst->getType())); + + // If the CTLZ counter's initial value is not zero, insert Add Inst. + Value *CntInitVal = CntPhi->getIncomingValueForBlock(Preheader); + ConstantInt *InitConst = dyn_cast<ConstantInt>(CntInitVal); + if (!InitConst || !InitConst->isZero()) + NewCount = Builder.CreateAdd(NewCount, CntInitVal); + + // Step 2: Insert new IV and loop condition: + // loop: + // ... + // PhiCount = PHI [Count, Dec] + // ... + // Dec = PhiCount - 1 + // ... + // Br: loop if (Dec != 0) + BasicBlock *Body = *(CurLoop->block_begin()); + auto *LbBr = dyn_cast<BranchInst>(Body->getTerminator()); + ICmpInst *LbCond = cast<ICmpInst>(LbBr->getCondition()); + Type *Ty = Count->getType(); + + PHINode *TcPhi = PHINode::Create(Ty, 2, "tcphi", &Body->front()); + + Builder.SetInsertPoint(LbCond); + Instruction *TcDec = cast<Instruction>( + Builder.CreateSub(TcPhi, ConstantInt::get(Ty, 1), + "tcdec", false, true)); + + TcPhi->addIncoming(Count, Preheader); + TcPhi->addIncoming(TcDec, Body); + + CmpInst::Predicate Pred = + (LbBr->getSuccessor(0) == Body) ? CmpInst::ICMP_NE : CmpInst::ICMP_EQ; + LbCond->setPredicate(Pred); + LbCond->setOperand(0, TcDec); + LbCond->setOperand(1, ConstantInt::get(Ty, 0)); + + // Step 3: All the references to the original counter outside + // the loop are replaced with the NewCount -- the value returned from + // __builtin_ctlz(x). + if (IsCntPhiUsedOutsideLoop) + CntPhi->replaceUsesOutsideBlock(NewCount, Body); + else + CntInst->replaceUsesOutsideBlock(NewCount, Body); + + // step 4: Forget the "non-computable" trip-count SCEV associated with the + // loop. The loop would otherwise not be deleted even if it becomes empty. + SE->forgetLoop(CurLoop); +} + void LoopIdiomRecognize::transformLoopToPopcount(BasicBlock *PreCondBB, Instruction *CntInst, PHINode *CntPhi, Value *Var) { diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp index 69102d1..af09556 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp @@ -77,7 +77,7 @@ static bool SimplifyLoopInst(Loop *L, DominatorTree *DT, LoopInfo *LI, // Don't bother simplifying unused instructions. if (!I->use_empty()) { - Value *V = SimplifyInstruction(I, DL, TLI, DT, AC); + Value *V = SimplifyInstruction(I, {DL, TLI, DT, AC}); if (V && LI->replacementPreservesLCSSAForm(I, V)) { // Mark all uses for resimplification next time round the loop. for (User *U : I->users()) @@ -189,7 +189,9 @@ PreservedAnalyses LoopInstSimplifyPass::run(Loop &L, LoopAnalysisManager &AM, if (!SimplifyLoopInst(&L, &AR.DT, &AR.LI, &AR.AC, &AR.TLI)) return PreservedAnalyses::all(); - return getLoopPassPreservedAnalyses(); + auto PA = getLoopPassPreservedAnalyses(); + PA.preserveSet<CFGAnalyses>(); + return PA; } char LoopInstSimplifyLegacyPass::ID = 0; diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopInterchange.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopInterchange.cpp index e9f84ed..2e0d8e0 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LoopInterchange.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LoopInterchange.cpp @@ -22,6 +22,7 @@ #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopIterator.h" #include "llvm/Analysis/LoopPass.h" +#include "llvm/Analysis/OptimizationDiagnosticInfo.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/ScalarEvolutionExpander.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" @@ -39,7 +40,7 @@ #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/LoopUtils.h" -#include "llvm/Transforms/Utils/SSAUpdater.h" + using namespace llvm; #define DEBUG_TYPE "loop-interchange" @@ -323,9 +324,10 @@ static PHINode *getInductionVariable(Loop *L, ScalarEvolution *SE) { class LoopInterchangeLegality { public: LoopInterchangeLegality(Loop *Outer, Loop *Inner, ScalarEvolution *SE, - LoopInfo *LI, DominatorTree *DT, bool PreserveLCSSA) + LoopInfo *LI, DominatorTree *DT, bool PreserveLCSSA, + OptimizationRemarkEmitter *ORE) : OuterLoop(Outer), InnerLoop(Inner), SE(SE), LI(LI), DT(DT), - PreserveLCSSA(PreserveLCSSA), InnerLoopHasReduction(false) {} + PreserveLCSSA(PreserveLCSSA), ORE(ORE), InnerLoopHasReduction(false) {} /// Check if the loops can be interchanged. bool canInterchangeLoops(unsigned InnerLoopId, unsigned OuterLoopId, @@ -353,6 +355,8 @@ private: LoopInfo *LI; DominatorTree *DT; bool PreserveLCSSA; + /// Interface to emit optimization remarks. + OptimizationRemarkEmitter *ORE; bool InnerLoopHasReduction; }; @@ -361,8 +365,9 @@ private: /// loop. class LoopInterchangeProfitability { public: - LoopInterchangeProfitability(Loop *Outer, Loop *Inner, ScalarEvolution *SE) - : OuterLoop(Outer), InnerLoop(Inner), SE(SE) {} + LoopInterchangeProfitability(Loop *Outer, Loop *Inner, ScalarEvolution *SE, + OptimizationRemarkEmitter *ORE) + : OuterLoop(Outer), InnerLoop(Inner), SE(SE), ORE(ORE) {} /// Check if the loop interchange is profitable. bool isProfitable(unsigned InnerLoopId, unsigned OuterLoopId, @@ -376,6 +381,8 @@ private: /// Scev analysis. ScalarEvolution *SE; + /// Interface to emit optimization remarks. + OptimizationRemarkEmitter *ORE; }; /// LoopInterchangeTransform interchanges the loop. @@ -422,6 +429,9 @@ struct LoopInterchange : public FunctionPass { DependenceInfo *DI; DominatorTree *DT; bool PreserveLCSSA; + /// Interface to emit optimization remarks. + OptimizationRemarkEmitter *ORE; + LoopInterchange() : FunctionPass(ID), SE(nullptr), LI(nullptr), DI(nullptr), DT(nullptr) { initializeLoopInterchangePass(*PassRegistry::getPassRegistry()); @@ -435,6 +445,7 @@ struct LoopInterchange : public FunctionPass { AU.addRequired<DependenceAnalysisWrapperPass>(); AU.addRequiredID(LoopSimplifyID); AU.addRequiredID(LCSSAID); + AU.addRequired<OptimizationRemarkEmitterWrapperPass>(); } bool runOnFunction(Function &F) override { @@ -446,6 +457,7 @@ struct LoopInterchange : public FunctionPass { DI = &getAnalysis<DependenceAnalysisWrapperPass>().getDI(); auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>(); DT = DTWP ? &DTWP->getDomTree() : nullptr; + ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(); PreserveLCSSA = mustPreserveAnalysisID(LCSSAID); // Build up a worklist of loop pairs to analyze. @@ -575,18 +587,23 @@ struct LoopInterchange : public FunctionPass { Loop *OuterLoop = LoopList[OuterLoopId]; LoopInterchangeLegality LIL(OuterLoop, InnerLoop, SE, LI, DT, - PreserveLCSSA); + PreserveLCSSA, ORE); if (!LIL.canInterchangeLoops(InnerLoopId, OuterLoopId, DependencyMatrix)) { DEBUG(dbgs() << "Not interchanging Loops. Cannot prove legality\n"); return false; } DEBUG(dbgs() << "Loops are legal to interchange\n"); - LoopInterchangeProfitability LIP(OuterLoop, InnerLoop, SE); + LoopInterchangeProfitability LIP(OuterLoop, InnerLoop, SE, ORE); if (!LIP.isProfitable(InnerLoopId, OuterLoopId, DependencyMatrix)) { DEBUG(dbgs() << "Interchanging loops not profitable\n"); return false; } + ORE->emit(OptimizationRemark(DEBUG_TYPE, "Interchanged", + InnerLoop->getStartLoc(), + InnerLoop->getHeader()) + << "Loop interchanged with enclosing loop."); + LoopInterchangeTransform LIT(OuterLoop, InnerLoop, SE, LI, DT, LoopNestExit, LIL.hasInnerLoopReduction()); LIT.transform(); @@ -757,13 +774,28 @@ bool LoopInterchangeLegality::currentLimitations() { PHINode *InnerInductionVar; SmallVector<PHINode *, 8> Inductions; SmallVector<PHINode *, 8> Reductions; - if (!findInductionAndReductions(InnerLoop, Inductions, Reductions)) + if (!findInductionAndReductions(InnerLoop, Inductions, Reductions)) { + DEBUG(dbgs() << "Only inner loops with induction or reduction PHI nodes " + << "are supported currently.\n"); + ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE, + "UnsupportedPHIInner", + InnerLoop->getStartLoc(), + InnerLoop->getHeader()) + << "Only inner loops with induction or reduction PHI nodes can be" + " interchange currently."); return true; + } // TODO: Currently we handle only loops with 1 induction variable. if (Inductions.size() != 1) { DEBUG(dbgs() << "We currently only support loops with 1 induction variable." << "Failed to interchange due to current limitation\n"); + ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE, + "MultiInductionInner", + InnerLoop->getStartLoc(), + InnerLoop->getHeader()) + << "Only inner loops with 1 induction variable can be " + "interchanged currently."); return true; } if (Reductions.size() > 0) @@ -771,32 +803,80 @@ bool LoopInterchangeLegality::currentLimitations() { InnerInductionVar = Inductions.pop_back_val(); Reductions.clear(); - if (!findInductionAndReductions(OuterLoop, Inductions, Reductions)) + if (!findInductionAndReductions(OuterLoop, Inductions, Reductions)) { + DEBUG(dbgs() << "Only outer loops with induction or reduction PHI nodes " + << "are supported currently.\n"); + ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE, + "UnsupportedPHIOuter", + OuterLoop->getStartLoc(), + OuterLoop->getHeader()) + << "Only outer loops with induction or reduction PHI nodes can be" + " interchanged currently."); return true; + } // Outer loop cannot have reduction because then loops will not be tightly // nested. - if (!Reductions.empty()) + if (!Reductions.empty()) { + DEBUG(dbgs() << "Outer loops with reductions are not supported " + << "currently.\n"); + ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE, + "ReductionsOuter", + OuterLoop->getStartLoc(), + OuterLoop->getHeader()) + << "Outer loops with reductions cannot be interchangeed " + "currently."); return true; + } // TODO: Currently we handle only loops with 1 induction variable. - if (Inductions.size() != 1) + if (Inductions.size() != 1) { + DEBUG(dbgs() << "Loops with more than 1 induction variables are not " + << "supported currently.\n"); + ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE, + "MultiIndutionOuter", + OuterLoop->getStartLoc(), + OuterLoop->getHeader()) + << "Only outer loops with 1 induction variable can be " + "interchanged currently."); return true; + } // TODO: Triangular loops are not handled for now. if (!isLoopStructureUnderstood(InnerInductionVar)) { DEBUG(dbgs() << "Loop structure not understood by pass\n"); + ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE, + "UnsupportedStructureInner", + InnerLoop->getStartLoc(), + InnerLoop->getHeader()) + << "Inner loop structure not understood currently."); return true; } // TODO: We only handle LCSSA PHI's corresponding to reduction for now. BasicBlock *LoopExitBlock = getLoopLatchExitBlock(OuterLoopLatch, OuterLoopHeader); - if (!LoopExitBlock || !containsSafePHI(LoopExitBlock, true)) + if (!LoopExitBlock || !containsSafePHI(LoopExitBlock, true)) { + DEBUG(dbgs() << "Can only handle LCSSA PHIs in outer loops currently.\n"); + ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE, + "NoLCSSAPHIOuter", + OuterLoop->getStartLoc(), + OuterLoop->getHeader()) + << "Only outer loops with LCSSA PHIs can be interchange " + "currently."); return true; + } LoopExitBlock = getLoopLatchExitBlock(InnerLoopLatch, InnerLoopHeader); - if (!LoopExitBlock || !containsSafePHI(LoopExitBlock, false)) + if (!LoopExitBlock || !containsSafePHI(LoopExitBlock, false)) { + DEBUG(dbgs() << "Can only handle LCSSA PHIs in inner loops currently.\n"); + ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE, + "NoLCSSAPHIOuterInner", + InnerLoop->getStartLoc(), + InnerLoop->getHeader()) + << "Only inner loops with LCSSA PHIs can be interchange " + "currently."); return true; + } // TODO: Current limitation: Since we split the inner loop latch at the point // were induction variable is incremented (induction.next); We cannot have @@ -816,8 +896,16 @@ bool LoopInterchangeLegality::currentLimitations() { InnerIndexVarInc = dyn_cast<Instruction>(InnerInductionVar->getIncomingValue(0)); - if (!InnerIndexVarInc) + if (!InnerIndexVarInc) { + DEBUG(dbgs() << "Did not find an instruction to increment the induction " + << "variable.\n"); + ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE, + "NoIncrementInInner", + InnerLoop->getStartLoc(), + InnerLoop->getHeader()) + << "The inner loop does not increment the induction variable."); return true; + } // Since we split the inner loop latch on this induction variable. Make sure // we do not have any instruction between the induction variable and branch @@ -827,19 +915,35 @@ bool LoopInterchangeLegality::currentLimitations() { for (const Instruction &I : reverse(*InnerLoopLatch)) { if (isa<BranchInst>(I) || isa<CmpInst>(I) || isa<TruncInst>(I)) continue; + // We found an instruction. If this is not induction variable then it is not // safe to split this loop latch. - if (!I.isIdenticalTo(InnerIndexVarInc)) + if (!I.isIdenticalTo(InnerIndexVarInc)) { + DEBUG(dbgs() << "Found unsupported instructions between induction " + << "variable increment and branch.\n"); + ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE, + "UnsupportedInsBetweenInduction", + InnerLoop->getStartLoc(), + InnerLoop->getHeader()) + << "Found unsupported instruction between induction variable " + "increment and branch."); return true; + } FoundInduction = true; break; } // The loop latch ended and we didn't find the induction variable return as // current limitation. - if (!FoundInduction) + if (!FoundInduction) { + DEBUG(dbgs() << "Did not find the induction variable.\n"); + ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE, + "NoIndutionVariable", + InnerLoop->getStartLoc(), + InnerLoop->getHeader()) + << "Did not find the induction variable."); return true; - + } return false; } @@ -851,6 +955,11 @@ bool LoopInterchangeLegality::canInterchangeLoops(unsigned InnerLoopId, DEBUG(dbgs() << "Failed interchange InnerLoopId = " << InnerLoopId << " and OuterLoopId = " << OuterLoopId << " due to dependence\n"); + ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE, + "Dependence", + InnerLoop->getStartLoc(), + InnerLoop->getHeader()) + << "Cannot interchange loops due to dependences."); return false; } @@ -886,6 +995,12 @@ bool LoopInterchangeLegality::canInterchangeLoops(unsigned InnerLoopId, // Check if the loops are tightly nested. if (!tightlyNested(OuterLoop, InnerLoop)) { DEBUG(dbgs() << "Loops not tightly nested\n"); + ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE, + "NotTightlyNested", + InnerLoop->getStartLoc(), + InnerLoop->getHeader()) + << "Cannot interchange loops because they are not tightly " + "nested."); return false; } @@ -981,9 +1096,18 @@ bool LoopInterchangeProfitability::isProfitable(unsigned InnerLoopId, // It is not profitable as per current cache profitability model. But check if // we can move this loop outside to improve parallelism. - bool ImprovesPar = - isProfitableForVectorization(InnerLoopId, OuterLoopId, DepMatrix); - return ImprovesPar; + if (isProfitableForVectorization(InnerLoopId, OuterLoopId, DepMatrix)) + return true; + + ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE, + "InterchangeNotProfitable", + InnerLoop->getStartLoc(), + InnerLoop->getHeader()) + << "Interchanging loops is too costly (cost=" + << ore::NV("Cost", Cost) << ", threshold=" + << ore::NV("Threshold", LoopInterchangeCostThreshold) << + ") and it does not improve parallelism."); + return false; } void LoopInterchangeTransform::removeChildLoop(Loop *OuterLoop, @@ -1267,6 +1391,7 @@ INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopSimplify) INITIALIZE_PASS_DEPENDENCY(LCSSAWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) INITIALIZE_PASS_END(LoopInterchange, "loop-interchange", "Interchanges loops for cache reuse", false, false) diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp index 8fb5801..20b37c4 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp @@ -20,13 +20,14 @@ // //===----------------------------------------------------------------------===// +#include "llvm/Transforms/Scalar/LoopLoadElimination.h" #include "llvm/ADT/APInt.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/DepthFirstIterator.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" -#include "llvm/ADT/STLExtras.h" #include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/LoopAccessAnalysis.h" #include "llvm/Analysis/LoopInfo.h" @@ -45,9 +46,9 @@ #include "llvm/Support/Debug.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/LoopVersioning.h" -#include <forward_list> -#include <cassert> #include <algorithm> +#include <cassert> +#include <forward_list> #include <set> #include <tuple> #include <utility> @@ -196,8 +197,7 @@ public: continue; // Only progagate the value if they are of the same type. - if (Store->getPointerOperand()->getType() != - Load->getPointerOperand()->getType()) + if (Store->getPointerOperandType() != Load->getPointerOperandType()) continue; Candidates.emplace_front(Load, Store); @@ -373,15 +373,15 @@ public: const auto &AllChecks = LAI.getRuntimePointerChecking()->getChecks(); SmallVector<RuntimePointerChecking::PointerCheck, 4> Checks; - std::copy_if(AllChecks.begin(), AllChecks.end(), std::back_inserter(Checks), - [&](const RuntimePointerChecking::PointerCheck &Check) { - for (auto PtrIdx1 : Check.first->Members) - for (auto PtrIdx2 : Check.second->Members) - if (needsChecking(PtrIdx1, PtrIdx2, - PtrsWrittenOnFwdingPath, CandLoadPtrs)) - return true; - return false; - }); + copy_if(AllChecks, std::back_inserter(Checks), + [&](const RuntimePointerChecking::PointerCheck &Check) { + for (auto PtrIdx1 : Check.first->Members) + for (auto PtrIdx2 : Check.second->Members) + if (needsChecking(PtrIdx1, PtrIdx2, PtrsWrittenOnFwdingPath, + CandLoadPtrs)) + return true; + return false; + }); DEBUG(dbgs() << "\nPointer Checks (count: " << Checks.size() << "):\n"); DEBUG(LAI.getRuntimePointerChecking()->printChecks(dbgs(), Checks)); @@ -558,6 +558,32 @@ private: PredicatedScalarEvolution PSE; }; +static bool +eliminateLoadsAcrossLoops(Function &F, LoopInfo &LI, DominatorTree &DT, + function_ref<const LoopAccessInfo &(Loop &)> GetLAI) { + // Build up a worklist of inner-loops to transform to avoid iterator + // invalidation. + // FIXME: This logic comes from other passes that actually change the loop + // nest structure. It isn't clear this is necessary (or useful) for a pass + // which merely optimizes the use of loads in a loop. + SmallVector<Loop *, 8> Worklist; + + for (Loop *TopLevelLoop : LI) + for (Loop *L : depth_first(TopLevelLoop)) + // We only handle inner-most loops. + if (L->empty()) + Worklist.push_back(L); + + // Now walk the identified inner loops. + bool Changed = false; + for (Loop *L : Worklist) { + // The actual work is performed by LoadEliminationForLoop. + LoadEliminationForLoop LEL(L, &LI, GetLAI(*L), &DT); + Changed |= LEL.processLoop(); + } + return Changed; +} + /// \brief The pass. Most of the work is delegated to the per-loop /// LoadEliminationForLoop class. class LoopLoadElimination : public FunctionPass { @@ -570,32 +596,14 @@ public: if (skipFunction(F)) return false; - auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); - auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>(); - auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree(); - - // Build up a worklist of inner-loops to vectorize. This is necessary as the - // act of distributing a loop creates new loops and can invalidate iterators - // across the loops. - SmallVector<Loop *, 8> Worklist; - - for (Loop *TopLevelLoop : *LI) - for (Loop *L : depth_first(TopLevelLoop)) - // We only handle inner-most loops. - if (L->empty()) - Worklist.push_back(L); - - // Now walk the identified inner loops. - bool Changed = false; - for (Loop *L : Worklist) { - const LoopAccessInfo &LAI = LAA->getInfo(L); - // The actual work is performed by LoadEliminationForLoop. - LoadEliminationForLoop LEL(L, LI, LAI, DT); - Changed |= LEL.processLoop(); - } + auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); + auto &LAA = getAnalysis<LoopAccessLegacyAnalysis>(); + auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); // Process each loop nest in the function. - return Changed; + return eliminateLoadsAcrossLoops( + F, LI, DT, + [&LAA](Loop &L) -> const LoopAccessInfo & { return LAA.getInfo(&L); }); } void getAnalysisUsage(AnalysisUsage &AU) const override { @@ -631,4 +639,28 @@ FunctionPass *createLoopLoadEliminationPass() { return new LoopLoadElimination(); } +PreservedAnalyses LoopLoadEliminationPass::run(Function &F, + FunctionAnalysisManager &AM) { + auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F); + auto &LI = AM.getResult<LoopAnalysis>(F); + auto &TTI = AM.getResult<TargetIRAnalysis>(F); + auto &DT = AM.getResult<DominatorTreeAnalysis>(F); + auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); + auto &AA = AM.getResult<AAManager>(F); + auto &AC = AM.getResult<AssumptionAnalysis>(F); + + auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager(); + bool Changed = eliminateLoadsAcrossLoops( + F, LI, DT, [&](Loop &L) -> const LoopAccessInfo & { + LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI}; + return LAM.getResult<LoopAccessAnalysis>(L, AR); + }); + + if (!Changed) + return PreservedAnalyses::all(); + + PreservedAnalyses PA; + return PA; +} + } // end namespace llvm diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopPassManager.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopPassManager.cpp index 028f4bb..10f6fcd 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LoopPassManager.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LoopPassManager.cpp @@ -42,6 +42,13 @@ PassManager<Loop, LoopAnalysisManager, LoopStandardAnalysisResults &, break; } +#ifndef NDEBUG + // Verify the loop structure and LCSSA form before visiting the loop. + L.verifyLoop(); + assert(L.isRecursivelyLCSSAForm(AR.DT, AR.LI) && + "Loops must remain in LCSSA form!"); +#endif + // Update the analysis manager as each pass runs and potentially // invalidates analyses. AM.invalidate(L, PassPA); diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopPredication.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopPredication.cpp new file mode 100644 index 0000000..9b12ba1 --- /dev/null +++ b/contrib/llvm/lib/Transforms/Scalar/LoopPredication.cpp @@ -0,0 +1,330 @@ +//===-- LoopPredication.cpp - Guard based loop predication pass -----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// The LoopPredication pass tries to convert loop variant range checks to loop +// invariant by widening checks across loop iterations. For example, it will +// convert +// +// for (i = 0; i < n; i++) { +// guard(i < len); +// ... +// } +// +// to +// +// for (i = 0; i < n; i++) { +// guard(n - 1 < len); +// ... +// } +// +// After this transformation the condition of the guard is loop invariant, so +// loop-unswitch can later unswitch the loop by this condition which basically +// predicates the loop by the widened condition: +// +// if (n - 1 < len) +// for (i = 0; i < n; i++) { +// ... +// } +// else +// deoptimize +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Scalar/LoopPredication.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/LoopPass.h" +#include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/ScalarEvolutionExpander.h" +#include "llvm/Analysis/ScalarEvolutionExpressions.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/GlobalValue.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/PatternMatch.h" +#include "llvm/Pass.h" +#include "llvm/Support/Debug.h" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/LoopUtils.h" + +#define DEBUG_TYPE "loop-predication" + +using namespace llvm; + +namespace { +class LoopPredication { + /// Represents an induction variable check: + /// icmp Pred, <induction variable>, <loop invariant limit> + struct LoopICmp { + ICmpInst::Predicate Pred; + const SCEVAddRecExpr *IV; + const SCEV *Limit; + LoopICmp(ICmpInst::Predicate Pred, const SCEVAddRecExpr *IV, + const SCEV *Limit) + : Pred(Pred), IV(IV), Limit(Limit) {} + LoopICmp() {} + }; + + ScalarEvolution *SE; + + Loop *L; + const DataLayout *DL; + BasicBlock *Preheader; + + Optional<LoopICmp> parseLoopICmp(ICmpInst *ICI); + + Value *expandCheck(SCEVExpander &Expander, IRBuilder<> &Builder, + ICmpInst::Predicate Pred, const SCEV *LHS, const SCEV *RHS, + Instruction *InsertAt); + + Optional<Value *> widenICmpRangeCheck(ICmpInst *ICI, SCEVExpander &Expander, + IRBuilder<> &Builder); + bool widenGuardConditions(IntrinsicInst *II, SCEVExpander &Expander); + +public: + LoopPredication(ScalarEvolution *SE) : SE(SE){}; + bool runOnLoop(Loop *L); +}; + +class LoopPredicationLegacyPass : public LoopPass { +public: + static char ID; + LoopPredicationLegacyPass() : LoopPass(ID) { + initializeLoopPredicationLegacyPassPass(*PassRegistry::getPassRegistry()); + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + getLoopAnalysisUsage(AU); + } + + bool runOnLoop(Loop *L, LPPassManager &LPM) override { + if (skipLoop(L)) + return false; + auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); + LoopPredication LP(SE); + return LP.runOnLoop(L); + } +}; + +char LoopPredicationLegacyPass::ID = 0; +} // end namespace llvm + +INITIALIZE_PASS_BEGIN(LoopPredicationLegacyPass, "loop-predication", + "Loop predication", false, false) +INITIALIZE_PASS_DEPENDENCY(LoopPass) +INITIALIZE_PASS_END(LoopPredicationLegacyPass, "loop-predication", + "Loop predication", false, false) + +Pass *llvm::createLoopPredicationPass() { + return new LoopPredicationLegacyPass(); +} + +PreservedAnalyses LoopPredicationPass::run(Loop &L, LoopAnalysisManager &AM, + LoopStandardAnalysisResults &AR, + LPMUpdater &U) { + LoopPredication LP(&AR.SE); + if (!LP.runOnLoop(&L)) + return PreservedAnalyses::all(); + + return getLoopPassPreservedAnalyses(); +} + +Optional<LoopPredication::LoopICmp> +LoopPredication::parseLoopICmp(ICmpInst *ICI) { + ICmpInst::Predicate Pred = ICI->getPredicate(); + + Value *LHS = ICI->getOperand(0); + Value *RHS = ICI->getOperand(1); + const SCEV *LHSS = SE->getSCEV(LHS); + if (isa<SCEVCouldNotCompute>(LHSS)) + return None; + const SCEV *RHSS = SE->getSCEV(RHS); + if (isa<SCEVCouldNotCompute>(RHSS)) + return None; + + // Canonicalize RHS to be loop invariant bound, LHS - a loop computable IV + if (SE->isLoopInvariant(LHSS, L)) { + std::swap(LHS, RHS); + std::swap(LHSS, RHSS); + Pred = ICmpInst::getSwappedPredicate(Pred); + } + + const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(LHSS); + if (!AR || AR->getLoop() != L) + return None; + + return LoopICmp(Pred, AR, RHSS); +} + +Value *LoopPredication::expandCheck(SCEVExpander &Expander, + IRBuilder<> &Builder, + ICmpInst::Predicate Pred, const SCEV *LHS, + const SCEV *RHS, Instruction *InsertAt) { + Type *Ty = LHS->getType(); + assert(Ty == RHS->getType() && "expandCheck operands have different types?"); + Value *LHSV = Expander.expandCodeFor(LHS, Ty, InsertAt); + Value *RHSV = Expander.expandCodeFor(RHS, Ty, InsertAt); + return Builder.CreateICmp(Pred, LHSV, RHSV); +} + +/// If ICI can be widened to a loop invariant condition emits the loop +/// invariant condition in the loop preheader and return it, otherwise +/// returns None. +Optional<Value *> LoopPredication::widenICmpRangeCheck(ICmpInst *ICI, + SCEVExpander &Expander, + IRBuilder<> &Builder) { + DEBUG(dbgs() << "Analyzing ICmpInst condition:\n"); + DEBUG(ICI->dump()); + + auto RangeCheck = parseLoopICmp(ICI); + if (!RangeCheck) { + DEBUG(dbgs() << "Failed to parse the loop latch condition!\n"); + return None; + } + + ICmpInst::Predicate Pred = RangeCheck->Pred; + const SCEVAddRecExpr *IndexAR = RangeCheck->IV; + const SCEV *RHSS = RangeCheck->Limit; + + auto CanExpand = [this](const SCEV *S) { + return SE->isLoopInvariant(S, L) && isSafeToExpand(S, *SE); + }; + if (!CanExpand(RHSS)) + return None; + + DEBUG(dbgs() << "IndexAR: "); + DEBUG(IndexAR->dump()); + + bool IsIncreasing = false; + if (!SE->isMonotonicPredicate(IndexAR, Pred, IsIncreasing)) + return None; + + // If the predicate is increasing the condition can change from false to true + // as the loop progresses, in this case take the value on the first iteration + // for the widened check. Otherwise the condition can change from true to + // false as the loop progresses, so take the value on the last iteration. + const SCEV *NewLHSS = IsIncreasing + ? IndexAR->getStart() + : SE->getSCEVAtScope(IndexAR, L->getParentLoop()); + if (NewLHSS == IndexAR) { + DEBUG(dbgs() << "Can't compute NewLHSS!\n"); + return None; + } + + DEBUG(dbgs() << "NewLHSS: "); + DEBUG(NewLHSS->dump()); + + if (!CanExpand(NewLHSS)) + return None; + + DEBUG(dbgs() << "NewLHSS is loop invariant and safe to expand. Expand!\n"); + + Instruction *InsertAt = Preheader->getTerminator(); + return expandCheck(Expander, Builder, Pred, NewLHSS, RHSS, InsertAt); +} + +bool LoopPredication::widenGuardConditions(IntrinsicInst *Guard, + SCEVExpander &Expander) { + DEBUG(dbgs() << "Processing guard:\n"); + DEBUG(Guard->dump()); + + IRBuilder<> Builder(cast<Instruction>(Preheader->getTerminator())); + + // The guard condition is expected to be in form of: + // cond1 && cond2 && cond3 ... + // Iterate over subconditions looking for for icmp conditions which can be + // widened across loop iterations. Widening these conditions remember the + // resulting list of subconditions in Checks vector. + SmallVector<Value *, 4> Worklist(1, Guard->getOperand(0)); + SmallPtrSet<Value *, 4> Visited; + + SmallVector<Value *, 4> Checks; + + unsigned NumWidened = 0; + do { + Value *Condition = Worklist.pop_back_val(); + if (!Visited.insert(Condition).second) + continue; + + Value *LHS, *RHS; + using namespace llvm::PatternMatch; + if (match(Condition, m_And(m_Value(LHS), m_Value(RHS)))) { + Worklist.push_back(LHS); + Worklist.push_back(RHS); + continue; + } + + if (ICmpInst *ICI = dyn_cast<ICmpInst>(Condition)) { + if (auto NewRangeCheck = widenICmpRangeCheck(ICI, Expander, Builder)) { + Checks.push_back(NewRangeCheck.getValue()); + NumWidened++; + continue; + } + } + + // Save the condition as is if we can't widen it + Checks.push_back(Condition); + } while (Worklist.size() != 0); + + if (NumWidened == 0) + return false; + + // Emit the new guard condition + Builder.SetInsertPoint(Guard); + Value *LastCheck = nullptr; + for (auto *Check : Checks) + if (!LastCheck) + LastCheck = Check; + else + LastCheck = Builder.CreateAnd(LastCheck, Check); + Guard->setOperand(0, LastCheck); + + DEBUG(dbgs() << "Widened checks = " << NumWidened << "\n"); + return true; +} + +bool LoopPredication::runOnLoop(Loop *Loop) { + L = Loop; + + DEBUG(dbgs() << "Analyzing "); + DEBUG(L->dump()); + + Module *M = L->getHeader()->getModule(); + + // There is nothing to do if the module doesn't use guards + auto *GuardDecl = + M->getFunction(Intrinsic::getName(Intrinsic::experimental_guard)); + if (!GuardDecl || GuardDecl->use_empty()) + return false; + + DL = &M->getDataLayout(); + + Preheader = L->getLoopPreheader(); + if (!Preheader) + return false; + + // Collect all the guards into a vector and process later, so as not + // to invalidate the instruction iterator. + SmallVector<IntrinsicInst *, 4> Guards; + for (const auto BB : L->blocks()) + for (auto &I : *BB) + if (auto *II = dyn_cast<IntrinsicInst>(&I)) + if (II->getIntrinsicID() == Intrinsic::experimental_guard) + Guards.push_back(II); + + if (Guards.empty()) + return false; + + SCEVExpander Expander(*SE, *DL, "loop-predication"); + + bool Changed = false; + for (auto *Guard : Guards) + Changed |= widenGuardConditions(Guard, Expander); + + return Changed; +} diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp index 86058fe..fc0216e 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp @@ -11,10 +11,9 @@ // //===----------------------------------------------------------------------===// -#include "llvm/Transforms/Scalar.h" +#include "llvm/ADT/BitVector.h" #include "llvm/ADT/MapVector.h" #include "llvm/ADT/STLExtras.h" -#include "llvm/ADT/BitVector.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" @@ -31,6 +30,7 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/LoopUtils.h" @@ -557,7 +557,7 @@ bool LoopReroll::isLoopControlIV(Loop *L, Instruction *IV) { Instruction *UUser = dyn_cast<Instruction>(UU); // Skip SExt if we are extending an nsw value // TODO: Allow ZExt too - if (BO->hasNoSignedWrap() && UUser && UUser->getNumUses() == 1 && + if (BO->hasNoSignedWrap() && UUser && UUser->hasOneUse() && isa<SExtInst>(UUser)) UUser = dyn_cast<Instruction>(*(UUser->user_begin())); if (!isCompareUsedByBranch(UUser)) @@ -852,7 +852,7 @@ collectPossibleRoots(Instruction *Base, std::map<int64_t,Instruction*> &Roots) { for (auto &KV : Roots) { if (KV.first == 0) continue; - if (KV.second->getNumUses() != NumBaseUses) { + if (!KV.second->hasNUses(NumBaseUses)) { DEBUG(dbgs() << "LRR: Aborting - Root and Base #users not the same: " << "#Base=" << NumBaseUses << ", #Root=" << KV.second->getNumUses() << "\n"); @@ -867,7 +867,7 @@ void LoopReroll::DAGRootTracker:: findRootsRecursive(Instruction *I, SmallInstructionSet SubsumedInsts) { // Does the user look like it could be part of a root set? // All its users must be simple arithmetic ops. - if (I->getNumUses() > IL_MaxRerollIterations) + if (I->hasNUsesOrMore(IL_MaxRerollIterations + 1)) return; if (I != IV && findRootsBase(I, SubsumedInsts)) diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopRotation.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopRotation.cpp index cc83069..3506ac3 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LoopRotation.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LoopRotation.cpp @@ -58,13 +58,14 @@ class LoopRotate { AssumptionCache *AC; DominatorTree *DT; ScalarEvolution *SE; + const SimplifyQuery &SQ; public: LoopRotate(unsigned MaxHeaderSize, LoopInfo *LI, const TargetTransformInfo *TTI, AssumptionCache *AC, - DominatorTree *DT, ScalarEvolution *SE) - : MaxHeaderSize(MaxHeaderSize), LI(LI), TTI(TTI), AC(AC), DT(DT), SE(SE) { - } + DominatorTree *DT, ScalarEvolution *SE, const SimplifyQuery &SQ) + : MaxHeaderSize(MaxHeaderSize), LI(LI), TTI(TTI), AC(AC), DT(DT), SE(SE), + SQ(SQ) {} bool processLoop(Loop *L); private: @@ -79,7 +80,8 @@ private: /// to merge the two values. Do this now. static void RewriteUsesOfClonedInstructions(BasicBlock *OrigHeader, BasicBlock *OrigPreheader, - ValueToValueMapTy &ValueMap) { + ValueToValueMapTy &ValueMap, + SmallVectorImpl<PHINode*> *InsertedPHIs) { // Remove PHI node entries that are no longer live. BasicBlock::iterator I, E = OrigHeader->end(); for (I = OrigHeader->begin(); PHINode *PN = dyn_cast<PHINode>(I); ++I) @@ -87,7 +89,7 @@ static void RewriteUsesOfClonedInstructions(BasicBlock *OrigHeader, // Now fix up users of the instructions in OrigHeader, inserting PHI nodes // as necessary. - SSAUpdater SSA; + SSAUpdater SSA(InsertedPHIs); for (I = OrigHeader->begin(); I != E; ++I) { Value *OrigHeaderVal = &*I; @@ -174,6 +176,38 @@ static void RewriteUsesOfClonedInstructions(BasicBlock *OrigHeader, } } +/// Propagate dbg.value intrinsics through the newly inserted Phis. +static void insertDebugValues(BasicBlock *OrigHeader, + SmallVectorImpl<PHINode*> &InsertedPHIs) { + ValueToValueMapTy DbgValueMap; + + // Map existing PHI nodes to their dbg.values. + for (auto &I : *OrigHeader) { + if (auto DbgII = dyn_cast<DbgInfoIntrinsic>(&I)) { + if (auto *Loc = dyn_cast_or_null<PHINode>(DbgII->getVariableLocation())) + DbgValueMap.insert({Loc, DbgII}); + } + } + + // Then iterate through the new PHIs and look to see if they use one of the + // previously mapped PHIs. If so, insert a new dbg.value intrinsic that will + // propagate the info through the new PHI. + LLVMContext &C = OrigHeader->getContext(); + for (auto PHI : InsertedPHIs) { + for (auto VI : PHI->operand_values()) { + auto V = DbgValueMap.find(VI); + if (V != DbgValueMap.end()) { + auto *DbgII = cast<DbgInfoIntrinsic>(V->second); + Instruction *NewDbgII = DbgII->clone(); + auto PhiMAV = MetadataAsValue::get(C, ValueAsMetadata::get(PHI)); + NewDbgII->setOperand(0, PhiMAV); + BasicBlock *Parent = PHI->getParent(); + NewDbgII->insertBefore(Parent->getFirstNonPHIOrDbgOrLifetime()); + } + } + } +} + /// Rotate loop LP. Return true if the loop is rotated. /// /// \param SimplifiedLatch is true if the latch was just folded into the final @@ -278,8 +312,6 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) { for (; PHINode *PN = dyn_cast<PHINode>(I); ++I) ValueMap[PN] = PN->getIncomingValueForBlock(OrigPreheader); - const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); - // For the rest of the instructions, either hoist to the OrigPreheader if // possible or create a clone in the OldPreHeader if not. TerminatorInst *LoopEntryBranch = OrigPreheader->getTerminator(); @@ -309,14 +341,13 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) { // With the operands remapped, see if the instruction constant folds or is // otherwise simplifyable. This commonly occurs because the entry from PHI // nodes allows icmps and other instructions to fold. - // FIXME: Provide TLI, DT, AC to SimplifyInstruction. - Value *V = SimplifyInstruction(C, DL); + Value *V = SimplifyInstruction(C, SQ); if (V && LI->replacementPreservesLCSSAForm(C, V)) { // If so, then delete the temporary instruction and stick the folded value // in the map. ValueMap[Inst] = V; if (!C->mayHaveSideEffects()) { - delete C; + C->deleteValue(); C = nullptr; } } else { @@ -347,9 +378,18 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) { // remove the corresponding incoming values from the PHI nodes in OrigHeader. LoopEntryBranch->eraseFromParent(); + + SmallVector<PHINode*, 2> InsertedPHIs; // If there were any uses of instructions in the duplicated block outside the // loop, update them, inserting PHI nodes as required - RewriteUsesOfClonedInstructions(OrigHeader, OrigPreheader, ValueMap); + RewriteUsesOfClonedInstructions(OrigHeader, OrigPreheader, ValueMap, + &InsertedPHIs); + + // Attach dbg.value intrinsics to the new phis if that phi uses a value that + // previously had debug metadata attached. This keeps the debug info + // up-to-date in the loop body. + if (!InsertedPHIs.empty()) + insertDebugValues(OrigHeader, InsertedPHIs); // NewHeader is now the header of the loop. L->moveToHeader(NewHeader); @@ -445,10 +485,22 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) { DomTreeNode *Node = HeaderChildren[I]; BasicBlock *BB = Node->getBlock(); - pred_iterator PI = pred_begin(BB); - BasicBlock *NearestDom = *PI; - for (pred_iterator PE = pred_end(BB); PI != PE; ++PI) - NearestDom = DT->findNearestCommonDominator(NearestDom, *PI); + BasicBlock *NearestDom = nullptr; + for (BasicBlock *Pred : predecessors(BB)) { + // Consider only reachable basic blocks. + if (!DT->getNode(Pred)) + continue; + + if (!NearestDom) { + NearestDom = Pred; + continue; + } + + NearestDom = DT->findNearestCommonDominator(NearestDom, Pred); + assert(NearestDom && "No NearestCommonDominator found"); + } + + assert(NearestDom && "Nearest dominator not found"); // Remember if this changes the DomTree. if (Node->getIDom()->getBlock() != NearestDom) { @@ -629,11 +681,15 @@ PreservedAnalyses LoopRotatePass::run(Loop &L, LoopAnalysisManager &AM, LoopStandardAnalysisResults &AR, LPMUpdater &) { int Threshold = EnableHeaderDuplication ? DefaultRotationThreshold : 0; - LoopRotate LR(Threshold, &AR.LI, &AR.TTI, &AR.AC, &AR.DT, &AR.SE); + const DataLayout &DL = L.getHeader()->getModule()->getDataLayout(); + const SimplifyQuery SQ = getBestSimplifyQuery(AR, DL); + LoopRotate LR(Threshold, &AR.LI, &AR.TTI, &AR.AC, &AR.DT, &AR.SE, + SQ); bool Changed = LR.processLoop(&L); if (!Changed) return PreservedAnalyses::all(); + return getLoopPassPreservedAnalyses(); } @@ -671,7 +727,8 @@ public: auto *DT = DTWP ? &DTWP->getDomTree() : nullptr; auto *SEWP = getAnalysisIfAvailable<ScalarEvolutionWrapperPass>(); auto *SE = SEWP ? &SEWP->getSE() : nullptr; - LoopRotate LR(MaxHeaderSize, LI, TTI, AC, DT, SE); + const SimplifyQuery SQ = getBestSimplifyQuery(*this, F); + LoopRotate LR(MaxHeaderSize, LI, TTI, AC, DT, SE, SQ); return LR.processLoop(L); } }; diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp index 1606121..35c05e8 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp @@ -40,7 +40,7 @@ static bool simplifyLoopCFG(Loop &L, DominatorTree &DT, LoopInfo &LI) { bool Changed = false; // Copy blocks into a temporary array to avoid iterator invalidation issues // as we remove them. - SmallVector<WeakVH, 16> Blocks(L.blocks()); + SmallVector<WeakTrackingVH, 16> Blocks(L.blocks()); for (auto &Block : Blocks) { // Attempt to merge blocks in the trivial case. Don't modify blocks which @@ -69,6 +69,7 @@ PreservedAnalyses LoopSimplifyCFGPass::run(Loop &L, LoopAnalysisManager &AM, LPMUpdater &) { if (!simplifyLoopCFG(L, AR.DT, AR.LI)) return PreservedAnalyses::all(); + return getLoopPassPreservedAnalyses(); } diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopSink.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopSink.cpp index f3f4152..c9d55b4 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LoopSink.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LoopSink.cpp @@ -1,4 +1,4 @@ -//===-- LoopSink.cpp - Loop Sink Pass ------------------------===// +//===-- LoopSink.cpp - Loop Sink Pass -------------------------------------===// // // The LLVM Compiler Infrastructure // @@ -28,8 +28,10 @@ // InsertBBs = UseBBs - DomBBs + BB // For BB in InsertBBs: // Insert I at BB's beginning +// //===----------------------------------------------------------------------===// +#include "llvm/Transforms/Scalar/LoopSink.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/AliasSetTracker.h" @@ -297,6 +299,42 @@ static bool sinkLoopInvariantInstructions(Loop &L, AAResults &AA, LoopInfo &LI, return Changed; } +PreservedAnalyses LoopSinkPass::run(Function &F, FunctionAnalysisManager &FAM) { + LoopInfo &LI = FAM.getResult<LoopAnalysis>(F); + // Nothing to do if there are no loops. + if (LI.empty()) + return PreservedAnalyses::all(); + + AAResults &AA = FAM.getResult<AAManager>(F); + DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(F); + BlockFrequencyInfo &BFI = FAM.getResult<BlockFrequencyAnalysis>(F); + + // We want to do a postorder walk over the loops. Since loops are a tree this + // is equivalent to a reversed preorder walk and preorder is easy to compute + // without recursion. Since we reverse the preorder, we will visit siblings + // in reverse program order. This isn't expected to matter at all but is more + // consistent with sinking algorithms which generally work bottom-up. + SmallVector<Loop *, 4> PreorderLoops = LI.getLoopsInPreorder(); + + bool Changed = false; + do { + Loop &L = *PreorderLoops.pop_back_val(); + + // Note that we don't pass SCEV here because it is only used to invalidate + // loops in SCEV and we don't preserve (or request) SCEV at all making that + // unnecessary. + Changed |= sinkLoopInvariantInstructions(L, AA, LI, DT, BFI, + /*ScalarEvolution*/ nullptr); + } while (!PreorderLoops.empty()); + + if (!Changed) + return PreservedAnalyses::all(); + + PreservedAnalyses PA; + PA.preserveSet<CFGAnalyses>(); + return PA; +} + namespace { struct LegacyLoopSinkPass : public LoopPass { static char ID; diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp index 194587a..3638da1 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -129,6 +129,24 @@ static cl::opt<bool> EnablePhiElim( "enable-lsr-phielim", cl::Hidden, cl::init(true), cl::desc("Enable LSR phi elimination")); +// The flag adds instruction count to solutions cost comparision. +static cl::opt<bool> InsnsCost( + "lsr-insns-cost", cl::Hidden, cl::init(false), + cl::desc("Add instruction count to a LSR cost model")); + +// Flag to choose how to narrow complex lsr solution +static cl::opt<bool> LSRExpNarrow( + "lsr-exp-narrow", cl::Hidden, cl::init(false), + cl::desc("Narrow LSR complex solution using" + " expectation of registers number")); + +// Flag to narrow search space by filtering non-optimal formulae with +// the same ScaledReg and Scale. +static cl::opt<bool> FilterSameScaledReg( + "lsr-filter-same-scaled-reg", cl::Hidden, cl::init(true), + cl::desc("Narrow LSR search space by filtering non-optimal formulae" + " with the same ScaledReg and Scale")); + #ifndef NDEBUG // Stress test IV chain generation. static cl::opt<bool> StressIVChain( @@ -181,10 +199,11 @@ void RegSortData::print(raw_ostream &OS) const { OS << "[NumUses=" << UsedByIndices.count() << ']'; } -LLVM_DUMP_METHOD -void RegSortData::dump() const { +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +LLVM_DUMP_METHOD void RegSortData::dump() const { print(errs()); errs() << '\n'; } +#endif namespace { @@ -295,9 +314,13 @@ struct Formula { /// canonical representation of a formula is /// 1. BaseRegs.size > 1 implies ScaledReg != NULL and /// 2. ScaledReg != NULL implies Scale != 1 || !BaseRegs.empty(). + /// 3. The reg containing recurrent expr related with currect loop in the + /// formula should be put in the ScaledReg. /// #1 enforces that the scaled register is always used when at least two /// registers are needed by the formula: e.g., reg1 + reg2 is reg1 + 1 * reg2. /// #2 enforces that 1 * reg is reg. + /// #3 ensures invariant regs with respect to current loop can be combined + /// together in LSR codegen. /// This invariant can be temporarly broken while building a formula. /// However, every formula inserted into the LSRInstance must be in canonical /// form. @@ -318,12 +341,14 @@ struct Formula { void initialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE); - bool isCanonical() const; + bool isCanonical(const Loop &L) const; - void canonicalize(); + void canonicalize(const Loop &L); bool unscale(); + bool hasZeroEnd() const; + size_t getNumRegs() const; Type *getType() const; @@ -410,16 +435,35 @@ void Formula::initialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE) { BaseRegs.push_back(Sum); HasBaseReg = true; } - canonicalize(); + canonicalize(*L); } /// \brief Check whether or not this formula statisfies the canonical /// representation. /// \see Formula::BaseRegs. -bool Formula::isCanonical() const { - if (ScaledReg) - return Scale != 1 || !BaseRegs.empty(); - return BaseRegs.size() <= 1; +bool Formula::isCanonical(const Loop &L) const { + if (!ScaledReg) + return BaseRegs.size() <= 1; + + if (Scale != 1) + return true; + + if (Scale == 1 && BaseRegs.empty()) + return false; + + const SCEVAddRecExpr *SAR = dyn_cast<const SCEVAddRecExpr>(ScaledReg); + if (SAR && SAR->getLoop() == &L) + return true; + + // If ScaledReg is not a recurrent expr, or it is but its loop is not current + // loop, meanwhile BaseRegs contains a recurrent expr reg related with current + // loop, we want to swap the reg in BaseRegs with ScaledReg. + auto I = + find_if(make_range(BaseRegs.begin(), BaseRegs.end()), [&](const SCEV *S) { + return isa<const SCEVAddRecExpr>(S) && + (cast<SCEVAddRecExpr>(S)->getLoop() == &L); + }); + return I == BaseRegs.end(); } /// \brief Helper method to morph a formula into its canonical representation. @@ -428,21 +472,33 @@ bool Formula::isCanonical() const { /// field. Otherwise, we would have to do special cases everywhere in LSR /// to treat reg1 + reg2 + ... the same way as reg1 + 1*reg2 + ... /// On the other hand, 1*reg should be canonicalized into reg. -void Formula::canonicalize() { - if (isCanonical()) +void Formula::canonicalize(const Loop &L) { + if (isCanonical(L)) return; // So far we did not need this case. This is easy to implement but it is // useless to maintain dead code. Beside it could hurt compile time. assert(!BaseRegs.empty() && "1*reg => reg, should not be needed."); + // Keep the invariant sum in BaseRegs and one of the variant sum in ScaledReg. - ScaledReg = BaseRegs.back(); - BaseRegs.pop_back(); - Scale = 1; - size_t BaseRegsSize = BaseRegs.size(); - size_t Try = 0; - // If ScaledReg is an invariant, try to find a variant expression. - while (Try < BaseRegsSize && !isa<SCEVAddRecExpr>(ScaledReg)) - std::swap(ScaledReg, BaseRegs[Try++]); + if (!ScaledReg) { + ScaledReg = BaseRegs.back(); + BaseRegs.pop_back(); + Scale = 1; + } + + // If ScaledReg is an invariant with respect to L, find the reg from + // BaseRegs containing the recurrent expr related with Loop L. Swap the + // reg with ScaledReg. + const SCEVAddRecExpr *SAR = dyn_cast<const SCEVAddRecExpr>(ScaledReg); + if (!SAR || SAR->getLoop() != &L) { + auto I = find_if(make_range(BaseRegs.begin(), BaseRegs.end()), + [&](const SCEV *S) { + return isa<const SCEVAddRecExpr>(S) && + (cast<SCEVAddRecExpr>(S)->getLoop() == &L); + }); + if (I != BaseRegs.end()) + std::swap(ScaledReg, *I); + } } /// \brief Get rid of the scale in the formula. @@ -458,6 +514,14 @@ bool Formula::unscale() { return true; } +bool Formula::hasZeroEnd() const { + if (UnfoldedOffset || BaseOffset) + return false; + if (BaseRegs.size() != 1 || ScaledReg) + return false; + return true; +} + /// Return the total number of register operands used by this formula. This does /// not include register uses implied by non-constant addrec strides. size_t Formula::getNumRegs() const { @@ -534,10 +598,11 @@ void Formula::print(raw_ostream &OS) const { } } -LLVM_DUMP_METHOD -void Formula::dump() const { +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +LLVM_DUMP_METHOD void Formula::dump() const { print(errs()); errs() << '\n'; } +#endif /// Return true if the given addrec can be sign-extended without changing its /// value. @@ -711,7 +776,7 @@ static GlobalValue *ExtractSymbol(const SCEV *&S, ScalarEvolution &SE) { static bool isAddressUse(Instruction *Inst, Value *OperandVal) { bool isAddress = isa<LoadInst>(Inst); if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) { - if (SI->getOperand(1) == OperandVal) + if (SI->getPointerOperand() == OperandVal) isAddress = true; } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) { // Addressing modes can also be folded into prefetches and a variety @@ -723,6 +788,12 @@ static bool isAddressUse(Instruction *Inst, Value *OperandVal) { isAddress = true; break; } + } else if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(Inst)) { + if (RMW->getPointerOperand() == OperandVal) + isAddress = true; + } else if (AtomicCmpXchgInst *CmpX = dyn_cast<AtomicCmpXchgInst>(Inst)) { + if (CmpX->getPointerOperand() == OperandVal) + isAddress = true; } return isAddress; } @@ -735,6 +806,10 @@ static MemAccessTy getAccessType(const Instruction *Inst) { AccessTy.AddrSpace = SI->getPointerAddressSpace(); } else if (const LoadInst *LI = dyn_cast<LoadInst>(Inst)) { AccessTy.AddrSpace = LI->getPointerAddressSpace(); + } else if (const AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(Inst)) { + AccessTy.AddrSpace = RMW->getPointerAddressSpace(); + } else if (const AtomicCmpXchgInst *CmpX = dyn_cast<AtomicCmpXchgInst>(Inst)) { + AccessTy.AddrSpace = CmpX->getPointerAddressSpace(); } // All pointers have the same requirements, so canonicalize them to an @@ -832,7 +907,7 @@ static bool isHighCostExpansion(const SCEV *S, /// If any of the instructions is the specified set are trivially dead, delete /// them and see if this makes any of their operands subsequently dead. static bool -DeleteTriviallyDeadInstructions(SmallVectorImpl<WeakVH> &DeadInsts) { +DeleteTriviallyDeadInstructions(SmallVectorImpl<WeakTrackingVH> &DeadInsts) { bool Changed = false; while (!DeadInsts.empty()) { @@ -875,44 +950,44 @@ static bool isAMCompletelyFolded(const TargetTransformInfo &TTI, const LSRUse &LU, const Formula &F); // Get the cost of the scaling factor used in F for LU. static unsigned getScalingFactorCost(const TargetTransformInfo &TTI, - const LSRUse &LU, const Formula &F); + const LSRUse &LU, const Formula &F, + const Loop &L); namespace { /// This class is used to measure and compare candidate formulae. class Cost { - /// TODO: Some of these could be merged. Also, a lexical ordering - /// isn't always optimal. - unsigned NumRegs; - unsigned AddRecCost; - unsigned NumIVMuls; - unsigned NumBaseAdds; - unsigned ImmCost; - unsigned SetupCost; - unsigned ScaleCost; + TargetTransformInfo::LSRCost C; public: - Cost() - : NumRegs(0), AddRecCost(0), NumIVMuls(0), NumBaseAdds(0), ImmCost(0), - SetupCost(0), ScaleCost(0) {} + Cost() { + C.Insns = 0; + C.NumRegs = 0; + C.AddRecCost = 0; + C.NumIVMuls = 0; + C.NumBaseAdds = 0; + C.ImmCost = 0; + C.SetupCost = 0; + C.ScaleCost = 0; + } - bool operator<(const Cost &Other) const; + bool isLess(Cost &Other, const TargetTransformInfo &TTI); void Lose(); #ifndef NDEBUG // Once any of the metrics loses, they must all remain losers. bool isValid() { - return ((NumRegs | AddRecCost | NumIVMuls | NumBaseAdds - | ImmCost | SetupCost | ScaleCost) != ~0u) - || ((NumRegs & AddRecCost & NumIVMuls & NumBaseAdds - & ImmCost & SetupCost & ScaleCost) == ~0u); + return ((C.Insns | C.NumRegs | C.AddRecCost | C.NumIVMuls | C.NumBaseAdds + | C.ImmCost | C.SetupCost | C.ScaleCost) != ~0u) + || ((C.Insns & C.NumRegs & C.AddRecCost & C.NumIVMuls & C.NumBaseAdds + & C.ImmCost & C.SetupCost & C.ScaleCost) == ~0u); } #endif bool isLoser() { assert(isValid() && "invalid cost"); - return NumRegs == ~0u; + return C.NumRegs == ~0u; } void RateFormula(const TargetTransformInfo &TTI, @@ -1067,7 +1142,8 @@ public: } bool HasFormulaWithSameRegs(const Formula &F) const; - bool InsertFormula(const Formula &F); + float getNotSelectedProbability(const SCEV *Reg) const; + bool InsertFormula(const Formula &F, const Loop &L); void DeleteFormula(Formula &F); void RecomputeRegs(size_t LUIdx, RegUseTracker &Reguses); @@ -1083,20 +1159,26 @@ void Cost::RateRegister(const SCEV *Reg, const Loop *L, ScalarEvolution &SE, DominatorTree &DT) { if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Reg)) { - // If this is an addrec for another loop, don't second-guess its addrec phi - // nodes. LSR isn't currently smart enough to reason about more than one - // loop at a time. LSR has already run on inner loops, will not run on outer - // loops, and cannot be expected to change sibling loops. + // If this is an addrec for another loop, it should be an invariant + // with respect to L since L is the innermost loop (at least + // for now LSR only handles innermost loops). if (AR->getLoop() != L) { // If the AddRec exists, consider it's register free and leave it alone. if (isExistingPhi(AR, SE)) return; - // Otherwise, do not consider this formula at all. - Lose(); + // It is bad to allow LSR for current loop to add induction variables + // for its sibling loops. + if (!AR->getLoop()->contains(L)) { + Lose(); + return; + } + + // Otherwise, it will be an invariant with respect to Loop L. + ++C.NumRegs; return; } - AddRecCost += 1; /// TODO: This should be a function of the stride. + C.AddRecCost += 1; /// TODO: This should be a function of the stride. // Add the step value register, if it needs one. // TODO: The non-affine case isn't precisely modeled here. @@ -1108,7 +1190,7 @@ void Cost::RateRegister(const SCEV *Reg, } } } - ++NumRegs; + ++C.NumRegs; // Rough heuristic; favor registers which don't require extra setup // instructions in the preheader. @@ -1117,9 +1199,9 @@ void Cost::RateRegister(const SCEV *Reg, !(isa<SCEVAddRecExpr>(Reg) && (isa<SCEVUnknown>(cast<SCEVAddRecExpr>(Reg)->getStart()) || isa<SCEVConstant>(cast<SCEVAddRecExpr>(Reg)->getStart())))) - ++SetupCost; + ++C.SetupCost; - NumIVMuls += isa<SCEVMulExpr>(Reg) && + C.NumIVMuls += isa<SCEVMulExpr>(Reg) && SE.hasComputableLoopEvolution(Reg, L); } @@ -1150,8 +1232,11 @@ void Cost::RateFormula(const TargetTransformInfo &TTI, ScalarEvolution &SE, DominatorTree &DT, const LSRUse &LU, SmallPtrSetImpl<const SCEV *> *LoserRegs) { - assert(F.isCanonical() && "Cost is accurate only for canonical formula"); + assert(F.isCanonical(*L) && "Cost is accurate only for canonical formula"); // Tally up the registers. + unsigned PrevAddRecCost = C.AddRecCost; + unsigned PrevNumRegs = C.NumRegs; + unsigned PrevNumBaseAdds = C.NumBaseAdds; if (const SCEV *ScaledReg = F.ScaledReg) { if (VisitedRegs.count(ScaledReg)) { Lose(); @@ -1176,73 +1261,113 @@ void Cost::RateFormula(const TargetTransformInfo &TTI, if (NumBaseParts > 1) // Do not count the base and a possible second register if the target // allows to fold 2 registers. - NumBaseAdds += + C.NumBaseAdds += NumBaseParts - (1 + (F.Scale && isAMCompletelyFolded(TTI, LU, F))); - NumBaseAdds += (F.UnfoldedOffset != 0); + C.NumBaseAdds += (F.UnfoldedOffset != 0); // Accumulate non-free scaling amounts. - ScaleCost += getScalingFactorCost(TTI, LU, F); + C.ScaleCost += getScalingFactorCost(TTI, LU, F, *L); // Tally up the non-zero immediates. for (const LSRFixup &Fixup : LU.Fixups) { int64_t O = Fixup.Offset; int64_t Offset = (uint64_t)O + F.BaseOffset; if (F.BaseGV) - ImmCost += 64; // Handle symbolic values conservatively. + C.ImmCost += 64; // Handle symbolic values conservatively. // TODO: This should probably be the pointer size. else if (Offset != 0) - ImmCost += APInt(64, Offset, true).getMinSignedBits(); + C.ImmCost += APInt(64, Offset, true).getMinSignedBits(); // Check with target if this offset with this instruction is // specifically not supported. if ((isa<LoadInst>(Fixup.UserInst) || isa<StoreInst>(Fixup.UserInst)) && !TTI.isFoldableMemAccessOffset(Fixup.UserInst, Offset)) - NumBaseAdds++; + C.NumBaseAdds++; + } + + // If we don't count instruction cost exit here. + if (!InsnsCost) { + assert(isValid() && "invalid cost"); + return; + } + + // Treat every new register that exceeds TTI.getNumberOfRegisters() - 1 as + // additional instruction (at least fill). + unsigned TTIRegNum = TTI.getNumberOfRegisters(false) - 1; + if (C.NumRegs > TTIRegNum) { + // Cost already exceeded TTIRegNum, then only newly added register can add + // new instructions. + if (PrevNumRegs > TTIRegNum) + C.Insns += (C.NumRegs - PrevNumRegs); + else + C.Insns += (C.NumRegs - TTIRegNum); } + + // If ICmpZero formula ends with not 0, it could not be replaced by + // just add or sub. We'll need to compare final result of AddRec. + // That means we'll need an additional instruction. + // For -10 + {0, +, 1}: + // i = i + 1; + // cmp i, 10 + // + // For {-10, +, 1}: + // i = i + 1; + if (LU.Kind == LSRUse::ICmpZero && !F.hasZeroEnd()) + C.Insns++; + // Each new AddRec adds 1 instruction to calculation. + C.Insns += (C.AddRecCost - PrevAddRecCost); + + // BaseAdds adds instructions for unfolded registers. + if (LU.Kind != LSRUse::ICmpZero) + C.Insns += C.NumBaseAdds - PrevNumBaseAdds; assert(isValid() && "invalid cost"); } /// Set this cost to a losing value. void Cost::Lose() { - NumRegs = ~0u; - AddRecCost = ~0u; - NumIVMuls = ~0u; - NumBaseAdds = ~0u; - ImmCost = ~0u; - SetupCost = ~0u; - ScaleCost = ~0u; + C.Insns = ~0u; + C.NumRegs = ~0u; + C.AddRecCost = ~0u; + C.NumIVMuls = ~0u; + C.NumBaseAdds = ~0u; + C.ImmCost = ~0u; + C.SetupCost = ~0u; + C.ScaleCost = ~0u; } /// Choose the lower cost. -bool Cost::operator<(const Cost &Other) const { - return std::tie(NumRegs, AddRecCost, NumIVMuls, NumBaseAdds, ScaleCost, - ImmCost, SetupCost) < - std::tie(Other.NumRegs, Other.AddRecCost, Other.NumIVMuls, - Other.NumBaseAdds, Other.ScaleCost, Other.ImmCost, - Other.SetupCost); +bool Cost::isLess(Cost &Other, const TargetTransformInfo &TTI) { + if (InsnsCost.getNumOccurrences() > 0 && InsnsCost && + C.Insns != Other.C.Insns) + return C.Insns < Other.C.Insns; + return TTI.isLSRCostLess(C, Other.C); } void Cost::print(raw_ostream &OS) const { - OS << NumRegs << " reg" << (NumRegs == 1 ? "" : "s"); - if (AddRecCost != 0) - OS << ", with addrec cost " << AddRecCost; - if (NumIVMuls != 0) - OS << ", plus " << NumIVMuls << " IV mul" << (NumIVMuls == 1 ? "" : "s"); - if (NumBaseAdds != 0) - OS << ", plus " << NumBaseAdds << " base add" - << (NumBaseAdds == 1 ? "" : "s"); - if (ScaleCost != 0) - OS << ", plus " << ScaleCost << " scale cost"; - if (ImmCost != 0) - OS << ", plus " << ImmCost << " imm cost"; - if (SetupCost != 0) - OS << ", plus " << SetupCost << " setup cost"; + if (InsnsCost) + OS << C.Insns << " instruction" << (C.Insns == 1 ? " " : "s "); + OS << C.NumRegs << " reg" << (C.NumRegs == 1 ? "" : "s"); + if (C.AddRecCost != 0) + OS << ", with addrec cost " << C.AddRecCost; + if (C.NumIVMuls != 0) + OS << ", plus " << C.NumIVMuls << " IV mul" + << (C.NumIVMuls == 1 ? "" : "s"); + if (C.NumBaseAdds != 0) + OS << ", plus " << C.NumBaseAdds << " base add" + << (C.NumBaseAdds == 1 ? "" : "s"); + if (C.ScaleCost != 0) + OS << ", plus " << C.ScaleCost << " scale cost"; + if (C.ImmCost != 0) + OS << ", plus " << C.ImmCost << " imm cost"; + if (C.SetupCost != 0) + OS << ", plus " << C.SetupCost << " setup cost"; } -LLVM_DUMP_METHOD -void Cost::dump() const { +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +LLVM_DUMP_METHOD void Cost::dump() const { print(errs()); errs() << '\n'; } +#endif LSRFixup::LSRFixup() : UserInst(nullptr), OperandValToReplace(nullptr), @@ -1285,10 +1410,11 @@ void LSRFixup::print(raw_ostream &OS) const { OS << ", Offset=" << Offset; } -LLVM_DUMP_METHOD -void LSRFixup::dump() const { +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +LLVM_DUMP_METHOD void LSRFixup::dump() const { print(errs()); errs() << '\n'; } +#endif /// Test whether this use as a formula which has the same registers as the given /// formula. @@ -1300,10 +1426,19 @@ bool LSRUse::HasFormulaWithSameRegs(const Formula &F) const { return Uniquifier.count(Key); } +/// The function returns a probability of selecting formula without Reg. +float LSRUse::getNotSelectedProbability(const SCEV *Reg) const { + unsigned FNum = 0; + for (const Formula &F : Formulae) + if (F.referencesReg(Reg)) + FNum++; + return ((float)(Formulae.size() - FNum)) / Formulae.size(); +} + /// If the given formula has not yet been inserted, add it to the list, and /// return true. Return false otherwise. The formula must be in canonical form. -bool LSRUse::InsertFormula(const Formula &F) { - assert(F.isCanonical() && "Invalid canonical representation"); +bool LSRUse::InsertFormula(const Formula &F, const Loop &L) { + assert(F.isCanonical(L) && "Invalid canonical representation"); if (!Formulae.empty() && RigidFormula) return false; @@ -1391,10 +1526,11 @@ void LSRUse::print(raw_ostream &OS) const { OS << ", widest fixup type: " << *WidestFixupType; } -LLVM_DUMP_METHOD -void LSRUse::dump() const { +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +LLVM_DUMP_METHOD void LSRUse::dump() const { print(errs()); errs() << '\n'; } +#endif static bool isAMCompletelyFolded(const TargetTransformInfo &TTI, LSRUse::KindType Kind, MemAccessTy AccessTy, @@ -1472,7 +1608,7 @@ static bool isAMCompletelyFolded(const TargetTransformInfo &TTI, static bool isAMCompletelyFolded(const TargetTransformInfo &TTI, int64_t MinOffset, int64_t MaxOffset, LSRUse::KindType Kind, MemAccessTy AccessTy, - const Formula &F) { + const Formula &F, const Loop &L) { // For the purpose of isAMCompletelyFolded either having a canonical formula // or a scale not equal to zero is correct. // Problems may arise from non canonical formulae having a scale == 0. @@ -1480,7 +1616,7 @@ static bool isAMCompletelyFolded(const TargetTransformInfo &TTI, // However, when we generate the scaled formulae, we first check that the // scaling factor is profitable before computing the actual ScaledReg for // compile time sake. - assert((F.isCanonical() || F.Scale != 0)); + assert((F.isCanonical(L) || F.Scale != 0)); return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy, F.BaseGV, F.BaseOffset, F.HasBaseReg, F.Scale); } @@ -1515,14 +1651,15 @@ static bool isAMCompletelyFolded(const TargetTransformInfo &TTI, } static unsigned getScalingFactorCost(const TargetTransformInfo &TTI, - const LSRUse &LU, const Formula &F) { + const LSRUse &LU, const Formula &F, + const Loop &L) { if (!F.Scale) return 0; // If the use is not completely folded in that instruction, we will have to // pay an extra cost only for scale != 1. if (!isAMCompletelyFolded(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, - LU.AccessTy, F)) + LU.AccessTy, F, L)) return F.Scale != 1; switch (LU.Kind) { @@ -1718,7 +1855,7 @@ class LSRInstance { void FinalizeChain(IVChain &Chain); void CollectChains(); void GenerateIVChain(const IVChain &Chain, SCEVExpander &Rewriter, - SmallVectorImpl<WeakVH> &DeadInsts); + SmallVectorImpl<WeakTrackingVH> &DeadInsts); void CollectInterestingTypesAndFactors(); void CollectFixupsAndInitialFormulae(); @@ -1772,6 +1909,8 @@ class LSRInstance { void NarrowSearchSpaceByDetectingSupersets(); void NarrowSearchSpaceByCollapsingUnrolledCode(); void NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters(); + void NarrowSearchSpaceByFilterFormulaWithSameScaledReg(); + void NarrowSearchSpaceByDeletingCostlyFormulas(); void NarrowSearchSpaceByPickingWinnerRegs(); void NarrowSearchSpaceUsingHeuristics(); @@ -1792,19 +1931,15 @@ class LSRInstance { const LSRUse &LU, SCEVExpander &Rewriter) const; - Value *Expand(const LSRUse &LU, const LSRFixup &LF, - const Formula &F, - BasicBlock::iterator IP, - SCEVExpander &Rewriter, - SmallVectorImpl<WeakVH> &DeadInsts) const; + Value *Expand(const LSRUse &LU, const LSRFixup &LF, const Formula &F, + BasicBlock::iterator IP, SCEVExpander &Rewriter, + SmallVectorImpl<WeakTrackingVH> &DeadInsts) const; void RewriteForPHI(PHINode *PN, const LSRUse &LU, const LSRFixup &LF, - const Formula &F, - SCEVExpander &Rewriter, - SmallVectorImpl<WeakVH> &DeadInsts) const; - void Rewrite(const LSRUse &LU, const LSRFixup &LF, - const Formula &F, + const Formula &F, SCEVExpander &Rewriter, + SmallVectorImpl<WeakTrackingVH> &DeadInsts) const; + void Rewrite(const LSRUse &LU, const LSRFixup &LF, const Formula &F, SCEVExpander &Rewriter, - SmallVectorImpl<WeakVH> &DeadInsts) const; + SmallVectorImpl<WeakTrackingVH> &DeadInsts) const; void ImplementSolution(const SmallVectorImpl<const Formula *> &Solution); public: @@ -2191,7 +2326,7 @@ LSRInstance::OptimizeLoopTermCond() { dyn_cast_or_null<SCEVConstant>(getExactSDiv(B, A, SE))) { const ConstantInt *C = D->getValue(); // Stride of one or negative one can have reuse with non-addresses. - if (C->isOne() || C->isAllOnesValue()) + if (C->isOne() || C->isMinusOne()) goto decline_post_inc; // Avoid weird situations. if (C->getValue().getMinSignedBits() >= 64 || @@ -2492,7 +2627,12 @@ static Value *getWideOperand(Value *Oper) { static bool isCompatibleIVType(Value *LVal, Value *RVal) { Type *LType = LVal->getType(); Type *RType = RVal->getType(); - return (LType == RType) || (LType->isPointerTy() && RType->isPointerTy()); + return (LType == RType) || (LType->isPointerTy() && RType->isPointerTy() && + // Different address spaces means (possibly) + // different types of the pointer implementation, + // e.g. i16 vs i32 so disallow that. + (LType->getPointerAddressSpace() == + RType->getPointerAddressSpace())); } /// Return an approximation of this SCEV expression's "base", or NULL for any @@ -2881,7 +3021,7 @@ static bool canFoldIVIncExpr(const SCEV *IncExpr, Instruction *UserInst, /// Generate an add or subtract for each IVInc in a chain to materialize the IV /// user's operand from the previous IV user's operand. void LSRInstance::GenerateIVChain(const IVChain &Chain, SCEVExpander &Rewriter, - SmallVectorImpl<WeakVH> &DeadInsts) { + SmallVectorImpl<WeakTrackingVH> &DeadInsts) { // Find the new IVOperand for the head of the chain. It may have been replaced // by LSR. const IVInc &Head = Chain.Incs[0]; @@ -2989,8 +3129,10 @@ void LSRInstance::CollectFixupsAndInitialFormulae() { User::op_iterator UseI = find(UserInst->operands(), U.getOperandValToReplace()); assert(UseI != UserInst->op_end() && "cannot find IV operand"); - if (IVIncSet.count(UseI)) + if (IVIncSet.count(UseI)) { + DEBUG(dbgs() << "Use is in profitable chain: " << **UseI << '\n'); continue; + } LSRUse::KindType Kind = LSRUse::Basic; MemAccessTy AccessTy; @@ -3025,8 +3167,7 @@ void LSRInstance::CollectFixupsAndInitialFormulae() { if (SE.isLoopInvariant(N, L) && isSafeToExpand(N, SE)) { // S is normalized, so normalize N before folding it into S // to keep the result normalized. - N = TransformForPostIncUse(Normalize, N, CI, nullptr, - TmpPostIncLoops, SE, DT); + N = normalizeForPostIncUse(N, TmpPostIncLoops, SE); Kind = LSRUse::ICmpZero; S = SE.getMinusSCEV(N, S); } @@ -3108,7 +3249,8 @@ bool LSRInstance::InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F) { // Do not insert formula that we will not be able to expand. assert(isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F) && "Formula is illegal"); - if (!LU.InsertFormula(F)) + + if (!LU.InsertFormula(F, *L)) return false; CountRegisters(F, LUIdx); @@ -3347,7 +3489,7 @@ void LSRInstance::GenerateReassociationsImpl(LSRUse &LU, unsigned LUIdx, F.BaseRegs.push_back(*J); // We may have changed the number of register in base regs, adjust the // formula accordingly. - F.canonicalize(); + F.canonicalize(*L); if (InsertFormula(LU, LUIdx, F)) // If that formula hadn't been seen before, recurse to find more like @@ -3359,7 +3501,7 @@ void LSRInstance::GenerateReassociationsImpl(LSRUse &LU, unsigned LUIdx, /// Split out subexpressions from adds and the bases of addrecs. void LSRInstance::GenerateReassociations(LSRUse &LU, unsigned LUIdx, Formula Base, unsigned Depth) { - assert(Base.isCanonical() && "Input must be in the canonical form"); + assert(Base.isCanonical(*L) && "Input must be in the canonical form"); // Arbitrarily cap recursion to protect compile time. if (Depth >= 3) return; @@ -3400,7 +3542,7 @@ void LSRInstance::GenerateCombinations(LSRUse &LU, unsigned LUIdx, // rather than proceed with zero in a register. if (!Sum->isZero()) { F.BaseRegs.push_back(Sum); - F.canonicalize(); + F.canonicalize(*L); (void)InsertFormula(LU, LUIdx, F); } } @@ -3457,7 +3599,7 @@ void LSRInstance::GenerateConstantOffsetsImpl( F.ScaledReg = nullptr; } else F.deleteBaseReg(F.BaseRegs[Idx]); - F.canonicalize(); + F.canonicalize(*L); } else if (IsScaledReg) F.ScaledReg = NewG; else @@ -3620,10 +3762,10 @@ void LSRInstance::GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base) { if (LU.Kind == LSRUse::ICmpZero && !Base.HasBaseReg && Base.BaseOffset == 0 && !Base.BaseGV) continue; - // For each addrec base reg, apply the scale, if possible. - for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) - if (const SCEVAddRecExpr *AR = - dyn_cast<SCEVAddRecExpr>(Base.BaseRegs[i])) { + // For each addrec base reg, if its loop is current loop, apply the scale. + for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) { + const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Base.BaseRegs[i]); + if (AR && (AR->getLoop() == L || LU.AllFixupsOutsideLoop)) { const SCEV *FactorS = SE.getConstant(IntTy, Factor); if (FactorS->isZero()) continue; @@ -3637,11 +3779,17 @@ void LSRInstance::GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base) { // The canonical representation of 1*reg is reg, which is already in // Base. In that case, do not try to insert the formula, it will be // rejected anyway. - if (F.Scale == 1 && F.BaseRegs.empty()) + if (F.Scale == 1 && (F.BaseRegs.empty() || + (AR->getLoop() != L && LU.AllFixupsOutsideLoop))) continue; + // If AllFixupsOutsideLoop is true and F.Scale is 1, we may generate + // non canonical Formula with ScaledReg's loop not being L. + if (F.Scale == 1 && LU.AllFixupsOutsideLoop) + F.canonicalize(*L); (void)InsertFormula(LU, LUIdx, F); } } + } } } @@ -3668,6 +3816,7 @@ void LSRInstance::GenerateTruncates(LSRUse &LU, unsigned LUIdx, Formula Base) { if (!F.hasRegsUsedByUsesOtherThan(LUIdx, RegUses)) continue; + F.canonicalize(*L); (void)InsertFormula(LU, LUIdx, F); } } @@ -3697,10 +3846,11 @@ void WorkItem::print(raw_ostream &OS) const { << " , add offset " << Imm; } -LLVM_DUMP_METHOD -void WorkItem::dump() const { +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +LLVM_DUMP_METHOD void WorkItem::dump() const { print(errs()); errs() << '\n'; } +#endif /// Look for registers which are a constant distance apart and try to form reuse /// opportunities between them. @@ -3764,8 +3914,7 @@ void LSRInstance::GenerateCrossUseConstantOffsets() { // Compute the difference between the two. int64_t Imm = (uint64_t)JImm - M->first; - for (int LUIdx = UsedByIndices.find_first(); LUIdx != -1; - LUIdx = UsedByIndices.find_next(LUIdx)) + for (unsigned LUIdx : UsedByIndices.set_bits()) // Make a memo of this use, offset, and register tuple. if (UniqueItems.insert(std::make_pair(LUIdx, Imm)).second) WorkItems.push_back(WorkItem(LUIdx, Imm, OrigReg)); @@ -3821,7 +3970,7 @@ void LSRInstance::GenerateCrossUseConstantOffsets() { continue; // OK, looks good. - NewF.canonicalize(); + NewF.canonicalize(*this->L); (void)InsertFormula(LU, LUIdx, NewF); } else { // Use the immediate in a base register. @@ -3853,7 +4002,7 @@ void LSRInstance::GenerateCrossUseConstantOffsets() { goto skip_formula; // Ok, looks good. - NewF.canonicalize(); + NewF.canonicalize(*this->L); (void)InsertFormula(LU, LUIdx, NewF); break; skip_formula:; @@ -3967,7 +4116,7 @@ void LSRInstance::FilterOutUndesirableDedicatedRegisters() { Cost CostBest; Regs.clear(); CostBest.RateFormula(TTI, Best, Regs, VisitedRegs, L, SE, DT, LU); - if (CostF < CostBest) + if (CostF.isLess(CostBest, TTI)) std::swap(F, Best); DEBUG(dbgs() << " Filtering out formula "; F.print(dbgs()); dbgs() << "\n" @@ -4165,6 +4314,242 @@ void LSRInstance::NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters(){ } } +/// If a LSRUse has multiple formulae with the same ScaledReg and Scale. +/// Pick the best one and delete the others. +/// This narrowing heuristic is to keep as many formulae with different +/// Scale and ScaledReg pair as possible while narrowing the search space. +/// The benefit is that it is more likely to find out a better solution +/// from a formulae set with more Scale and ScaledReg variations than +/// a formulae set with the same Scale and ScaledReg. The picking winner +/// reg heurstic will often keep the formulae with the same Scale and +/// ScaledReg and filter others, and we want to avoid that if possible. +void LSRInstance::NarrowSearchSpaceByFilterFormulaWithSameScaledReg() { + if (EstimateSearchSpaceComplexity() < ComplexityLimit) + return; + + DEBUG(dbgs() << "The search space is too complex.\n" + "Narrowing the search space by choosing the best Formula " + "from the Formulae with the same Scale and ScaledReg.\n"); + + // Map the "Scale * ScaledReg" pair to the best formula of current LSRUse. + typedef DenseMap<std::pair<const SCEV *, int64_t>, size_t> BestFormulaeTy; + BestFormulaeTy BestFormulae; +#ifndef NDEBUG + bool ChangedFormulae = false; +#endif + DenseSet<const SCEV *> VisitedRegs; + SmallPtrSet<const SCEV *, 16> Regs; + + for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) { + LSRUse &LU = Uses[LUIdx]; + DEBUG(dbgs() << "Filtering for use "; LU.print(dbgs()); dbgs() << '\n'); + + // Return true if Formula FA is better than Formula FB. + auto IsBetterThan = [&](Formula &FA, Formula &FB) { + // First we will try to choose the Formula with fewer new registers. + // For a register used by current Formula, the more the register is + // shared among LSRUses, the less we increase the register number + // counter of the formula. + size_t FARegNum = 0; + for (const SCEV *Reg : FA.BaseRegs) { + const SmallBitVector &UsedByIndices = RegUses.getUsedByIndices(Reg); + FARegNum += (NumUses - UsedByIndices.count() + 1); + } + size_t FBRegNum = 0; + for (const SCEV *Reg : FB.BaseRegs) { + const SmallBitVector &UsedByIndices = RegUses.getUsedByIndices(Reg); + FBRegNum += (NumUses - UsedByIndices.count() + 1); + } + if (FARegNum != FBRegNum) + return FARegNum < FBRegNum; + + // If the new register numbers are the same, choose the Formula with + // less Cost. + Cost CostFA, CostFB; + Regs.clear(); + CostFA.RateFormula(TTI, FA, Regs, VisitedRegs, L, SE, DT, LU); + Regs.clear(); + CostFB.RateFormula(TTI, FB, Regs, VisitedRegs, L, SE, DT, LU); + return CostFA.isLess(CostFB, TTI); + }; + + bool Any = false; + for (size_t FIdx = 0, NumForms = LU.Formulae.size(); FIdx != NumForms; + ++FIdx) { + Formula &F = LU.Formulae[FIdx]; + if (!F.ScaledReg) + continue; + auto P = BestFormulae.insert({{F.ScaledReg, F.Scale}, FIdx}); + if (P.second) + continue; + + Formula &Best = LU.Formulae[P.first->second]; + if (IsBetterThan(F, Best)) + std::swap(F, Best); + DEBUG(dbgs() << " Filtering out formula "; F.print(dbgs()); + dbgs() << "\n" + " in favor of formula "; + Best.print(dbgs()); dbgs() << '\n'); +#ifndef NDEBUG + ChangedFormulae = true; +#endif + LU.DeleteFormula(F); + --FIdx; + --NumForms; + Any = true; + } + if (Any) + LU.RecomputeRegs(LUIdx, RegUses); + + // Reset this to prepare for the next use. + BestFormulae.clear(); + } + + DEBUG(if (ChangedFormulae) { + dbgs() << "\n" + "After filtering out undesirable candidates:\n"; + print_uses(dbgs()); + }); +} + +/// The function delete formulas with high registers number expectation. +/// Assuming we don't know the value of each formula (already delete +/// all inefficient), generate probability of not selecting for each +/// register. +/// For example, +/// Use1: +/// reg(a) + reg({0,+,1}) +/// reg(a) + reg({-1,+,1}) + 1 +/// reg({a,+,1}) +/// Use2: +/// reg(b) + reg({0,+,1}) +/// reg(b) + reg({-1,+,1}) + 1 +/// reg({b,+,1}) +/// Use3: +/// reg(c) + reg(b) + reg({0,+,1}) +/// reg(c) + reg({b,+,1}) +/// +/// Probability of not selecting +/// Use1 Use2 Use3 +/// reg(a) (1/3) * 1 * 1 +/// reg(b) 1 * (1/3) * (1/2) +/// reg({0,+,1}) (2/3) * (2/3) * (1/2) +/// reg({-1,+,1}) (2/3) * (2/3) * 1 +/// reg({a,+,1}) (2/3) * 1 * 1 +/// reg({b,+,1}) 1 * (2/3) * (2/3) +/// reg(c) 1 * 1 * 0 +/// +/// Now count registers number mathematical expectation for each formula: +/// Note that for each use we exclude probability if not selecting for the use. +/// For example for Use1 probability for reg(a) would be just 1 * 1 (excluding +/// probabilty 1/3 of not selecting for Use1). +/// Use1: +/// reg(a) + reg({0,+,1}) 1 + 1/3 -- to be deleted +/// reg(a) + reg({-1,+,1}) + 1 1 + 4/9 -- to be deleted +/// reg({a,+,1}) 1 +/// Use2: +/// reg(b) + reg({0,+,1}) 1/2 + 1/3 -- to be deleted +/// reg(b) + reg({-1,+,1}) + 1 1/2 + 2/3 -- to be deleted +/// reg({b,+,1}) 2/3 +/// Use3: +/// reg(c) + reg(b) + reg({0,+,1}) 1 + 1/3 + 4/9 -- to be deleted +/// reg(c) + reg({b,+,1}) 1 + 2/3 + +void LSRInstance::NarrowSearchSpaceByDeletingCostlyFormulas() { + if (EstimateSearchSpaceComplexity() < ComplexityLimit) + return; + // Ok, we have too many of formulae on our hands to conveniently handle. + // Use a rough heuristic to thin out the list. + + // Set of Regs wich will be 100% used in final solution. + // Used in each formula of a solution (in example above this is reg(c)). + // We can skip them in calculations. + SmallPtrSet<const SCEV *, 4> UniqRegs; + DEBUG(dbgs() << "The search space is too complex.\n"); + + // Map each register to probability of not selecting + DenseMap <const SCEV *, float> RegNumMap; + for (const SCEV *Reg : RegUses) { + if (UniqRegs.count(Reg)) + continue; + float PNotSel = 1; + for (const LSRUse &LU : Uses) { + if (!LU.Regs.count(Reg)) + continue; + float P = LU.getNotSelectedProbability(Reg); + if (P != 0.0) + PNotSel *= P; + else + UniqRegs.insert(Reg); + } + RegNumMap.insert(std::make_pair(Reg, PNotSel)); + } + + DEBUG(dbgs() << "Narrowing the search space by deleting costly formulas\n"); + + // Delete formulas where registers number expectation is high. + for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) { + LSRUse &LU = Uses[LUIdx]; + // If nothing to delete - continue. + if (LU.Formulae.size() < 2) + continue; + // This is temporary solution to test performance. Float should be + // replaced with round independent type (based on integers) to avoid + // different results for different target builds. + float FMinRegNum = LU.Formulae[0].getNumRegs(); + float FMinARegNum = LU.Formulae[0].getNumRegs(); + size_t MinIdx = 0; + for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) { + Formula &F = LU.Formulae[i]; + float FRegNum = 0; + float FARegNum = 0; + for (const SCEV *BaseReg : F.BaseRegs) { + if (UniqRegs.count(BaseReg)) + continue; + FRegNum += RegNumMap[BaseReg] / LU.getNotSelectedProbability(BaseReg); + if (isa<SCEVAddRecExpr>(BaseReg)) + FARegNum += + RegNumMap[BaseReg] / LU.getNotSelectedProbability(BaseReg); + } + if (const SCEV *ScaledReg = F.ScaledReg) { + if (!UniqRegs.count(ScaledReg)) { + FRegNum += + RegNumMap[ScaledReg] / LU.getNotSelectedProbability(ScaledReg); + if (isa<SCEVAddRecExpr>(ScaledReg)) + FARegNum += + RegNumMap[ScaledReg] / LU.getNotSelectedProbability(ScaledReg); + } + } + if (FMinRegNum > FRegNum || + (FMinRegNum == FRegNum && FMinARegNum > FARegNum)) { + FMinRegNum = FRegNum; + FMinARegNum = FARegNum; + MinIdx = i; + } + } + DEBUG(dbgs() << " The formula "; LU.Formulae[MinIdx].print(dbgs()); + dbgs() << " with min reg num " << FMinRegNum << '\n'); + if (MinIdx != 0) + std::swap(LU.Formulae[MinIdx], LU.Formulae[0]); + while (LU.Formulae.size() != 1) { + DEBUG(dbgs() << " Deleting "; LU.Formulae.back().print(dbgs()); + dbgs() << '\n'); + LU.Formulae.pop_back(); + } + LU.RecomputeRegs(LUIdx, RegUses); + assert(LU.Formulae.size() == 1 && "Should be exactly 1 min regs formula"); + Formula &F = LU.Formulae[0]; + DEBUG(dbgs() << " Leaving only "; F.print(dbgs()); dbgs() << '\n'); + // When we choose the formula, the regs become unique. + UniqRegs.insert(F.BaseRegs.begin(), F.BaseRegs.end()); + if (F.ScaledReg) + UniqRegs.insert(F.ScaledReg); + } + DEBUG(dbgs() << "After pre-selection:\n"; + print_uses(dbgs())); +} + + /// Pick a register which seems likely to be profitable, and then in any use /// which has any reference to that register, delete all formulae which do not /// reference that register. @@ -4237,7 +4622,12 @@ void LSRInstance::NarrowSearchSpaceUsingHeuristics() { NarrowSearchSpaceByDetectingSupersets(); NarrowSearchSpaceByCollapsingUnrolledCode(); NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters(); - NarrowSearchSpaceByPickingWinnerRegs(); + if (FilterSameScaledReg) + NarrowSearchSpaceByFilterFormulaWithSameScaledReg(); + if (LSRExpNarrow) + NarrowSearchSpaceByDeletingCostlyFormulas(); + else + NarrowSearchSpaceByPickingWinnerRegs(); } /// This is the recursive solver. @@ -4294,7 +4684,7 @@ void LSRInstance::SolveRecurse(SmallVectorImpl<const Formula *> &Solution, NewCost = CurCost; NewRegs = CurRegs; NewCost.RateFormula(TTI, F, NewRegs, VisitedRegs, L, SE, DT, LU); - if (NewCost < SolutionCost) { + if (NewCost.isLess(SolutionCost, TTI)) { Workspace.push_back(&F); if (Workspace.size() != Uses.size()) { SolveRecurse(Solution, SolutionCost, Workspace, NewCost, @@ -4476,12 +4866,10 @@ LSRInstance::AdjustInsertPositionForExpand(BasicBlock::iterator LowestIP, /// Emit instructions for the leading candidate expression for this LSRUse (this /// is called "expanding"). -Value *LSRInstance::Expand(const LSRUse &LU, - const LSRFixup &LF, - const Formula &F, - BasicBlock::iterator IP, +Value *LSRInstance::Expand(const LSRUse &LU, const LSRFixup &LF, + const Formula &F, BasicBlock::iterator IP, SCEVExpander &Rewriter, - SmallVectorImpl<WeakVH> &DeadInsts) const { + SmallVectorImpl<WeakTrackingVH> &DeadInsts) const { if (LU.RigidFormula) return LF.OperandValToReplace; @@ -4515,11 +4903,7 @@ Value *LSRInstance::Expand(const LSRUse &LU, assert(!Reg->isZero() && "Zero allocated in a base register!"); // If we're expanding for a post-inc user, make the post-inc adjustment. - PostIncLoopSet &Loops = const_cast<PostIncLoopSet &>(LF.PostIncLoops); - Reg = TransformForPostIncUse(Denormalize, Reg, - LF.UserInst, LF.OperandValToReplace, - Loops, SE, DT); - + Reg = denormalizeForPostIncUse(Reg, LF.PostIncLoops, SE); Ops.push_back(SE.getUnknown(Rewriter.expandCodeFor(Reg, nullptr))); } @@ -4530,9 +4914,7 @@ Value *LSRInstance::Expand(const LSRUse &LU, // If we're expanding for a post-inc user, make the post-inc adjustment. PostIncLoopSet &Loops = const_cast<PostIncLoopSet &>(LF.PostIncLoops); - ScaledS = TransformForPostIncUse(Denormalize, ScaledS, - LF.UserInst, LF.OperandValToReplace, - Loops, SE, DT); + ScaledS = denormalizeForPostIncUse(ScaledS, Loops, SE); if (LU.Kind == LSRUse::ICmpZero) { // Expand ScaleReg as if it was part of the base regs. @@ -4662,12 +5044,9 @@ Value *LSRInstance::Expand(const LSRUse &LU, /// Helper for Rewrite. PHI nodes are special because the use of their operands /// effectively happens in their predecessor blocks, so the expression may need /// to be expanded in multiple places. -void LSRInstance::RewriteForPHI(PHINode *PN, - const LSRUse &LU, - const LSRFixup &LF, - const Formula &F, - SCEVExpander &Rewriter, - SmallVectorImpl<WeakVH> &DeadInsts) const { +void LSRInstance::RewriteForPHI( + PHINode *PN, const LSRUse &LU, const LSRFixup &LF, const Formula &F, + SCEVExpander &Rewriter, SmallVectorImpl<WeakTrackingVH> &DeadInsts) const { DenseMap<BasicBlock *, Value *> Inserted; for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) if (PN->getIncomingValue(i) == LF.OperandValToReplace) { @@ -4739,11 +5118,9 @@ void LSRInstance::RewriteForPHI(PHINode *PN, /// Emit instructions for the leading candidate expression for this LSRUse (this /// is called "expanding"), and update the UserInst to reference the newly /// expanded value. -void LSRInstance::Rewrite(const LSRUse &LU, - const LSRFixup &LF, - const Formula &F, - SCEVExpander &Rewriter, - SmallVectorImpl<WeakVH> &DeadInsts) const { +void LSRInstance::Rewrite(const LSRUse &LU, const LSRFixup &LF, + const Formula &F, SCEVExpander &Rewriter, + SmallVectorImpl<WeakTrackingVH> &DeadInsts) const { // First, find an insertion point that dominates UserInst. For PHI nodes, // find the nearest block which dominates all the relevant uses. if (PHINode *PN = dyn_cast<PHINode>(LF.UserInst)) { @@ -4781,7 +5158,7 @@ void LSRInstance::ImplementSolution( const SmallVectorImpl<const Formula *> &Solution) { // Keep track of instructions we may have made dead, so that // we can remove them after we are done working. - SmallVector<WeakVH, 16> DeadInsts; + SmallVector<WeakTrackingVH, 16> DeadInsts; SCEVExpander Rewriter(SE, L->getHeader()->getModule()->getDataLayout(), "lsr"); @@ -4975,10 +5352,11 @@ void LSRInstance::print(raw_ostream &OS) const { print_uses(OS); } -LLVM_DUMP_METHOD -void LSRInstance::dump() const { +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +LLVM_DUMP_METHOD void LSRInstance::dump() const { print(errs()); errs() << '\n'; } +#endif namespace { @@ -5030,7 +5408,7 @@ static bool ReduceLoopStrength(Loop *L, IVUsers &IU, ScalarEvolution &SE, // Remove any extra phis created by processing inner loops. Changed |= DeleteDeadPHIs(L->getHeader()); if (EnablePhiElim && L->isLoopSimplifyForm()) { - SmallVector<WeakVH, 16> DeadInsts; + SmallVector<WeakTrackingVH, 16> DeadInsts; const DataLayout &DL = L->getHeader()->getModule()->getDataLayout(); SCEVExpander Rewriter(SE, DL, "lsr"); #ifndef NDEBUG diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp index c7f9122..530a684 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp @@ -44,7 +44,11 @@ using namespace llvm; static cl::opt<unsigned> UnrollThreshold("unroll-threshold", cl::Hidden, - cl::desc("The baseline cost threshold for loop unrolling")); + cl::desc("The cost threshold for loop unrolling")); + +static cl::opt<unsigned> UnrollPartialThreshold( + "unroll-partial-threshold", cl::Hidden, + cl::desc("The cost threshold for partial loop unrolling")); static cl::opt<unsigned> UnrollMaxPercentThresholdBoost( "unroll-max-percent-threshold-boost", cl::init(400), cl::Hidden, @@ -106,10 +110,19 @@ static cl::opt<unsigned> FlatLoopTripCountThreshold( "aggressively unrolled.")); static cl::opt<bool> - UnrollAllowPeeling("unroll-allow-peeling", cl::Hidden, + UnrollAllowPeeling("unroll-allow-peeling", cl::init(true), cl::Hidden, cl::desc("Allows loops to be peeled when the dynamic " "trip count is known to be low.")); +// This option isn't ever intended to be enabled, it serves to allow +// experiments to check the assumptions about when this kind of revisit is +// necessary. +static cl::opt<bool> UnrollRevisitChildLoops( + "unroll-revisit-child-loops", cl::Hidden, + cl::desc("Enqueue and re-visit child loops in the loop PM after unrolling. " + "This shouldn't typically be needed as child loops (or their " + "clones) were already visited.")); + /// A magic value for use with the Threshold parameter to indicate /// that the loop unroll should be performed regardless of how much /// code expansion would result. @@ -118,16 +131,17 @@ static const unsigned NoThreshold = UINT_MAX; /// Gather the various unrolling parameters based on the defaults, compiler /// flags, TTI overrides and user specified parameters. static TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences( - Loop *L, const TargetTransformInfo &TTI, Optional<unsigned> UserThreshold, - Optional<unsigned> UserCount, Optional<bool> UserAllowPartial, - Optional<bool> UserRuntime, Optional<bool> UserUpperBound) { + Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI, int OptLevel, + Optional<unsigned> UserThreshold, Optional<unsigned> UserCount, + Optional<bool> UserAllowPartial, Optional<bool> UserRuntime, + Optional<bool> UserUpperBound) { TargetTransformInfo::UnrollingPreferences UP; // Set up the defaults - UP.Threshold = 150; + UP.Threshold = OptLevel > 2 ? 300 : 150; UP.MaxPercentThresholdBoost = 400; UP.OptSizeThreshold = 0; - UP.PartialThreshold = UP.Threshold; + UP.PartialThreshold = 150; UP.PartialOptSizeThreshold = 0; UP.Count = 0; UP.PeelCount = 0; @@ -141,10 +155,10 @@ static TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences( UP.AllowExpensiveTripCount = false; UP.Force = false; UP.UpperBound = false; - UP.AllowPeeling = false; + UP.AllowPeeling = true; // Override with any target specific settings - TTI.getUnrollingPreferences(L, UP); + TTI.getUnrollingPreferences(L, SE, UP); // Apply size attributes if (L->getHeader()->getParent()->optForSize()) { @@ -153,10 +167,10 @@ static TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences( } // Apply any user values specified by cl::opt - if (UnrollThreshold.getNumOccurrences() > 0) { + if (UnrollThreshold.getNumOccurrences() > 0) UP.Threshold = UnrollThreshold; - UP.PartialThreshold = UnrollThreshold; - } + if (UnrollPartialThreshold.getNumOccurrences() > 0) + UP.PartialThreshold = UnrollPartialThreshold; if (UnrollMaxPercentThresholdBoost.getNumOccurrences() > 0) UP.MaxPercentThresholdBoost = UnrollMaxPercentThresholdBoost; if (UnrollMaxCount.getNumOccurrences() > 0) @@ -495,7 +509,7 @@ analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, DominatorTree &DT, KnownSucc = SI->getSuccessor(0); else if (ConstantInt *SimpleCondVal = dyn_cast<ConstantInt>(SimpleCond)) - KnownSucc = SI->findCaseValue(SimpleCondVal).getCaseSuccessor(); + KnownSucc = SI->findCaseValue(SimpleCondVal)->getCaseSuccessor(); } } if (KnownSucc) { @@ -685,7 +699,7 @@ static uint64_t getUnrolledLoopSize( // Calculates unroll count and writes it to UP.Count. static bool computeUnrollCount( Loop *L, const TargetTransformInfo &TTI, DominatorTree &DT, LoopInfo *LI, - ScalarEvolution *SE, OptimizationRemarkEmitter *ORE, unsigned &TripCount, + ScalarEvolution &SE, OptimizationRemarkEmitter *ORE, unsigned &TripCount, unsigned MaxTripCount, unsigned &TripMultiple, unsigned LoopSize, TargetTransformInfo::UnrollingPreferences &UP, bool &UseUpperBound) { // Check for explicit Count. @@ -756,7 +770,7 @@ static bool computeUnrollCount( // helps to remove a significant number of instructions. // To check that, run additional analysis on the loop. if (Optional<EstimatedUnrollCost> Cost = analyzeLoopUnrollCost( - L, FullUnrollTripCount, DT, *SE, TTI, + L, FullUnrollTripCount, DT, SE, TTI, UP.Threshold * UP.MaxPercentThresholdBoost / 100)) { unsigned Boost = getFullUnrollBoostingFactor(*Cost, UP.MaxPercentThresholdBoost); @@ -770,7 +784,15 @@ static bool computeUnrollCount( } } - // 4rd priority is partial unrolling. + // 4th priority is loop peeling + computePeelCount(L, LoopSize, UP, TripCount); + if (UP.PeelCount) { + UP.Runtime = false; + UP.Count = 1; + return ExplicitUnroll; + } + + // 5th priority is partial unrolling. // Try partial unroll only when TripCount could be staticaly calculated. if (TripCount) { UP.Partial |= ExplicitUnroll; @@ -814,6 +836,8 @@ static bool computeUnrollCount( } else { UP.Count = TripCount; } + if (UP.Count > UP.MaxCount) + UP.Count = UP.MaxCount; if ((PragmaFullUnroll || PragmaEnableUnroll) && TripCount && UP.Count != TripCount) ORE->emit( @@ -833,14 +857,6 @@ static bool computeUnrollCount( << "Unable to fully unroll loop as directed by unroll(full) pragma " "because loop has a runtime trip count."); - // 5th priority is loop peeling - computePeelCount(L, LoopSize, UP); - if (UP.PeelCount) { - UP.Runtime = false; - UP.Count = 1; - return ExplicitUnroll; - } - // 6th priority is runtime unrolling. // Don't unroll a runtime trip count loop when it is disabled. if (HasRuntimeUnrollDisablePragma(L)) { @@ -912,9 +928,9 @@ static bool computeUnrollCount( } static bool tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, - ScalarEvolution *SE, const TargetTransformInfo &TTI, + ScalarEvolution &SE, const TargetTransformInfo &TTI, AssumptionCache &AC, OptimizationRemarkEmitter &ORE, - bool PreserveLCSSA, + bool PreserveLCSSA, int OptLevel, Optional<unsigned> ProvidedCount, Optional<unsigned> ProvidedThreshold, Optional<bool> ProvidedAllowPartial, @@ -934,8 +950,8 @@ static bool tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, bool NotDuplicatable; bool Convergent; TargetTransformInfo::UnrollingPreferences UP = gatherUnrollingPreferences( - L, TTI, ProvidedThreshold, ProvidedCount, ProvidedAllowPartial, - ProvidedRuntime, ProvidedUpperBound); + L, SE, TTI, OptLevel, ProvidedThreshold, ProvidedCount, + ProvidedAllowPartial, ProvidedRuntime, ProvidedUpperBound); // Exit early if unrolling is disabled. if (UP.Threshold == 0 && (!UP.Partial || UP.PartialThreshold == 0)) return false; @@ -963,8 +979,8 @@ static bool tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, if (!ExitingBlock || !L->isLoopExiting(ExitingBlock)) ExitingBlock = L->getExitingBlock(); if (ExitingBlock) { - TripCount = SE->getSmallConstantTripCount(L, ExitingBlock); - TripMultiple = SE->getSmallConstantTripMultiple(L, ExitingBlock); + TripCount = SE.getSmallConstantTripCount(L, ExitingBlock); + TripMultiple = SE.getSmallConstantTripMultiple(L, ExitingBlock); } // If the loop contains a convergent operation, the prelude we'd add @@ -986,8 +1002,8 @@ static bool tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, // count. bool MaxOrZero = false; if (!TripCount) { - MaxTripCount = SE->getSmallConstantMaxTripCount(L); - MaxOrZero = SE->isBackedgeTakenCountMaxOrZero(L); + MaxTripCount = SE.getSmallConstantMaxTripCount(L); + MaxOrZero = SE.isBackedgeTakenCountMaxOrZero(L); // We can unroll by the upper bound amount if it's generally allowed or if // we know that the loop is executed either the upper bound or zero times. // (MaxOrZero unrolling keeps only the first loop test, so the number of @@ -1016,7 +1032,7 @@ static bool tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, // Unroll the loop. if (!UnrollLoop(L, UP.Count, TripCount, UP.Force, UP.Runtime, UP.AllowExpensiveTripCount, UseUpperBound, MaxOrZero, - TripMultiple, UP.PeelCount, LI, SE, &DT, &AC, &ORE, + TripMultiple, UP.PeelCount, LI, &SE, &DT, &AC, &ORE, PreserveLCSSA)) return false; @@ -1034,16 +1050,17 @@ namespace { class LoopUnroll : public LoopPass { public: static char ID; // Pass ID, replacement for typeid - LoopUnroll(Optional<unsigned> Threshold = None, + LoopUnroll(int OptLevel = 2, Optional<unsigned> Threshold = None, Optional<unsigned> Count = None, Optional<bool> AllowPartial = None, Optional<bool> Runtime = None, Optional<bool> UpperBound = None) - : LoopPass(ID), ProvidedCount(std::move(Count)), + : LoopPass(ID), OptLevel(OptLevel), ProvidedCount(std::move(Count)), ProvidedThreshold(Threshold), ProvidedAllowPartial(AllowPartial), ProvidedRuntime(Runtime), ProvidedUpperBound(UpperBound) { initializeLoopUnrollPass(*PassRegistry::getPassRegistry()); } + int OptLevel; Optional<unsigned> ProvidedCount; Optional<unsigned> ProvidedThreshold; Optional<bool> ProvidedAllowPartial; @@ -1058,7 +1075,7 @@ public: auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); - ScalarEvolution *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE(); + ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE(); const TargetTransformInfo &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); @@ -1068,7 +1085,7 @@ public: OptimizationRemarkEmitter ORE(&F); bool PreserveLCSSA = mustPreserveAnalysisID(LCSSAID); - return tryToUnrollLoop(L, DT, LI, SE, TTI, AC, ORE, PreserveLCSSA, + return tryToUnrollLoop(L, DT, LI, SE, TTI, AC, ORE, PreserveLCSSA, OptLevel, ProvidedCount, ProvidedThreshold, ProvidedAllowPartial, ProvidedRuntime, ProvidedUpperBound); @@ -1094,26 +1111,27 @@ INITIALIZE_PASS_DEPENDENCY(LoopPass) INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) INITIALIZE_PASS_END(LoopUnroll, "loop-unroll", "Unroll loops", false, false) -Pass *llvm::createLoopUnrollPass(int Threshold, int Count, int AllowPartial, - int Runtime, int UpperBound) { +Pass *llvm::createLoopUnrollPass(int OptLevel, int Threshold, int Count, + int AllowPartial, int Runtime, + int UpperBound) { // TODO: It would make more sense for this function to take the optionals // directly, but that's dangerous since it would silently break out of tree // callers. - return new LoopUnroll(Threshold == -1 ? None : Optional<unsigned>(Threshold), - Count == -1 ? None : Optional<unsigned>(Count), - AllowPartial == -1 ? None - : Optional<bool>(AllowPartial), - Runtime == -1 ? None : Optional<bool>(Runtime), - UpperBound == -1 ? None : Optional<bool>(UpperBound)); + return new LoopUnroll( + OptLevel, Threshold == -1 ? None : Optional<unsigned>(Threshold), + Count == -1 ? None : Optional<unsigned>(Count), + AllowPartial == -1 ? None : Optional<bool>(AllowPartial), + Runtime == -1 ? None : Optional<bool>(Runtime), + UpperBound == -1 ? None : Optional<bool>(UpperBound)); } -Pass *llvm::createSimpleLoopUnrollPass() { - return llvm::createLoopUnrollPass(-1, -1, 0, 0, 0); +Pass *llvm::createSimpleLoopUnrollPass(int OptLevel) { + return llvm::createLoopUnrollPass(OptLevel, -1, -1, 0, 0, 0); } PreservedAnalyses LoopUnrollPass::run(Loop &L, LoopAnalysisManager &AM, LoopStandardAnalysisResults &AR, - LPMUpdater &) { + LPMUpdater &Updater) { const auto &FAM = AM.getResult<FunctionAnalysisManagerLoopProxy>(L, AR).getManager(); Function *F = L.getHeader()->getParent(); @@ -1124,12 +1142,84 @@ PreservedAnalyses LoopUnrollPass::run(Loop &L, LoopAnalysisManager &AM, report_fatal_error("LoopUnrollPass: OptimizationRemarkEmitterAnalysis not " "cached at a higher level"); - bool Changed = tryToUnrollLoop(&L, AR.DT, &AR.LI, &AR.SE, AR.TTI, AR.AC, *ORE, - /*PreserveLCSSA*/ true, ProvidedCount, - ProvidedThreshold, ProvidedAllowPartial, - ProvidedRuntime, ProvidedUpperBound); - + // Keep track of the previous loop structure so we can identify new loops + // created by unrolling. + Loop *ParentL = L.getParentLoop(); + SmallPtrSet<Loop *, 4> OldLoops; + if (ParentL) + OldLoops.insert(ParentL->begin(), ParentL->end()); + else + OldLoops.insert(AR.LI.begin(), AR.LI.end()); + + // The API here is quite complex to call, but there are only two interesting + // states we support: partial and full (or "simple") unrolling. However, to + // enable these things we actually pass "None" in for the optional to avoid + // providing an explicit choice. + Optional<bool> AllowPartialParam, RuntimeParam, UpperBoundParam; + if (!AllowPartialUnrolling) + AllowPartialParam = RuntimeParam = UpperBoundParam = false; + bool Changed = tryToUnrollLoop( + &L, AR.DT, &AR.LI, AR.SE, AR.TTI, AR.AC, *ORE, + /*PreserveLCSSA*/ true, OptLevel, /*Count*/ None, + /*Threshold*/ None, AllowPartialParam, RuntimeParam, UpperBoundParam); if (!Changed) return PreservedAnalyses::all(); + + // The parent must not be damaged by unrolling! +#ifndef NDEBUG + if (ParentL) + ParentL->verifyLoop(); +#endif + + // Unrolling can do several things to introduce new loops into a loop nest: + // - Partial unrolling clones child loops within the current loop. If it + // uses a remainder, then it can also create any number of sibling loops. + // - Full unrolling clones child loops within the current loop but then + // removes the current loop making all of the children appear to be new + // sibling loops. + // - Loop peeling can directly introduce new sibling loops by peeling one + // iteration. + // + // When a new loop appears as a sibling loop, either from peeling an + // iteration or fully unrolling, its nesting structure has fundamentally + // changed and we want to revisit it to reflect that. + // + // When unrolling has removed the current loop, we need to tell the + // infrastructure that it is gone. + // + // Finally, we support a debugging/testing mode where we revisit child loops + // as well. These are not expected to require further optimizations as either + // they or the loop they were cloned from have been directly visited already. + // But the debugging mode allows us to check this assumption. + bool IsCurrentLoopValid = false; + SmallVector<Loop *, 4> SibLoops; + if (ParentL) + SibLoops.append(ParentL->begin(), ParentL->end()); + else + SibLoops.append(AR.LI.begin(), AR.LI.end()); + erase_if(SibLoops, [&](Loop *SibLoop) { + if (SibLoop == &L) { + IsCurrentLoopValid = true; + return true; + } + + // Otherwise erase the loop from the list if it was in the old loops. + return OldLoops.count(SibLoop) != 0; + }); + Updater.addSiblingLoops(SibLoops); + + if (!IsCurrentLoopValid) { + Updater.markLoopAsDeleted(L); + } else { + // We can only walk child loops if the current loop remained valid. + if (UnrollRevisitChildLoops) { + // Walk *all* of the child loops. This is a highly speculative mode + // anyways so look for any simplifications that arose from partial + // unrolling or peeling off of iterations. + SmallVector<Loop *, 4> ChildLoops(L.begin(), L.end()); + Updater.addChildLoops(ChildLoops); + } + } + return getLoopPassPreservedAnalyses(); } diff --git a/contrib/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp b/contrib/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp index 76fe918..d0c96fa 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp @@ -8,7 +8,7 @@ //===----------------------------------------------------------------------===// // // This pass transforms loops that contain branches on loop-invariant conditions -// to have multiple loops. For example, it turns the left into the right code: +// to multiple loops. For example, it turns the left into the right code: // // for (...) if (lic) // A for (...) @@ -26,32 +26,34 @@ // //===----------------------------------------------------------------------===// -#include "llvm/Transforms/Scalar.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/Statistic.h" -#include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/AssumptionCache.h" +#include "llvm/Analysis/BlockFrequencyInfo.h" +#include "llvm/Analysis/BlockFrequencyInfoImpl.h" +#include "llvm/Analysis/BranchProbabilityInfo.h" #include "llvm/Analysis/CodeMetrics.h" +#include "llvm/Analysis/DivergenceAnalysis.h" +#include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopPass.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/TargetTransformInfo.h" -#include "llvm/Analysis/BlockFrequencyInfoImpl.h" -#include "llvm/Analysis/BlockFrequencyInfo.h" -#include "llvm/Analysis/BranchProbabilityInfo.h" -#include "llvm/Support/BranchProbability.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" +#include "llvm/IR/InstrTypes.h" #include "llvm/IR/Instructions.h" -#include "llvm/IR/Module.h" #include "llvm/IR/MDBuilder.h" +#include "llvm/IR/Module.h" +#include "llvm/Support/BranchProbability.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Cloning.h" #include "llvm/Transforms/Utils/Local.h" @@ -77,19 +79,6 @@ static cl::opt<unsigned> Threshold("loop-unswitch-threshold", cl::desc("Max loop size to unswitch"), cl::init(100), cl::Hidden); -static cl::opt<bool> -LoopUnswitchWithBlockFrequency("loop-unswitch-with-block-frequency", - cl::init(false), cl::Hidden, - cl::desc("Enable the use of the block frequency analysis to access PGO " - "heuristics to minimize code growth in cold regions.")); - -static cl::opt<unsigned> -ColdnessThreshold("loop-unswitch-coldness-threshold", cl::init(1), cl::Hidden, - cl::desc("Coldness threshold in percentage. The loop header frequency " - "(relative to the entry frequency) is compared with this " - "threshold to determine if non-trivial unswitching should be " - "enabled.")); - namespace { class LUAnalysisCache { @@ -174,13 +163,6 @@ namespace { LUAnalysisCache BranchesInfo; - bool EnabledPGO; - - // BFI and ColdEntryFreq are only used when PGO and - // LoopUnswitchWithBlockFrequency are enabled. - BlockFrequencyInfo BFI; - BlockFrequency ColdEntryFreq; - bool OptimizeForSize; bool redoLoop; @@ -199,12 +181,14 @@ namespace { // NewBlocks contained cloned copy of basic blocks from LoopBlocks. std::vector<BasicBlock*> NewBlocks; + bool hasBranchDivergence; + public: static char ID; // Pass ID, replacement for typeid - explicit LoopUnswitch(bool Os = false) : + explicit LoopUnswitch(bool Os = false, bool hasBranchDivergence = false) : LoopPass(ID), OptimizeForSize(Os), redoLoop(false), currentLoop(nullptr), DT(nullptr), loopHeader(nullptr), - loopPreheader(nullptr) { + loopPreheader(nullptr), hasBranchDivergence(hasBranchDivergence) { initializeLoopUnswitchPass(*PassRegistry::getPassRegistry()); } @@ -217,6 +201,8 @@ namespace { void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<AssumptionCacheTracker>(); AU.addRequired<TargetTransformInfoWrapperPass>(); + if (hasBranchDivergence) + AU.addRequired<DivergenceAnalysis>(); getLoopAnalysisUsage(AU); } @@ -255,6 +241,11 @@ namespace { TerminatorInst *TI); void SimplifyCode(std::vector<Instruction*> &Worklist, Loop *L); + + /// Given that the Invariant is not equal to Val. Simplify instructions + /// in the loop. + Value *SimplifyInstructionWithNotEqual(Instruction *Inst, Value *Invariant, + Constant *Val); }; } @@ -381,16 +372,35 @@ INITIALIZE_PASS_BEGIN(LoopUnswitch, "loop-unswitch", "Unswitch loops", INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_DEPENDENCY(LoopPass) INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis) INITIALIZE_PASS_END(LoopUnswitch, "loop-unswitch", "Unswitch loops", false, false) -Pass *llvm::createLoopUnswitchPass(bool Os) { - return new LoopUnswitch(Os); +Pass *llvm::createLoopUnswitchPass(bool Os, bool hasBranchDivergence) { + return new LoopUnswitch(Os, hasBranchDivergence); } +/// Operator chain lattice. +enum OperatorChain { + OC_OpChainNone, ///< There is no operator. + OC_OpChainOr, ///< There are only ORs. + OC_OpChainAnd, ///< There are only ANDs. + OC_OpChainMixed ///< There are ANDs and ORs. +}; + /// Cond is a condition that occurs in L. If it is invariant in the loop, or has /// an invariant piece, return the invariant. Otherwise, return null. +// +/// NOTE: FindLIVLoopCondition will not return a partial LIV by walking up a +/// mixed operator chain, as we can not reliably find a value which will simplify +/// the operator chain. If the chain is AND-only or OR-only, we can use 0 or ~0 +/// to simplify the chain. +/// +/// NOTE: In case a partial LIV and a mixed operator chain, we may be able to +/// simplify the condition itself to a loop variant condition, but at the +/// cost of creating an entirely new loop. static Value *FindLIVLoopCondition(Value *Cond, Loop *L, bool &Changed, + OperatorChain &ParentChain, DenseMap<Value *, Value *> &Cache) { auto CacheIt = Cache.find(Cond); if (CacheIt != Cache.end()) @@ -414,21 +424,53 @@ static Value *FindLIVLoopCondition(Value *Cond, Loop *L, bool &Changed, return Cond; } + // Walk up the operator chain to find partial invariant conditions. if (BinaryOperator *BO = dyn_cast<BinaryOperator>(Cond)) if (BO->getOpcode() == Instruction::And || BO->getOpcode() == Instruction::Or) { - // If either the left or right side is invariant, we can unswitch on this, - // which will cause the branch to go away in one loop and the condition to - // simplify in the other one. - if (Value *LHS = - FindLIVLoopCondition(BO->getOperand(0), L, Changed, Cache)) { - Cache[Cond] = LHS; - return LHS; + // Given the previous operator, compute the current operator chain status. + OperatorChain NewChain; + switch (ParentChain) { + case OC_OpChainNone: + NewChain = BO->getOpcode() == Instruction::And ? OC_OpChainAnd : + OC_OpChainOr; + break; + case OC_OpChainOr: + NewChain = BO->getOpcode() == Instruction::Or ? OC_OpChainOr : + OC_OpChainMixed; + break; + case OC_OpChainAnd: + NewChain = BO->getOpcode() == Instruction::And ? OC_OpChainAnd : + OC_OpChainMixed; + break; + case OC_OpChainMixed: + NewChain = OC_OpChainMixed; + break; } - if (Value *RHS = - FindLIVLoopCondition(BO->getOperand(1), L, Changed, Cache)) { - Cache[Cond] = RHS; - return RHS; + + // If we reach a Mixed state, we do not want to keep walking up as we can not + // reliably find a value that will simplify the chain. With this check, we + // will return null on the first sight of mixed chain and the caller will + // either backtrack to find partial LIV in other operand or return null. + if (NewChain != OC_OpChainMixed) { + // Update the current operator chain type before we search up the chain. + ParentChain = NewChain; + // If either the left or right side is invariant, we can unswitch on this, + // which will cause the branch to go away in one loop and the condition to + // simplify in the other one. + if (Value *LHS = FindLIVLoopCondition(BO->getOperand(0), L, Changed, + ParentChain, Cache)) { + Cache[Cond] = LHS; + return LHS; + } + // We did not manage to find a partial LIV in operand(0). Backtrack and try + // operand(1). + ParentChain = NewChain; + if (Value *RHS = FindLIVLoopCondition(BO->getOperand(1), L, Changed, + ParentChain, Cache)) { + Cache[Cond] = RHS; + return RHS; + } } } @@ -436,9 +478,21 @@ static Value *FindLIVLoopCondition(Value *Cond, Loop *L, bool &Changed, return nullptr; } -static Value *FindLIVLoopCondition(Value *Cond, Loop *L, bool &Changed) { +/// Cond is a condition that occurs in L. If it is invariant in the loop, or has +/// an invariant piece, return the invariant along with the operator chain type. +/// Otherwise, return null. +static std::pair<Value *, OperatorChain> FindLIVLoopCondition(Value *Cond, + Loop *L, + bool &Changed) { DenseMap<Value *, Value *> Cache; - return FindLIVLoopCondition(Cond, L, Changed, Cache); + OperatorChain OpChain = OC_OpChainNone; + Value *FCond = FindLIVLoopCondition(Cond, L, Changed, OpChain, Cache); + + // In case we do find a LIV, it can not be obtained by walking up a mixed + // operator chain. + assert((!FCond || OpChain != OC_OpChainMixed) && + "Do not expect a partial LIV with mixed operator chain"); + return {FCond, OpChain}; } bool LoopUnswitch::runOnLoop(Loop *L, LPPassManager &LPM_Ref) { @@ -457,19 +511,6 @@ bool LoopUnswitch::runOnLoop(Loop *L, LPPassManager &LPM_Ref) { if (SanitizeMemory) computeLoopSafetyInfo(&SafetyInfo, L); - EnabledPGO = F->getEntryCount().hasValue(); - - if (LoopUnswitchWithBlockFrequency && EnabledPGO) { - BranchProbabilityInfo BPI(*F, *LI); - BFI.calculate(*L->getHeader()->getParent(), BPI, *LI); - - // Use BranchProbability to compute a minimum frequency based on - // function entry baseline frequency. Loops with headers below this - // frequency are considered as cold. - const BranchProbability ColdProb(ColdnessThreshold, 100); - ColdEntryFreq = BlockFrequency(BFI.getEntryFreq()) * ColdProb; - } - bool Changed = false; do { assert(currentLoop->isLCSSAForm(*DT)); @@ -581,19 +622,9 @@ bool LoopUnswitch::processCurrentLoop() { loopHeader->getParent()->hasFnAttribute(Attribute::OptimizeForSize)) return false; - if (LoopUnswitchWithBlockFrequency && EnabledPGO) { - // Compute the weighted frequency of the hottest block in the - // loop (loopHeader in this case since inner loops should be - // processed before outer loop). If it is less than ColdFrequency, - // we should not unswitch. - BlockFrequency LoopEntryFreq = BFI.getBlockFreq(loopHeader); - if (LoopEntryFreq < ColdEntryFreq) - return false; - } - for (IntrinsicInst *Guard : Guards) { Value *LoopCond = - FindLIVLoopCondition(Guard->getOperand(0), currentLoop, Changed); + FindLIVLoopCondition(Guard->getOperand(0), currentLoop, Changed).first; if (LoopCond && UnswitchIfProfitable(LoopCond, ConstantInt::getTrue(Context))) { // NB! Unswitching (if successful) could have erased some of the @@ -634,7 +665,7 @@ bool LoopUnswitch::processCurrentLoop() { // See if this, or some part of it, is loop invariant. If so, we can // unswitch on it if we desire. Value *LoopCond = FindLIVLoopCondition(BI->getCondition(), - currentLoop, Changed); + currentLoop, Changed).first; if (LoopCond && UnswitchIfProfitable(LoopCond, ConstantInt::getTrue(Context), TI)) { ++NumBranches; @@ -642,24 +673,48 @@ bool LoopUnswitch::processCurrentLoop() { } } } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) { - Value *LoopCond = FindLIVLoopCondition(SI->getCondition(), - currentLoop, Changed); + Value *SC = SI->getCondition(); + Value *LoopCond; + OperatorChain OpChain; + std::tie(LoopCond, OpChain) = + FindLIVLoopCondition(SC, currentLoop, Changed); + unsigned NumCases = SI->getNumCases(); if (LoopCond && NumCases) { // Find a value to unswitch on: // FIXME: this should chose the most expensive case! // FIXME: scan for a case with a non-critical edge? Constant *UnswitchVal = nullptr; - - // Do not process same value again and again. - // At this point we have some cases already unswitched and - // some not yet unswitched. Let's find the first not yet unswitched one. - for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end(); - i != e; ++i) { - Constant *UnswitchValCandidate = i.getCaseValue(); - if (!BranchesInfo.isUnswitched(SI, UnswitchValCandidate)) { - UnswitchVal = UnswitchValCandidate; - break; + // Find a case value such that at least one case value is unswitched + // out. + if (OpChain == OC_OpChainAnd) { + // If the chain only has ANDs and the switch has a case value of 0. + // Dropping in a 0 to the chain will unswitch out the 0-casevalue. + auto *AllZero = cast<ConstantInt>(Constant::getNullValue(SC->getType())); + if (BranchesInfo.isUnswitched(SI, AllZero)) + continue; + // We are unswitching 0 out. + UnswitchVal = AllZero; + } else if (OpChain == OC_OpChainOr) { + // If the chain only has ORs and the switch has a case value of ~0. + // Dropping in a ~0 to the chain will unswitch out the ~0-casevalue. + auto *AllOne = cast<ConstantInt>(Constant::getAllOnesValue(SC->getType())); + if (BranchesInfo.isUnswitched(SI, AllOne)) + continue; + // We are unswitching ~0 out. + UnswitchVal = AllOne; + } else { + assert(OpChain == OC_OpChainNone && + "Expect to unswitch on trivial chain"); + // Do not process same value again and again. + // At this point we have some cases already unswitched and + // some not yet unswitched. Let's find the first not yet unswitched one. + for (auto Case : SI->cases()) { + Constant *UnswitchValCandidate = Case.getCaseValue(); + if (!BranchesInfo.isUnswitched(SI, UnswitchValCandidate)) { + UnswitchVal = UnswitchValCandidate; + break; + } } } @@ -668,6 +723,11 @@ bool LoopUnswitch::processCurrentLoop() { if (UnswitchIfProfitable(LoopCond, UnswitchVal)) { ++NumSwitches; + // In case of a full LIV, UnswitchVal is the value we unswitched out. + // In case of a partial LIV, we only unswitch when its an AND-chain + // or OR-chain. In both cases switch input value simplifies to + // UnswitchVal. + BranchesInfo.setUnswitched(SI, UnswitchVal); return true; } } @@ -678,7 +738,7 @@ bool LoopUnswitch::processCurrentLoop() { BBI != E; ++BBI) if (SelectInst *SI = dyn_cast<SelectInst>(BBI)) { Value *LoopCond = FindLIVLoopCondition(SI->getCondition(), - currentLoop, Changed); + currentLoop, Changed).first; if (LoopCond && UnswitchIfProfitable(LoopCond, ConstantInt::getTrue(Context))) { ++NumSelects; @@ -753,6 +813,15 @@ bool LoopUnswitch::UnswitchIfProfitable(Value *LoopCond, Constant *Val, << ". Cost too high.\n"); return false; } + if (hasBranchDivergence && + getAnalysis<DivergenceAnalysis>().isDivergent(LoopCond)) { + DEBUG(dbgs() << "NOT unswitching loop %" + << currentLoop->getHeader()->getName() + << " at non-trivial condition '" << *Val + << "' == " << *LoopCond << "\n" + << ". Condition is divergent.\n"); + return false; + } UnswitchNontrivialCondition(LoopCond, Val, currentLoop, TI); return true; @@ -762,7 +831,12 @@ bool LoopUnswitch::UnswitchIfProfitable(Value *LoopCond, Constant *Val, /// mapping the blocks with the specified map. static Loop *CloneLoop(Loop *L, Loop *PL, ValueToValueMapTy &VM, LoopInfo *LI, LPPassManager *LPM) { - Loop &New = LPM->addLoop(PL); + Loop &New = *new Loop(); + if (PL) + PL->addChildLoop(&New); + else + LI->addTopLevelLoop(&New); + LPM->addLoop(New); // Add all of the blocks in L to the new loop. for (Loop::block_iterator I = L->block_begin(), E = L->block_end(); @@ -899,7 +973,6 @@ bool LoopUnswitch::TryTrivialLoopUnswitch(bool &Changed) { if (I.mayHaveSideEffects()) return false; - // FIXME: add check for constant foldable switch instructions. if (BranchInst *BI = dyn_cast<BranchInst>(CurrentTerm)) { if (BI->isUnconditional()) { CurrentBB = BI->getSuccessor(0); @@ -911,7 +984,16 @@ bool LoopUnswitch::TryTrivialLoopUnswitch(bool &Changed) { // Found a trivial condition candidate: non-foldable conditional branch. break; } + } else if (SwitchInst *SI = dyn_cast<SwitchInst>(CurrentTerm)) { + // At this point, any constant-foldable instructions should have probably + // been folded. + ConstantInt *Cond = dyn_cast<ConstantInt>(SI->getCondition()); + if (!Cond) + break; + // Find the target block we are definitely going to. + CurrentBB = SI->findCaseValue(Cond)->getCaseSuccessor(); } else { + // We do not understand these terminator instructions. break; } @@ -929,7 +1011,7 @@ bool LoopUnswitch::TryTrivialLoopUnswitch(bool &Changed) { return false; Value *LoopCond = FindLIVLoopCondition(BI->getCondition(), - currentLoop, Changed); + currentLoop, Changed).first; // Unswitch only if the trivial condition itself is an LIV (not // partial LIV which could occur in and/or) @@ -960,7 +1042,7 @@ bool LoopUnswitch::TryTrivialLoopUnswitch(bool &Changed) { } else if (SwitchInst *SI = dyn_cast<SwitchInst>(CurrentTerm)) { // If this isn't switching on an invariant condition, we can't unswitch it. Value *LoopCond = FindLIVLoopCondition(SI->getCondition(), - currentLoop, Changed); + currentLoop, Changed).first; // Unswitch only if the trivial condition itself is an LIV (not // partial LIV which could occur in and/or) @@ -973,13 +1055,12 @@ bool LoopUnswitch::TryTrivialLoopUnswitch(bool &Changed) { // this. // Note that we can't trivially unswitch on the default case or // on already unswitched cases. - for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end(); - i != e; ++i) { + for (auto Case : SI->cases()) { BasicBlock *LoopExitCandidate; - if ((LoopExitCandidate = isTrivialLoopExitBlock(currentLoop, - i.getCaseSuccessor()))) { + if ((LoopExitCandidate = + isTrivialLoopExitBlock(currentLoop, Case.getCaseSuccessor()))) { // Okay, we found a trivial case, remember the value that is trivial. - ConstantInt *CaseVal = i.getCaseValue(); + ConstantInt *CaseVal = Case.getCaseValue(); // Check that it was not unswitched before, since already unswitched // trivial vals are looks trivial too. @@ -998,6 +1079,9 @@ bool LoopUnswitch::TryTrivialLoopUnswitch(bool &Changed) { UnswitchTrivialCondition(currentLoop, LoopCond, CondVal, LoopExitBB, nullptr); + + // We are only unswitching full LIV. + BranchesInfo.setUnswitched(SI, CondVal); ++NumSwitches; return true; } @@ -1152,11 +1236,12 @@ void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val, LoopProcessWorklist.push_back(NewLoop); redoLoop = true; - // Keep a WeakVH holding onto LIC. If the first call to RewriteLoopBody + // Keep a WeakTrackingVH holding onto LIC. If the first call to + // RewriteLoopBody // deletes the instruction (for example by simplifying a PHI that feeds into // the condition that we're unswitching on), we don't rewrite the second // iteration. - WeakVH LICHandle(LIC); + WeakTrackingVH LICHandle(LIC); // Now we rewrite the original code to know that the condition is true and the // new code to know that the condition is false. @@ -1183,7 +1268,7 @@ static void RemoveFromWorklist(Instruction *I, static void ReplaceUsesOfWith(Instruction *I, Value *V, std::vector<Instruction*> &Worklist, Loop *L, LPPassManager *LPM) { - DEBUG(dbgs() << "Replace with '" << *V << "': " << *I); + DEBUG(dbgs() << "Replace with '" << *V << "': " << *I << "\n"); // Add uses to the worklist, which may be dead now. for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) @@ -1196,7 +1281,8 @@ static void ReplaceUsesOfWith(Instruction *I, Value *V, LPM->deleteSimpleAnalysisValue(I, L); RemoveFromWorklist(I, Worklist); I->replaceAllUsesWith(V); - I->eraseFromParent(); + if (!I->mayHaveSideEffects()) + I->eraseFromParent(); ++NumSimplify; } @@ -1253,18 +1339,38 @@ void LoopUnswitch::RewriteLoopBodyWithConditionConstant(Loop *L, Value *LIC, if (!UI || !L->contains(UI)) continue; - Worklist.push_back(UI); + // At this point, we know LIC is definitely not Val. Try to use some simple + // logic to simplify the user w.r.t. to the context. + if (Value *Replacement = SimplifyInstructionWithNotEqual(UI, LIC, Val)) { + if (LI->replacementPreservesLCSSAForm(UI, Replacement)) { + // This in-loop instruction has been simplified w.r.t. its context, + // i.e. LIC != Val, make sure we propagate its replacement value to + // all its users. + // + // We can not yet delete UI, the LIC user, yet, because that would invalidate + // the LIC->users() iterator !. However, we can make this instruction + // dead by replacing all its users and push it onto the worklist so that + // it can be properly deleted and its operands simplified. + UI->replaceAllUsesWith(Replacement); + } + } - // TODO: We could do other simplifications, for example, turning - // 'icmp eq LIC, Val' -> false. + // This is a LIC user, push it into the worklist so that SimplifyCode can + // attempt to simplify it. + Worklist.push_back(UI); // If we know that LIC is not Val, use this info to simplify code. SwitchInst *SI = dyn_cast<SwitchInst>(UI); if (!SI || !isa<ConstantInt>(Val)) continue; - SwitchInst::CaseIt DeadCase = SI->findCaseValue(cast<ConstantInt>(Val)); + // NOTE: if a case value for the switch is unswitched out, we record it + // after the unswitch finishes. We can not record it here as the switch + // is not a direct user of the partial LIV. + SwitchInst::CaseHandle DeadCase = + *SI->findCaseValue(cast<ConstantInt>(Val)); // Default case is live for multiple values. - if (DeadCase == SI->case_default()) continue; + if (DeadCase == *SI->case_default()) + continue; // Found a dead case value. Don't remove PHI nodes in the // successor if they become single-entry, those PHI nodes may @@ -1274,8 +1380,6 @@ void LoopUnswitch::RewriteLoopBodyWithConditionConstant(Loop *L, Value *LIC, BasicBlock *SISucc = DeadCase.getCaseSuccessor(); BasicBlock *Latch = L->getLoopLatch(); - BranchesInfo.setUnswitched(SI, Val); - if (!SI->findCaseDest(SISucc)) continue; // Edge is critical. // If the DeadCase successor dominates the loop latch, then the // transformation isn't safe since it will delete the sole predecessor edge @@ -1334,7 +1438,7 @@ void LoopUnswitch::SimplifyCode(std::vector<Instruction*> &Worklist, Loop *L) { // Simple DCE. if (isInstructionTriviallyDead(I)) { - DEBUG(dbgs() << "Remove dead instruction '" << *I); + DEBUG(dbgs() << "Remove dead instruction '" << *I << "\n"); // Add uses to the worklist, which may be dead now. for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) @@ -1397,3 +1501,27 @@ void LoopUnswitch::SimplifyCode(std::vector<Instruction*> &Worklist, Loop *L) { } } } + +/// Simple simplifications we can do given the information that Cond is +/// definitely not equal to Val. +Value *LoopUnswitch::SimplifyInstructionWithNotEqual(Instruction *Inst, + Value *Invariant, + Constant *Val) { + // icmp eq cond, val -> false + ICmpInst *CI = dyn_cast<ICmpInst>(Inst); + if (CI && CI->isEquality()) { + Value *Op0 = CI->getOperand(0); + Value *Op1 = CI->getOperand(1); + if ((Op0 == Invariant && Op1 == Val) || (Op0 == Val && Op1 == Invariant)) { + LLVMContext &Ctx = Inst->getContext(); + if (CI->getPredicate() == CmpInst::ICMP_EQ) + return ConstantInt::getFalse(Ctx); + else + return ConstantInt::getTrue(Ctx); + } + } + + // FIXME: there may be other opportunities, e.g. comparison with floating + // point, or Invariant - Val != 0, etc. + return nullptr; +} diff --git a/contrib/llvm/lib/Transforms/Scalar/LowerAtomic.cpp b/contrib/llvm/lib/Transforms/Scalar/LowerAtomic.cpp index 08e60b1..6f77c5b 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LowerAtomic.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LowerAtomic.cpp @@ -155,8 +155,7 @@ public: } bool runOnFunction(Function &F) override { - if (skipFunction(F)) - return false; + // Don't skip optnone functions; atomics still need to be lowered. FunctionAnalysisManager DummyFAM; auto PA = Impl.run(F, DummyFAM); return !PA.areAllPreserved(); diff --git a/contrib/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp b/contrib/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp index 52975ef..46f8a35 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp @@ -14,6 +14,7 @@ #include "llvm/Transforms/Scalar/LowerExpectIntrinsic.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" +#include "llvm/ADT/iterator_range.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Constants.h" #include "llvm/IR/Function.h" @@ -67,11 +68,11 @@ static bool handleSwitchExpect(SwitchInst &SI) { if (!ExpectedValue) return false; - SwitchInst::CaseIt Case = SI.findCaseValue(ExpectedValue); + SwitchInst::CaseHandle Case = *SI.findCaseValue(ExpectedValue); unsigned n = SI.getNumCases(); // +1 for default case. SmallVector<uint32_t, 16> Weights(n + 1, UnlikelyBranchWeight); - if (Case == SI.case_default()) + if (Case == *SI.case_default()) Weights[0] = LikelyBranchWeight; else Weights[Case.getCaseIndex() + 1] = LikelyBranchWeight; @@ -83,6 +84,151 @@ static bool handleSwitchExpect(SwitchInst &SI) { return true; } +/// Handler for PHINodes that define the value argument to an +/// @llvm.expect call. +/// +/// If the operand of the phi has a constant value and it 'contradicts' +/// with the expected value of phi def, then the corresponding incoming +/// edge of the phi is unlikely to be taken. Using that information, +/// the branch probability info for the originating branch can be inferred. +static void handlePhiDef(CallInst *Expect) { + Value &Arg = *Expect->getArgOperand(0); + ConstantInt *ExpectedValue = dyn_cast<ConstantInt>(Expect->getArgOperand(1)); + if (!ExpectedValue) + return; + const APInt &ExpectedPhiValue = ExpectedValue->getValue(); + + // Walk up in backward a list of instructions that + // have 'copy' semantics by 'stripping' the copies + // until a PHI node or an instruction of unknown kind + // is reached. Negation via xor is also handled. + // + // C = PHI(...); + // B = C; + // A = B; + // D = __builtin_expect(A, 0); + // + Value *V = &Arg; + SmallVector<Instruction *, 4> Operations; + while (!isa<PHINode>(V)) { + if (ZExtInst *ZExt = dyn_cast<ZExtInst>(V)) { + V = ZExt->getOperand(0); + Operations.push_back(ZExt); + continue; + } + + if (SExtInst *SExt = dyn_cast<SExtInst>(V)) { + V = SExt->getOperand(0); + Operations.push_back(SExt); + continue; + } + + BinaryOperator *BinOp = dyn_cast<BinaryOperator>(V); + if (!BinOp || BinOp->getOpcode() != Instruction::Xor) + return; + + ConstantInt *CInt = dyn_cast<ConstantInt>(BinOp->getOperand(1)); + if (!CInt) + return; + + V = BinOp->getOperand(0); + Operations.push_back(BinOp); + } + + // Executes the recorded operations on input 'Value'. + auto ApplyOperations = [&](const APInt &Value) { + APInt Result = Value; + for (auto Op : llvm::reverse(Operations)) { + switch (Op->getOpcode()) { + case Instruction::Xor: + Result ^= cast<ConstantInt>(Op->getOperand(1))->getValue(); + break; + case Instruction::ZExt: + Result = Result.zext(Op->getType()->getIntegerBitWidth()); + break; + case Instruction::SExt: + Result = Result.sext(Op->getType()->getIntegerBitWidth()); + break; + default: + llvm_unreachable("Unexpected operation"); + } + } + return Result; + }; + + auto *PhiDef = dyn_cast<PHINode>(V); + + // Get the first dominating conditional branch of the operand + // i's incoming block. + auto GetDomConditional = [&](unsigned i) -> BranchInst * { + BasicBlock *BB = PhiDef->getIncomingBlock(i); + BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator()); + if (BI && BI->isConditional()) + return BI; + BB = BB->getSinglePredecessor(); + if (!BB) + return nullptr; + BI = dyn_cast<BranchInst>(BB->getTerminator()); + if (!BI || BI->isUnconditional()) + return nullptr; + return BI; + }; + + // Now walk through all Phi operands to find phi oprerands with values + // conflicting with the expected phi output value. Any such operand + // indicates the incoming edge to that operand is unlikely. + for (unsigned i = 0, e = PhiDef->getNumIncomingValues(); i != e; ++i) { + + Value *PhiOpnd = PhiDef->getIncomingValue(i); + ConstantInt *CI = dyn_cast<ConstantInt>(PhiOpnd); + if (!CI) + continue; + + // Not an interesting case when IsUnlikely is false -- we can not infer + // anything useful when the operand value matches the expected phi + // output. + if (ExpectedPhiValue == ApplyOperations(CI->getValue())) + continue; + + BranchInst *BI = GetDomConditional(i); + if (!BI) + continue; + + MDBuilder MDB(PhiDef->getContext()); + + // There are two situations in which an operand of the PhiDef comes + // from a given successor of a branch instruction BI. + // 1) When the incoming block of the operand is the successor block; + // 2) When the incoming block is BI's enclosing block and the + // successor is the PhiDef's enclosing block. + // + // Returns true if the operand which comes from OpndIncomingBB + // comes from outgoing edge of BI that leads to Succ block. + auto *OpndIncomingBB = PhiDef->getIncomingBlock(i); + auto IsOpndComingFromSuccessor = [&](BasicBlock *Succ) { + if (OpndIncomingBB == Succ) + // If this successor is the incoming block for this + // Phi operand, then this successor does lead to the Phi. + return true; + if (OpndIncomingBB == BI->getParent() && Succ == PhiDef->getParent()) + // Otherwise, if the edge is directly from the branch + // to the Phi, this successor is the one feeding this + // Phi operand. + return true; + return false; + }; + + if (IsOpndComingFromSuccessor(BI->getSuccessor(1))) + BI->setMetadata( + LLVMContext::MD_prof, + MDB.createBranchWeights(LikelyBranchWeight, UnlikelyBranchWeight)); + else if (IsOpndComingFromSuccessor(BI->getSuccessor(0))) + BI->setMetadata( + LLVMContext::MD_prof, + MDB.createBranchWeights(UnlikelyBranchWeight, LikelyBranchWeight)); + } +} + // Handle both BranchInst and SelectInst. template <class BrSelInst> static bool handleBrSelExpect(BrSelInst &BSI) { @@ -98,10 +244,18 @@ template <class BrSelInst> static bool handleBrSelExpect(BrSelInst &BSI) { CallInst *CI; ICmpInst *CmpI = dyn_cast<ICmpInst>(BSI.getCondition()); + CmpInst::Predicate Predicate; + ConstantInt *CmpConstOperand = nullptr; if (!CmpI) { CI = dyn_cast<CallInst>(BSI.getCondition()); + Predicate = CmpInst::ICMP_NE; } else { - if (CmpI->getPredicate() != CmpInst::ICMP_NE) + Predicate = CmpI->getPredicate(); + if (Predicate != CmpInst::ICMP_NE && Predicate != CmpInst::ICMP_EQ) + return false; + + CmpConstOperand = dyn_cast<ConstantInt>(CmpI->getOperand(1)); + if (!CmpConstOperand) return false; CI = dyn_cast<CallInst>(CmpI->getOperand(0)); } @@ -109,6 +263,13 @@ template <class BrSelInst> static bool handleBrSelExpect(BrSelInst &BSI) { if (!CI) return false; + uint64_t ValueComparedTo = 0; + if (CmpConstOperand) { + if (CmpConstOperand->getBitWidth() > 64) + return false; + ValueComparedTo = CmpConstOperand->getZExtValue(); + } + Function *Fn = CI->getCalledFunction(); if (!Fn || Fn->getIntrinsicID() != Intrinsic::expect) return false; @@ -121,9 +282,8 @@ template <class BrSelInst> static bool handleBrSelExpect(BrSelInst &BSI) { MDBuilder MDB(CI->getContext()); MDNode *Node; - // If expect value is equal to 1 it means that we are more likely to take - // branch 0, in other case more likely is branch 1. - if (ExpectedValue->isOne()) + if ((ExpectedValue->getZExtValue() == ValueComparedTo) == + (Predicate == CmpInst::ICMP_EQ)) Node = MDB.createBranchWeights(LikelyBranchWeight, UnlikelyBranchWeight); else Node = MDB.createBranchWeights(UnlikelyBranchWeight, LikelyBranchWeight); @@ -173,6 +333,10 @@ static bool lowerExpectIntrinsic(Function &F) { Function *Fn = CI->getCalledFunction(); if (Fn && Fn->getIntrinsicID() == Intrinsic::expect) { + // Before erasing the llvm.expect, walk backward to find + // phi that define llvm.expect's first arg, and + // infer branch probability: + handlePhiDef(CI); Value *Exp = CI->getArgOperand(0); CI->replaceAllUsesWith(Exp); CI->eraseFromParent(); diff --git a/contrib/llvm/lib/Transforms/Scalar/LowerGuardIntrinsic.cpp b/contrib/llvm/lib/Transforms/Scalar/LowerGuardIntrinsic.cpp index 4f41371..070114a 100644 --- a/contrib/llvm/lib/Transforms/Scalar/LowerGuardIntrinsic.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/LowerGuardIntrinsic.cpp @@ -17,10 +17,10 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Function.h" +#include "llvm/IR/IRBuilder.h" #include "llvm/IR/InstIterator.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Intrinsics.h" -#include "llvm/IR/IRBuilder.h" #include "llvm/IR/MDBuilder.h" #include "llvm/IR/Module.h" #include "llvm/Pass.h" diff --git a/contrib/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/contrib/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp index 1b59014..7896396 100644 --- a/contrib/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp @@ -13,19 +13,48 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/Scalar/MemCpyOptimizer.h" -#include "llvm/Transforms/Scalar.h" #include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" +#include "llvm/ADT/iterator_range.h" +#include "llvm/Analysis/AssumptionCache.h" +#include "llvm/Analysis/GlobalsModRef.h" +#include "llvm/Analysis/MemoryDependenceAnalysis.h" +#include "llvm/Analysis/MemoryLocation.h" +#include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/ValueTracking.h" +#include "llvm/IR/Argument.h" +#include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/Function.h" #include "llvm/IR/GetElementPtrTypeIterator.h" #include "llvm/IR/GlobalVariable.h" #include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InstrTypes.h" +#include "llvm/IR/Instruction.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/Operator.h" +#include "llvm/IR/Type.h" +#include "llvm/IR/User.h" +#include "llvm/IR/Value.h" +#include "llvm/Pass.h" +#include "llvm/Support/Casting.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/Local.h" #include <algorithm> +#include <cassert> +#include <cstdint> + using namespace llvm; #define DEBUG_TYPE "memcpyopt" @@ -119,6 +148,7 @@ static bool IsPointerOffset(Value *Ptr1, Value *Ptr2, int64_t &Offset, return true; } +namespace { /// Represents a range of memset'd bytes with the ByteVal value. /// This allows us to analyze stores like: @@ -130,7 +160,6 @@ static bool IsPointerOffset(Value *Ptr1, Value *Ptr2, int64_t &Offset, /// the first store, we make a range [1, 2). The second store extends the range /// to [0, 2). The third makes a new range [2, 3). The fourth store joins the /// two ranges into [0, 3) which is memset'able. -namespace { struct MemsetRange { // Start/End - A semi range that describes the span that this range covers. // The range is closed at the start and open at the end: [Start, End). @@ -148,7 +177,8 @@ struct MemsetRange { bool isProfitableToUseMemset(const DataLayout &DL) const; }; -} // end anon namespace + +} // end anonymous namespace bool MemsetRange::isProfitableToUseMemset(const DataLayout &DL) const { // If we found more than 4 stores to merge or 16 bytes, use memset. @@ -192,13 +222,14 @@ bool MemsetRange::isProfitableToUseMemset(const DataLayout &DL) const { return TheStores.size() > NumPointerStores+NumByteStores; } - namespace { + class MemsetRanges { /// A sorted list of the memset ranges. SmallVector<MemsetRange, 8> Ranges; typedef SmallVectorImpl<MemsetRange>::iterator range_iterator; const DataLayout &DL; + public: MemsetRanges(const DataLayout &DL) : DL(DL) {} @@ -231,8 +262,7 @@ public: }; -} // end anon namespace - +} // end anonymous namespace /// Add a new store to the MemsetRanges data structure. This adds a /// new range for the specified store at the specified offset, merging into @@ -299,48 +329,36 @@ void MemsetRanges::addRange(int64_t Start, int64_t Size, Value *Ptr, //===----------------------------------------------------------------------===// namespace { - class MemCpyOptLegacyPass : public FunctionPass { - MemCpyOptPass Impl; - public: - static char ID; // Pass identification, replacement for typeid - MemCpyOptLegacyPass() : FunctionPass(ID) { - initializeMemCpyOptLegacyPassPass(*PassRegistry::getPassRegistry()); - } - bool runOnFunction(Function &F) override; - - private: - // This transformation requires dominator postdominator info - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.setPreservesCFG(); - AU.addRequired<AssumptionCacheTracker>(); - AU.addRequired<DominatorTreeWrapperPass>(); - AU.addRequired<MemoryDependenceWrapperPass>(); - AU.addRequired<AAResultsWrapperPass>(); - AU.addRequired<TargetLibraryInfoWrapperPass>(); - AU.addPreserved<GlobalsAAWrapperPass>(); - AU.addPreserved<MemoryDependenceWrapperPass>(); - } +class MemCpyOptLegacyPass : public FunctionPass { + MemCpyOptPass Impl; - // Helper functions - bool processStore(StoreInst *SI, BasicBlock::iterator &BBI); - bool processMemSet(MemSetInst *SI, BasicBlock::iterator &BBI); - bool processMemCpy(MemCpyInst *M); - bool processMemMove(MemMoveInst *M); - bool performCallSlotOptzn(Instruction *cpy, Value *cpyDst, Value *cpySrc, - uint64_t cpyLen, unsigned cpyAlign, CallInst *C); - bool processMemCpyMemCpyDependence(MemCpyInst *M, MemCpyInst *MDep); - bool processMemSetMemCpyDependence(MemCpyInst *M, MemSetInst *MDep); - bool performMemCpyToMemSetOptzn(MemCpyInst *M, MemSetInst *MDep); - bool processByValArgument(CallSite CS, unsigned ArgNo); - Instruction *tryMergingIntoMemset(Instruction *I, Value *StartPtr, - Value *ByteVal); - - bool iterateOnFunction(Function &F); - }; +public: + static char ID; // Pass identification, replacement for typeid - char MemCpyOptLegacyPass::ID = 0; -} + MemCpyOptLegacyPass() : FunctionPass(ID) { + initializeMemCpyOptLegacyPassPass(*PassRegistry::getPassRegistry()); + } + + bool runOnFunction(Function &F) override; + +private: + // This transformation requires dominator postdominator info + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + AU.addRequired<AssumptionCacheTracker>(); + AU.addRequired<DominatorTreeWrapperPass>(); + AU.addRequired<MemoryDependenceWrapperPass>(); + AU.addRequired<AAResultsWrapperPass>(); + AU.addRequired<TargetLibraryInfoWrapperPass>(); + AU.addPreserved<GlobalsAAWrapperPass>(); + AU.addPreserved<MemoryDependenceWrapperPass>(); + } +}; + +char MemCpyOptLegacyPass::ID = 0; + +} // end anonymous namespace /// The public interface to this file... FunctionPass *llvm::createMemCpyOptPass() { return new MemCpyOptLegacyPass(); } @@ -523,14 +541,15 @@ static bool moveUp(AliasAnalysis &AA, StoreInst *SI, Instruction *P, if (Args.erase(C)) NeedLift = true; else if (MayAlias) { - NeedLift = any_of(MemLocs, [C, &AA](const MemoryLocation &ML) { + NeedLift = llvm::any_of(MemLocs, [C, &AA](const MemoryLocation &ML) { return AA.getModRefInfo(C, ML); }); if (!NeedLift) - NeedLift = any_of(CallSites, [C, &AA](const ImmutableCallSite &CS) { - return AA.getModRefInfo(C, CS); - }); + NeedLift = + llvm::any_of(CallSites, [C, &AA](const ImmutableCallSite &CS) { + return AA.getModRefInfo(C, CS); + }); } if (!NeedLift) @@ -567,7 +586,7 @@ static bool moveUp(AliasAnalysis &AA, StoreInst *SI, Instruction *P, } // We made it, we need to lift - for (auto *I : reverse(ToLift)) { + for (auto *I : llvm::reverse(ToLift)) { DEBUG(dbgs() << "Lifting " << *I << " before " << *P << "\n"); I->moveBefore(P); } @@ -761,7 +780,6 @@ bool MemCpyOptPass::processMemSet(MemSetInst *MSI, BasicBlock::iterator &BBI) { return false; } - /// Takes a memcpy and a call that it depends on, /// and checks for the possibility of a call slot optimization by having /// the call write its result directly into the destination of the memcpy. @@ -914,6 +932,17 @@ bool MemCpyOptPass::performCallSlotOptzn(Instruction *cpy, Value *cpyDest, if (MR != MRI_NoModRef) return false; + // We can't create address space casts here because we don't know if they're + // safe for the target. + if (cpySrc->getType()->getPointerAddressSpace() != + cpyDest->getType()->getPointerAddressSpace()) + return false; + for (unsigned i = 0; i < CS.arg_size(); ++i) + if (CS.getArgument(i)->stripPointerCasts() == cpySrc && + cpySrc->getType()->getPointerAddressSpace() != + CS.getArgument(i)->getType()->getPointerAddressSpace()) + return false; + // All the checks have passed, so do the transformation. bool changedArgument = false; for (unsigned i = 0; i < CS.arg_size(); ++i) @@ -1240,7 +1269,7 @@ bool MemCpyOptPass::processMemCpy(MemCpyInst *M) { bool MemCpyOptPass::processMemMove(MemMoveInst *M) { AliasAnalysis &AA = LookupAliasAnalysis(); - if (!TLI->has(LibFunc::memmove)) + if (!TLI->has(LibFunc_memmove)) return false; // See if the pointers alias. @@ -1294,7 +1323,7 @@ bool MemCpyOptPass::processByValArgument(CallSite CS, unsigned ArgNo) { // Get the alignment of the byval. If the call doesn't specify the alignment, // then it is some target specific value that we can't know. - unsigned ByValAlign = CS.getParamAlignment(ArgNo+1); + unsigned ByValAlign = CS.getParamAlignment(ArgNo); if (ByValAlign == 0) return false; // If it is greater than the memcpy, then we check to see if we can force the @@ -1306,6 +1335,11 @@ bool MemCpyOptPass::processByValArgument(CallSite CS, unsigned ArgNo) { CS.getInstruction(), &AC, &DT) < ByValAlign) return false; + // The address space of the memcpy source must match the byval argument + if (MDep->getSource()->getType()->getPointerAddressSpace() != + ByValArg->getType()->getPointerAddressSpace()) + return false; + // Verify that the copied-from memory doesn't change in between the memcpy and // the byval call. // memcpy(a <- b) @@ -1375,7 +1409,6 @@ bool MemCpyOptPass::iterateOnFunction(Function &F) { } PreservedAnalyses MemCpyOptPass::run(Function &F, FunctionAnalysisManager &AM) { - auto &MD = AM.getResult<MemoryDependenceAnalysis>(F); auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); @@ -1393,7 +1426,9 @@ PreservedAnalyses MemCpyOptPass::run(Function &F, FunctionAnalysisManager &AM) { LookupAssumptionCache, LookupDomTree); if (!MadeChange) return PreservedAnalyses::all(); + PreservedAnalyses PA; + PA.preserveSet<CFGAnalyses>(); PA.preserve<GlobalsAA>(); PA.preserve<MemoryDependenceAnalysis>(); return PA; @@ -1414,10 +1449,10 @@ bool MemCpyOptPass::runImpl( // If we don't have at least memset and memcpy, there is little point of doing // anything here. These are required by a freestanding implementation, so if // even they are disabled, there is no point in trying hard. - if (!TLI->has(LibFunc::memset) || !TLI->has(LibFunc::memcpy)) + if (!TLI->has(LibFunc_memset) || !TLI->has(LibFunc_memcpy)) return false; - while (1) { + while (true) { if (!iterateOnFunction(F)) break; MadeChange = true; diff --git a/contrib/llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp b/contrib/llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp index 6a64c6b..6727cf0 100644 --- a/contrib/llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp @@ -19,6 +19,8 @@ // thinks it safe to do so. This optimization helps with eg. hiding load // latencies, triggering if-conversion, and reducing static code size. // +// NOTE: This code no longer performs load hoisting, it is subsumed by GVNHoist. +// //===----------------------------------------------------------------------===// // // @@ -87,7 +89,6 @@ #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" -#include "llvm/Transforms/Utils/SSAUpdater.h" using namespace llvm; @@ -118,16 +119,6 @@ private: void removeInstruction(Instruction *Inst); BasicBlock *getDiamondTail(BasicBlock *BB); bool isDiamondHead(BasicBlock *BB); - // Routines for hoisting loads - bool isLoadHoistBarrierInRange(const Instruction &Start, - const Instruction &End, LoadInst *LI, - bool SafeToLoadUnconditionally); - LoadInst *canHoistFromBlock(BasicBlock *BB, LoadInst *LI); - void hoistInstruction(BasicBlock *BB, Instruction *HoistCand, - Instruction *ElseInst); - bool isSafeToHoist(Instruction *I) const; - bool hoistLoad(BasicBlock *BB, LoadInst *HoistCand, LoadInst *ElseInst); - bool mergeLoads(BasicBlock *BB); // Routines for sinking stores StoreInst *canSinkFromBlock(BasicBlock *BB, StoreInst *SI); PHINode *getPHIOperand(BasicBlock *BB, StoreInst *S0, StoreInst *S1); @@ -188,169 +179,6 @@ bool MergedLoadStoreMotion::isDiamondHead(BasicBlock *BB) { return true; } -/// -/// \brief True when instruction is a hoist barrier for a load -/// -/// Whenever an instruction could possibly modify the value -/// being loaded or protect against the load from happening -/// it is considered a hoist barrier. -/// -bool MergedLoadStoreMotion::isLoadHoistBarrierInRange( - const Instruction &Start, const Instruction &End, LoadInst *LI, - bool SafeToLoadUnconditionally) { - if (!SafeToLoadUnconditionally) - for (const Instruction &Inst : - make_range(Start.getIterator(), End.getIterator())) - if (!isGuaranteedToTransferExecutionToSuccessor(&Inst)) - return true; - MemoryLocation Loc = MemoryLocation::get(LI); - return AA->canInstructionRangeModRef(Start, End, Loc, MRI_Mod); -} - -/// -/// \brief Decide if a load can be hoisted -/// -/// When there is a load in \p BB to the same address as \p LI -/// and it can be hoisted from \p BB, return that load. -/// Otherwise return Null. -/// -LoadInst *MergedLoadStoreMotion::canHoistFromBlock(BasicBlock *BB1, - LoadInst *Load0) { - BasicBlock *BB0 = Load0->getParent(); - BasicBlock *Head = BB0->getSinglePredecessor(); - bool SafeToLoadUnconditionally = isSafeToLoadUnconditionally( - Load0->getPointerOperand(), Load0->getAlignment(), - Load0->getModule()->getDataLayout(), - /*ScanFrom=*/Head->getTerminator()); - for (BasicBlock::iterator BBI = BB1->begin(), BBE = BB1->end(); BBI != BBE; - ++BBI) { - Instruction *Inst = &*BBI; - - // Only merge and hoist loads when their result in used only in BB - auto *Load1 = dyn_cast<LoadInst>(Inst); - if (!Load1 || Inst->isUsedOutsideOfBlock(BB1)) - continue; - - MemoryLocation Loc0 = MemoryLocation::get(Load0); - MemoryLocation Loc1 = MemoryLocation::get(Load1); - if (Load0->isSameOperationAs(Load1) && AA->isMustAlias(Loc0, Loc1) && - !isLoadHoistBarrierInRange(BB1->front(), *Load1, Load1, - SafeToLoadUnconditionally) && - !isLoadHoistBarrierInRange(BB0->front(), *Load0, Load0, - SafeToLoadUnconditionally)) { - return Load1; - } - } - return nullptr; -} - -/// -/// \brief Merge two equivalent instructions \p HoistCand and \p ElseInst into -/// \p BB -/// -/// BB is the head of a diamond -/// -void MergedLoadStoreMotion::hoistInstruction(BasicBlock *BB, - Instruction *HoistCand, - Instruction *ElseInst) { - DEBUG(dbgs() << " Hoist Instruction into BB \n"; BB->dump(); - dbgs() << "Instruction Left\n"; HoistCand->dump(); dbgs() << "\n"; - dbgs() << "Instruction Right\n"; ElseInst->dump(); dbgs() << "\n"); - // Hoist the instruction. - assert(HoistCand->getParent() != BB); - - // Intersect optional metadata. - HoistCand->andIRFlags(ElseInst); - HoistCand->dropUnknownNonDebugMetadata(); - - // Prepend point for instruction insert - Instruction *HoistPt = BB->getTerminator(); - - // Merged instruction - Instruction *HoistedInst = HoistCand->clone(); - - // Hoist instruction. - HoistedInst->insertBefore(HoistPt); - - HoistCand->replaceAllUsesWith(HoistedInst); - removeInstruction(HoistCand); - // Replace the else block instruction. - ElseInst->replaceAllUsesWith(HoistedInst); - removeInstruction(ElseInst); -} - -/// -/// \brief Return true if no operand of \p I is defined in I's parent block -/// -bool MergedLoadStoreMotion::isSafeToHoist(Instruction *I) const { - BasicBlock *Parent = I->getParent(); - for (Use &U : I->operands()) - if (auto *Instr = dyn_cast<Instruction>(&U)) - if (Instr->getParent() == Parent) - return false; - return true; -} - -/// -/// \brief Merge two equivalent loads and GEPs and hoist into diamond head -/// -bool MergedLoadStoreMotion::hoistLoad(BasicBlock *BB, LoadInst *L0, - LoadInst *L1) { - // Only one definition? - auto *A0 = dyn_cast<Instruction>(L0->getPointerOperand()); - auto *A1 = dyn_cast<Instruction>(L1->getPointerOperand()); - if (A0 && A1 && A0->isIdenticalTo(A1) && isSafeToHoist(A0) && - A0->hasOneUse() && (A0->getParent() == L0->getParent()) && - A1->hasOneUse() && (A1->getParent() == L1->getParent()) && - isa<GetElementPtrInst>(A0)) { - DEBUG(dbgs() << "Hoist Instruction into BB \n"; BB->dump(); - dbgs() << "Instruction Left\n"; L0->dump(); dbgs() << "\n"; - dbgs() << "Instruction Right\n"; L1->dump(); dbgs() << "\n"); - hoistInstruction(BB, A0, A1); - hoistInstruction(BB, L0, L1); - return true; - } - return false; -} - -/// -/// \brief Try to hoist two loads to same address into diamond header -/// -/// Starting from a diamond head block, iterate over the instructions in one -/// successor block and try to match a load in the second successor. -/// -bool MergedLoadStoreMotion::mergeLoads(BasicBlock *BB) { - bool MergedLoads = false; - assert(isDiamondHead(BB)); - BranchInst *BI = cast<BranchInst>(BB->getTerminator()); - BasicBlock *Succ0 = BI->getSuccessor(0); - BasicBlock *Succ1 = BI->getSuccessor(1); - // #Instructions in Succ1 for Compile Time Control - int Size1 = Succ1->size(); - int NLoads = 0; - for (BasicBlock::iterator BBI = Succ0->begin(), BBE = Succ0->end(); - BBI != BBE;) { - Instruction *I = &*BBI; - ++BBI; - - // Don't move non-simple (atomic, volatile) loads. - auto *L0 = dyn_cast<LoadInst>(I); - if (!L0 || !L0->isSimple() || L0->isUsedOutsideOfBlock(Succ0)) - continue; - - ++NLoads; - if (NLoads * Size1 >= MagicCompileTimeControl) - break; - if (LoadInst *L1 = canHoistFromBlock(Succ1, L0)) { - bool Res = hoistLoad(BB, L0, L1); - MergedLoads |= Res; - // Don't attempt to hoist above loads that had not been hoisted. - if (!Res) - break; - } - } - return MergedLoads; -} /// /// \brief True when instruction is a sink barrier for a store @@ -410,7 +238,7 @@ PHINode *MergedLoadStoreMotion::getPHIOperand(BasicBlock *BB, StoreInst *S0, &BB->front()); NewPN->addIncoming(Opd1, S0->getParent()); NewPN->addIncoming(Opd2, S1->getParent()); - if (MD && NewPN->getType()->getScalarType()->isPointerTy()) + if (MD && NewPN->getType()->isPtrOrPtrVectorTy()) MD->invalidateCachedPointerInfo(NewPN); return NewPN; } @@ -534,7 +362,6 @@ bool MergedLoadStoreMotion::run(Function &F, MemoryDependenceResults *MD, // Hoist equivalent loads and sink stores // outside diamonds when possible if (isDiamondHead(BB)) { - Changed |= mergeLoads(BB); Changed |= mergeStores(getDiamondTail(BB)); } } @@ -596,8 +423,8 @@ MergedLoadStoreMotionPass::run(Function &F, FunctionAnalysisManager &AM) { if (!Impl.run(F, MD, AA)) return PreservedAnalyses::all(); - // FIXME: This should also 'preserve the CFG'. PreservedAnalyses PA; + PA.preserveSet<CFGAnalyses>(); PA.preserve<GlobalsAA>(); PA.preserve<MemoryDependenceAnalysis>(); return PA; diff --git a/contrib/llvm/lib/Transforms/Scalar/NaryReassociate.cpp b/contrib/llvm/lib/Transforms/Scalar/NaryReassociate.cpp index 0a3bf7b..d0bfe36 100644 --- a/contrib/llvm/lib/Transforms/Scalar/NaryReassociate.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/NaryReassociate.cpp @@ -156,20 +156,12 @@ PreservedAnalyses NaryReassociatePass::run(Function &F, auto *TLI = &AM.getResult<TargetLibraryAnalysis>(F); auto *TTI = &AM.getResult<TargetIRAnalysis>(F); - bool Changed = runImpl(F, AC, DT, SE, TLI, TTI); - - // FIXME: We need to invalidate this to avoid PR28400. Is there a better - // solution? - AM.invalidate<ScalarEvolutionAnalysis>(F); - - if (!Changed) + if (!runImpl(F, AC, DT, SE, TLI, TTI)) return PreservedAnalyses::all(); - // FIXME: This should also 'preserve the CFG'. PreservedAnalyses PA; - PA.preserve<DominatorTreeAnalysis>(); + PA.preserveSet<CFGAnalyses>(); PA.preserve<ScalarEvolutionAnalysis>(); - PA.preserve<TargetLibraryAnalysis>(); return PA; } @@ -219,7 +211,8 @@ bool NaryReassociatePass::doOneIteration(Function &F) { Changed = true; SE->forgetValue(&*I); I->replaceAllUsesWith(NewI); - // If SeenExprs constains I's WeakVH, that entry will be replaced with + // If SeenExprs constains I's WeakTrackingVH, that entry will be + // replaced with // nullptr. RecursivelyDeleteTriviallyDeadInstructions(&*I, TLI); I = NewI->getIterator(); @@ -227,7 +220,7 @@ bool NaryReassociatePass::doOneIteration(Function &F) { // Add the rewritten instruction to SeenExprs; the original instruction // is deleted. const SCEV *NewSCEV = SE->getSCEV(&*I); - SeenExprs[NewSCEV].push_back(WeakVH(&*I)); + SeenExprs[NewSCEV].push_back(WeakTrackingVH(&*I)); // Ideally, NewSCEV should equal OldSCEV because tryReassociate(I) // is equivalent to I. However, ScalarEvolution::getSCEV may // weaken nsw causing NewSCEV not to equal OldSCEV. For example, suppose @@ -247,7 +240,7 @@ bool NaryReassociatePass::doOneIteration(Function &F) { // // This improvement is exercised in @reassociate_gep_nsw in nary-gep.ll. if (NewSCEV != OldSCEV) - SeenExprs[OldSCEV].push_back(WeakVH(&*I)); + SeenExprs[OldSCEV].push_back(WeakTrackingVH(&*I)); } } } @@ -502,7 +495,8 @@ NaryReassociatePass::findClosestMatchingDominator(const SCEV *CandidateExpr, // future instruction either. Therefore, we pop it out of the stack. This // optimization makes the algorithm O(n). while (!Candidates.empty()) { - // Candidates stores WeakVHs, so a candidate can be nullptr if it's removed + // Candidates stores WeakTrackingVHs, so a candidate can be nullptr if it's + // removed // during rewriting. if (Value *Candidate = Candidates.back()) { Instruction *CandidateInstruction = cast<Instruction>(Candidate); diff --git a/contrib/llvm/lib/Transforms/Scalar/NewGVN.cpp b/contrib/llvm/lib/Transforms/Scalar/NewGVN.cpp index 57e6e3d..9d01856 100644 --- a/contrib/llvm/lib/Transforms/Scalar/NewGVN.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/NewGVN.cpp @@ -17,6 +17,37 @@ /// "A Sparse Algorithm for Predicated Global Value Numbering" from /// Karthik Gargi. /// +/// A brief overview of the algorithm: The algorithm is essentially the same as +/// the standard RPO value numbering algorithm (a good reference is the paper +/// "SCC based value numbering" by L. Taylor Simpson) with one major difference: +/// The RPO algorithm proceeds, on every iteration, to process every reachable +/// block and every instruction in that block. This is because the standard RPO +/// algorithm does not track what things have the same value number, it only +/// tracks what the value number of a given operation is (the mapping is +/// operation -> value number). Thus, when a value number of an operation +/// changes, it must reprocess everything to ensure all uses of a value number +/// get updated properly. In constrast, the sparse algorithm we use *also* +/// tracks what operations have a given value number (IE it also tracks the +/// reverse mapping from value number -> operations with that value number), so +/// that it only needs to reprocess the instructions that are affected when +/// something's value number changes. The vast majority of complexity and code +/// in this file is devoted to tracking what value numbers could change for what +/// instructions when various things happen. The rest of the algorithm is +/// devoted to performing symbolic evaluation, forward propagation, and +/// simplification of operations based on the value numbers deduced so far +/// +/// In order to make the GVN mostly-complete, we use a technique derived from +/// "Detection of Redundant Expressions: A Complete and Polynomial-time +/// Algorithm in SSA" by R.R. Pai. The source of incompleteness in most SSA +/// based GVN algorithms is related to their inability to detect equivalence +/// between phi of ops (IE phi(a+b, c+d)) and op of phis (phi(a,c) + phi(b, d)). +/// We resolve this issue by generating the equivalent "phi of ops" form for +/// each op of phis we see, in a way that only takes polynomial time to resolve. +/// +/// We also do not perform elimination by using any published algorithm. All +/// published algorithms are O(Instructions). Instead, we use a technique that +/// is O(number of operations with the same value number), enabling us to skip +/// trying to eliminate things that have unique value numbers. //===----------------------------------------------------------------------===// #include "llvm/Transforms/Scalar/NewGVN.h" @@ -30,7 +61,6 @@ #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallSet.h" -#include "llvm/ADT/SparseBitVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/TinyPtrVector.h" #include "llvm/Analysis/AliasAnalysis.h" @@ -40,13 +70,10 @@ #include "llvm/Analysis/ConstantFolding.h" #include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/InstructionSimplify.h" -#include "llvm/Analysis/Loads.h" #include "llvm/Analysis/MemoryBuiltins.h" -#include "llvm/Analysis/MemoryDependenceAnalysis.h" #include "llvm/Analysis/MemoryLocation.h" -#include "llvm/Analysis/PHITransAddr.h" +#include "llvm/Analysis/MemorySSA.h" #include "llvm/Analysis/TargetLibraryInfo.h" -#include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/GlobalVariable.h" @@ -55,24 +82,25 @@ #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Metadata.h" #include "llvm/IR/PatternMatch.h" -#include "llvm/IR/PredIteratorCache.h" #include "llvm/IR/Type.h" #include "llvm/Support/Allocator.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/DebugCounter.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Scalar/GVNExpression.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Local.h" -#include "llvm/Transforms/Utils/MemorySSA.h" -#include "llvm/Transforms/Utils/SSAUpdater.h" +#include "llvm/Transforms/Utils/PredicateInfo.h" +#include "llvm/Transforms/Utils/VNCoercion.h" +#include <numeric> #include <unordered_map> #include <utility> #include <vector> using namespace llvm; using namespace PatternMatch; using namespace llvm::GVNExpression; - +using namespace llvm::VNCoercion; #define DEBUG_TYPE "newgvn" STATISTIC(NumGVNInstrDeleted, "Number of instructions deleted"); @@ -85,8 +113,19 @@ STATISTIC(NumGVNLeaderChanges, "Number of leader changes"); STATISTIC(NumGVNSortedLeaderChanges, "Number of sorted leader changes"); STATISTIC(NumGVNAvoidedSortedLeaderChanges, "Number of avoided sorted leader changes"); -STATISTIC(NumGVNNotMostDominatingLeader, - "Number of times a member dominated it's new classes' leader"); +STATISTIC(NumGVNDeadStores, "Number of redundant/dead stores eliminated"); +STATISTIC(NumGVNPHIOfOpsCreated, "Number of PHI of ops created"); +STATISTIC(NumGVNPHIOfOpsEliminations, + "Number of things eliminated using PHI of ops"); +DEBUG_COUNTER(VNCounter, "newgvn-vn", + "Controls which instructions are value numbered") +DEBUG_COUNTER(PHIOfOpsCounter, "newgvn-phi", + "Controls which instructions we create phi of ops for") +// Currently store defining access refinement is too slow due to basicaa being +// egregiously slow. This flag lets us keep it working while we work on this +// issue. +static cl::opt<bool> EnableStoreRefinement("enable-store-refinement", + cl::init(false), cl::Hidden); //===----------------------------------------------------------------------===// // GVN Pass @@ -105,6 +144,79 @@ PHIExpression::~PHIExpression() = default; } } +// Tarjan's SCC finding algorithm with Nuutila's improvements +// SCCIterator is actually fairly complex for the simple thing we want. +// It also wants to hand us SCC's that are unrelated to the phi node we ask +// about, and have us process them there or risk redoing work. +// Graph traits over a filter iterator also doesn't work that well here. +// This SCC finder is specialized to walk use-def chains, and only follows +// instructions, +// not generic values (arguments, etc). +struct TarjanSCC { + + TarjanSCC() : Components(1) {} + + void Start(const Instruction *Start) { + if (Root.lookup(Start) == 0) + FindSCC(Start); + } + + const SmallPtrSetImpl<const Value *> &getComponentFor(const Value *V) const { + unsigned ComponentID = ValueToComponent.lookup(V); + + assert(ComponentID > 0 && + "Asking for a component for a value we never processed"); + return Components[ComponentID]; + } + +private: + void FindSCC(const Instruction *I) { + Root[I] = ++DFSNum; + // Store the DFS Number we had before it possibly gets incremented. + unsigned int OurDFS = DFSNum; + for (auto &Op : I->operands()) { + if (auto *InstOp = dyn_cast<Instruction>(Op)) { + if (Root.lookup(Op) == 0) + FindSCC(InstOp); + if (!InComponent.count(Op)) + Root[I] = std::min(Root.lookup(I), Root.lookup(Op)); + } + } + // See if we really were the root of a component, by seeing if we still have + // our DFSNumber. If we do, we are the root of the component, and we have + // completed a component. If we do not, we are not the root of a component, + // and belong on the component stack. + if (Root.lookup(I) == OurDFS) { + unsigned ComponentID = Components.size(); + Components.resize(Components.size() + 1); + auto &Component = Components.back(); + Component.insert(I); + DEBUG(dbgs() << "Component root is " << *I << "\n"); + InComponent.insert(I); + ValueToComponent[I] = ComponentID; + // Pop a component off the stack and label it. + while (!Stack.empty() && Root.lookup(Stack.back()) >= OurDFS) { + auto *Member = Stack.back(); + DEBUG(dbgs() << "Component member is " << *Member << "\n"); + Component.insert(Member); + InComponent.insert(Member); + ValueToComponent[Member] = ComponentID; + Stack.pop_back(); + } + } else { + // Part of a component, push to stack + Stack.push_back(I); + } + } + unsigned int DFSNum = 1; + SmallPtrSet<const Value *, 8> InComponent; + DenseMap<const Value *, unsigned int> Root; + SmallVector<const Value *, 8> Stack; + // Store the components as vector of ptr sets, because we need the topo order + // of SCC's, but not individual member order + SmallVector<SmallPtrSet<const Value *, 8>, 8> Components; + DenseMap<const Value *, unsigned> ValueToComponent; +}; // Congruence classes represent the set of expressions/instructions // that are all the same *during some scope in the function*. // That is, because of the way we perform equality propagation, and @@ -115,46 +227,166 @@ PHIExpression::~PHIExpression() = default; // For any Value in the Member set, it is valid to replace any dominated member // with that Value. // -// Every congruence class has a leader, and the leader is used to -// symbolize instructions in a canonical way (IE every operand of an -// instruction that is a member of the same congruence class will -// always be replaced with leader during symbolization). -// To simplify symbolization, we keep the leader as a constant if class can be -// proved to be a constant value. -// Otherwise, the leader is a randomly chosen member of the value set, it does -// not matter which one is chosen. -// Each congruence class also has a defining expression, -// though the expression may be null. If it exists, it can be used for forward -// propagation and reassociation of values. -// -struct CongruenceClass { - using MemberSet = SmallPtrSet<Value *, 4>; +// Every congruence class has a leader, and the leader is used to symbolize +// instructions in a canonical way (IE every operand of an instruction that is a +// member of the same congruence class will always be replaced with leader +// during symbolization). To simplify symbolization, we keep the leader as a +// constant if class can be proved to be a constant value. Otherwise, the +// leader is the member of the value set with the smallest DFS number. Each +// congruence class also has a defining expression, though the expression may be +// null. If it exists, it can be used for forward propagation and reassociation +// of values. + +// For memory, we also track a representative MemoryAccess, and a set of memory +// members for MemoryPhis (which have no real instructions). Note that for +// memory, it seems tempting to try to split the memory members into a +// MemoryCongruenceClass or something. Unfortunately, this does not work +// easily. The value numbering of a given memory expression depends on the +// leader of the memory congruence class, and the leader of memory congruence +// class depends on the value numbering of a given memory expression. This +// leads to wasted propagation, and in some cases, missed optimization. For +// example: If we had value numbered two stores together before, but now do not, +// we move them to a new value congruence class. This in turn will move at one +// of the memorydefs to a new memory congruence class. Which in turn, affects +// the value numbering of the stores we just value numbered (because the memory +// congruence class is part of the value number). So while theoretically +// possible to split them up, it turns out to be *incredibly* complicated to get +// it to work right, because of the interdependency. While structurally +// slightly messier, it is algorithmically much simpler and faster to do what we +// do here, and track them both at once in the same class. +// Note: The default iterators for this class iterate over values +class CongruenceClass { +public: + using MemberType = Value; + using MemberSet = SmallPtrSet<MemberType *, 4>; + using MemoryMemberType = MemoryPhi; + using MemoryMemberSet = SmallPtrSet<const MemoryMemberType *, 2>; + + explicit CongruenceClass(unsigned ID) : ID(ID) {} + CongruenceClass(unsigned ID, Value *Leader, const Expression *E) + : ID(ID), RepLeader(Leader), DefiningExpr(E) {} + unsigned getID() const { return ID; } + // True if this class has no members left. This is mainly used for assertion + // purposes, and for skipping empty classes. + bool isDead() const { + // If it's both dead from a value perspective, and dead from a memory + // perspective, it's really dead. + return empty() && memory_empty(); + } + // Leader functions + Value *getLeader() const { return RepLeader; } + void setLeader(Value *Leader) { RepLeader = Leader; } + const std::pair<Value *, unsigned int> &getNextLeader() const { + return NextLeader; + } + void resetNextLeader() { NextLeader = {nullptr, ~0}; } + + void addPossibleNextLeader(std::pair<Value *, unsigned int> LeaderPair) { + if (LeaderPair.second < NextLeader.second) + NextLeader = LeaderPair; + } + + Value *getStoredValue() const { return RepStoredValue; } + void setStoredValue(Value *Leader) { RepStoredValue = Leader; } + const MemoryAccess *getMemoryLeader() const { return RepMemoryAccess; } + void setMemoryLeader(const MemoryAccess *Leader) { RepMemoryAccess = Leader; } + + // Forward propagation info + const Expression *getDefiningExpr() const { return DefiningExpr; } + + // Value member set + bool empty() const { return Members.empty(); } + unsigned size() const { return Members.size(); } + MemberSet::const_iterator begin() const { return Members.begin(); } + MemberSet::const_iterator end() const { return Members.end(); } + void insert(MemberType *M) { Members.insert(M); } + void erase(MemberType *M) { Members.erase(M); } + void swap(MemberSet &Other) { Members.swap(Other); } + + // Memory member set + bool memory_empty() const { return MemoryMembers.empty(); } + unsigned memory_size() const { return MemoryMembers.size(); } + MemoryMemberSet::const_iterator memory_begin() const { + return MemoryMembers.begin(); + } + MemoryMemberSet::const_iterator memory_end() const { + return MemoryMembers.end(); + } + iterator_range<MemoryMemberSet::const_iterator> memory() const { + return make_range(memory_begin(), memory_end()); + } + void memory_insert(const MemoryMemberType *M) { MemoryMembers.insert(M); } + void memory_erase(const MemoryMemberType *M) { MemoryMembers.erase(M); } + + // Store count + unsigned getStoreCount() const { return StoreCount; } + void incStoreCount() { ++StoreCount; } + void decStoreCount() { + assert(StoreCount != 0 && "Store count went negative"); + --StoreCount; + } + + // True if this class has no memory members. + bool definesNoMemory() const { return StoreCount == 0 && memory_empty(); } + + // Return true if two congruence classes are equivalent to each other. This + // means + // that every field but the ID number and the dead field are equivalent. + bool isEquivalentTo(const CongruenceClass *Other) const { + if (!Other) + return false; + if (this == Other) + return true; + + if (std::tie(StoreCount, RepLeader, RepStoredValue, RepMemoryAccess) != + std::tie(Other->StoreCount, Other->RepLeader, Other->RepStoredValue, + Other->RepMemoryAccess)) + return false; + if (DefiningExpr != Other->DefiningExpr) + if (!DefiningExpr || !Other->DefiningExpr || + *DefiningExpr != *Other->DefiningExpr) + return false; + // We need some ordered set + std::set<Value *> AMembers(Members.begin(), Members.end()); + std::set<Value *> BMembers(Members.begin(), Members.end()); + return AMembers == BMembers; + } + +private: unsigned ID; // Representative leader. Value *RepLeader = nullptr; + // The most dominating leader after our current leader, because the member set + // is not sorted and is expensive to keep sorted all the time. + std::pair<Value *, unsigned int> NextLeader = {nullptr, ~0U}; + // If this is represented by a store, the value of the store. + Value *RepStoredValue = nullptr; + // If this class contains MemoryDefs or MemoryPhis, this is the leading memory + // access. + const MemoryAccess *RepMemoryAccess = nullptr; // Defining Expression. const Expression *DefiningExpr = nullptr; // Actual members of this class. MemberSet Members; - - // True if this class has no members left. This is mainly used for assertion - // purposes, and for skipping empty classes. - bool Dead = false; - + // This is the set of MemoryPhis that exist in the class. MemoryDefs and + // MemoryUses have real instructions representing them, so we only need to + // track MemoryPhis here. + MemoryMemberSet MemoryMembers; // Number of stores in this congruence class. // This is used so we can detect store equivalence changes properly. int StoreCount = 0; - - // The most dominating leader after our current leader, because the member set - // is not sorted and is expensive to keep sorted all the time. - std::pair<Value *, unsigned int> NextLeader = {nullptr, ~0U}; - - explicit CongruenceClass(unsigned ID) : ID(ID) {} - CongruenceClass(unsigned ID, Value *Leader, const Expression *E) - : ID(ID), RepLeader(Leader), DefiningExpr(E) {} }; namespace llvm { +struct ExactEqualsExpression { + const Expression &E; + explicit ExactEqualsExpression(const Expression &E) : E(E) {} + hash_code getComputedHash() const { return E.getComputedHash(); } + bool operator==(const Expression &Other) const { + return E.exactlyEquals(Other); + } +}; + template <> struct DenseMapInfo<const Expression *> { static const Expression *getEmptyKey() { auto Val = static_cast<uintptr_t>(-1); @@ -166,51 +398,144 @@ template <> struct DenseMapInfo<const Expression *> { Val <<= PointerLikeTypeTraits<const Expression *>::NumLowBitsAvailable; return reinterpret_cast<const Expression *>(Val); } - static unsigned getHashValue(const Expression *V) { - return static_cast<unsigned>(V->getHashValue()); + static unsigned getHashValue(const Expression *E) { + return E->getComputedHash(); } + static unsigned getHashValue(const ExactEqualsExpression &E) { + return E.getComputedHash(); + } + static bool isEqual(const ExactEqualsExpression &LHS, const Expression *RHS) { + if (RHS == getTombstoneKey() || RHS == getEmptyKey()) + return false; + return LHS == *RHS; + } + static bool isEqual(const Expression *LHS, const Expression *RHS) { if (LHS == RHS) return true; if (LHS == getTombstoneKey() || RHS == getTombstoneKey() || LHS == getEmptyKey() || RHS == getEmptyKey()) return false; + // Compare hashes before equality. This is *not* what the hashtable does, + // since it is computing it modulo the number of buckets, whereas we are + // using the full hash keyspace. Since the hashes are precomputed, this + // check is *much* faster than equality. + if (LHS->getComputedHash() != RHS->getComputedHash()) + return false; return *LHS == *RHS; } }; } // end namespace llvm -class NewGVN : public FunctionPass { +namespace { +class NewGVN { + Function &F; DominatorTree *DT; - const DataLayout *DL; const TargetLibraryInfo *TLI; - AssumptionCache *AC; AliasAnalysis *AA; MemorySSA *MSSA; MemorySSAWalker *MSSAWalker; - BumpPtrAllocator ExpressionAllocator; - ArrayRecycler<Value *> ArgRecycler; + const DataLayout &DL; + std::unique_ptr<PredicateInfo> PredInfo; + + // These are the only two things the create* functions should have + // side-effects on due to allocating memory. + mutable BumpPtrAllocator ExpressionAllocator; + mutable ArrayRecycler<Value *> ArgRecycler; + mutable TarjanSCC SCCFinder; + const SimplifyQuery SQ; + + // Number of function arguments, used by ranking + unsigned int NumFuncArgs; + + // RPOOrdering of basic blocks + DenseMap<const DomTreeNode *, unsigned> RPOOrdering; // Congruence class info. - CongruenceClass *InitialClass; + + // This class is called INITIAL in the paper. It is the class everything + // startsout in, and represents any value. Being an optimistic analysis, + // anything in the TOP class has the value TOP, which is indeterminate and + // equivalent to everything. + CongruenceClass *TOPClass; std::vector<CongruenceClass *> CongruenceClasses; unsigned NextCongruenceNum; // Value Mappings. DenseMap<Value *, CongruenceClass *> ValueToClass; DenseMap<Value *, const Expression *> ValueToExpression; + // Value PHI handling, used to make equivalence between phi(op, op) and + // op(phi, phi). + // These mappings just store various data that would normally be part of the + // IR. + DenseSet<const Instruction *> PHINodeUses; + // Map a temporary instruction we created to a parent block. + DenseMap<const Value *, BasicBlock *> TempToBlock; + // Map between the temporary phis we created and the real instructions they + // are known equivalent to. + DenseMap<const Value *, PHINode *> RealToTemp; + // In order to know when we should re-process instructions that have + // phi-of-ops, we track the set of expressions that they needed as + // leaders. When we discover new leaders for those expressions, we process the + // associated phi-of-op instructions again in case they have changed. The + // other way they may change is if they had leaders, and those leaders + // disappear. However, at the point they have leaders, there are uses of the + // relevant operands in the created phi node, and so they will get reprocessed + // through the normal user marking we perform. + mutable DenseMap<const Value *, SmallPtrSet<Value *, 2>> AdditionalUsers; + DenseMap<const Expression *, SmallPtrSet<Instruction *, 2>> + ExpressionToPhiOfOps; + // Map from basic block to the temporary operations we created + DenseMap<const BasicBlock *, SmallVector<PHINode *, 8>> PHIOfOpsPHIs; + // Map from temporary operation to MemoryAccess. + DenseMap<const Instruction *, MemoryUseOrDef *> TempToMemory; + // Set of all temporary instructions we created. + DenseSet<Instruction *> AllTempInstructions; + + // Mapping from predicate info we used to the instructions we used it with. + // In order to correctly ensure propagation, we must keep track of what + // comparisons we used, so that when the values of the comparisons change, we + // propagate the information to the places we used the comparison. + mutable DenseMap<const Value *, SmallPtrSet<Instruction *, 2>> + PredicateToUsers; + // the same reasoning as PredicateToUsers. When we skip MemoryAccesses for + // stores, we no longer can rely solely on the def-use chains of MemorySSA. + mutable DenseMap<const MemoryAccess *, SmallPtrSet<MemoryAccess *, 2>> + MemoryToUsers; // A table storing which memorydefs/phis represent a memory state provably // equivalent to another memory state. // We could use the congruence class machinery, but the MemoryAccess's are // abstract memory states, so they can only ever be equivalent to each other, // and not to constants, etc. - DenseMap<const MemoryAccess *, MemoryAccess *> MemoryAccessEquiv; - + DenseMap<const MemoryAccess *, CongruenceClass *> MemoryAccessToClass; + + // We could, if we wanted, build MemoryPhiExpressions and + // MemoryVariableExpressions, etc, and value number them the same way we value + // number phi expressions. For the moment, this seems like overkill. They + // can only exist in one of three states: they can be TOP (equal to + // everything), Equivalent to something else, or unique. Because we do not + // create expressions for them, we need to simulate leader change not just + // when they change class, but when they change state. Note: We can do the + // same thing for phis, and avoid having phi expressions if we wanted, We + // should eventually unify in one direction or the other, so this is a little + // bit of an experiment in which turns out easier to maintain. + enum MemoryPhiState { MPS_Invalid, MPS_TOP, MPS_Equivalent, MPS_Unique }; + DenseMap<const MemoryPhi *, MemoryPhiState> MemoryPhiState; + + enum InstCycleState { ICS_Unknown, ICS_CycleFree, ICS_Cycle }; + mutable DenseMap<const Instruction *, InstCycleState> InstCycleState; // Expression to class mapping. using ExpressionClassMap = DenseMap<const Expression *, CongruenceClass *>; ExpressionClassMap ExpressionToClass; + // We have a single expression that represents currently DeadExpressions. + // For dead expressions we can prove will stay dead, we mark them with + // DFS number zero. However, it's possible in the case of phi nodes + // for us to assume/prove all arguments are dead during fixpointing. + // We use DeadExpression for that case. + DeadExpression *SingletonDeadExpression = nullptr; + // Which values have changed as a result of leader changes. SmallPtrSet<Value *, 8> LeaderChanges; @@ -231,8 +556,6 @@ class NewGVN : public FunctionPass { BitVector TouchedInstructions; DenseMap<const BasicBlock *, std::pair<unsigned, unsigned>> BlockInstRange; - DenseMap<const DomTreeNode *, std::pair<unsigned, unsigned>> - DominatedInstRange; #ifndef NDEBUG // Debugging for how many times each block and instruction got processed. @@ -240,56 +563,47 @@ class NewGVN : public FunctionPass { #endif // DFS info. - DenseMap<const BasicBlock *, std::pair<int, int>> DFSDomMap; + // This contains a mapping from Instructions to DFS numbers. + // The numbering starts at 1. An instruction with DFS number zero + // means that the instruction is dead. DenseMap<const Value *, unsigned> InstrDFS; + + // This contains the mapping DFS numbers to instructions. SmallVector<Value *, 32> DFSToInstr; // Deletion info. SmallPtrSet<Instruction *, 8> InstructionsToErase; public: - static char ID; // Pass identification, replacement for typeid. - NewGVN() : FunctionPass(ID) { - initializeNewGVNPass(*PassRegistry::getPassRegistry()); + NewGVN(Function &F, DominatorTree *DT, AssumptionCache *AC, + TargetLibraryInfo *TLI, AliasAnalysis *AA, MemorySSA *MSSA, + const DataLayout &DL) + : F(F), DT(DT), TLI(TLI), AA(AA), MSSA(MSSA), DL(DL), + PredInfo(make_unique<PredicateInfo>(F, *DT, *AC)), SQ(DL, TLI, DT, AC) { } - - bool runOnFunction(Function &F) override; - bool runGVN(Function &F, DominatorTree *DT, AssumptionCache *AC, - TargetLibraryInfo *TLI, AliasAnalysis *AA, MemorySSA *MSSA); + bool runGVN(); private: - // This transformation requires dominator postdominator info. - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired<AssumptionCacheTracker>(); - AU.addRequired<DominatorTreeWrapperPass>(); - AU.addRequired<TargetLibraryInfoWrapperPass>(); - AU.addRequired<MemorySSAWrapperPass>(); - AU.addRequired<AAResultsWrapperPass>(); - - AU.addPreserved<DominatorTreeWrapperPass>(); - AU.addPreserved<GlobalsAAWrapperPass>(); - } - // Expression handling. - const Expression *createExpression(Instruction *, const BasicBlock *); - const Expression *createBinaryExpression(unsigned, Type *, Value *, Value *, - const BasicBlock *); - PHIExpression *createPHIExpression(Instruction *); - const VariableExpression *createVariableExpression(Value *); - const ConstantExpression *createConstantExpression(Constant *); - const Expression *createVariableOrConstant(Value *V, const BasicBlock *B); - const UnknownExpression *createUnknownExpression(Instruction *); - const StoreExpression *createStoreExpression(StoreInst *, MemoryAccess *, - const BasicBlock *); + const Expression *createExpression(Instruction *) const; + const Expression *createBinaryExpression(unsigned, Type *, Value *, + Value *) const; + PHIExpression *createPHIExpression(Instruction *, bool &HasBackEdge, + bool &OriginalOpsConstant) const; + const DeadExpression *createDeadExpression() const; + const VariableExpression *createVariableExpression(Value *) const; + const ConstantExpression *createConstantExpression(Constant *) const; + const Expression *createVariableOrConstant(Value *V) const; + const UnknownExpression *createUnknownExpression(Instruction *) const; + const StoreExpression *createStoreExpression(StoreInst *, + const MemoryAccess *) const; LoadExpression *createLoadExpression(Type *, Value *, LoadInst *, - MemoryAccess *, const BasicBlock *); - - const CallExpression *createCallExpression(CallInst *, MemoryAccess *, - const BasicBlock *); + const MemoryAccess *) const; + const CallExpression *createCallExpression(CallInst *, + const MemoryAccess *) const; const AggregateValueExpression * - createAggregateValueExpression(Instruction *, const BasicBlock *); - bool setBasicExpressionInfo(Instruction *, BasicExpression *, - const BasicBlock *); + createAggregateValueExpression(Instruction *) const; + bool setBasicExpressionInfo(Instruction *, BasicExpression *) const; // Congruence class handling. CongruenceClass *createCongruenceClass(Value *Leader, const Expression *E) { @@ -298,13 +612,28 @@ private: return result; } + CongruenceClass *createMemoryClass(MemoryAccess *MA) { + auto *CC = createCongruenceClass(nullptr, nullptr); + CC->setMemoryLeader(MA); + return CC; + } + CongruenceClass *ensureLeaderOfMemoryClass(MemoryAccess *MA) { + auto *CC = getMemoryClass(MA); + if (CC->getMemoryLeader() != MA) + CC = createMemoryClass(MA); + return CC; + } + CongruenceClass *createSingletonCongruenceClass(Value *Member) { CongruenceClass *CClass = createCongruenceClass(Member, nullptr); - CClass->Members.insert(Member); + CClass->insert(Member); ValueToClass[Member] = CClass; return CClass; } void initializeCongruenceClasses(Function &F); + const Expression *makePossiblePhiOfOps(Instruction *, + SmallPtrSetImpl<Value *> &); + void addPhiOfOps(PHINode *Op, BasicBlock *BB, Instruction *ExistingValue); // Value number an Instruction or MemoryPhi. void valueNumberMemoryPhi(MemoryPhi *); @@ -312,78 +641,128 @@ private: // Symbolic evaluation. const Expression *checkSimplificationResults(Expression *, Instruction *, - Value *); - const Expression *performSymbolicEvaluation(Value *, const BasicBlock *); - const Expression *performSymbolicLoadEvaluation(Instruction *, - const BasicBlock *); - const Expression *performSymbolicStoreEvaluation(Instruction *, - const BasicBlock *); - const Expression *performSymbolicCallEvaluation(Instruction *, - const BasicBlock *); - const Expression *performSymbolicPHIEvaluation(Instruction *, - const BasicBlock *); - bool setMemoryAccessEquivTo(MemoryAccess *From, MemoryAccess *To); - const Expression *performSymbolicAggrValueEvaluation(Instruction *, - const BasicBlock *); + Value *) const; + const Expression *performSymbolicEvaluation(Value *, + SmallPtrSetImpl<Value *> &) const; + const Expression *performSymbolicLoadCoercion(Type *, Value *, LoadInst *, + Instruction *, + MemoryAccess *) const; + const Expression *performSymbolicLoadEvaluation(Instruction *) const; + const Expression *performSymbolicStoreEvaluation(Instruction *) const; + const Expression *performSymbolicCallEvaluation(Instruction *) const; + const Expression *performSymbolicPHIEvaluation(Instruction *) const; + const Expression *performSymbolicAggrValueEvaluation(Instruction *) const; + const Expression *performSymbolicCmpEvaluation(Instruction *) const; + const Expression *performSymbolicPredicateInfoEvaluation(Instruction *) const; // Congruence finding. - // Templated to allow them to work both on BB's and BB-edges. - template <class T> - Value *lookupOperandLeader(Value *, const User *, const T &) const; + bool someEquivalentDominates(const Instruction *, const Instruction *) const; + Value *lookupOperandLeader(Value *) const; void performCongruenceFinding(Instruction *, const Expression *); - void moveValueToNewCongruenceClass(Instruction *, CongruenceClass *, - CongruenceClass *); + void moveValueToNewCongruenceClass(Instruction *, const Expression *, + CongruenceClass *, CongruenceClass *); + void moveMemoryToNewCongruenceClass(Instruction *, MemoryAccess *, + CongruenceClass *, CongruenceClass *); + Value *getNextValueLeader(CongruenceClass *) const; + const MemoryAccess *getNextMemoryLeader(CongruenceClass *) const; + bool setMemoryClass(const MemoryAccess *From, CongruenceClass *To); + CongruenceClass *getMemoryClass(const MemoryAccess *MA) const; + const MemoryAccess *lookupMemoryLeader(const MemoryAccess *) const; + bool isMemoryAccessTOP(const MemoryAccess *) const; + + // Ranking + unsigned int getRank(const Value *) const; + bool shouldSwapOperands(const Value *, const Value *) const; + // Reachability handling. void updateReachableEdge(BasicBlock *, BasicBlock *); void processOutgoingEdges(TerminatorInst *, BasicBlock *); - bool isOnlyReachableViaThisEdge(const BasicBlockEdge &) const; - Value *findConditionEquivalence(Value *, BasicBlock *) const; - MemoryAccess *lookupMemoryAccessEquiv(MemoryAccess *) const; + Value *findConditionEquivalence(Value *) const; // Elimination. struct ValueDFS; - void convertDenseToDFSOrdered(CongruenceClass::MemberSet &, - SmallVectorImpl<ValueDFS> &); + void convertClassToDFSOrdered(const CongruenceClass &, + SmallVectorImpl<ValueDFS> &, + DenseMap<const Value *, unsigned int> &, + SmallPtrSetImpl<Instruction *> &) const; + void convertClassToLoadsAndStores(const CongruenceClass &, + SmallVectorImpl<ValueDFS> &) const; bool eliminateInstructions(Function &); void replaceInstruction(Instruction *, Value *); void markInstructionForDeletion(Instruction *); void deleteInstructionsInBlock(BasicBlock *); + Value *findPhiOfOpsLeader(const Expression *E, const BasicBlock *BB) const; // New instruction creation. void handleNewInstruction(Instruction *){}; // Various instruction touch utilities + template <typename Map, typename KeyType, typename Func> + void for_each_found(Map &, const KeyType &, Func); + template <typename Map, typename KeyType> + void touchAndErase(Map &, const KeyType &); void markUsersTouched(Value *); - void markMemoryUsersTouched(MemoryAccess *); - void markLeaderChangeTouched(CongruenceClass *CC); + void markMemoryUsersTouched(const MemoryAccess *); + void markMemoryDefTouched(const MemoryAccess *); + void markPredicateUsersTouched(Instruction *); + void markValueLeaderChangeTouched(CongruenceClass *CC); + void markMemoryLeaderChangeTouched(CongruenceClass *CC); + void markPhiOfOpsChanged(const Expression *E); + void addPredicateUsers(const PredicateBase *, Instruction *) const; + void addMemoryUsers(const MemoryAccess *To, MemoryAccess *U) const; + void addAdditionalUsers(Value *To, Value *User) const; + + // Main loop of value numbering + void iterateTouchedInstructions(); // Utilities. void cleanupTables(); std::pair<unsigned, unsigned> assignDFSNumbers(BasicBlock *, unsigned); - void updateProcessedCount(Value *V); + void updateProcessedCount(const Value *V); void verifyMemoryCongruency() const; - bool singleReachablePHIPath(const MemoryAccess *, const MemoryAccess *) const; -}; - -char NewGVN::ID = 0; + void verifyIterationSettled(Function &F); + void verifyStoreExpressions() const; + bool singleReachablePHIPath(SmallPtrSet<const MemoryAccess *, 8> &, + const MemoryAccess *, const MemoryAccess *) const; + BasicBlock *getBlockForValue(Value *V) const; + void deleteExpression(const Expression *E) const; + MemoryUseOrDef *getMemoryAccess(const Instruction *) const; + MemoryAccess *getDefiningAccess(const MemoryAccess *) const; + MemoryPhi *getMemoryAccess(const BasicBlock *) const; + template <class T, class Range> T *getMinDFSOfRange(const Range &) const; + unsigned InstrToDFSNum(const Value *V) const { + assert(isa<Instruction>(V) && "This should not be used for MemoryAccesses"); + return InstrDFS.lookup(V); + } -// createGVNPass - The public interface to this file. -FunctionPass *llvm::createNewGVNPass() { return new NewGVN(); } + unsigned InstrToDFSNum(const MemoryAccess *MA) const { + return MemoryToDFSNum(MA); + } + Value *InstrFromDFSNum(unsigned DFSNum) { return DFSToInstr[DFSNum]; } + // Given a MemoryAccess, return the relevant instruction DFS number. Note: + // This deliberately takes a value so it can be used with Use's, which will + // auto-convert to Value's but not to MemoryAccess's. + unsigned MemoryToDFSNum(const Value *MA) const { + assert(isa<MemoryAccess>(MA) && + "This should not be used with instructions"); + return isa<MemoryUseOrDef>(MA) + ? InstrToDFSNum(cast<MemoryUseOrDef>(MA)->getMemoryInst()) + : InstrDFS.lookup(MA); + } + bool isCycleFree(const Instruction *) const; + bool isBackedge(BasicBlock *From, BasicBlock *To) const; + // Debug counter info. When verifying, we have to reset the value numbering + // debug counter to the same state it started in to get the same results. + std::pair<int, int> StartingVNCounter; +}; +} // end anonymous namespace template <typename T> static bool equalsLoadStoreHelper(const T &LHS, const Expression &RHS) { - if ((!isa<LoadExpression>(RHS) && !isa<StoreExpression>(RHS)) || - !LHS.BasicExpression::equals(RHS)) { + if (!isa<LoadExpression>(RHS) && !isa<StoreExpression>(RHS)) return false; - } else if (const auto *L = dyn_cast<LoadExpression>(&RHS)) { - if (LHS.getDefiningAccess() != L->getDefiningAccess()) - return false; - } else if (const auto *S = dyn_cast<StoreExpression>(&RHS)) { - if (LHS.getDefiningAccess() != S->getDefiningAccess()) - return false; - } - return true; + return LHS.MemoryExpression::equals(RHS); } bool LoadExpression::equals(const Expression &Other) const { @@ -391,7 +770,22 @@ bool LoadExpression::equals(const Expression &Other) const { } bool StoreExpression::equals(const Expression &Other) const { - return equalsLoadStoreHelper(*this, Other); + if (!equalsLoadStoreHelper(*this, Other)) + return false; + // Make sure that store vs store includes the value operand. + if (const auto *S = dyn_cast<StoreExpression>(&Other)) + if (getStoredValue() != S->getStoredValue()) + return false; + return true; +} + +// Determine if the edge From->To is a backedge +bool NewGVN::isBackedge(BasicBlock *From, BasicBlock *To) const { + if (From == To) + return true; + auto *FromDTN = DT->getNode(From); + auto *ToDTN = DT->getNode(To); + return RPOOrdering.lookup(FromDTN) >= RPOOrdering.lookup(ToDTN); } #ifndef NDEBUG @@ -400,17 +794,45 @@ static std::string getBlockName(const BasicBlock *B) { } #endif -INITIALIZE_PASS_BEGIN(NewGVN, "newgvn", "Global Value Numbering", false, false) -INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) -INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass) -INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) -INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) -INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) -INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) -INITIALIZE_PASS_END(NewGVN, "newgvn", "Global Value Numbering", false, false) +// Get a MemoryAccess for an instruction, fake or real. +MemoryUseOrDef *NewGVN::getMemoryAccess(const Instruction *I) const { + auto *Result = MSSA->getMemoryAccess(I); + return Result ? Result : TempToMemory.lookup(I); +} -PHIExpression *NewGVN::createPHIExpression(Instruction *I) { - BasicBlock *PHIBlock = I->getParent(); +// Get a MemoryPhi for a basic block. These are all real. +MemoryPhi *NewGVN::getMemoryAccess(const BasicBlock *BB) const { + return MSSA->getMemoryAccess(BB); +} + +// Get the basic block from an instruction/memory value. +BasicBlock *NewGVN::getBlockForValue(Value *V) const { + if (auto *I = dyn_cast<Instruction>(V)) { + auto *Parent = I->getParent(); + if (Parent) + return Parent; + Parent = TempToBlock.lookup(V); + assert(Parent && "Every fake instruction should have a block"); + return Parent; + } + + auto *MP = dyn_cast<MemoryPhi>(V); + assert(MP && "Should have been an instruction or a MemoryPhi"); + return MP->getBlock(); +} + +// Delete a definitely dead expression, so it can be reused by the expression +// allocator. Some of these are not in creation functions, so we have to accept +// const versions. +void NewGVN::deleteExpression(const Expression *E) const { + assert(isa<BasicExpression>(E)); + auto *BE = cast<BasicExpression>(E); + const_cast<BasicExpression *>(BE)->deallocateOperands(ArgRecycler); + ExpressionAllocator.Deallocate(E); +} +PHIExpression *NewGVN::createPHIExpression(Instruction *I, bool &HasBackedge, + bool &OriginalOpsConstant) const { + BasicBlock *PHIBlock = getBlockForValue(I); auto *PN = cast<PHINode>(I); auto *E = new (ExpressionAllocator) PHIExpression(PN->getNumOperands(), PHIBlock); @@ -419,28 +841,47 @@ PHIExpression *NewGVN::createPHIExpression(Instruction *I) { E->setType(I->getType()); E->setOpcode(I->getOpcode()); - auto ReachablePhiArg = [&](const Use &U) { - return ReachableBlocks.count(PN->getIncomingBlock(U)); - }; - - // Filter out unreachable operands - auto Filtered = make_filter_range(PN->operands(), ReachablePhiArg); - + // NewGVN assumes the operands of a PHI node are in a consistent order across + // PHIs. LLVM doesn't seem to always guarantee this. While we need to fix + // this in LLVM at some point we don't want GVN to find wrong congruences. + // Therefore, here we sort uses in predecessor order. + // We're sorting the values by pointer. In theory this might be cause of + // non-determinism, but here we don't rely on the ordering for anything + // significant, e.g. we don't create new instructions based on it so we're + // fine. + SmallVector<const Use *, 4> PHIOperands; + for (const Use &U : PN->operands()) + PHIOperands.push_back(&U); + std::sort(PHIOperands.begin(), PHIOperands.end(), + [&](const Use *U1, const Use *U2) { + return PN->getIncomingBlock(*U1) < PN->getIncomingBlock(*U2); + }); + + // Filter out unreachable phi operands. + auto Filtered = make_filter_range(PHIOperands, [&](const Use *U) { + if (*U == PN) + return false; + if (!ReachableEdges.count({PN->getIncomingBlock(*U), PHIBlock})) + return false; + // Things in TOPClass are equivalent to everything. + if (ValueToClass.lookup(*U) == TOPClass) + return false; + return lookupOperandLeader(*U) != PN; + }); std::transform(Filtered.begin(), Filtered.end(), op_inserter(E), - [&](const Use &U) -> Value * { - // Don't try to transform self-defined phis. - if (U == PN) - return PN; - const BasicBlockEdge BBE(PN->getIncomingBlock(U), PHIBlock); - return lookupOperandLeader(U, I, BBE); + [&](const Use *U) -> Value * { + auto *BB = PN->getIncomingBlock(*U); + HasBackedge = HasBackedge || isBackedge(BB, PHIBlock); + OriginalOpsConstant = + OriginalOpsConstant && isa<Constant>(*U); + return lookupOperandLeader(*U); }); return E; } // Set basic expression info (Arguments, type, opcode) for Expression // E from Instruction I in block B. -bool NewGVN::setBasicExpressionInfo(Instruction *I, BasicExpression *E, - const BasicBlock *B) { +bool NewGVN::setBasicExpressionInfo(Instruction *I, BasicExpression *E) const { bool AllConstant = true; if (auto *GEP = dyn_cast<GetElementPtrInst>(I)) E->setType(GEP->getSourceElementType()); @@ -452,8 +893,8 @@ bool NewGVN::setBasicExpressionInfo(Instruction *I, BasicExpression *E, // Transform the operand array into an operand leader array, and keep track of // whether all members are constant. std::transform(I->op_begin(), I->op_end(), op_inserter(E), [&](Value *O) { - auto Operand = lookupOperandLeader(O, I, B); - AllConstant &= isa<Constant>(Operand); + auto Operand = lookupOperandLeader(O); + AllConstant = AllConstant && isa<Constant>(Operand); return Operand; }); @@ -461,8 +902,8 @@ bool NewGVN::setBasicExpressionInfo(Instruction *I, BasicExpression *E, } const Expression *NewGVN::createBinaryExpression(unsigned Opcode, Type *T, - Value *Arg1, Value *Arg2, - const BasicBlock *B) { + Value *Arg1, + Value *Arg2) const { auto *E = new (ExpressionAllocator) BasicExpression(2); E->setType(T); @@ -473,14 +914,13 @@ const Expression *NewGVN::createBinaryExpression(unsigned Opcode, Type *T, // of their operands get the same value number by sorting the operand value // numbers. Since all commutative instructions have two operands it is more // efficient to sort by hand rather than using, say, std::sort. - if (Arg1 > Arg2) + if (shouldSwapOperands(Arg1, Arg2)) std::swap(Arg1, Arg2); } - E->op_push_back(lookupOperandLeader(Arg1, nullptr, B)); - E->op_push_back(lookupOperandLeader(Arg2, nullptr, B)); + E->op_push_back(lookupOperandLeader(Arg1)); + E->op_push_back(lookupOperandLeader(Arg2)); - Value *V = SimplifyBinOp(Opcode, E->getOperand(0), E->getOperand(1), *DL, TLI, - DT, AC); + Value *V = SimplifyBinOp(Opcode, E->getOperand(0), E->getOperand(1), SQ); if (const Expression *SimplifiedE = checkSimplificationResults(E, nullptr, V)) return SimplifiedE; return E; @@ -492,7 +932,8 @@ const Expression *NewGVN::createBinaryExpression(unsigned Opcode, Type *T, // TODO: Once finished, this should not take an Instruction, we only // use it for printing. const Expression *NewGVN::checkSimplificationResults(Expression *E, - Instruction *I, Value *V) { + Instruction *I, + Value *V) const { if (!V) return nullptr; if (auto *C = dyn_cast<Constant>(V)) { @@ -502,40 +943,40 @@ const Expression *NewGVN::checkSimplificationResults(Expression *E, NumGVNOpsSimplified++; assert(isa<BasicExpression>(E) && "We should always have had a basic expression here"); - - cast<BasicExpression>(E)->deallocateOperands(ArgRecycler); - ExpressionAllocator.Deallocate(E); + deleteExpression(E); return createConstantExpression(C); } else if (isa<Argument>(V) || isa<GlobalVariable>(V)) { if (I) DEBUG(dbgs() << "Simplified " << *I << " to " << " variable " << *V << "\n"); - cast<BasicExpression>(E)->deallocateOperands(ArgRecycler); - ExpressionAllocator.Deallocate(E); + deleteExpression(E); return createVariableExpression(V); } CongruenceClass *CC = ValueToClass.lookup(V); - if (CC && CC->DefiningExpr) { + if (CC && CC->getDefiningExpr()) { + // If we simplified to something else, we need to communicate + // that we're users of the value we simplified to. + if (I != V) { + // Don't add temporary instructions to the user lists. + if (!AllTempInstructions.count(I)) + addAdditionalUsers(V, I); + } + if (I) DEBUG(dbgs() << "Simplified " << *I << " to " - << " expression " << *V << "\n"); + << " expression " << *CC->getDefiningExpr() << "\n"); NumGVNOpsSimplified++; - assert(isa<BasicExpression>(E) && - "We should always have had a basic expression here"); - cast<BasicExpression>(E)->deallocateOperands(ArgRecycler); - ExpressionAllocator.Deallocate(E); - return CC->DefiningExpr; + deleteExpression(E); + return CC->getDefiningExpr(); } return nullptr; } -const Expression *NewGVN::createExpression(Instruction *I, - const BasicBlock *B) { - +const Expression *NewGVN::createExpression(Instruction *I) const { auto *E = new (ExpressionAllocator) BasicExpression(I->getNumOperands()); - bool AllConstant = setBasicExpressionInfo(I, E, B); + bool AllConstant = setBasicExpressionInfo(I, E); if (I->isCommutative()) { // Ensure that commutative instructions that only differ by a permutation @@ -543,7 +984,7 @@ const Expression *NewGVN::createExpression(Instruction *I, // numbers. Since all commutative instructions have two operands it is more // efficient to sort by hand rather than using, say, std::sort. assert(I->getNumOperands() == 2 && "Unsupported commutative instruction!"); - if (E->getOperand(0) > E->getOperand(1)) + if (shouldSwapOperands(E->getOperand(0), E->getOperand(1))) E->swapOperands(0, 1); } @@ -559,48 +1000,43 @@ const Expression *NewGVN::createExpression(Instruction *I, // Sort the operand value numbers so x<y and y>x get the same value // number. CmpInst::Predicate Predicate = CI->getPredicate(); - if (E->getOperand(0) > E->getOperand(1)) { + if (shouldSwapOperands(E->getOperand(0), E->getOperand(1))) { E->swapOperands(0, 1); Predicate = CmpInst::getSwappedPredicate(Predicate); } E->setOpcode((CI->getOpcode() << 8) | Predicate); // TODO: 25% of our time is spent in SimplifyCmpInst with pointer operands - // TODO: Since we noop bitcasts, we may need to check types before - // simplifying, so that we don't end up simplifying based on a wrong - // type assumption. We should clean this up so we can use constants of the - // wrong type - assert(I->getOperand(0)->getType() == I->getOperand(1)->getType() && "Wrong types on cmp instruction"); - if ((E->getOperand(0)->getType() == I->getOperand(0)->getType() && - E->getOperand(1)->getType() == I->getOperand(1)->getType())) { - Value *V = SimplifyCmpInst(Predicate, E->getOperand(0), E->getOperand(1), - *DL, TLI, DT, AC); - if (const Expression *SimplifiedE = checkSimplificationResults(E, I, V)) - return SimplifiedE; - } + assert((E->getOperand(0)->getType() == I->getOperand(0)->getType() && + E->getOperand(1)->getType() == I->getOperand(1)->getType())); + Value *V = + SimplifyCmpInst(Predicate, E->getOperand(0), E->getOperand(1), SQ); + if (const Expression *SimplifiedE = checkSimplificationResults(E, I, V)) + return SimplifiedE; } else if (isa<SelectInst>(I)) { if (isa<Constant>(E->getOperand(0)) || - (E->getOperand(1)->getType() == I->getOperand(1)->getType() && - E->getOperand(2)->getType() == I->getOperand(2)->getType())) { + E->getOperand(0) == E->getOperand(1)) { + assert(E->getOperand(1)->getType() == I->getOperand(1)->getType() && + E->getOperand(2)->getType() == I->getOperand(2)->getType()); Value *V = SimplifySelectInst(E->getOperand(0), E->getOperand(1), - E->getOperand(2), *DL, TLI, DT, AC); + E->getOperand(2), SQ); if (const Expression *SimplifiedE = checkSimplificationResults(E, I, V)) return SimplifiedE; } } else if (I->isBinaryOp()) { - Value *V = SimplifyBinOp(E->getOpcode(), E->getOperand(0), E->getOperand(1), - *DL, TLI, DT, AC); + Value *V = + SimplifyBinOp(E->getOpcode(), E->getOperand(0), E->getOperand(1), SQ); if (const Expression *SimplifiedE = checkSimplificationResults(E, I, V)) return SimplifiedE; } else if (auto *BI = dyn_cast<BitCastInst>(I)) { - Value *V = SimplifyInstruction(BI, *DL, TLI, DT, AC); + Value *V = + SimplifyCastInst(BI->getOpcode(), BI->getOperand(0), BI->getType(), SQ); if (const Expression *SimplifiedE = checkSimplificationResults(E, I, V)) return SimplifiedE; } else if (isa<GetElementPtrInst>(I)) { - Value *V = SimplifyGEPInst(E->getType(), - ArrayRef<Value *>(E->op_begin(), E->op_end()), - *DL, TLI, DT, AC); + Value *V = SimplifyGEPInst( + E->getType(), ArrayRef<Value *>(E->op_begin(), E->op_end()), SQ); if (const Expression *SimplifiedE = checkSimplificationResults(E, I, V)) return SimplifiedE; } else if (AllConstant) { @@ -615,7 +1051,7 @@ const Expression *NewGVN::createExpression(Instruction *I, for (Value *Arg : E->operands()) C.emplace_back(cast<Constant>(Arg)); - if (Value *V = ConstantFoldInstOperands(I, C, *DL, TLI)) + if (Value *V = ConstantFoldInstOperands(I, C, DL, TLI)) if (const Expression *SimplifiedE = checkSimplificationResults(E, I, V)) return SimplifiedE; } @@ -623,18 +1059,18 @@ const Expression *NewGVN::createExpression(Instruction *I, } const AggregateValueExpression * -NewGVN::createAggregateValueExpression(Instruction *I, const BasicBlock *B) { +NewGVN::createAggregateValueExpression(Instruction *I) const { if (auto *II = dyn_cast<InsertValueInst>(I)) { auto *E = new (ExpressionAllocator) AggregateValueExpression(I->getNumOperands(), II->getNumIndices()); - setBasicExpressionInfo(I, E, B); + setBasicExpressionInfo(I, E); E->allocateIntOperands(ExpressionAllocator); std::copy(II->idx_begin(), II->idx_end(), int_op_inserter(E)); return E; } else if (auto *EI = dyn_cast<ExtractValueInst>(I)) { auto *E = new (ExpressionAllocator) AggregateValueExpression(I->getNumOperands(), EI->getNumIndices()); - setBasicExpressionInfo(EI, E, B); + setBasicExpressionInfo(EI, E); E->allocateIntOperands(ExpressionAllocator); std::copy(EI->idx_begin(), EI->idx_end(), int_op_inserter(E)); return E; @@ -642,67 +1078,120 @@ NewGVN::createAggregateValueExpression(Instruction *I, const BasicBlock *B) { llvm_unreachable("Unhandled type of aggregate value operation"); } -const VariableExpression *NewGVN::createVariableExpression(Value *V) { +const DeadExpression *NewGVN::createDeadExpression() const { + // DeadExpression has no arguments and all DeadExpression's are the same, + // so we only need one of them. + return SingletonDeadExpression; +} + +const VariableExpression *NewGVN::createVariableExpression(Value *V) const { auto *E = new (ExpressionAllocator) VariableExpression(V); E->setOpcode(V->getValueID()); return E; } -const Expression *NewGVN::createVariableOrConstant(Value *V, - const BasicBlock *B) { - auto Leader = lookupOperandLeader(V, nullptr, B); - if (auto *C = dyn_cast<Constant>(Leader)) +const Expression *NewGVN::createVariableOrConstant(Value *V) const { + if (auto *C = dyn_cast<Constant>(V)) return createConstantExpression(C); - return createVariableExpression(Leader); + return createVariableExpression(V); } -const ConstantExpression *NewGVN::createConstantExpression(Constant *C) { +const ConstantExpression *NewGVN::createConstantExpression(Constant *C) const { auto *E = new (ExpressionAllocator) ConstantExpression(C); E->setOpcode(C->getValueID()); return E; } -const UnknownExpression *NewGVN::createUnknownExpression(Instruction *I) { +const UnknownExpression *NewGVN::createUnknownExpression(Instruction *I) const { auto *E = new (ExpressionAllocator) UnknownExpression(I); E->setOpcode(I->getOpcode()); return E; } -const CallExpression *NewGVN::createCallExpression(CallInst *CI, - MemoryAccess *HV, - const BasicBlock *B) { +const CallExpression * +NewGVN::createCallExpression(CallInst *CI, const MemoryAccess *MA) const { // FIXME: Add operand bundles for calls. auto *E = - new (ExpressionAllocator) CallExpression(CI->getNumOperands(), CI, HV); - setBasicExpressionInfo(CI, E, B); + new (ExpressionAllocator) CallExpression(CI->getNumOperands(), CI, MA); + setBasicExpressionInfo(CI, E); return E; } +// Return true if some equivalent of instruction Inst dominates instruction U. +bool NewGVN::someEquivalentDominates(const Instruction *Inst, + const Instruction *U) const { + auto *CC = ValueToClass.lookup(Inst); + // This must be an instruction because we are only called from phi nodes + // in the case that the value it needs to check against is an instruction. + + // The most likely candiates for dominance are the leader and the next leader. + // The leader or nextleader will dominate in all cases where there is an + // equivalent that is higher up in the dom tree. + // We can't *only* check them, however, because the + // dominator tree could have an infinite number of non-dominating siblings + // with instructions that are in the right congruence class. + // A + // B C D E F G + // | + // H + // Instruction U could be in H, with equivalents in every other sibling. + // Depending on the rpo order picked, the leader could be the equivalent in + // any of these siblings. + if (!CC) + return false; + if (DT->dominates(cast<Instruction>(CC->getLeader()), U)) + return true; + if (CC->getNextLeader().first && + DT->dominates(cast<Instruction>(CC->getNextLeader().first), U)) + return true; + return llvm::any_of(*CC, [&](const Value *Member) { + return Member != CC->getLeader() && + DT->dominates(cast<Instruction>(Member), U); + }); +} + // See if we have a congruence class and leader for this operand, and if so, // return it. Otherwise, return the operand itself. -template <class T> -Value *NewGVN::lookupOperandLeader(Value *V, const User *U, const T &B) const { +Value *NewGVN::lookupOperandLeader(Value *V) const { CongruenceClass *CC = ValueToClass.lookup(V); - if (CC && (CC != InitialClass)) - return CC->RepLeader; + if (CC) { + // Everything in TOP is represented by undef, as it can be any value. + // We do have to make sure we get the type right though, so we can't set the + // RepLeader to undef. + if (CC == TOPClass) + return UndefValue::get(V->getType()); + return CC->getStoredValue() ? CC->getStoredValue() : CC->getLeader(); + } + return V; } -MemoryAccess *NewGVN::lookupMemoryAccessEquiv(MemoryAccess *MA) const { - MemoryAccess *Result = MemoryAccessEquiv.lookup(MA); - return Result ? Result : MA; +const MemoryAccess *NewGVN::lookupMemoryLeader(const MemoryAccess *MA) const { + auto *CC = getMemoryClass(MA); + assert(CC->getMemoryLeader() && + "Every MemoryAccess should be mapped to a congruence class with a " + "representative memory access"); + return CC->getMemoryLeader(); +} + +// Return true if the MemoryAccess is really equivalent to everything. This is +// equivalent to the lattice value "TOP" in most lattices. This is the initial +// state of all MemoryAccesses. +bool NewGVN::isMemoryAccessTOP(const MemoryAccess *MA) const { + return getMemoryClass(MA) == TOPClass; } LoadExpression *NewGVN::createLoadExpression(Type *LoadType, Value *PointerOp, - LoadInst *LI, MemoryAccess *DA, - const BasicBlock *B) { - auto *E = new (ExpressionAllocator) LoadExpression(1, LI, DA); + LoadInst *LI, + const MemoryAccess *MA) const { + auto *E = + new (ExpressionAllocator) LoadExpression(1, LI, lookupMemoryLeader(MA)); E->allocateOperands(ArgRecycler, ExpressionAllocator); E->setType(LoadType); // Give store and loads same opcode so they value number together. E->setOpcode(0); - E->op_push_back(lookupOperandLeader(PointerOp, LI, B)); + E->op_push_back(PointerOp); if (LI) E->setAlignment(LI->getAlignment()); @@ -712,17 +1201,17 @@ LoadExpression *NewGVN::createLoadExpression(Type *LoadType, Value *PointerOp, return E; } -const StoreExpression *NewGVN::createStoreExpression(StoreInst *SI, - MemoryAccess *DA, - const BasicBlock *B) { - auto *E = - new (ExpressionAllocator) StoreExpression(SI->getNumOperands(), SI, DA); +const StoreExpression * +NewGVN::createStoreExpression(StoreInst *SI, const MemoryAccess *MA) const { + auto *StoredValueLeader = lookupOperandLeader(SI->getValueOperand()); + auto *E = new (ExpressionAllocator) + StoreExpression(SI->getNumOperands(), SI, StoredValueLeader, MA); E->allocateOperands(ArgRecycler, ExpressionAllocator); E->setType(SI->getValueOperand()->getType()); // Give store and loads same opcode so they value number together. E->setOpcode(0); - E->op_push_back(lookupOperandLeader(SI->getPointerOperand(), SI, B)); + E->op_push_back(lookupOperandLeader(SI->getPointerOperand())); // TODO: Value number heap versions. We may be able to discover // things alias analysis can't on it's own (IE that a store and a @@ -730,44 +1219,136 @@ const StoreExpression *NewGVN::createStoreExpression(StoreInst *SI, return E; } -// Utility function to check whether the congruence class has a member other -// than the given instruction. -bool hasMemberOtherThanUs(const CongruenceClass *CC, Instruction *I) { - // Either it has more than one store, in which case it must contain something - // other than us (because it's indexed by value), or if it only has one store - // right now, that member should not be us. - return CC->StoreCount > 1 || CC->Members.count(I) == 0; -} - -const Expression *NewGVN::performSymbolicStoreEvaluation(Instruction *I, - const BasicBlock *B) { +const Expression *NewGVN::performSymbolicStoreEvaluation(Instruction *I) const { // Unlike loads, we never try to eliminate stores, so we do not check if they // are simple and avoid value numbering them. auto *SI = cast<StoreInst>(I); - MemoryAccess *StoreAccess = MSSA->getMemoryAccess(SI); - // See if we are defined by a previous store expression, it already has a - // value, and it's the same value as our current store. FIXME: Right now, we - // only do this for simple stores, we should expand to cover memcpys, etc. + auto *StoreAccess = getMemoryAccess(SI); + // Get the expression, if any, for the RHS of the MemoryDef. + const MemoryAccess *StoreRHS = StoreAccess->getDefiningAccess(); + if (EnableStoreRefinement) + StoreRHS = MSSAWalker->getClobberingMemoryAccess(StoreAccess); + // If we bypassed the use-def chains, make sure we add a use. + if (StoreRHS != StoreAccess->getDefiningAccess()) + addMemoryUsers(StoreRHS, StoreAccess); + StoreRHS = lookupMemoryLeader(StoreRHS); + // If we are defined by ourselves, use the live on entry def. + if (StoreRHS == StoreAccess) + StoreRHS = MSSA->getLiveOnEntryDef(); + if (SI->isSimple()) { - // Get the expression, if any, for the RHS of the MemoryDef. - MemoryAccess *StoreRHS = lookupMemoryAccessEquiv( - cast<MemoryDef>(StoreAccess)->getDefiningAccess()); - const Expression *OldStore = createStoreExpression(SI, StoreRHS, B); - CongruenceClass *CC = ExpressionToClass.lookup(OldStore); - // Basically, check if the congruence class the store is in is defined by a - // store that isn't us, and has the same value. MemorySSA takes care of - // ensuring the store has the same memory state as us already. - if (CC && CC->DefiningExpr && isa<StoreExpression>(CC->DefiningExpr) && - CC->RepLeader == lookupOperandLeader(SI->getValueOperand(), SI, B) && - hasMemberOtherThanUs(CC, I)) - return createStoreExpression(SI, StoreRHS, B); + // See if we are defined by a previous store expression, it already has a + // value, and it's the same value as our current store. FIXME: Right now, we + // only do this for simple stores, we should expand to cover memcpys, etc. + const auto *LastStore = createStoreExpression(SI, StoreRHS); + const auto *LastCC = ExpressionToClass.lookup(LastStore); + // We really want to check whether the expression we matched was a store. No + // easy way to do that. However, we can check that the class we found has a + // store, which, assuming the value numbering state is not corrupt, is + // sufficient, because we must also be equivalent to that store's expression + // for it to be in the same class as the load. + if (LastCC && LastCC->getStoredValue() == LastStore->getStoredValue()) + return LastStore; + // Also check if our value operand is defined by a load of the same memory + // location, and the memory state is the same as it was then (otherwise, it + // could have been overwritten later. See test32 in + // transforms/DeadStoreElimination/simple.ll). + if (auto *LI = dyn_cast<LoadInst>(LastStore->getStoredValue())) + if ((lookupOperandLeader(LI->getPointerOperand()) == + LastStore->getOperand(0)) && + (lookupMemoryLeader(getMemoryAccess(LI)->getDefiningAccess()) == + StoreRHS)) + return LastStore; + deleteExpression(LastStore); + } + + // If the store is not equivalent to anything, value number it as a store that + // produces a unique memory state (instead of using it's MemoryUse, we use + // it's MemoryDef). + return createStoreExpression(SI, StoreAccess); +} + +// See if we can extract the value of a loaded pointer from a load, a store, or +// a memory instruction. +const Expression * +NewGVN::performSymbolicLoadCoercion(Type *LoadType, Value *LoadPtr, + LoadInst *LI, Instruction *DepInst, + MemoryAccess *DefiningAccess) const { + assert((!LI || LI->isSimple()) && "Not a simple load"); + if (auto *DepSI = dyn_cast<StoreInst>(DepInst)) { + // Can't forward from non-atomic to atomic without violating memory model. + // Also don't need to coerce if they are the same type, we will just + // propogate.. + if (LI->isAtomic() > DepSI->isAtomic() || + LoadType == DepSI->getValueOperand()->getType()) + return nullptr; + int Offset = analyzeLoadFromClobberingStore(LoadType, LoadPtr, DepSI, DL); + if (Offset >= 0) { + if (auto *C = dyn_cast<Constant>( + lookupOperandLeader(DepSI->getValueOperand()))) { + DEBUG(dbgs() << "Coercing load from store " << *DepSI << " to constant " + << *C << "\n"); + return createConstantExpression( + getConstantStoreValueForLoad(C, Offset, LoadType, DL)); + } + } + + } else if (LoadInst *DepLI = dyn_cast<LoadInst>(DepInst)) { + // Can't forward from non-atomic to atomic without violating memory model. + if (LI->isAtomic() > DepLI->isAtomic()) + return nullptr; + int Offset = analyzeLoadFromClobberingLoad(LoadType, LoadPtr, DepLI, DL); + if (Offset >= 0) { + // We can coerce a constant load into a load + if (auto *C = dyn_cast<Constant>(lookupOperandLeader(DepLI))) + if (auto *PossibleConstant = + getConstantLoadValueForLoad(C, Offset, LoadType, DL)) { + DEBUG(dbgs() << "Coercing load from load " << *LI << " to constant " + << *PossibleConstant << "\n"); + return createConstantExpression(PossibleConstant); + } + } + + } else if (MemIntrinsic *DepMI = dyn_cast<MemIntrinsic>(DepInst)) { + int Offset = analyzeLoadFromClobberingMemInst(LoadType, LoadPtr, DepMI, DL); + if (Offset >= 0) { + if (auto *PossibleConstant = + getConstantMemInstValueForLoad(DepMI, Offset, LoadType, DL)) { + DEBUG(dbgs() << "Coercing load from meminst " << *DepMI + << " to constant " << *PossibleConstant << "\n"); + return createConstantExpression(PossibleConstant); + } + } + } + + // All of the below are only true if the loaded pointer is produced + // by the dependent instruction. + if (LoadPtr != lookupOperandLeader(DepInst) && + !AA->isMustAlias(LoadPtr, DepInst)) + return nullptr; + // If this load really doesn't depend on anything, then we must be loading an + // undef value. This can happen when loading for a fresh allocation with no + // intervening stores, for example. Note that this is only true in the case + // that the result of the allocation is pointer equal to the load ptr. + if (isa<AllocaInst>(DepInst) || isMallocLikeFn(DepInst, TLI)) { + return createConstantExpression(UndefValue::get(LoadType)); + } + // If this load occurs either right after a lifetime begin, + // then the loaded value is undefined. + else if (auto *II = dyn_cast<IntrinsicInst>(DepInst)) { + if (II->getIntrinsicID() == Intrinsic::lifetime_start) + return createConstantExpression(UndefValue::get(LoadType)); + } + // If this load follows a calloc (which zero initializes memory), + // then the loaded value is zero + else if (isCallocLikeFn(DepInst, TLI)) { + return createConstantExpression(Constant::getNullValue(LoadType)); } - return createStoreExpression(SI, StoreAccess, B); + return nullptr; } -const Expression *NewGVN::performSymbolicLoadEvaluation(Instruction *I, - const BasicBlock *B) { +const Expression *NewGVN::performSymbolicLoadEvaluation(Instruction *I) const { auto *LI = cast<LoadInst>(I); // We can eliminate in favor of non-simple loads, but we won't be able to @@ -775,12 +1356,13 @@ const Expression *NewGVN::performSymbolicLoadEvaluation(Instruction *I, if (!LI->isSimple()) return nullptr; - Value *LoadAddressLeader = lookupOperandLeader(LI->getPointerOperand(), I, B); + Value *LoadAddressLeader = lookupOperandLeader(LI->getPointerOperand()); // Load of undef is undef. if (isa<UndefValue>(LoadAddressLeader)) return createConstantExpression(UndefValue::get(LI->getType())); - - MemoryAccess *DefiningAccess = MSSAWalker->getClobberingMemoryAccess(I); + MemoryAccess *OriginalAccess = getMemoryAccess(I); + MemoryAccess *DefiningAccess = + MSSAWalker->getClobberingMemoryAccess(OriginalAccess); if (!MSSA->isLiveOnEntryDef(DefiningAccess)) { if (auto *MD = dyn_cast<MemoryDef>(DefiningAccess)) { @@ -788,88 +1370,263 @@ const Expression *NewGVN::performSymbolicLoadEvaluation(Instruction *I, // If the defining instruction is not reachable, replace with undef. if (!ReachableBlocks.count(DefiningInst->getParent())) return createConstantExpression(UndefValue::get(LI->getType())); + // This will handle stores and memory insts. We only do if it the + // defining access has a different type, or it is a pointer produced by + // certain memory operations that cause the memory to have a fixed value + // (IE things like calloc). + if (const auto *CoercionResult = + performSymbolicLoadCoercion(LI->getType(), LoadAddressLeader, LI, + DefiningInst, DefiningAccess)) + return CoercionResult; } } - const Expression *E = - createLoadExpression(LI->getType(), LI->getPointerOperand(), LI, - lookupMemoryAccessEquiv(DefiningAccess), B); + const Expression *E = createLoadExpression(LI->getType(), LoadAddressLeader, + LI, DefiningAccess); return E; } +const Expression * +NewGVN::performSymbolicPredicateInfoEvaluation(Instruction *I) const { + auto *PI = PredInfo->getPredicateInfoFor(I); + if (!PI) + return nullptr; + + DEBUG(dbgs() << "Found predicate info from instruction !\n"); + + auto *PWC = dyn_cast<PredicateWithCondition>(PI); + if (!PWC) + return nullptr; + + auto *CopyOf = I->getOperand(0); + auto *Cond = PWC->Condition; + + // If this a copy of the condition, it must be either true or false depending + // on the predicate info type and edge + if (CopyOf == Cond) { + // We should not need to add predicate users because the predicate info is + // already a use of this operand. + if (isa<PredicateAssume>(PI)) + return createConstantExpression(ConstantInt::getTrue(Cond->getType())); + if (auto *PBranch = dyn_cast<PredicateBranch>(PI)) { + if (PBranch->TrueEdge) + return createConstantExpression(ConstantInt::getTrue(Cond->getType())); + return createConstantExpression(ConstantInt::getFalse(Cond->getType())); + } + if (auto *PSwitch = dyn_cast<PredicateSwitch>(PI)) + return createConstantExpression(cast<Constant>(PSwitch->CaseValue)); + } + + // Not a copy of the condition, so see what the predicates tell us about this + // value. First, though, we check to make sure the value is actually a copy + // of one of the condition operands. It's possible, in certain cases, for it + // to be a copy of a predicateinfo copy. In particular, if two branch + // operations use the same condition, and one branch dominates the other, we + // will end up with a copy of a copy. This is currently a small deficiency in + // predicateinfo. What will end up happening here is that we will value + // number both copies the same anyway. + + // Everything below relies on the condition being a comparison. + auto *Cmp = dyn_cast<CmpInst>(Cond); + if (!Cmp) + return nullptr; + + if (CopyOf != Cmp->getOperand(0) && CopyOf != Cmp->getOperand(1)) { + DEBUG(dbgs() << "Copy is not of any condition operands!\n"); + return nullptr; + } + Value *FirstOp = lookupOperandLeader(Cmp->getOperand(0)); + Value *SecondOp = lookupOperandLeader(Cmp->getOperand(1)); + bool SwappedOps = false; + // Sort the ops + if (shouldSwapOperands(FirstOp, SecondOp)) { + std::swap(FirstOp, SecondOp); + SwappedOps = true; + } + CmpInst::Predicate Predicate = + SwappedOps ? Cmp->getSwappedPredicate() : Cmp->getPredicate(); + + if (isa<PredicateAssume>(PI)) { + // If the comparison is true when the operands are equal, then we know the + // operands are equal, because assumes must always be true. + if (CmpInst::isTrueWhenEqual(Predicate)) { + addPredicateUsers(PI, I); + addAdditionalUsers(Cmp->getOperand(0), I); + return createVariableOrConstant(FirstOp); + } + } + if (const auto *PBranch = dyn_cast<PredicateBranch>(PI)) { + // If we are *not* a copy of the comparison, we may equal to the other + // operand when the predicate implies something about equality of + // operations. In particular, if the comparison is true/false when the + // operands are equal, and we are on the right edge, we know this operation + // is equal to something. + if ((PBranch->TrueEdge && Predicate == CmpInst::ICMP_EQ) || + (!PBranch->TrueEdge && Predicate == CmpInst::ICMP_NE)) { + addPredicateUsers(PI, I); + addAdditionalUsers(Cmp->getOperand(0), I); + return createVariableOrConstant(FirstOp); + } + // Handle the special case of floating point. + if (((PBranch->TrueEdge && Predicate == CmpInst::FCMP_OEQ) || + (!PBranch->TrueEdge && Predicate == CmpInst::FCMP_UNE)) && + isa<ConstantFP>(FirstOp) && !cast<ConstantFP>(FirstOp)->isZero()) { + addPredicateUsers(PI, I); + addAdditionalUsers(Cmp->getOperand(0), I); + return createConstantExpression(cast<Constant>(FirstOp)); + } + } + return nullptr; +} + // Evaluate read only and pure calls, and create an expression result. -const Expression *NewGVN::performSymbolicCallEvaluation(Instruction *I, - const BasicBlock *B) { +const Expression *NewGVN::performSymbolicCallEvaluation(Instruction *I) const { auto *CI = cast<CallInst>(I); - if (AA->doesNotAccessMemory(CI)) - return createCallExpression(CI, nullptr, B); - if (AA->onlyReadsMemory(CI)) { + if (auto *II = dyn_cast<IntrinsicInst>(I)) { + // Instrinsics with the returned attribute are copies of arguments. + if (auto *ReturnedValue = II->getReturnedArgOperand()) { + if (II->getIntrinsicID() == Intrinsic::ssa_copy) + if (const auto *Result = performSymbolicPredicateInfoEvaluation(I)) + return Result; + return createVariableOrConstant(ReturnedValue); + } + } + if (AA->doesNotAccessMemory(CI)) { + return createCallExpression(CI, TOPClass->getMemoryLeader()); + } else if (AA->onlyReadsMemory(CI)) { MemoryAccess *DefiningAccess = MSSAWalker->getClobberingMemoryAccess(CI); - return createCallExpression(CI, lookupMemoryAccessEquiv(DefiningAccess), B); + return createCallExpression(CI, DefiningAccess); } return nullptr; } -// Update the memory access equivalence table to say that From is equal to To, +// Retrieve the memory class for a given MemoryAccess. +CongruenceClass *NewGVN::getMemoryClass(const MemoryAccess *MA) const { + + auto *Result = MemoryAccessToClass.lookup(MA); + assert(Result && "Should have found memory class"); + return Result; +} + +// Update the MemoryAccess equivalence table to say that From is equal to To, // and return true if this is different from what already existed in the table. -bool NewGVN::setMemoryAccessEquivTo(MemoryAccess *From, MemoryAccess *To) { - DEBUG(dbgs() << "Setting " << *From << " equivalent to "); - if (!To) - DEBUG(dbgs() << "itself"); - else - DEBUG(dbgs() << *To); - DEBUG(dbgs() << "\n"); - auto LookupResult = MemoryAccessEquiv.find(From); +bool NewGVN::setMemoryClass(const MemoryAccess *From, + CongruenceClass *NewClass) { + assert(NewClass && + "Every MemoryAccess should be getting mapped to a non-null class"); + DEBUG(dbgs() << "Setting " << *From); + DEBUG(dbgs() << " equivalent to congruence class "); + DEBUG(dbgs() << NewClass->getID() << " with current MemoryAccess leader "); + DEBUG(dbgs() << *NewClass->getMemoryLeader() << "\n"); + + auto LookupResult = MemoryAccessToClass.find(From); bool Changed = false; // If it's already in the table, see if the value changed. - if (LookupResult != MemoryAccessEquiv.end()) { - if (To && LookupResult->second != To) { + if (LookupResult != MemoryAccessToClass.end()) { + auto *OldClass = LookupResult->second; + if (OldClass != NewClass) { + // If this is a phi, we have to handle memory member updates. + if (auto *MP = dyn_cast<MemoryPhi>(From)) { + OldClass->memory_erase(MP); + NewClass->memory_insert(MP); + // This may have killed the class if it had no non-memory members + if (OldClass->getMemoryLeader() == From) { + if (OldClass->definesNoMemory()) { + OldClass->setMemoryLeader(nullptr); + } else { + OldClass->setMemoryLeader(getNextMemoryLeader(OldClass)); + DEBUG(dbgs() << "Memory class leader change for class " + << OldClass->getID() << " to " + << *OldClass->getMemoryLeader() + << " due to removal of a memory member " << *From + << "\n"); + markMemoryLeaderChangeTouched(OldClass); + } + } + } // It wasn't equivalent before, and now it is. - LookupResult->second = To; - Changed = true; - } else if (!To) { - // It used to be equivalent to something, and now it's not. - MemoryAccessEquiv.erase(LookupResult); + LookupResult->second = NewClass; Changed = true; } - } else { - assert(!To && - "Memory equivalence should never change from nothing to something"); } return Changed; } + +// Determine if a instruction is cycle-free. That means the values in the +// instruction don't depend on any expressions that can change value as a result +// of the instruction. For example, a non-cycle free instruction would be v = +// phi(0, v+1). +bool NewGVN::isCycleFree(const Instruction *I) const { + // In order to compute cycle-freeness, we do SCC finding on the instruction, + // and see what kind of SCC it ends up in. If it is a singleton, it is + // cycle-free. If it is not in a singleton, it is only cycle free if the + // other members are all phi nodes (as they do not compute anything, they are + // copies). + auto ICS = InstCycleState.lookup(I); + if (ICS == ICS_Unknown) { + SCCFinder.Start(I); + auto &SCC = SCCFinder.getComponentFor(I); + // It's cycle free if it's size 1 or or the SCC is *only* phi nodes. + if (SCC.size() == 1) + InstCycleState.insert({I, ICS_CycleFree}); + else { + bool AllPhis = + llvm::all_of(SCC, [](const Value *V) { return isa<PHINode>(V); }); + ICS = AllPhis ? ICS_CycleFree : ICS_Cycle; + for (auto *Member : SCC) + if (auto *MemberPhi = dyn_cast<PHINode>(Member)) + InstCycleState.insert({MemberPhi, ICS}); + } + } + if (ICS == ICS_Cycle) + return false; + return true; +} + // Evaluate PHI nodes symbolically, and create an expression result. -const Expression *NewGVN::performSymbolicPHIEvaluation(Instruction *I, - const BasicBlock *B) { - auto *E = cast<PHIExpression>(createPHIExpression(I)); +const Expression *NewGVN::performSymbolicPHIEvaluation(Instruction *I) const { + // True if one of the incoming phi edges is a backedge. + bool HasBackedge = false; + // All constant tracks the state of whether all the *original* phi operands + // This is really shorthand for "this phi cannot cycle due to forward + // change in value of the phi is guaranteed not to later change the value of + // the phi. IE it can't be v = phi(undef, v+1) + bool AllConstant = true; + auto *E = + cast<PHIExpression>(createPHIExpression(I, HasBackedge, AllConstant)); // We match the semantics of SimplifyPhiNode from InstructionSimplify here. - - // See if all arguaments are the same. + // See if all arguments are the same. // We track if any were undef because they need special handling. bool HasUndef = false; - auto Filtered = make_filter_range(E->operands(), [&](const Value *Arg) { - if (Arg == I) - return false; + auto Filtered = make_filter_range(E->operands(), [&](Value *Arg) { if (isa<UndefValue>(Arg)) { HasUndef = true; return false; } return true; }); - // If we are left with no operands, it's undef + // If we are left with no operands, it's dead. if (Filtered.begin() == Filtered.end()) { - DEBUG(dbgs() << "Simplified PHI node " << *I << " to undef" - << "\n"); - E->deallocateOperands(ArgRecycler); - ExpressionAllocator.Deallocate(E); - return createConstantExpression(UndefValue::get(I->getType())); + // If it has undef at this point, it means there are no-non-undef arguments, + // and thus, the value of the phi node must be undef. + if (HasUndef) { + DEBUG(dbgs() << "PHI Node " << *I + << " has no non-undef arguments, valuing it as undef\n"); + return createConstantExpression(UndefValue::get(I->getType())); + } + + DEBUG(dbgs() << "No arguments of PHI node " << *I << " are live\n"); + deleteExpression(E); + return createDeadExpression(); } + unsigned NumOps = 0; Value *AllSameValue = *(Filtered.begin()); ++Filtered.begin(); // Can't use std::equal here, sadly, because filter.begin moves. - if (llvm::all_of(Filtered, [AllSameValue](const Value *V) { - return V == AllSameValue; + if (llvm::all_of(Filtered, [&](Value *Arg) { + ++NumOps; + return Arg == AllSameValue; })) { // In LLVM's non-standard representation of phi nodes, it's possible to have // phi nodes with cycles (IE dependent on other phis that are .... dependent @@ -881,27 +1638,38 @@ const Expression *NewGVN::performSymbolicPHIEvaluation(Instruction *I, // We also special case undef, so that if we have an undef, we can't use the // common value unless it dominates the phi block. if (HasUndef) { + // If we have undef and at least one other value, this is really a + // multivalued phi, and we need to know if it's cycle free in order to + // evaluate whether we can ignore the undef. The other parts of this are + // just shortcuts. If there is no backedge, or all operands are + // constants, or all operands are ignored but the undef, it also must be + // cycle free. + if (!AllConstant && HasBackedge && NumOps > 0 && + !isa<UndefValue>(AllSameValue) && !isCycleFree(I)) + return E; + // Only have to check for instructions if (auto *AllSameInst = dyn_cast<Instruction>(AllSameValue)) - if (!DT->dominates(AllSameInst, I)) + if (!someEquivalentDominates(AllSameInst, I)) return E; } - + // Can't simplify to something that comes later in the iteration. + // Otherwise, when and if it changes congruence class, we will never catch + // up. We will always be a class behind it. + if (isa<Instruction>(AllSameValue) && + InstrToDFSNum(AllSameValue) > InstrToDFSNum(I)) + return E; NumGVNPhisAllSame++; DEBUG(dbgs() << "Simplified PHI node " << *I << " to " << *AllSameValue << "\n"); - E->deallocateOperands(ArgRecycler); - ExpressionAllocator.Deallocate(E); - if (auto *C = dyn_cast<Constant>(AllSameValue)) - return createConstantExpression(C); - return createVariableExpression(AllSameValue); + deleteExpression(E); + return createVariableOrConstant(AllSameValue); } return E; } const Expression * -NewGVN::performSymbolicAggrValueEvaluation(Instruction *I, - const BasicBlock *B) { +NewGVN::performSymbolicAggrValueEvaluation(Instruction *I) const { if (auto *EI = dyn_cast<ExtractValueInst>(I)) { auto *II = dyn_cast<IntrinsicInst>(EI->getAggregateOperand()); if (II && EI->getNumIndices() == 1 && *EI->idx_begin() == 0) { @@ -931,19 +1699,140 @@ NewGVN::performSymbolicAggrValueEvaluation(Instruction *I, // expression. assert(II->getNumArgOperands() == 2 && "Expect two args for recognised intrinsics."); - return createBinaryExpression(Opcode, EI->getType(), - II->getArgOperand(0), - II->getArgOperand(1), B); + return createBinaryExpression( + Opcode, EI->getType(), II->getArgOperand(0), II->getArgOperand(1)); } } } - return createAggregateValueExpression(I, B); + return createAggregateValueExpression(I); +} +const Expression *NewGVN::performSymbolicCmpEvaluation(Instruction *I) const { + auto *CI = dyn_cast<CmpInst>(I); + // See if our operands are equal to those of a previous predicate, and if so, + // if it implies true or false. + auto Op0 = lookupOperandLeader(CI->getOperand(0)); + auto Op1 = lookupOperandLeader(CI->getOperand(1)); + auto OurPredicate = CI->getPredicate(); + if (shouldSwapOperands(Op0, Op1)) { + std::swap(Op0, Op1); + OurPredicate = CI->getSwappedPredicate(); + } + + // Avoid processing the same info twice + const PredicateBase *LastPredInfo = nullptr; + // See if we know something about the comparison itself, like it is the target + // of an assume. + auto *CmpPI = PredInfo->getPredicateInfoFor(I); + if (dyn_cast_or_null<PredicateAssume>(CmpPI)) + return createConstantExpression(ConstantInt::getTrue(CI->getType())); + + if (Op0 == Op1) { + // This condition does not depend on predicates, no need to add users + if (CI->isTrueWhenEqual()) + return createConstantExpression(ConstantInt::getTrue(CI->getType())); + else if (CI->isFalseWhenEqual()) + return createConstantExpression(ConstantInt::getFalse(CI->getType())); + } + + // NOTE: Because we are comparing both operands here and below, and using + // previous comparisons, we rely on fact that predicateinfo knows to mark + // comparisons that use renamed operands as users of the earlier comparisons. + // It is *not* enough to just mark predicateinfo renamed operands as users of + // the earlier comparisons, because the *other* operand may have changed in a + // previous iteration. + // Example: + // icmp slt %a, %b + // %b.0 = ssa.copy(%b) + // false branch: + // icmp slt %c, %b.0 + + // %c and %a may start out equal, and thus, the code below will say the second + // %icmp is false. c may become equal to something else, and in that case the + // %second icmp *must* be reexamined, but would not if only the renamed + // %operands are considered users of the icmp. + + // *Currently* we only check one level of comparisons back, and only mark one + // level back as touched when changes appen . If you modify this code to look + // back farther through comparisons, you *must* mark the appropriate + // comparisons as users in PredicateInfo.cpp, or you will cause bugs. See if + // we know something just from the operands themselves + + // See if our operands have predicate info, so that we may be able to derive + // something from a previous comparison. + for (const auto &Op : CI->operands()) { + auto *PI = PredInfo->getPredicateInfoFor(Op); + if (const auto *PBranch = dyn_cast_or_null<PredicateBranch>(PI)) { + if (PI == LastPredInfo) + continue; + LastPredInfo = PI; + + // TODO: Along the false edge, we may know more things too, like icmp of + // same operands is false. + // TODO: We only handle actual comparison conditions below, not and/or. + auto *BranchCond = dyn_cast<CmpInst>(PBranch->Condition); + if (!BranchCond) + continue; + auto *BranchOp0 = lookupOperandLeader(BranchCond->getOperand(0)); + auto *BranchOp1 = lookupOperandLeader(BranchCond->getOperand(1)); + auto BranchPredicate = BranchCond->getPredicate(); + if (shouldSwapOperands(BranchOp0, BranchOp1)) { + std::swap(BranchOp0, BranchOp1); + BranchPredicate = BranchCond->getSwappedPredicate(); + } + if (BranchOp0 == Op0 && BranchOp1 == Op1) { + if (PBranch->TrueEdge) { + // If we know the previous predicate is true and we are in the true + // edge then we may be implied true or false. + if (CmpInst::isImpliedTrueByMatchingCmp(BranchPredicate, + OurPredicate)) { + addPredicateUsers(PI, I); + return createConstantExpression( + ConstantInt::getTrue(CI->getType())); + } + + if (CmpInst::isImpliedFalseByMatchingCmp(BranchPredicate, + OurPredicate)) { + addPredicateUsers(PI, I); + return createConstantExpression( + ConstantInt::getFalse(CI->getType())); + } + + } else { + // Just handle the ne and eq cases, where if we have the same + // operands, we may know something. + if (BranchPredicate == OurPredicate) { + addPredicateUsers(PI, I); + // Same predicate, same ops,we know it was false, so this is false. + return createConstantExpression( + ConstantInt::getFalse(CI->getType())); + } else if (BranchPredicate == + CmpInst::getInversePredicate(OurPredicate)) { + addPredicateUsers(PI, I); + // Inverse predicate, we know the other was false, so this is true. + return createConstantExpression( + ConstantInt::getTrue(CI->getType())); + } + } + } + } + } + // Create expression will take care of simplifyCmpInst + return createExpression(I); +} + +// Return true if V is a value that will always be available (IE can +// be placed anywhere) in the function. We don't do globals here +// because they are often worse to put in place. +// TODO: Separate cost from availability +static bool alwaysAvailable(Value *V) { + return isa<Constant>(V) || isa<Argument>(V); } // Substitute and symbolize the value before value numbering. -const Expression *NewGVN::performSymbolicEvaluation(Value *V, - const BasicBlock *B) { +const Expression * +NewGVN::performSymbolicEvaluation(Value *V, + SmallPtrSetImpl<Value *> &Visited) const { const Expression *E = nullptr; if (auto *C = dyn_cast<Constant>(V)) E = createConstantExpression(C); @@ -957,24 +1846,27 @@ const Expression *NewGVN::performSymbolicEvaluation(Value *V, switch (I->getOpcode()) { case Instruction::ExtractValue: case Instruction::InsertValue: - E = performSymbolicAggrValueEvaluation(I, B); + E = performSymbolicAggrValueEvaluation(I); break; case Instruction::PHI: - E = performSymbolicPHIEvaluation(I, B); + E = performSymbolicPHIEvaluation(I); break; case Instruction::Call: - E = performSymbolicCallEvaluation(I, B); + E = performSymbolicCallEvaluation(I); break; case Instruction::Store: - E = performSymbolicStoreEvaluation(I, B); + E = performSymbolicStoreEvaluation(I); break; case Instruction::Load: - E = performSymbolicLoadEvaluation(I, B); + E = performSymbolicLoadEvaluation(I); break; case Instruction::BitCast: { - E = createExpression(I, B); + E = createExpression(I); + } break; + case Instruction::ICmp: + case Instruction::FCmp: { + E = performSymbolicCmpEvaluation(I); } break; - case Instruction::Add: case Instruction::FAdd: case Instruction::Sub: @@ -993,8 +1885,6 @@ const Expression *NewGVN::performSymbolicEvaluation(Value *V, case Instruction::And: case Instruction::Or: case Instruction::Xor: - case Instruction::ICmp: - case Instruction::FCmp: case Instruction::Trunc: case Instruction::ZExt: case Instruction::SExt: @@ -1011,7 +1901,7 @@ const Expression *NewGVN::performSymbolicEvaluation(Value *V, case Instruction::InsertElement: case Instruction::ShuffleVector: case Instruction::GetElementPtr: - E = createExpression(I, B); + E = createExpression(I); break; default: return nullptr; @@ -1020,147 +1910,308 @@ const Expression *NewGVN::performSymbolicEvaluation(Value *V, return E; } -// There is an edge from 'Src' to 'Dst'. Return true if every path from -// the entry block to 'Dst' passes via this edge. In particular 'Dst' -// must not be reachable via another edge from 'Src'. -bool NewGVN::isOnlyReachableViaThisEdge(const BasicBlockEdge &E) const { +// Look up a container in a map, and then call a function for each thing in the +// found container. +template <typename Map, typename KeyType, typename Func> +void NewGVN::for_each_found(Map &M, const KeyType &Key, Func F) { + const auto Result = M.find_as(Key); + if (Result != M.end()) + for (typename Map::mapped_type::value_type Mapped : Result->second) + F(Mapped); +} + +// Look up a container of values/instructions in a map, and touch all the +// instructions in the container. Then erase value from the map. +template <typename Map, typename KeyType> +void NewGVN::touchAndErase(Map &M, const KeyType &Key) { + const auto Result = M.find_as(Key); + if (Result != M.end()) { + for (const typename Map::mapped_type::value_type Mapped : Result->second) + TouchedInstructions.set(InstrToDFSNum(Mapped)); + M.erase(Result); + } +} - // While in theory it is interesting to consider the case in which Dst has - // more than one predecessor, because Dst might be part of a loop which is - // only reachable from Src, in practice it is pointless since at the time - // GVN runs all such loops have preheaders, which means that Dst will have - // been changed to have only one predecessor, namely Src. - const BasicBlock *Pred = E.getEnd()->getSinglePredecessor(); - const BasicBlock *Src = E.getStart(); - assert((!Pred || Pred == Src) && "No edge between these basic blocks!"); - (void)Src; - return Pred != nullptr; +void NewGVN::addAdditionalUsers(Value *To, Value *User) const { + if (isa<Instruction>(To)) + AdditionalUsers[To].insert(User); } void NewGVN::markUsersTouched(Value *V) { // Now mark the users as touched. for (auto *User : V->users()) { assert(isa<Instruction>(User) && "Use of value not within an instruction?"); - TouchedInstructions.set(InstrDFS[User]); + TouchedInstructions.set(InstrToDFSNum(User)); } + touchAndErase(AdditionalUsers, V); } -void NewGVN::markMemoryUsersTouched(MemoryAccess *MA) { - for (auto U : MA->users()) { - if (auto *MUD = dyn_cast<MemoryUseOrDef>(U)) - TouchedInstructions.set(InstrDFS[MUD->getMemoryInst()]); - else - TouchedInstructions.set(InstrDFS[U]); - } +void NewGVN::addMemoryUsers(const MemoryAccess *To, MemoryAccess *U) const { + DEBUG(dbgs() << "Adding memory user " << *U << " to " << *To << "\n"); + MemoryToUsers[To].insert(U); +} + +void NewGVN::markMemoryDefTouched(const MemoryAccess *MA) { + TouchedInstructions.set(MemoryToDFSNum(MA)); +} + +void NewGVN::markMemoryUsersTouched(const MemoryAccess *MA) { + if (isa<MemoryUse>(MA)) + return; + for (auto U : MA->users()) + TouchedInstructions.set(MemoryToDFSNum(U)); + touchAndErase(MemoryToUsers, MA); +} + +// Add I to the set of users of a given predicate. +void NewGVN::addPredicateUsers(const PredicateBase *PB, Instruction *I) const { + // Don't add temporary instructions to the user lists. + if (AllTempInstructions.count(I)) + return; + + if (auto *PBranch = dyn_cast<PredicateBranch>(PB)) + PredicateToUsers[PBranch->Condition].insert(I); + else if (auto *PAssume = dyn_cast<PredicateBranch>(PB)) + PredicateToUsers[PAssume->Condition].insert(I); +} + +// Touch all the predicates that depend on this instruction. +void NewGVN::markPredicateUsersTouched(Instruction *I) { + touchAndErase(PredicateToUsers, I); +} + +// Mark users affected by a memory leader change. +void NewGVN::markMemoryLeaderChangeTouched(CongruenceClass *CC) { + for (auto M : CC->memory()) + markMemoryDefTouched(M); } // Touch the instructions that need to be updated after a congruence class has a // leader change, and mark changed values. -void NewGVN::markLeaderChangeTouched(CongruenceClass *CC) { - for (auto M : CC->Members) { +void NewGVN::markValueLeaderChangeTouched(CongruenceClass *CC) { + for (auto M : *CC) { if (auto *I = dyn_cast<Instruction>(M)) - TouchedInstructions.set(InstrDFS[I]); + TouchedInstructions.set(InstrToDFSNum(I)); LeaderChanges.insert(M); } } +// Give a range of things that have instruction DFS numbers, this will return +// the member of the range with the smallest dfs number. +template <class T, class Range> +T *NewGVN::getMinDFSOfRange(const Range &R) const { + std::pair<T *, unsigned> MinDFS = {nullptr, ~0U}; + for (const auto X : R) { + auto DFSNum = InstrToDFSNum(X); + if (DFSNum < MinDFS.second) + MinDFS = {X, DFSNum}; + } + return MinDFS.first; +} + +// This function returns the MemoryAccess that should be the next leader of +// congruence class CC, under the assumption that the current leader is going to +// disappear. +const MemoryAccess *NewGVN::getNextMemoryLeader(CongruenceClass *CC) const { + // TODO: If this ends up to slow, we can maintain a next memory leader like we + // do for regular leaders. + // Make sure there will be a leader to find + assert(!CC->definesNoMemory() && "Can't get next leader if there is none"); + if (CC->getStoreCount() > 0) { + if (auto *NL = dyn_cast_or_null<StoreInst>(CC->getNextLeader().first)) + return getMemoryAccess(NL); + // Find the store with the minimum DFS number. + auto *V = getMinDFSOfRange<Value>(make_filter_range( + *CC, [&](const Value *V) { return isa<StoreInst>(V); })); + return getMemoryAccess(cast<StoreInst>(V)); + } + assert(CC->getStoreCount() == 0); + + // Given our assertion, hitting this part must mean + // !OldClass->memory_empty() + if (CC->memory_size() == 1) + return *CC->memory_begin(); + return getMinDFSOfRange<const MemoryPhi>(CC->memory()); +} + +// This function returns the next value leader of a congruence class, under the +// assumption that the current leader is going away. This should end up being +// the next most dominating member. +Value *NewGVN::getNextValueLeader(CongruenceClass *CC) const { + // We don't need to sort members if there is only 1, and we don't care about + // sorting the TOP class because everything either gets out of it or is + // unreachable. + + if (CC->size() == 1 || CC == TOPClass) { + return *(CC->begin()); + } else if (CC->getNextLeader().first) { + ++NumGVNAvoidedSortedLeaderChanges; + return CC->getNextLeader().first; + } else { + ++NumGVNSortedLeaderChanges; + // NOTE: If this ends up to slow, we can maintain a dual structure for + // member testing/insertion, or keep things mostly sorted, and sort only + // here, or use SparseBitVector or .... + return getMinDFSOfRange<Value>(*CC); + } +} + +// Move a MemoryAccess, currently in OldClass, to NewClass, including updates to +// the memory members, etc for the move. +// +// The invariants of this function are: +// +// - I must be moving to NewClass from OldClass +// - The StoreCount of OldClass and NewClass is expected to have been updated +// for I already if it is is a store. +// - The OldClass memory leader has not been updated yet if I was the leader. +void NewGVN::moveMemoryToNewCongruenceClass(Instruction *I, + MemoryAccess *InstMA, + CongruenceClass *OldClass, + CongruenceClass *NewClass) { + // If the leader is I, and we had a represenative MemoryAccess, it should + // be the MemoryAccess of OldClass. + assert((!InstMA || !OldClass->getMemoryLeader() || + OldClass->getLeader() != I || + MemoryAccessToClass.lookup(OldClass->getMemoryLeader()) == + MemoryAccessToClass.lookup(InstMA)) && + "Representative MemoryAccess mismatch"); + // First, see what happens to the new class + if (!NewClass->getMemoryLeader()) { + // Should be a new class, or a store becoming a leader of a new class. + assert(NewClass->size() == 1 || + (isa<StoreInst>(I) && NewClass->getStoreCount() == 1)); + NewClass->setMemoryLeader(InstMA); + // Mark it touched if we didn't just create a singleton + DEBUG(dbgs() << "Memory class leader change for class " << NewClass->getID() + << " due to new memory instruction becoming leader\n"); + markMemoryLeaderChangeTouched(NewClass); + } + setMemoryClass(InstMA, NewClass); + // Now, fixup the old class if necessary + if (OldClass->getMemoryLeader() == InstMA) { + if (!OldClass->definesNoMemory()) { + OldClass->setMemoryLeader(getNextMemoryLeader(OldClass)); + DEBUG(dbgs() << "Memory class leader change for class " + << OldClass->getID() << " to " + << *OldClass->getMemoryLeader() + << " due to removal of old leader " << *InstMA << "\n"); + markMemoryLeaderChangeTouched(OldClass); + } else + OldClass->setMemoryLeader(nullptr); + } +} + // Move a value, currently in OldClass, to be part of NewClass -// Update OldClass for the move (including changing leaders, etc) -void NewGVN::moveValueToNewCongruenceClass(Instruction *I, +// Update OldClass and NewClass for the move (including changing leaders, etc). +void NewGVN::moveValueToNewCongruenceClass(Instruction *I, const Expression *E, CongruenceClass *OldClass, CongruenceClass *NewClass) { - DEBUG(dbgs() << "New congruence class for " << I << " is " << NewClass->ID - << "\n"); - - if (I == OldClass->NextLeader.first) - OldClass->NextLeader = {nullptr, ~0U}; - - // It's possible, though unlikely, for us to discover equivalences such - // that the current leader does not dominate the old one. - // This statistic tracks how often this happens. - // We assert on phi nodes when this happens, currently, for debugging, because - // we want to make sure we name phi node cycles properly. - if (isa<Instruction>(NewClass->RepLeader) && NewClass->RepLeader && - I != NewClass->RepLeader && - DT->properlyDominates( - I->getParent(), - cast<Instruction>(NewClass->RepLeader)->getParent())) { - ++NumGVNNotMostDominatingLeader; - assert(!isa<PHINode>(I) && - "New class for instruction should not be dominated by instruction"); - } - - if (NewClass->RepLeader != I) { - auto DFSNum = InstrDFS.lookup(I); - if (DFSNum < NewClass->NextLeader.second) - NewClass->NextLeader = {I, DFSNum}; - } - - OldClass->Members.erase(I); - NewClass->Members.insert(I); - if (isa<StoreInst>(I)) { - --OldClass->StoreCount; - assert(OldClass->StoreCount >= 0); - ++NewClass->StoreCount; - assert(NewClass->StoreCount > 0); + if (I == OldClass->getNextLeader().first) + OldClass->resetNextLeader(); + + OldClass->erase(I); + NewClass->insert(I); + + if (NewClass->getLeader() != I) + NewClass->addPossibleNextLeader({I, InstrToDFSNum(I)}); + // Handle our special casing of stores. + if (auto *SI = dyn_cast<StoreInst>(I)) { + OldClass->decStoreCount(); + // Okay, so when do we want to make a store a leader of a class? + // If we have a store defined by an earlier load, we want the earlier load + // to lead the class. + // If we have a store defined by something else, we want the store to lead + // the class so everything else gets the "something else" as a value. + // If we have a store as the single member of the class, we want the store + // as the leader + if (NewClass->getStoreCount() == 0 && !NewClass->getStoredValue()) { + // If it's a store expression we are using, it means we are not equivalent + // to something earlier. + if (auto *SE = dyn_cast<StoreExpression>(E)) { + NewClass->setStoredValue(SE->getStoredValue()); + markValueLeaderChangeTouched(NewClass); + // Shift the new class leader to be the store + DEBUG(dbgs() << "Changing leader of congruence class " + << NewClass->getID() << " from " << *NewClass->getLeader() + << " to " << *SI << " because store joined class\n"); + // If we changed the leader, we have to mark it changed because we don't + // know what it will do to symbolic evaluation. + NewClass->setLeader(SI); + } + // We rely on the code below handling the MemoryAccess change. + } + NewClass->incStoreCount(); } + // True if there is no memory instructions left in a class that had memory + // instructions before. + // If it's not a memory use, set the MemoryAccess equivalence + auto *InstMA = dyn_cast_or_null<MemoryDef>(getMemoryAccess(I)); + if (InstMA) + moveMemoryToNewCongruenceClass(I, InstMA, OldClass, NewClass); ValueToClass[I] = NewClass; // See if we destroyed the class or need to swap leaders. - if (OldClass->Members.empty() && OldClass != InitialClass) { - if (OldClass->DefiningExpr) { - OldClass->Dead = true; - DEBUG(dbgs() << "Erasing expression " << OldClass->DefiningExpr + if (OldClass->empty() && OldClass != TOPClass) { + if (OldClass->getDefiningExpr()) { + DEBUG(dbgs() << "Erasing expression " << *OldClass->getDefiningExpr() << " from table\n"); - ExpressionToClass.erase(OldClass->DefiningExpr); + // We erase it as an exact expression to make sure we don't just erase an + // equivalent one. + auto Iter = ExpressionToClass.find_as( + ExactEqualsExpression(*OldClass->getDefiningExpr())); + if (Iter != ExpressionToClass.end()) + ExpressionToClass.erase(Iter); +#ifdef EXPENSIVE_CHECKS + assert( + (*OldClass->getDefiningExpr() != *E || ExpressionToClass.lookup(E)) && + "We erased the expression we just inserted, which should not happen"); +#endif } - } else if (OldClass->RepLeader == I) { + } else if (OldClass->getLeader() == I) { // When the leader changes, the value numbering of // everything may change due to symbolization changes, so we need to // reprocess. - DEBUG(dbgs() << "Leader change!\n"); + DEBUG(dbgs() << "Value class leader change for class " << OldClass->getID() + << "\n"); ++NumGVNLeaderChanges; - // We don't need to sort members if there is only 1, and we don't care about - // sorting the initial class because everything either gets out of it or is - // unreachable. - if (OldClass->Members.size() == 1 || OldClass == InitialClass) { - OldClass->RepLeader = *(OldClass->Members.begin()); - } else if (OldClass->NextLeader.first) { - ++NumGVNAvoidedSortedLeaderChanges; - OldClass->RepLeader = OldClass->NextLeader.first; - OldClass->NextLeader = {nullptr, ~0U}; - } else { - ++NumGVNSortedLeaderChanges; - // TODO: If this ends up to slow, we can maintain a dual structure for - // member testing/insertion, or keep things mostly sorted, and sort only - // here, or .... - std::pair<Value *, unsigned> MinDFS = {nullptr, ~0U}; - for (const auto X : OldClass->Members) { - auto DFSNum = InstrDFS.lookup(X); - if (DFSNum < MinDFS.second) - MinDFS = {X, DFSNum}; - } - OldClass->RepLeader = MinDFS.first; + // Destroy the stored value if there are no more stores to represent it. + // Note that this is basically clean up for the expression removal that + // happens below. If we remove stores from a class, we may leave it as a + // class of equivalent memory phis. + if (OldClass->getStoreCount() == 0) { + if (OldClass->getStoredValue()) + OldClass->setStoredValue(nullptr); } - markLeaderChangeTouched(OldClass); + OldClass->setLeader(getNextValueLeader(OldClass)); + OldClass->resetNextLeader(); + markValueLeaderChangeTouched(OldClass); } } +// For a given expression, mark the phi of ops instructions that could have +// changed as a result. +void NewGVN::markPhiOfOpsChanged(const Expression *E) { + touchAndErase(ExpressionToPhiOfOps, ExactEqualsExpression(*E)); +} + // Perform congruence finding on a given value numbering expression. void NewGVN::performCongruenceFinding(Instruction *I, const Expression *E) { - ValueToExpression[I] = E; // This is guaranteed to return something, since it will at least find - // INITIAL. + // TOP. - CongruenceClass *IClass = ValueToClass[I]; + CongruenceClass *IClass = ValueToClass.lookup(I); assert(IClass && "Should have found a IClass"); // Dead classes should have been eliminated from the mapping. - assert(!IClass->Dead && "Found a dead class"); + assert(!IClass->isDead() && "Found a dead class"); - CongruenceClass *EClass; + CongruenceClass *EClass = nullptr; if (const auto *VE = dyn_cast<VariableExpression>(E)) { - EClass = ValueToClass[VE->getVariableValue()]; - } else { + EClass = ValueToClass.lookup(VE->getVariableValue()); + } else if (isa<DeadExpression>(E)) { + EClass = TOPClass; + } + if (!EClass) { auto lookupResult = ExpressionToClass.insert({E, nullptr}); // If it's not in the value table, create a new congruence class. @@ -1171,80 +2222,73 @@ void NewGVN::performCongruenceFinding(Instruction *I, const Expression *E) { // Constants and variables should always be made the leader. if (const auto *CE = dyn_cast<ConstantExpression>(E)) { - NewClass->RepLeader = CE->getConstantValue(); + NewClass->setLeader(CE->getConstantValue()); } else if (const auto *SE = dyn_cast<StoreExpression>(E)) { StoreInst *SI = SE->getStoreInst(); - NewClass->RepLeader = - lookupOperandLeader(SI->getValueOperand(), SI, SI->getParent()); + NewClass->setLeader(SI); + NewClass->setStoredValue(SE->getStoredValue()); + // The RepMemoryAccess field will be filled in properly by the + // moveValueToNewCongruenceClass call. } else { - NewClass->RepLeader = I; + NewClass->setLeader(I); } assert(!isa<VariableExpression>(E) && "VariableExpression should have been handled already"); EClass = NewClass; DEBUG(dbgs() << "Created new congruence class for " << *I - << " using expression " << *E << " at " << NewClass->ID - << " and leader " << *(NewClass->RepLeader) << "\n"); - DEBUG(dbgs() << "Hash value was " << E->getHashValue() << "\n"); + << " using expression " << *E << " at " << NewClass->getID() + << " and leader " << *(NewClass->getLeader())); + if (NewClass->getStoredValue()) + DEBUG(dbgs() << " and stored value " << *(NewClass->getStoredValue())); + DEBUG(dbgs() << "\n"); } else { EClass = lookupResult.first->second; if (isa<ConstantExpression>(E)) - assert(isa<Constant>(EClass->RepLeader) && + assert((isa<Constant>(EClass->getLeader()) || + (EClass->getStoredValue() && + isa<Constant>(EClass->getStoredValue()))) && "Any class with a constant expression should have a " "constant leader"); assert(EClass && "Somehow don't have an eclass"); - assert(!EClass->Dead && "We accidentally looked up a dead class"); + assert(!EClass->isDead() && "We accidentally looked up a dead class"); } } bool ClassChanged = IClass != EClass; bool LeaderChanged = LeaderChanges.erase(I); if (ClassChanged || LeaderChanged) { - DEBUG(dbgs() << "Found class " << EClass->ID << " for expression " << E + DEBUG(dbgs() << "New class " << EClass->getID() << " for expression " << *E << "\n"); + if (ClassChanged) { + moveValueToNewCongruenceClass(I, E, IClass, EClass); + markPhiOfOpsChanged(E); + } - if (ClassChanged) - moveValueToNewCongruenceClass(I, IClass, EClass); markUsersTouched(I); - if (MemoryAccess *MA = MSSA->getMemoryAccess(I)) { - // If this is a MemoryDef, we need to update the equivalence table. If - // we determined the expression is congruent to a different memory - // state, use that different memory state. If we determined it didn't, - // we update that as well. Right now, we only support store - // expressions. - if (!isa<MemoryUse>(MA) && isa<StoreExpression>(E) && - EClass->Members.size() != 1) { - auto *DefAccess = cast<StoreExpression>(E)->getDefiningAccess(); - setMemoryAccessEquivTo(MA, DefAccess != MA ? DefAccess : nullptr); - } else { - setMemoryAccessEquivTo(MA, nullptr); - } + if (MemoryAccess *MA = getMemoryAccess(I)) markMemoryUsersTouched(MA); - } - } else if (auto *SI = dyn_cast<StoreInst>(I)) { - // There is, sadly, one complicating thing for stores. Stores do not - // produce values, only consume them. However, in order to make loads and - // stores value number the same, we ignore the value operand of the store. - // But the value operand will still be the leader of our class, and thus, it - // may change. Because the store is a use, the store will get reprocessed, - // but nothing will change about it, and so nothing above will catch it - // (since the class will not change). In order to make sure everything ends - // up okay, we need to recheck the leader of the class. Since stores of - // different values value number differently due to different memorydefs, we - // are guaranteed the leader is always the same between stores in the same - // class. - DEBUG(dbgs() << "Checking store leader\n"); - auto ProperLeader = - lookupOperandLeader(SI->getValueOperand(), SI, SI->getParent()); - if (EClass->RepLeader != ProperLeader) { - DEBUG(dbgs() << "Store leader changed, fixing\n"); - EClass->RepLeader = ProperLeader; - markLeaderChangeTouched(EClass); - markMemoryUsersTouched(MSSA->getMemoryAccess(SI)); + if (auto *CI = dyn_cast<CmpInst>(I)) + markPredicateUsersTouched(CI); + } + // If we changed the class of the store, we want to ensure nothing finds the + // old store expression. In particular, loads do not compare against stored + // value, so they will find old store expressions (and associated class + // mappings) if we leave them in the table. + if (ClassChanged && isa<StoreInst>(I)) { + auto *OldE = ValueToExpression.lookup(I); + // It could just be that the old class died. We don't want to erase it if we + // just moved classes. + if (OldE && isa<StoreExpression>(OldE) && *E != *OldE) { + // Erase this as an exact expression to ensure we don't erase expressions + // equivalent to it. + auto Iter = ExpressionToClass.find_as(ExactEqualsExpression(*OldE)); + if (Iter != ExpressionToClass.end()) + ExpressionToClass.erase(Iter); } } + ValueToExpression[I] = E; } // Process the fact that Edge (from, to) is reachable, including marking @@ -1266,25 +2310,26 @@ void NewGVN::updateReachableEdge(BasicBlock *From, BasicBlock *To) { // impact predicates. Otherwise, only mark the phi nodes as touched, as // they are the only thing that depend on new edges. Anything using their // values will get propagated to if necessary. - if (MemoryAccess *MemPhi = MSSA->getMemoryAccess(To)) - TouchedInstructions.set(InstrDFS[MemPhi]); + if (MemoryAccess *MemPhi = getMemoryAccess(To)) + TouchedInstructions.set(InstrToDFSNum(MemPhi)); auto BI = To->begin(); while (isa<PHINode>(BI)) { - TouchedInstructions.set(InstrDFS[&*BI]); + TouchedInstructions.set(InstrToDFSNum(&*BI)); ++BI; } + for_each_found(PHIOfOpsPHIs, To, [&](const PHINode *I) { + TouchedInstructions.set(InstrToDFSNum(I)); + }); } } } // Given a predicate condition (from a switch, cmp, or whatever) and a block, // see if we know some constant value for it already. -Value *NewGVN::findConditionEquivalence(Value *Cond, BasicBlock *B) const { - auto Result = lookupOperandLeader(Cond, nullptr, B); - if (isa<Constant>(Result)) - return Result; - return nullptr; +Value *NewGVN::findConditionEquivalence(Value *Cond) const { + auto Result = lookupOperandLeader(Cond); + return isa<Constant>(Result) ? Result : nullptr; } // Process the outgoing edges of a block for reachability. @@ -1293,10 +2338,10 @@ void NewGVN::processOutgoingEdges(TerminatorInst *TI, BasicBlock *B) { BranchInst *BR; if ((BR = dyn_cast<BranchInst>(TI)) && BR->isConditional()) { Value *Cond = BR->getCondition(); - Value *CondEvaluated = findConditionEquivalence(Cond, B); + Value *CondEvaluated = findConditionEquivalence(Cond); if (!CondEvaluated) { if (auto *I = dyn_cast<Instruction>(Cond)) { - const Expression *E = createExpression(I, B); + const Expression *E = createExpression(I); if (const auto *CE = dyn_cast<ConstantExpression>(E)) { CondEvaluated = CE->getConstantValue(); } @@ -1329,13 +2374,13 @@ void NewGVN::processOutgoingEdges(TerminatorInst *TI, BasicBlock *B) { SmallDenseMap<BasicBlock *, unsigned, 16> SwitchEdges; Value *SwitchCond = SI->getCondition(); - Value *CondEvaluated = findConditionEquivalence(SwitchCond, B); + Value *CondEvaluated = findConditionEquivalence(SwitchCond); // See if we were able to turn this switch statement into a constant. if (CondEvaluated && isa<ConstantInt>(CondEvaluated)) { auto *CondVal = cast<ConstantInt>(CondEvaluated); // We should be able to get case value for this. - auto CaseVal = SI->findCaseValue(CondVal); - if (CaseVal.getCaseSuccessor() == SI->getDefaultDest()) { + auto Case = *SI->findCaseValue(CondVal); + if (Case.getCaseSuccessor() == SI->getDefaultDest()) { // We proved the value is outside of the range of the case. // We can't do anything other than mark the default dest as reachable, // and go home. @@ -1343,7 +2388,7 @@ void NewGVN::processOutgoingEdges(TerminatorInst *TI, BasicBlock *B) { return; } // Now get where it goes and mark it reachable. - BasicBlock *TargetBlock = CaseVal.getCaseSuccessor(); + BasicBlock *TargetBlock = Case.getCaseSuccessor(); updateReachableEdge(B, TargetBlock); } else { for (unsigned i = 0, e = SI->getNumSuccessors(); i != e; ++i) { @@ -1361,45 +2406,215 @@ void NewGVN::processOutgoingEdges(TerminatorInst *TI, BasicBlock *B) { } // This also may be a memory defining terminator, in which case, set it - // equivalent to nothing. - if (MemoryAccess *MA = MSSA->getMemoryAccess(TI)) - setMemoryAccessEquivTo(MA, nullptr); + // equivalent only to itself. + // + auto *MA = getMemoryAccess(TI); + if (MA && !isa<MemoryUse>(MA)) { + auto *CC = ensureLeaderOfMemoryClass(MA); + if (setMemoryClass(MA, CC)) + markMemoryUsersTouched(MA); + } } } -// The algorithm initially places the values of the routine in the INITIAL -// congruence -// class. The leader of INITIAL is the undetermined value `TOP`. -// When the algorithm has finished, values still in INITIAL are unreachable. +void NewGVN::addPhiOfOps(PHINode *Op, BasicBlock *BB, + Instruction *ExistingValue) { + InstrDFS[Op] = InstrToDFSNum(ExistingValue); + AllTempInstructions.insert(Op); + PHIOfOpsPHIs[BB].push_back(Op); + TempToBlock[Op] = BB; + RealToTemp[ExistingValue] = Op; +} + +static bool okayForPHIOfOps(const Instruction *I) { + return isa<BinaryOperator>(I) || isa<SelectInst>(I) || isa<CmpInst>(I) || + isa<LoadInst>(I); +} + +// When we see an instruction that is an op of phis, generate the equivalent phi +// of ops form. +const Expression * +NewGVN::makePossiblePhiOfOps(Instruction *I, + SmallPtrSetImpl<Value *> &Visited) { + if (!okayForPHIOfOps(I)) + return nullptr; + + if (!Visited.insert(I).second) + return nullptr; + // For now, we require the instruction be cycle free because we don't + // *always* create a phi of ops for instructions that could be done as phi + // of ops, we only do it if we think it is useful. If we did do it all the + // time, we could remove the cycle free check. + if (!isCycleFree(I)) + return nullptr; + + unsigned IDFSNum = InstrToDFSNum(I); + SmallPtrSet<const Value *, 8> ProcessedPHIs; + // TODO: We don't do phi translation on memory accesses because it's + // complicated. For a load, we'd need to be able to simulate a new memoryuse, + // which we don't have a good way of doing ATM. + auto *MemAccess = getMemoryAccess(I); + // If the memory operation is defined by a memory operation this block that + // isn't a MemoryPhi, transforming the pointer backwards through a scalar phi + // can't help, as it would still be killed by that memory operation. + if (MemAccess && !isa<MemoryPhi>(MemAccess->getDefiningAccess()) && + MemAccess->getDefiningAccess()->getBlock() == I->getParent()) + return nullptr; + + // Convert op of phis to phi of ops + for (auto &Op : I->operands()) { + // TODO: We can't handle expressions that must be recursively translated + // IE + // a = phi (b, c) + // f = use a + // g = f + phi of something + // To properly make a phi of ops for g, we'd have to properly translate and + // use the instruction for f. We should add this by splitting out the + // instruction creation we do below. + if (isa<Instruction>(Op) && PHINodeUses.count(cast<Instruction>(Op))) + return nullptr; + if (!isa<PHINode>(Op)) + continue; + auto *OpPHI = cast<PHINode>(Op); + // No point in doing this for one-operand phis. + if (OpPHI->getNumOperands() == 1) + continue; + if (!DebugCounter::shouldExecute(PHIOfOpsCounter)) + return nullptr; + SmallVector<std::pair<Value *, BasicBlock *>, 4> Ops; + auto *PHIBlock = getBlockForValue(OpPHI); + for (auto PredBB : OpPHI->blocks()) { + Value *FoundVal = nullptr; + // We could just skip unreachable edges entirely but it's tricky to do + // with rewriting existing phi nodes. + if (ReachableEdges.count({PredBB, PHIBlock})) { + // Clone the instruction, create an expression from it, and see if we + // have a leader. + Instruction *ValueOp = I->clone(); + if (MemAccess) + TempToMemory.insert({ValueOp, MemAccess}); + + for (auto &Op : ValueOp->operands()) { + Op = Op->DoPHITranslation(PHIBlock, PredBB); + // When this operand changes, it could change whether there is a + // leader for us or not. + addAdditionalUsers(Op, I); + } + // Make sure it's marked as a temporary instruction. + AllTempInstructions.insert(ValueOp); + // and make sure anything that tries to add it's DFS number is + // redirected to the instruction we are making a phi of ops + // for. + InstrDFS.insert({ValueOp, IDFSNum}); + const Expression *E = performSymbolicEvaluation(ValueOp, Visited); + InstrDFS.erase(ValueOp); + AllTempInstructions.erase(ValueOp); + ValueOp->deleteValue(); + if (MemAccess) + TempToMemory.erase(ValueOp); + if (!E) + return nullptr; + FoundVal = findPhiOfOpsLeader(E, PredBB); + if (!FoundVal) { + ExpressionToPhiOfOps[E].insert(I); + return nullptr; + } + if (auto *SI = dyn_cast<StoreInst>(FoundVal)) + FoundVal = SI->getValueOperand(); + } else { + DEBUG(dbgs() << "Skipping phi of ops operand for incoming block " + << getBlockName(PredBB) + << " because the block is unreachable\n"); + FoundVal = UndefValue::get(I->getType()); + } + + Ops.push_back({FoundVal, PredBB}); + DEBUG(dbgs() << "Found phi of ops operand " << *FoundVal << " in " + << getBlockName(PredBB) << "\n"); + } + auto *ValuePHI = RealToTemp.lookup(I); + bool NewPHI = false; + if (!ValuePHI) { + ValuePHI = PHINode::Create(I->getType(), OpPHI->getNumOperands()); + addPhiOfOps(ValuePHI, PHIBlock, I); + NewPHI = true; + NumGVNPHIOfOpsCreated++; + } + if (NewPHI) { + for (auto PHIOp : Ops) + ValuePHI->addIncoming(PHIOp.first, PHIOp.second); + } else { + unsigned int i = 0; + for (auto PHIOp : Ops) { + ValuePHI->setIncomingValue(i, PHIOp.first); + ValuePHI->setIncomingBlock(i, PHIOp.second); + ++i; + } + } + + DEBUG(dbgs() << "Created phi of ops " << *ValuePHI << " for " << *I + << "\n"); + return performSymbolicEvaluation(ValuePHI, Visited); + } + return nullptr; +} + +// The algorithm initially places the values of the routine in the TOP +// congruence class. The leader of TOP is the undetermined value `undef`. +// When the algorithm has finished, values still in TOP are unreachable. void NewGVN::initializeCongruenceClasses(Function &F) { - // FIXME now i can't remember why this is 2 - NextCongruenceNum = 2; - // Initialize all other instructions to be in INITIAL class. - CongruenceClass::MemberSet InitialValues; - InitialClass = createCongruenceClass(nullptr, nullptr); - for (auto &B : F) { - if (auto *MP = MSSA->getMemoryAccess(&B)) - MemoryAccessEquiv.insert({MP, MSSA->getLiveOnEntryDef()}); - - for (auto &I : B) { - InitialValues.insert(&I); - ValueToClass[&I] = InitialClass; - // All memory accesses are equivalent to live on entry to start. They must - // be initialized to something so that initial changes are noticed. For - // the maximal answer, we initialize them all to be the same as - // liveOnEntry. Note that to save time, we only initialize the - // MemoryDef's for stores and all MemoryPhis to be equal. Right now, no - // other expression can generate a memory equivalence. If we start - // handling memcpy/etc, we can expand this. - if (isa<StoreInst>(&I)) { - MemoryAccessEquiv.insert( - {MSSA->getMemoryAccess(&I), MSSA->getLiveOnEntryDef()}); - ++InitialClass->StoreCount; - assert(InitialClass->StoreCount > 0); + NextCongruenceNum = 0; + + // Note that even though we use the live on entry def as a representative + // MemoryAccess, it is *not* the same as the actual live on entry def. We + // have no real equivalemnt to undef for MemoryAccesses, and so we really + // should be checking whether the MemoryAccess is top if we want to know if it + // is equivalent to everything. Otherwise, what this really signifies is that + // the access "it reaches all the way back to the beginning of the function" + + // Initialize all other instructions to be in TOP class. + TOPClass = createCongruenceClass(nullptr, nullptr); + TOPClass->setMemoryLeader(MSSA->getLiveOnEntryDef()); + // The live on entry def gets put into it's own class + MemoryAccessToClass[MSSA->getLiveOnEntryDef()] = + createMemoryClass(MSSA->getLiveOnEntryDef()); + + for (auto DTN : nodes(DT)) { + BasicBlock *BB = DTN->getBlock(); + // All MemoryAccesses are equivalent to live on entry to start. They must + // be initialized to something so that initial changes are noticed. For + // the maximal answer, we initialize them all to be the same as + // liveOnEntry. + auto *MemoryBlockDefs = MSSA->getBlockDefs(BB); + if (MemoryBlockDefs) + for (const auto &Def : *MemoryBlockDefs) { + MemoryAccessToClass[&Def] = TOPClass; + auto *MD = dyn_cast<MemoryDef>(&Def); + // Insert the memory phis into the member list. + if (!MD) { + const MemoryPhi *MP = cast<MemoryPhi>(&Def); + TOPClass->memory_insert(MP); + MemoryPhiState.insert({MP, MPS_TOP}); + } + + if (MD && isa<StoreInst>(MD->getMemoryInst())) + TOPClass->incStoreCount(); } + for (auto &I : *BB) { + // TODO: Move to helper + if (isa<PHINode>(&I)) + for (auto *U : I.users()) + if (auto *UInst = dyn_cast<Instruction>(U)) + if (InstrToDFSNum(UInst) != 0 && okayForPHIOfOps(UInst)) + PHINodeUses.insert(UInst); + // Don't insert void terminators into the class. We don't value number + // them, and they just end up sitting in TOP. + if (isa<TerminatorInst>(I) && I.getType()->isVoidTy()) + continue; + TOPClass->insert(&I); + ValueToClass[&I] = TOPClass; } } - InitialClass->Members.swap(InitialValues); // Initialize arguments to be in their own unique congruence classes for (auto &FA : F.args()) @@ -1408,45 +2623,79 @@ void NewGVN::initializeCongruenceClasses(Function &F) { void NewGVN::cleanupTables() { for (unsigned i = 0, e = CongruenceClasses.size(); i != e; ++i) { - DEBUG(dbgs() << "Congruence class " << CongruenceClasses[i]->ID << " has " - << CongruenceClasses[i]->Members.size() << " members\n"); + DEBUG(dbgs() << "Congruence class " << CongruenceClasses[i]->getID() + << " has " << CongruenceClasses[i]->size() << " members\n"); // Make sure we delete the congruence class (probably worth switching to // a unique_ptr at some point. delete CongruenceClasses[i]; CongruenceClasses[i] = nullptr; } + // Destroy the value expressions + SmallVector<Instruction *, 8> TempInst(AllTempInstructions.begin(), + AllTempInstructions.end()); + AllTempInstructions.clear(); + + // We have to drop all references for everything first, so there are no uses + // left as we delete them. + for (auto *I : TempInst) { + I->dropAllReferences(); + } + + while (!TempInst.empty()) { + auto *I = TempInst.back(); + TempInst.pop_back(); + I->deleteValue(); + } + ValueToClass.clear(); ArgRecycler.clear(ExpressionAllocator); ExpressionAllocator.Reset(); CongruenceClasses.clear(); ExpressionToClass.clear(); ValueToExpression.clear(); + RealToTemp.clear(); + AdditionalUsers.clear(); + ExpressionToPhiOfOps.clear(); + TempToBlock.clear(); + TempToMemory.clear(); + PHIOfOpsPHIs.clear(); ReachableBlocks.clear(); ReachableEdges.clear(); #ifndef NDEBUG ProcessedCount.clear(); #endif - DFSDomMap.clear(); InstrDFS.clear(); InstructionsToErase.clear(); - DFSToInstr.clear(); BlockInstRange.clear(); TouchedInstructions.clear(); - DominatedInstRange.clear(); - MemoryAccessEquiv.clear(); + MemoryAccessToClass.clear(); + PredicateToUsers.clear(); + MemoryToUsers.clear(); } +// Assign local DFS number mapping to instructions, and leave space for Value +// PHI's. std::pair<unsigned, unsigned> NewGVN::assignDFSNumbers(BasicBlock *B, unsigned Start) { unsigned End = Start; - if (MemoryAccess *MemPhi = MSSA->getMemoryAccess(B)) { + if (MemoryAccess *MemPhi = getMemoryAccess(B)) { InstrDFS[MemPhi] = End++; DFSToInstr.emplace_back(MemPhi); } + // Then the real block goes next. for (auto &I : *B) { + // There's no need to call isInstructionTriviallyDead more than once on + // an instruction. Therefore, once we know that an instruction is dead + // we change its DFS number so that it doesn't get value numbered. + if (isInstructionTriviallyDead(&I, TLI)) { + InstrDFS[&I] = 0; + DEBUG(dbgs() << "Skipping trivially dead instruction " << I << "\n"); + markInstructionForDeletion(&I); + continue; + } InstrDFS[&I] = End++; DFSToInstr.emplace_back(&I); } @@ -1457,12 +2706,12 @@ std::pair<unsigned, unsigned> NewGVN::assignDFSNumbers(BasicBlock *B, return std::make_pair(Start, End); } -void NewGVN::updateProcessedCount(Value *V) { +void NewGVN::updateProcessedCount(const Value *V) { #ifndef NDEBUG if (ProcessedCount.count(V) == 0) { ProcessedCount.insert({V, 1}); } else { - ProcessedCount[V] += 1; + ++ProcessedCount[V]; assert(ProcessedCount[V] < 100 && "Seem to have processed the same Value a lot"); } @@ -1471,27 +2720,35 @@ void NewGVN::updateProcessedCount(Value *V) { // Evaluate MemoryPhi nodes symbolically, just like PHI nodes void NewGVN::valueNumberMemoryPhi(MemoryPhi *MP) { // If all the arguments are the same, the MemoryPhi has the same value as the - // argument. - // Filter out unreachable blocks from our operands. + // argument. Filter out unreachable blocks and self phis from our operands. + // TODO: We could do cycle-checking on the memory phis to allow valueizing for + // self-phi checking. + const BasicBlock *PHIBlock = MP->getBlock(); auto Filtered = make_filter_range(MP->operands(), [&](const Use &U) { - return ReachableBlocks.count(MP->getIncomingBlock(U)); + return cast<MemoryAccess>(U) != MP && + !isMemoryAccessTOP(cast<MemoryAccess>(U)) && + ReachableEdges.count({MP->getIncomingBlock(U), PHIBlock}); }); - - assert(Filtered.begin() != Filtered.end() && - "We should not be processing a MemoryPhi in a completely " - "unreachable block"); + // If all that is left is nothing, our memoryphi is undef. We keep it as + // InitialClass. Note: The only case this should happen is if we have at + // least one self-argument. + if (Filtered.begin() == Filtered.end()) { + if (setMemoryClass(MP, TOPClass)) + markMemoryUsersTouched(MP); + return; + } // Transform the remaining operands into operand leaders. // FIXME: mapped_iterator should have a range version. auto LookupFunc = [&](const Use &U) { - return lookupMemoryAccessEquiv(cast<MemoryAccess>(U)); + return lookupMemoryLeader(cast<MemoryAccess>(U)); }; auto MappedBegin = map_iterator(Filtered.begin(), LookupFunc); auto MappedEnd = map_iterator(Filtered.end(), LookupFunc); // and now check if all the elements are equal. // Sadly, we can't use std::equals since these are random access iterators. - MemoryAccess *AllSameValue = *MappedBegin; + const auto *AllSameValue = *MappedBegin; ++MappedBegin; bool AllEqual = std::all_of( MappedBegin, MappedEnd, @@ -1501,8 +2758,18 @@ void NewGVN::valueNumberMemoryPhi(MemoryPhi *MP) { DEBUG(dbgs() << "Memory Phi value numbered to " << *AllSameValue << "\n"); else DEBUG(dbgs() << "Memory Phi value numbered to itself\n"); - - if (setMemoryAccessEquivTo(MP, AllEqual ? AllSameValue : nullptr)) + // If it's equal to something, it's in that class. Otherwise, it has to be in + // a class where it is the leader (other things may be equivalent to it, but + // it needs to start off in its own class, which means it must have been the + // leader, and it can't have stopped being the leader because it was never + // removed). + CongruenceClass *CC = + AllEqual ? getMemoryClass(AllSameValue) : ensureLeaderOfMemoryClass(MP); + auto OldState = MemoryPhiState.lookup(MP); + assert(OldState != MPS_Invalid && "Invalid memory phi state"); + auto NewState = AllEqual ? MPS_Equivalent : MPS_Unique; + MemoryPhiState[MP] = NewState; + if (setMemoryClass(MP, CC) || OldState != NewState) markMemoryUsersTouched(MP); } @@ -1510,13 +2777,23 @@ void NewGVN::valueNumberMemoryPhi(MemoryPhi *MP) { // congruence finding, and updating mappings. void NewGVN::valueNumberInstruction(Instruction *I) { DEBUG(dbgs() << "Processing instruction " << *I << "\n"); - if (isInstructionTriviallyDead(I, TLI)) { - DEBUG(dbgs() << "Skipping unused instruction\n"); - markInstructionForDeletion(I); - return; - } if (!I->isTerminator()) { - const auto *Symbolized = performSymbolicEvaluation(I, I->getParent()); + const Expression *Symbolized = nullptr; + SmallPtrSet<Value *, 2> Visited; + if (DebugCounter::shouldExecute(VNCounter)) { + Symbolized = performSymbolicEvaluation(I, Visited); + // Make a phi of ops if necessary + if (Symbolized && !isa<ConstantExpression>(Symbolized) && + !isa<VariableExpression>(Symbolized) && PHINodeUses.count(I)) { + auto *PHIE = makePossiblePhiOfOps(I, Visited); + if (PHIE) + Symbolized = PHIE; + } + + } else { + // Mark the instruction as unused so we don't value number it again. + InstrDFS[I] = 0; + } // If we couldn't come up with a symbolic expression, use the unknown // expression if (Symbolized == nullptr) @@ -1524,7 +2801,8 @@ void NewGVN::valueNumberInstruction(Instruction *I) { performCongruenceFinding(I, Symbolized); } else { // Handle terminators that return values. All of them produce values we - // don't currently understand. + // don't currently understand. We don't place non-value producing + // terminators in a class. if (!I->getType()->isVoidTy()) { auto *Symbolized = createUnknownExpression(I); performCongruenceFinding(I, Symbolized); @@ -1535,76 +2813,126 @@ void NewGVN::valueNumberInstruction(Instruction *I) { // Check if there is a path, using single or equal argument phi nodes, from // First to Second. -bool NewGVN::singleReachablePHIPath(const MemoryAccess *First, - const MemoryAccess *Second) const { +bool NewGVN::singleReachablePHIPath( + SmallPtrSet<const MemoryAccess *, 8> &Visited, const MemoryAccess *First, + const MemoryAccess *Second) const { if (First == Second) return true; - - if (auto *FirstDef = dyn_cast<MemoryUseOrDef>(First)) { - auto *DefAccess = FirstDef->getDefiningAccess(); - return singleReachablePHIPath(DefAccess, Second); - } else { - auto *MP = cast<MemoryPhi>(First); - auto ReachableOperandPred = [&](const Use &U) { - return ReachableBlocks.count(MP->getIncomingBlock(U)); - }; - auto FilteredPhiArgs = - make_filter_range(MP->operands(), ReachableOperandPred); - SmallVector<const Value *, 32> OperandList; - std::copy(FilteredPhiArgs.begin(), FilteredPhiArgs.end(), - std::back_inserter(OperandList)); - bool Okay = OperandList.size() == 1; - if (!Okay) - Okay = std::equal(OperandList.begin(), OperandList.end(), - OperandList.begin()); - if (Okay) - return singleReachablePHIPath(cast<MemoryAccess>(OperandList[0]), Second); + if (MSSA->isLiveOnEntryDef(First)) return false; + + // This is not perfect, but as we're just verifying here, we can live with + // the loss of precision. The real solution would be that of doing strongly + // connected component finding in this routine, and it's probably not worth + // the complexity for the time being. So, we just keep a set of visited + // MemoryAccess and return true when we hit a cycle. + if (Visited.count(First)) + return true; + Visited.insert(First); + + const auto *EndDef = First; + for (auto *ChainDef : optimized_def_chain(First)) { + if (ChainDef == Second) + return true; + if (MSSA->isLiveOnEntryDef(ChainDef)) + return false; + EndDef = ChainDef; } + auto *MP = cast<MemoryPhi>(EndDef); + auto ReachableOperandPred = [&](const Use &U) { + return ReachableEdges.count({MP->getIncomingBlock(U), MP->getBlock()}); + }; + auto FilteredPhiArgs = + make_filter_range(MP->operands(), ReachableOperandPred); + SmallVector<const Value *, 32> OperandList; + std::copy(FilteredPhiArgs.begin(), FilteredPhiArgs.end(), + std::back_inserter(OperandList)); + bool Okay = OperandList.size() == 1; + if (!Okay) + Okay = + std::equal(OperandList.begin(), OperandList.end(), OperandList.begin()); + if (Okay) + return singleReachablePHIPath(Visited, cast<MemoryAccess>(OperandList[0]), + Second); + return false; } // Verify the that the memory equivalence table makes sense relative to the // congruence classes. Note that this checking is not perfect, and is currently -// subject to very rare false negatives. It is only useful for testing/debugging. +// subject to very rare false negatives. It is only useful for +// testing/debugging. void NewGVN::verifyMemoryCongruency() const { - // Anything equivalent in the memory access table should be in the same +#ifndef NDEBUG + // Verify that the memory table equivalence and memory member set match + for (const auto *CC : CongruenceClasses) { + if (CC == TOPClass || CC->isDead()) + continue; + if (CC->getStoreCount() != 0) { + assert((CC->getStoredValue() || !isa<StoreInst>(CC->getLeader())) && + "Any class with a store as a leader should have a " + "representative stored value"); + assert(CC->getMemoryLeader() && + "Any congruence class with a store should have a " + "representative access"); + } + + if (CC->getMemoryLeader()) + assert(MemoryAccessToClass.lookup(CC->getMemoryLeader()) == CC && + "Representative MemoryAccess does not appear to be reverse " + "mapped properly"); + for (auto M : CC->memory()) + assert(MemoryAccessToClass.lookup(M) == CC && + "Memory member does not appear to be reverse mapped properly"); + } + + // Anything equivalent in the MemoryAccess table should be in the same // congruence class. // Filter out the unreachable and trivially dead entries, because they may // never have been updated if the instructions were not processed. auto ReachableAccessPred = - [&](const std::pair<const MemoryAccess *, MemoryAccess *> Pair) { + [&](const std::pair<const MemoryAccess *, CongruenceClass *> Pair) { bool Result = ReachableBlocks.count(Pair.first->getBlock()); - if (!Result) + if (!Result || MSSA->isLiveOnEntryDef(Pair.first) || + MemoryToDFSNum(Pair.first) == 0) return false; if (auto *MemDef = dyn_cast<MemoryDef>(Pair.first)) return !isInstructionTriviallyDead(MemDef->getMemoryInst()); + + // We could have phi nodes which operands are all trivially dead, + // so we don't process them. + if (auto *MemPHI = dyn_cast<MemoryPhi>(Pair.first)) { + for (auto &U : MemPHI->incoming_values()) { + if (Instruction *I = dyn_cast<Instruction>(U.get())) { + if (!isInstructionTriviallyDead(I)) + return true; + } + } + return false; + } + return true; }; - auto Filtered = make_filter_range(MemoryAccessEquiv, ReachableAccessPred); + auto Filtered = make_filter_range(MemoryAccessToClass, ReachableAccessPred); for (auto KV : Filtered) { - assert(KV.first != KV.second && - "We added a useless equivalence to the memory equivalence table"); - // Unreachable instructions may not have changed because we never process - // them. - if (!ReachableBlocks.count(KV.first->getBlock())) - continue; if (auto *FirstMUD = dyn_cast<MemoryUseOrDef>(KV.first)) { - auto *SecondMUD = dyn_cast<MemoryUseOrDef>(KV.second); - if (FirstMUD && SecondMUD) - assert((singleReachablePHIPath(FirstMUD, SecondMUD) || - ValueToClass.lookup(FirstMUD->getMemoryInst()) == - ValueToClass.lookup(SecondMUD->getMemoryInst())) && - "The instructions for these memory operations should have " - "been in the same congruence class or reachable through" - "a single argument phi"); + auto *SecondMUD = dyn_cast<MemoryUseOrDef>(KV.second->getMemoryLeader()); + if (FirstMUD && SecondMUD) { + SmallPtrSet<const MemoryAccess *, 8> VisitedMAS; + assert((singleReachablePHIPath(VisitedMAS, FirstMUD, SecondMUD) || + ValueToClass.lookup(FirstMUD->getMemoryInst()) == + ValueToClass.lookup(SecondMUD->getMemoryInst())) && + "The instructions for these memory operations should have " + "been in the same congruence class or reachable through" + "a single argument phi"); + } } else if (auto *FirstMP = dyn_cast<MemoryPhi>(KV.first)) { - // We can only sanely verify that MemoryDefs in the operand list all have // the same class. auto ReachableOperandPred = [&](const Use &U) { - return ReachableBlocks.count(FirstMP->getIncomingBlock(U)) && + return ReachableEdges.count( + {FirstMP->getIncomingBlock(U), FirstMP->getBlock()}) && isa<MemoryDef>(U); }; @@ -1622,35 +2950,179 @@ void NewGVN::verifyMemoryCongruency() const { "All MemoryPhi arguments should be in the same class"); } } +#endif +} + +// Verify that the sparse propagation we did actually found the maximal fixpoint +// We do this by storing the value to class mapping, touching all instructions, +// and redoing the iteration to see if anything changed. +void NewGVN::verifyIterationSettled(Function &F) { +#ifndef NDEBUG + DEBUG(dbgs() << "Beginning iteration verification\n"); + if (DebugCounter::isCounterSet(VNCounter)) + DebugCounter::setCounterValue(VNCounter, StartingVNCounter); + + // Note that we have to store the actual classes, as we may change existing + // classes during iteration. This is because our memory iteration propagation + // is not perfect, and so may waste a little work. But it should generate + // exactly the same congruence classes we have now, with different IDs. + std::map<const Value *, CongruenceClass> BeforeIteration; + + for (auto &KV : ValueToClass) { + if (auto *I = dyn_cast<Instruction>(KV.first)) + // Skip unused/dead instructions. + if (InstrToDFSNum(I) == 0) + continue; + BeforeIteration.insert({KV.first, *KV.second}); + } + + TouchedInstructions.set(); + TouchedInstructions.reset(0); + iterateTouchedInstructions(); + DenseSet<std::pair<const CongruenceClass *, const CongruenceClass *>> + EqualClasses; + for (const auto &KV : ValueToClass) { + if (auto *I = dyn_cast<Instruction>(KV.first)) + // Skip unused/dead instructions. + if (InstrToDFSNum(I) == 0) + continue; + // We could sink these uses, but i think this adds a bit of clarity here as + // to what we are comparing. + auto *BeforeCC = &BeforeIteration.find(KV.first)->second; + auto *AfterCC = KV.second; + // Note that the classes can't change at this point, so we memoize the set + // that are equal. + if (!EqualClasses.count({BeforeCC, AfterCC})) { + assert(BeforeCC->isEquivalentTo(AfterCC) && + "Value number changed after main loop completed!"); + EqualClasses.insert({BeforeCC, AfterCC}); + } + } +#endif +} + +// Verify that for each store expression in the expression to class mapping, +// only the latest appears, and multiple ones do not appear. +// Because loads do not use the stored value when doing equality with stores, +// if we don't erase the old store expressions from the table, a load can find +// a no-longer valid StoreExpression. +void NewGVN::verifyStoreExpressions() const { +#ifndef NDEBUG + // This is the only use of this, and it's not worth defining a complicated + // densemapinfo hash/equality function for it. + std::set< + std::pair<const Value *, + std::tuple<const Value *, const CongruenceClass *, Value *>>> + StoreExpressionSet; + for (const auto &KV : ExpressionToClass) { + if (auto *SE = dyn_cast<StoreExpression>(KV.first)) { + // Make sure a version that will conflict with loads is not already there + auto Res = StoreExpressionSet.insert( + {SE->getOperand(0), std::make_tuple(SE->getMemoryLeader(), KV.second, + SE->getStoredValue())}); + bool Okay = Res.second; + // It's okay to have the same expression already in there if it is + // identical in nature. + // This can happen when the leader of the stored value changes over time. + if (!Okay) + Okay = (std::get<1>(Res.first->second) == KV.second) && + (lookupOperandLeader(std::get<2>(Res.first->second)) == + lookupOperandLeader(SE->getStoredValue())); + assert(Okay && "Stored expression conflict exists in expression table"); + auto *ValueExpr = ValueToExpression.lookup(SE->getStoreInst()); + assert(ValueExpr && ValueExpr->equals(*SE) && + "StoreExpression in ExpressionToClass is not latest " + "StoreExpression for value"); + } + } +#endif +} + +// This is the main value numbering loop, it iterates over the initial touched +// instruction set, propagating value numbers, marking things touched, etc, +// until the set of touched instructions is completely empty. +void NewGVN::iterateTouchedInstructions() { + unsigned int Iterations = 0; + // Figure out where touchedinstructions starts + int FirstInstr = TouchedInstructions.find_first(); + // Nothing set, nothing to iterate, just return. + if (FirstInstr == -1) + return; + const BasicBlock *LastBlock = getBlockForValue(InstrFromDFSNum(FirstInstr)); + while (TouchedInstructions.any()) { + ++Iterations; + // Walk through all the instructions in all the blocks in RPO. + // TODO: As we hit a new block, we should push and pop equalities into a + // table lookupOperandLeader can use, to catch things PredicateInfo + // might miss, like edge-only equivalences. + for (unsigned InstrNum : TouchedInstructions.set_bits()) { + + // This instruction was found to be dead. We don't bother looking + // at it again. + if (InstrNum == 0) { + TouchedInstructions.reset(InstrNum); + continue; + } + + Value *V = InstrFromDFSNum(InstrNum); + const BasicBlock *CurrBlock = getBlockForValue(V); + + // If we hit a new block, do reachability processing. + if (CurrBlock != LastBlock) { + LastBlock = CurrBlock; + bool BlockReachable = ReachableBlocks.count(CurrBlock); + const auto &CurrInstRange = BlockInstRange.lookup(CurrBlock); + + // If it's not reachable, erase any touched instructions and move on. + if (!BlockReachable) { + TouchedInstructions.reset(CurrInstRange.first, CurrInstRange.second); + DEBUG(dbgs() << "Skipping instructions in block " + << getBlockName(CurrBlock) + << " because it is unreachable\n"); + continue; + } + updateProcessedCount(CurrBlock); + } + // Reset after processing (because we may mark ourselves as touched when + // we propagate equalities). + TouchedInstructions.reset(InstrNum); + + if (auto *MP = dyn_cast<MemoryPhi>(V)) { + DEBUG(dbgs() << "Processing MemoryPhi " << *MP << "\n"); + valueNumberMemoryPhi(MP); + } else if (auto *I = dyn_cast<Instruction>(V)) { + valueNumberInstruction(I); + } else { + llvm_unreachable("Should have been a MemoryPhi or Instruction"); + } + updateProcessedCount(V); + } + } + NumGVNMaxIterations = std::max(NumGVNMaxIterations.getValue(), Iterations); } // This is the main transformation entry point. -bool NewGVN::runGVN(Function &F, DominatorTree *_DT, AssumptionCache *_AC, - TargetLibraryInfo *_TLI, AliasAnalysis *_AA, - MemorySSA *_MSSA) { +bool NewGVN::runGVN() { + if (DebugCounter::isCounterSet(VNCounter)) + StartingVNCounter = DebugCounter::getCounterValue(VNCounter); bool Changed = false; - DT = _DT; - AC = _AC; - TLI = _TLI; - AA = _AA; - MSSA = _MSSA; - DL = &F.getParent()->getDataLayout(); + NumFuncArgs = F.arg_size(); MSSAWalker = MSSA->getWalker(); + SingletonDeadExpression = new (ExpressionAllocator) DeadExpression(); // Count number of instructions for sizing of hash tables, and come // up with a global dfs numbering for instructions. unsigned ICount = 1; // Add an empty instruction to account for the fact that we start at 1 DFSToInstr.emplace_back(nullptr); - // Note: We want RPO traversal of the blocks, which is not quite the same as - // dominator tree order, particularly with regard whether backedges get - // visited first or second, given a block with multiple successors. + // Note: We want ideal RPO traversal of the blocks, which is not quite the + // same as dominator tree order, particularly with regard whether backedges + // get visited first or second, given a block with multiple successors. // If we visit in the wrong order, we will end up performing N times as many // iterations. // The dominator tree does guarantee that, for a given dom tree node, it's // parent must occur before it in the RPO ordering. Thus, we only need to sort // the siblings. - DenseMap<const DomTreeNode *, unsigned> RPOOrdering; ReversePostOrderTraversal<Function *> RPOT(&F); unsigned Counter = 0; for (auto &B : RPOT) { @@ -1663,33 +3135,21 @@ bool NewGVN::runGVN(Function &F, DominatorTree *_DT, AssumptionCache *_AC, auto *Node = DT->getNode(B); if (Node->getChildren().size() > 1) std::sort(Node->begin(), Node->end(), - [&RPOOrdering](const DomTreeNode *A, const DomTreeNode *B) { + [&](const DomTreeNode *A, const DomTreeNode *B) { return RPOOrdering[A] < RPOOrdering[B]; }); } // Now a standard depth first ordering of the domtree is equivalent to RPO. - auto DFI = df_begin(DT->getRootNode()); - for (auto DFE = df_end(DT->getRootNode()); DFI != DFE; ++DFI) { - BasicBlock *B = DFI->getBlock(); + for (auto DTN : depth_first(DT->getRootNode())) { + BasicBlock *B = DTN->getBlock(); const auto &BlockRange = assignDFSNumbers(B, ICount); BlockInstRange.insert({B, BlockRange}); ICount += BlockRange.second - BlockRange.first; } - - // Handle forward unreachable blocks and figure out which blocks - // have single preds. - for (auto &B : F) { - // Assign numbers to unreachable blocks. - if (!DFI.nodeVisited(DT->getNode(&B))) { - const auto &BlockRange = assignDFSNumbers(&B, ICount); - BlockInstRange.insert({&B, BlockRange}); - ICount += BlockRange.second - BlockRange.first; - } - } + initializeCongruenceClasses(F); TouchedInstructions.resize(ICount); - DominatedInstRange.reserve(F.size()); // Ensure we don't end up resizing the expressionToClass map, as // that can be quite expensive. At most, we have one expression per // instruction. @@ -1698,65 +3158,15 @@ bool NewGVN::runGVN(Function &F, DominatorTree *_DT, AssumptionCache *_AC, // Initialize the touched instructions to include the entry block. const auto &InstRange = BlockInstRange.lookup(&F.getEntryBlock()); TouchedInstructions.set(InstRange.first, InstRange.second); + DEBUG(dbgs() << "Block " << getBlockName(&F.getEntryBlock()) + << " marked reachable\n"); ReachableBlocks.insert(&F.getEntryBlock()); - initializeCongruenceClasses(F); - - unsigned int Iterations = 0; - // We start out in the entry block. - BasicBlock *LastBlock = &F.getEntryBlock(); - while (TouchedInstructions.any()) { - ++Iterations; - // Walk through all the instructions in all the blocks in RPO. - for (int InstrNum = TouchedInstructions.find_first(); InstrNum != -1; - InstrNum = TouchedInstructions.find_next(InstrNum)) { - assert(InstrNum != 0 && "Bit 0 should never be set, something touched an " - "instruction not in the lookup table"); - Value *V = DFSToInstr[InstrNum]; - BasicBlock *CurrBlock = nullptr; - - if (auto *I = dyn_cast<Instruction>(V)) - CurrBlock = I->getParent(); - else if (auto *MP = dyn_cast<MemoryPhi>(V)) - CurrBlock = MP->getBlock(); - else - llvm_unreachable("DFSToInstr gave us an unknown type of instruction"); - - // If we hit a new block, do reachability processing. - if (CurrBlock != LastBlock) { - LastBlock = CurrBlock; - bool BlockReachable = ReachableBlocks.count(CurrBlock); - const auto &CurrInstRange = BlockInstRange.lookup(CurrBlock); - - // If it's not reachable, erase any touched instructions and move on. - if (!BlockReachable) { - TouchedInstructions.reset(CurrInstRange.first, CurrInstRange.second); - DEBUG(dbgs() << "Skipping instructions in block " - << getBlockName(CurrBlock) - << " because it is unreachable\n"); - continue; - } - updateProcessedCount(CurrBlock); - } - - if (auto *MP = dyn_cast<MemoryPhi>(V)) { - DEBUG(dbgs() << "Processing MemoryPhi " << *MP << "\n"); - valueNumberMemoryPhi(MP); - } else if (auto *I = dyn_cast<Instruction>(V)) { - valueNumberInstruction(I); - } else { - llvm_unreachable("Should have been a MemoryPhi or Instruction"); - } - updateProcessedCount(V); - // Reset after processing (because we may mark ourselves as touched when - // we propagate equalities). - TouchedInstructions.reset(InstrNum); - } - } - NumGVNMaxIterations = std::max(NumGVNMaxIterations.getValue(), Iterations); -#ifndef NDEBUG + iterateTouchedInstructions(); verifyMemoryCongruency(); -#endif + verifyIterationSettled(F); + verifyStoreExpressions(); + Changed |= eliminateInstructions(F); // Delete all instructions marked for deletion. @@ -1764,7 +3174,8 @@ bool NewGVN::runGVN(Function &F, DominatorTree *_DT, AssumptionCache *_AC, if (!ToErase->use_empty()) ToErase->replaceAllUsesWith(UndefValue::get(ToErase->getType())); - ToErase->eraseFromParent(); + if (ToErase->getParent()) + ToErase->eraseFromParent(); } // Delete all unreachable blocks. @@ -1783,59 +3194,15 @@ bool NewGVN::runGVN(Function &F, DominatorTree *_DT, AssumptionCache *_AC, return Changed; } -bool NewGVN::runOnFunction(Function &F) { - if (skipFunction(F)) - return false; - return runGVN(F, &getAnalysis<DominatorTreeWrapperPass>().getDomTree(), - &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F), - &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(), - &getAnalysis<AAResultsWrapperPass>().getAAResults(), - &getAnalysis<MemorySSAWrapperPass>().getMSSA()); -} - -PreservedAnalyses NewGVNPass::run(Function &F, AnalysisManager<Function> &AM) { - NewGVN Impl; - - // Apparently the order in which we get these results matter for - // the old GVN (see Chandler's comment in GVN.cpp). I'll keep - // the same order here, just in case. - auto &AC = AM.getResult<AssumptionAnalysis>(F); - auto &DT = AM.getResult<DominatorTreeAnalysis>(F); - auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); - auto &AA = AM.getResult<AAManager>(F); - auto &MSSA = AM.getResult<MemorySSAAnalysis>(F).getMSSA(); - bool Changed = Impl.runGVN(F, &DT, &AC, &TLI, &AA, &MSSA); - if (!Changed) - return PreservedAnalyses::all(); - PreservedAnalyses PA; - PA.preserve<DominatorTreeAnalysis>(); - PA.preserve<GlobalsAA>(); - return PA; -} - -// Return true if V is a value that will always be available (IE can -// be placed anywhere) in the function. We don't do globals here -// because they are often worse to put in place. -// TODO: Separate cost from availability -static bool alwaysAvailable(Value *V) { - return isa<Constant>(V) || isa<Argument>(V); -} - -// Get the basic block from an instruction/value. -static BasicBlock *getBlockForValue(Value *V) { - if (auto *I = dyn_cast<Instruction>(V)) - return I->getParent(); - return nullptr; -} - struct NewGVN::ValueDFS { int DFSIn = 0; int DFSOut = 0; int LocalNum = 0; - // Only one of these will be set. - Value *Val = nullptr; + // Only one of Def and U will be set. + // The bool in the Def tells us whether the Def is the stored value of a + // store. + PointerIntPair<Value *, 1, bool> Def; Use *U = nullptr; - bool operator<(const ValueDFS &Other) const { // It's not enough that any given field be less than - we have sets // of fields that need to be evaluated together to give a proper ordering. @@ -1875,89 +3242,163 @@ struct NewGVN::ValueDFS { // but .val and .u. // It does not matter what order we replace these operands in. // You will always end up with the same IR, and this is guaranteed. - return std::tie(DFSIn, DFSOut, LocalNum, Val, U) < - std::tie(Other.DFSIn, Other.DFSOut, Other.LocalNum, Other.Val, + return std::tie(DFSIn, DFSOut, LocalNum, Def, U) < + std::tie(Other.DFSIn, Other.DFSOut, Other.LocalNum, Other.Def, Other.U); } }; -void NewGVN::convertDenseToDFSOrdered( - CongruenceClass::MemberSet &Dense, - SmallVectorImpl<ValueDFS> &DFSOrderedSet) { +// This function converts the set of members for a congruence class from values, +// to sets of defs and uses with associated DFS info. The total number of +// reachable uses for each value is stored in UseCount, and instructions that +// seem +// dead (have no non-dead uses) are stored in ProbablyDead. +void NewGVN::convertClassToDFSOrdered( + const CongruenceClass &Dense, SmallVectorImpl<ValueDFS> &DFSOrderedSet, + DenseMap<const Value *, unsigned int> &UseCounts, + SmallPtrSetImpl<Instruction *> &ProbablyDead) const { for (auto D : Dense) { // First add the value. BasicBlock *BB = getBlockForValue(D); // Constants are handled prior to ever calling this function, so // we should only be left with instructions as members. assert(BB && "Should have figured out a basic block for value"); - ValueDFS VD; - - std::pair<int, int> DFSPair = DFSDomMap[BB]; - assert(DFSPair.first != -1 && DFSPair.second != -1 && "Invalid DFS Pair"); - VD.DFSIn = DFSPair.first; - VD.DFSOut = DFSPair.second; - VD.Val = D; - // If it's an instruction, use the real local dfs number. - if (auto *I = dyn_cast<Instruction>(D)) - VD.LocalNum = InstrDFS[I]; - else - llvm_unreachable("Should have been an instruction"); - - DFSOrderedSet.emplace_back(VD); + ValueDFS VDDef; + DomTreeNode *DomNode = DT->getNode(BB); + VDDef.DFSIn = DomNode->getDFSNumIn(); + VDDef.DFSOut = DomNode->getDFSNumOut(); + // If it's a store, use the leader of the value operand, if it's always + // available, or the value operand. TODO: We could do dominance checks to + // find a dominating leader, but not worth it ATM. + if (auto *SI = dyn_cast<StoreInst>(D)) { + auto Leader = lookupOperandLeader(SI->getValueOperand()); + if (alwaysAvailable(Leader)) { + VDDef.Def.setPointer(Leader); + } else { + VDDef.Def.setPointer(SI->getValueOperand()); + VDDef.Def.setInt(true); + } + } else { + VDDef.Def.setPointer(D); + } + assert(isa<Instruction>(D) && + "The dense set member should always be an instruction"); + Instruction *Def = cast<Instruction>(D); + VDDef.LocalNum = InstrToDFSNum(D); + DFSOrderedSet.push_back(VDDef); + // If there is a phi node equivalent, add it + if (auto *PN = RealToTemp.lookup(Def)) { + auto *PHIE = + dyn_cast_or_null<PHIExpression>(ValueToExpression.lookup(Def)); + if (PHIE) { + VDDef.Def.setInt(false); + VDDef.Def.setPointer(PN); + VDDef.LocalNum = 0; + DFSOrderedSet.push_back(VDDef); + } + } - // Now add the users. - for (auto &U : D->uses()) { + unsigned int UseCount = 0; + // Now add the uses. + for (auto &U : Def->uses()) { if (auto *I = dyn_cast<Instruction>(U.getUser())) { - ValueDFS VD; + // Don't try to replace into dead uses + if (InstructionsToErase.count(I)) + continue; + ValueDFS VDUse; // Put the phi node uses in the incoming block. BasicBlock *IBlock; if (auto *P = dyn_cast<PHINode>(I)) { IBlock = P->getIncomingBlock(U); // Make phi node users appear last in the incoming block // they are from. - VD.LocalNum = InstrDFS.size() + 1; + VDUse.LocalNum = InstrDFS.size() + 1; } else { - IBlock = I->getParent(); - VD.LocalNum = InstrDFS[I]; + IBlock = getBlockForValue(I); + VDUse.LocalNum = InstrToDFSNum(I); } - std::pair<int, int> DFSPair = DFSDomMap[IBlock]; - VD.DFSIn = DFSPair.first; - VD.DFSOut = DFSPair.second; - VD.U = &U; - DFSOrderedSet.emplace_back(VD); + + // Skip uses in unreachable blocks, as we're going + // to delete them. + if (ReachableBlocks.count(IBlock) == 0) + continue; + + DomTreeNode *DomNode = DT->getNode(IBlock); + VDUse.DFSIn = DomNode->getDFSNumIn(); + VDUse.DFSOut = DomNode->getDFSNumOut(); + VDUse.U = &U; + ++UseCount; + DFSOrderedSet.emplace_back(VDUse); } } + + // If there are no uses, it's probably dead (but it may have side-effects, + // so not definitely dead. Otherwise, store the number of uses so we can + // track if it becomes dead later). + if (UseCount == 0) + ProbablyDead.insert(Def); + else + UseCounts[Def] = UseCount; } } -static void patchReplacementInstruction(Instruction *I, Value *Repl) { - // Patch the replacement so that it is not more restrictive than the value - // being replaced. - auto *Op = dyn_cast<BinaryOperator>(I); - auto *ReplOp = dyn_cast<BinaryOperator>(Repl); +// This function converts the set of members for a congruence class from values, +// to the set of defs for loads and stores, with associated DFS info. +void NewGVN::convertClassToLoadsAndStores( + const CongruenceClass &Dense, + SmallVectorImpl<ValueDFS> &LoadsAndStores) const { + for (auto D : Dense) { + if (!isa<LoadInst>(D) && !isa<StoreInst>(D)) + continue; - if (Op && ReplOp) - ReplOp->andIRFlags(Op); + BasicBlock *BB = getBlockForValue(D); + ValueDFS VD; + DomTreeNode *DomNode = DT->getNode(BB); + VD.DFSIn = DomNode->getDFSNumIn(); + VD.DFSOut = DomNode->getDFSNumOut(); + VD.Def.setPointer(D); - if (auto *ReplInst = dyn_cast<Instruction>(Repl)) { - // FIXME: If both the original and replacement value are part of the - // same control-flow region (meaning that the execution of one - // guarentees the executation of the other), then we can combine the - // noalias scopes here and do better than the general conservative - // answer used in combineMetadata(). + // If it's an instruction, use the real local dfs number. + if (auto *I = dyn_cast<Instruction>(D)) + VD.LocalNum = InstrToDFSNum(I); + else + llvm_unreachable("Should have been an instruction"); - // In general, GVN unifies expressions over different control-flow - // regions, and so we need a conservative combination of the noalias - // scopes. - unsigned KnownIDs[] = { - LLVMContext::MD_tbaa, LLVMContext::MD_alias_scope, - LLVMContext::MD_noalias, LLVMContext::MD_range, - LLVMContext::MD_fpmath, LLVMContext::MD_invariant_load, - LLVMContext::MD_invariant_group}; - combineMetadata(ReplInst, I, KnownIDs); + LoadsAndStores.emplace_back(VD); } } +static void patchReplacementInstruction(Instruction *I, Value *Repl) { + auto *ReplInst = dyn_cast<Instruction>(Repl); + if (!ReplInst) + return; + + // Patch the replacement so that it is not more restrictive than the value + // being replaced. + // Note that if 'I' is a load being replaced by some operation, + // for example, by an arithmetic operation, then andIRFlags() + // would just erase all math flags from the original arithmetic + // operation, which is clearly not wanted and not needed. + if (!isa<LoadInst>(I)) + ReplInst->andIRFlags(I); + + // FIXME: If both the original and replacement value are part of the + // same control-flow region (meaning that the execution of one + // guarantees the execution of the other), then we can combine the + // noalias scopes here and do better than the general conservative + // answer used in combineMetadata(). + + // In general, GVN unifies expressions over different control-flow + // regions, and so we need a conservative combination of the noalias + // scopes. + static const unsigned KnownIDs[] = { + LLVMContext::MD_tbaa, LLVMContext::MD_alias_scope, + LLVMContext::MD_noalias, LLVMContext::MD_range, + LLVMContext::MD_fpmath, LLVMContext::MD_invariant_load, + LLVMContext::MD_invariant_group}; + combineMetadata(ReplInst, I, KnownIDs); +} + static void patchAndReplaceAllUsesWith(Instruction *I, Value *Repl) { patchReplacementInstruction(I, Repl); I->replaceAllUsesWith(Repl); @@ -1967,10 +3408,6 @@ void NewGVN::deleteInstructionsInBlock(BasicBlock *BB) { DEBUG(dbgs() << " BasicBlock Dead:" << *BB); ++NumGVNBlocksDeleted; - // Check to see if there are non-terminating instructions to delete. - if (isa<TerminatorInst>(BB->begin())) - return; - // Delete the instructions backwards, as it has a reduced likelihood of having // to update as many def-use and use-def chains. Start after the terminator. auto StartPoint = BB->rbegin(); @@ -1987,6 +3424,11 @@ void NewGVN::deleteInstructionsInBlock(BasicBlock *BB) { Inst.eraseFromParent(); ++NumGVNInstrDeleted; } + // Now insert something that simplifycfg will turn into an unreachable. + Type *Int8Ty = Type::getInt8Ty(BB->getContext()); + new StoreInst(UndefValue::get(Int8Ty), + Constant::getNullValue(Int8Ty->getPointerTo()), + BB->getTerminator()); } void NewGVN::markInstructionForDeletion(Instruction *I) { @@ -2042,6 +3484,37 @@ private: }; } +// Given a value and a basic block we are trying to see if it is available in, +// see if the value has a leader available in that block. +Value *NewGVN::findPhiOfOpsLeader(const Expression *E, + const BasicBlock *BB) const { + // It would already be constant if we could make it constant + if (auto *CE = dyn_cast<ConstantExpression>(E)) + return CE->getConstantValue(); + if (auto *VE = dyn_cast<VariableExpression>(E)) + return VE->getVariableValue(); + + auto *CC = ExpressionToClass.lookup(E); + if (!CC) + return nullptr; + if (alwaysAvailable(CC->getLeader())) + return CC->getLeader(); + + for (auto Member : *CC) { + auto *MemberInst = dyn_cast<Instruction>(Member); + // Anything that isn't an instruction is always available. + if (!MemberInst) + return Member; + // If we are looking for something in the same block as the member, it must + // be a leader because this function is looking for operands for a phi node. + if (MemberInst->getParent() == BB || + DT->dominates(MemberInst->getParent(), BB)) { + return Member; + } + } + return nullptr; +} + bool NewGVN::eliminateInstructions(Function &F) { // This is a non-standard eliminator. The normal way to eliminate is // to walk the dominator tree in order, keeping track of available @@ -2072,73 +3545,91 @@ bool NewGVN::eliminateInstructions(Function &F) { // DFS numbers are updated, we compute some ourselves. DT->updateDFSNumbers(); - for (auto &B : F) { - if (!ReachableBlocks.count(&B)) { - for (const auto S : successors(&B)) { - for (auto II = S->begin(); isa<PHINode>(II); ++II) { - auto &Phi = cast<PHINode>(*II); - DEBUG(dbgs() << "Replacing incoming value of " << *II << " for block " - << getBlockName(&B) - << " with undef due to it being unreachable\n"); - for (auto &Operand : Phi.incoming_values()) - if (Phi.getIncomingBlock(Operand) == &B) - Operand.set(UndefValue::get(Phi.getType())); - } + // Go through all of our phi nodes, and kill the arguments associated with + // unreachable edges. + auto ReplaceUnreachablePHIArgs = [&](PHINode &PHI, BasicBlock *BB) { + for (auto &Operand : PHI.incoming_values()) + if (!ReachableEdges.count({PHI.getIncomingBlock(Operand), BB})) { + DEBUG(dbgs() << "Replacing incoming value of " << PHI << " for block " + << getBlockName(PHI.getIncomingBlock(Operand)) + << " with undef due to it being unreachable\n"); + Operand.set(UndefValue::get(PHI.getType())); } + }; + SmallPtrSet<BasicBlock *, 8> BlocksWithPhis; + for (auto &B : F) + if ((!B.empty() && isa<PHINode>(*B.begin())) || + (PHIOfOpsPHIs.find(&B) != PHIOfOpsPHIs.end())) + BlocksWithPhis.insert(&B); + DenseMap<const BasicBlock *, unsigned> ReachablePredCount; + for (auto KV : ReachableEdges) + ReachablePredCount[KV.getEnd()]++; + for (auto *BB : BlocksWithPhis) + // TODO: It would be faster to use getNumIncomingBlocks() on a phi node in + // the block and subtract the pred count, but it's more complicated. + if (ReachablePredCount.lookup(BB) != + unsigned(std::distance(pred_begin(BB), pred_end(BB)))) { + for (auto II = BB->begin(); isa<PHINode>(II); ++II) { + auto &PHI = cast<PHINode>(*II); + ReplaceUnreachablePHIArgs(PHI, BB); + } + for_each_found(PHIOfOpsPHIs, BB, [&](PHINode *PHI) { + ReplaceUnreachablePHIArgs(*PHI, BB); + }); } - DomTreeNode *Node = DT->getNode(&B); - if (Node) - DFSDomMap[&B] = {Node->getDFSNumIn(), Node->getDFSNumOut()}; - } - for (CongruenceClass *CC : CongruenceClasses) { - // FIXME: We should eventually be able to replace everything still - // in the initial class with undef, as they should be unreachable. - // Right now, initial still contains some things we skip value - // numbering of (UNREACHABLE's, for example). - if (CC == InitialClass || CC->Dead) + // Map to store the use counts + DenseMap<const Value *, unsigned int> UseCounts; + for (auto *CC : reverse(CongruenceClasses)) { + DEBUG(dbgs() << "Eliminating in congruence class " << CC->getID() << "\n"); + // Track the equivalent store info so we can decide whether to try + // dead store elimination. + SmallVector<ValueDFS, 8> PossibleDeadStores; + SmallPtrSet<Instruction *, 8> ProbablyDead; + if (CC->isDead() || CC->empty()) continue; - assert(CC->RepLeader && "We should have had a leader"); + // Everything still in the TOP class is unreachable or dead. + if (CC == TOPClass) { + for (auto M : *CC) { + auto *VTE = ValueToExpression.lookup(M); + if (VTE && isa<DeadExpression>(VTE)) + markInstructionForDeletion(cast<Instruction>(M)); + assert((!ReachableBlocks.count(cast<Instruction>(M)->getParent()) || + InstructionsToErase.count(cast<Instruction>(M))) && + "Everything in TOP should be unreachable or dead at this " + "point"); + } + continue; + } + assert(CC->getLeader() && "We should have had a leader"); // If this is a leader that is always available, and it's a // constant or has no equivalences, just replace everything with // it. We then update the congruence class with whatever members // are left. - if (alwaysAvailable(CC->RepLeader)) { - SmallPtrSet<Value *, 4> MembersLeft; - for (auto M : CC->Members) { - + Value *Leader = + CC->getStoredValue() ? CC->getStoredValue() : CC->getLeader(); + if (alwaysAvailable(Leader)) { + CongruenceClass::MemberSet MembersLeft; + for (auto M : *CC) { Value *Member = M; - // Void things have no uses we can replace. - if (Member == CC->RepLeader || Member->getType()->isVoidTy()) { + if (Member == Leader || !isa<Instruction>(Member) || + Member->getType()->isVoidTy()) { MembersLeft.insert(Member); continue; } - - DEBUG(dbgs() << "Found replacement " << *(CC->RepLeader) << " for " - << *Member << "\n"); - // Due to equality propagation, these may not always be - // instructions, they may be real values. We don't really - // care about trying to replace the non-instructions. - if (auto *I = dyn_cast<Instruction>(Member)) { - assert(CC->RepLeader != I && - "About to accidentally remove our leader"); - replaceInstruction(I, CC->RepLeader); - AnythingReplaced = true; - - continue; - } else { - MembersLeft.insert(I); - } + DEBUG(dbgs() << "Found replacement " << *(Leader) << " for " << *Member + << "\n"); + auto *I = cast<Instruction>(Member); + assert(Leader != I && "About to accidentally remove our leader"); + replaceInstruction(I, Leader); + AnythingReplaced = true; } - CC->Members.swap(MembersLeft); - + CC->swap(MembersLeft); } else { - DEBUG(dbgs() << "Eliminating in congruence class " << CC->ID << "\n"); // If this is a singleton, we can skip it. - if (CC->Members.size() != 1) { - + if (CC->size() != 1 || RealToTemp.lookup(Leader)) { // This is a stack because equality replacement/etc may place // constants in the middle of the member list, and we want to use // those constant values in preference to the current leader, over @@ -2147,23 +3638,34 @@ bool NewGVN::eliminateInstructions(Function &F) { // Convert the members to DFS ordered sets and then merge them. SmallVector<ValueDFS, 8> DFSOrderedSet; - convertDenseToDFSOrdered(CC->Members, DFSOrderedSet); + convertClassToDFSOrdered(*CC, DFSOrderedSet, UseCounts, ProbablyDead); // Sort the whole thing. std::sort(DFSOrderedSet.begin(), DFSOrderedSet.end()); - for (auto &VD : DFSOrderedSet) { int MemberDFSIn = VD.DFSIn; int MemberDFSOut = VD.DFSOut; - Value *Member = VD.Val; - Use *MemberUse = VD.U; - - if (Member) { - // We ignore void things because we can't get a value from them. - // FIXME: We could actually use this to kill dead stores that are - // dominated by equivalent earlier stores. - if (Member->getType()->isVoidTy()) - continue; + Value *Def = VD.Def.getPointer(); + bool FromStore = VD.Def.getInt(); + Use *U = VD.U; + // We ignore void things because we can't get a value from them. + if (Def && Def->getType()->isVoidTy()) + continue; + auto *DefInst = dyn_cast_or_null<Instruction>(Def); + if (DefInst && AllTempInstructions.count(DefInst)) { + auto *PN = cast<PHINode>(DefInst); + + // If this is a value phi and that's the expression we used, insert + // it into the program + // remove from temp instruction list. + AllTempInstructions.erase(PN); + auto *DefBlock = getBlockForValue(Def); + DEBUG(dbgs() << "Inserting fully real phi of ops" << *Def + << " into block " + << getBlockName(getBlockForValue(Def)) << "\n"); + PN->insertBefore(&DefBlock->front()); + Def = PN; + NumGVNPHIOfOpsEliminations++; } if (EliminationStack.empty()) { @@ -2189,69 +3691,251 @@ bool NewGVN::eliminateInstructions(Function &F) { // start using, we also push. // Otherwise, we walk along, processing members who are // dominated by this scope, and eliminate them. - bool ShouldPush = - Member && (EliminationStack.empty() || isa<Constant>(Member)); + bool ShouldPush = Def && EliminationStack.empty(); bool OutOfScope = !EliminationStack.isInScope(MemberDFSIn, MemberDFSOut); if (OutOfScope || ShouldPush) { // Sync to our current scope. EliminationStack.popUntilDFSScope(MemberDFSIn, MemberDFSOut); - ShouldPush |= Member && EliminationStack.empty(); + bool ShouldPush = Def && EliminationStack.empty(); if (ShouldPush) { - EliminationStack.push_back(Member, MemberDFSIn, MemberDFSOut); + EliminationStack.push_back(Def, MemberDFSIn, MemberDFSOut); + } + } + + // Skip the Def's, we only want to eliminate on their uses. But mark + // dominated defs as dead. + if (Def) { + // For anything in this case, what and how we value number + // guarantees that any side-effets that would have occurred (ie + // throwing, etc) can be proven to either still occur (because it's + // dominated by something that has the same side-effects), or never + // occur. Otherwise, we would not have been able to prove it value + // equivalent to something else. For these things, we can just mark + // it all dead. Note that this is different from the "ProbablyDead" + // set, which may not be dominated by anything, and thus, are only + // easy to prove dead if they are also side-effect free. Note that + // because stores are put in terms of the stored value, we skip + // stored values here. If the stored value is really dead, it will + // still be marked for deletion when we process it in its own class. + if (!EliminationStack.empty() && Def != EliminationStack.back() && + isa<Instruction>(Def) && !FromStore) + markInstructionForDeletion(cast<Instruction>(Def)); + continue; + } + // At this point, we know it is a Use we are trying to possibly + // replace. + + assert(isa<Instruction>(U->get()) && + "Current def should have been an instruction"); + assert(isa<Instruction>(U->getUser()) && + "Current user should have been an instruction"); + + // If the thing we are replacing into is already marked to be dead, + // this use is dead. Note that this is true regardless of whether + // we have anything dominating the use or not. We do this here + // because we are already walking all the uses anyway. + Instruction *InstUse = cast<Instruction>(U->getUser()); + if (InstructionsToErase.count(InstUse)) { + auto &UseCount = UseCounts[U->get()]; + if (--UseCount == 0) { + ProbablyDead.insert(cast<Instruction>(U->get())); } } // If we get to this point, and the stack is empty we must have a use - // with nothing we can use to eliminate it, just skip it. + // with nothing we can use to eliminate this use, so just skip it. if (EliminationStack.empty()) continue; - // Skip the Value's, we only want to eliminate on their uses. - if (Member) - continue; - Value *Result = EliminationStack.back(); + Value *DominatingLeader = EliminationStack.back(); + + auto *II = dyn_cast<IntrinsicInst>(DominatingLeader); + if (II && II->getIntrinsicID() == Intrinsic::ssa_copy) + DominatingLeader = II->getOperand(0); // Don't replace our existing users with ourselves. - if (MemberUse->get() == Result) + if (U->get() == DominatingLeader) continue; - - DEBUG(dbgs() << "Found replacement " << *Result << " for " - << *MemberUse->get() << " in " << *(MemberUse->getUser()) - << "\n"); + DEBUG(dbgs() << "Found replacement " << *DominatingLeader << " for " + << *U->get() << " in " << *(U->getUser()) << "\n"); // If we replaced something in an instruction, handle the patching of - // metadata. - if (auto *ReplacedInst = dyn_cast<Instruction>(MemberUse->get())) - patchReplacementInstruction(ReplacedInst, Result); - - assert(isa<Instruction>(MemberUse->getUser())); - MemberUse->set(Result); + // metadata. Skip this if we are replacing predicateinfo with its + // original operand, as we already know we can just drop it. + auto *ReplacedInst = cast<Instruction>(U->get()); + auto *PI = PredInfo->getPredicateInfoFor(ReplacedInst); + if (!PI || DominatingLeader != PI->OriginalOp) + patchReplacementInstruction(ReplacedInst, DominatingLeader); + U->set(DominatingLeader); + // This is now a use of the dominating leader, which means if the + // dominating leader was dead, it's now live! + auto &LeaderUseCount = UseCounts[DominatingLeader]; + // It's about to be alive again. + if (LeaderUseCount == 0 && isa<Instruction>(DominatingLeader)) + ProbablyDead.erase(cast<Instruction>(DominatingLeader)); + if (LeaderUseCount == 0 && II) + ProbablyDead.insert(II); + ++LeaderUseCount; AnythingReplaced = true; } } } + // At this point, anything still in the ProbablyDead set is actually dead if + // would be trivially dead. + for (auto *I : ProbablyDead) + if (wouldInstructionBeTriviallyDead(I)) + markInstructionForDeletion(I); + // Cleanup the congruence class. - SmallPtrSet<Value *, 4> MembersLeft; - for (Value *Member : CC->Members) { - if (Member->getType()->isVoidTy()) { + CongruenceClass::MemberSet MembersLeft; + for (auto *Member : *CC) + if (!isa<Instruction>(Member) || + !InstructionsToErase.count(cast<Instruction>(Member))) MembersLeft.insert(Member); - continue; - } - - if (auto *MemberInst = dyn_cast<Instruction>(Member)) { - if (isInstructionTriviallyDead(MemberInst)) { - // TODO: Don't mark loads of undefs. - markInstructionForDeletion(MemberInst); - continue; + CC->swap(MembersLeft); + + // If we have possible dead stores to look at, try to eliminate them. + if (CC->getStoreCount() > 0) { + convertClassToLoadsAndStores(*CC, PossibleDeadStores); + std::sort(PossibleDeadStores.begin(), PossibleDeadStores.end()); + ValueDFSStack EliminationStack; + for (auto &VD : PossibleDeadStores) { + int MemberDFSIn = VD.DFSIn; + int MemberDFSOut = VD.DFSOut; + Instruction *Member = cast<Instruction>(VD.Def.getPointer()); + if (EliminationStack.empty() || + !EliminationStack.isInScope(MemberDFSIn, MemberDFSOut)) { + // Sync to our current scope. + EliminationStack.popUntilDFSScope(MemberDFSIn, MemberDFSOut); + if (EliminationStack.empty()) { + EliminationStack.push_back(Member, MemberDFSIn, MemberDFSOut); + continue; + } } + // We already did load elimination, so nothing to do here. + if (isa<LoadInst>(Member)) + continue; + assert(!EliminationStack.empty()); + Instruction *Leader = cast<Instruction>(EliminationStack.back()); + (void)Leader; + assert(DT->dominates(Leader->getParent(), Member->getParent())); + // Member is dominater by Leader, and thus dead + DEBUG(dbgs() << "Marking dead store " << *Member + << " that is dominated by " << *Leader << "\n"); + markInstructionForDeletion(Member); + CC->erase(Member); + ++NumGVNDeadStores; } - MembersLeft.insert(Member); } - CC->Members.swap(MembersLeft); } - return AnythingReplaced; } + +// This function provides global ranking of operations so that we can place them +// in a canonical order. Note that rank alone is not necessarily enough for a +// complete ordering, as constants all have the same rank. However, generally, +// we will simplify an operation with all constants so that it doesn't matter +// what order they appear in. +unsigned int NewGVN::getRank(const Value *V) const { + // Prefer constants to undef to anything else + // Undef is a constant, have to check it first. + // Prefer smaller constants to constantexprs + if (isa<ConstantExpr>(V)) + return 2; + if (isa<UndefValue>(V)) + return 1; + if (isa<Constant>(V)) + return 0; + else if (auto *A = dyn_cast<Argument>(V)) + return 3 + A->getArgNo(); + + // Need to shift the instruction DFS by number of arguments + 3 to account for + // the constant and argument ranking above. + unsigned Result = InstrToDFSNum(V); + if (Result > 0) + return 4 + NumFuncArgs + Result; + // Unreachable or something else, just return a really large number. + return ~0; +} + +// This is a function that says whether two commutative operations should +// have their order swapped when canonicalizing. +bool NewGVN::shouldSwapOperands(const Value *A, const Value *B) const { + // Because we only care about a total ordering, and don't rewrite expressions + // in this order, we order by rank, which will give a strict weak ordering to + // everything but constants, and then we order by pointer address. + return std::make_pair(getRank(A), A) > std::make_pair(getRank(B), B); +} + +namespace { +class NewGVNLegacyPass : public FunctionPass { +public: + static char ID; // Pass identification, replacement for typeid. + NewGVNLegacyPass() : FunctionPass(ID) { + initializeNewGVNLegacyPassPass(*PassRegistry::getPassRegistry()); + } + bool runOnFunction(Function &F) override; + +private: + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<AssumptionCacheTracker>(); + AU.addRequired<DominatorTreeWrapperPass>(); + AU.addRequired<TargetLibraryInfoWrapperPass>(); + AU.addRequired<MemorySSAWrapperPass>(); + AU.addRequired<AAResultsWrapperPass>(); + AU.addPreserved<DominatorTreeWrapperPass>(); + AU.addPreserved<GlobalsAAWrapperPass>(); + } +}; +} // namespace + +bool NewGVNLegacyPass::runOnFunction(Function &F) { + if (skipFunction(F)) + return false; + return NewGVN(F, &getAnalysis<DominatorTreeWrapperPass>().getDomTree(), + &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F), + &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(), + &getAnalysis<AAResultsWrapperPass>().getAAResults(), + &getAnalysis<MemorySSAWrapperPass>().getMSSA(), + F.getParent()->getDataLayout()) + .runGVN(); +} + +INITIALIZE_PASS_BEGIN(NewGVNLegacyPass, "newgvn", "Global Value Numbering", + false, false) +INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) +INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) +INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) +INITIALIZE_PASS_END(NewGVNLegacyPass, "newgvn", "Global Value Numbering", false, + false) + +char NewGVNLegacyPass::ID = 0; + +// createGVNPass - The public interface to this file. +FunctionPass *llvm::createNewGVNPass() { return new NewGVNLegacyPass(); } + +PreservedAnalyses NewGVNPass::run(Function &F, AnalysisManager<Function> &AM) { + // Apparently the order in which we get these results matter for + // the old GVN (see Chandler's comment in GVN.cpp). I'll keep + // the same order here, just in case. + auto &AC = AM.getResult<AssumptionAnalysis>(F); + auto &DT = AM.getResult<DominatorTreeAnalysis>(F); + auto &TLI = AM.getResult<TargetLibraryAnalysis>(F); + auto &AA = AM.getResult<AAManager>(F); + auto &MSSA = AM.getResult<MemorySSAAnalysis>(F).getMSSA(); + bool Changed = + NewGVN(F, &DT, &AC, &TLI, &AA, &MSSA, F.getParent()->getDataLayout()) + .runGVN(); + if (!Changed) + return PreservedAnalyses::all(); + PreservedAnalyses PA; + PA.preserve<DominatorTreeAnalysis>(); + PA.preserve<GlobalsAA>(); + return PA; +} diff --git a/contrib/llvm/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp b/contrib/llvm/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp index 1a7ddc9..1bfecea 100644 --- a/contrib/llvm/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp @@ -66,7 +66,7 @@ static bool optimizeSQRT(CallInst *Call, Function *CalledFunc, // Add attribute "readnone" so that backend can use a native sqrt instruction // for this call. Insert a FP compare instruction and a conditional branch // at the end of CurrBB. - Call->addAttribute(AttributeSet::FunctionIndex, Attribute::ReadNone); + Call->addAttribute(AttributeList::FunctionIndex, Attribute::ReadNone); CurrBB.getTerminator()->eraseFromParent(); Builder.SetInsertPoint(&CurrBB); Value *FCmp = Builder.CreateFCmpOEQ(Call, Call); @@ -98,14 +98,14 @@ static bool runPartiallyInlineLibCalls(Function &F, TargetLibraryInfo *TLI, // Skip if function either has local linkage or is not a known library // function. - LibFunc::Func LibFunc; + LibFunc LF; if (CalledFunc->hasLocalLinkage() || !CalledFunc->hasName() || - !TLI->getLibFunc(CalledFunc->getName(), LibFunc)) + !TLI->getLibFunc(CalledFunc->getName(), LF)) continue; - switch (LibFunc) { - case LibFunc::sqrtf: - case LibFunc::sqrt: + switch (LF) { + case LibFunc_sqrtf: + case LibFunc_sqrt: if (TTI->haveFastSqrt(Call->getType()) && optimizeSQRT(Call, CalledFunc, *CurrBB, BB)) break; diff --git a/contrib/llvm/lib/Transforms/Scalar/Reassociate.cpp b/contrib/llvm/lib/Transforms/Scalar/Reassociate.cpp index 65c814d..e235e5eb 100644 --- a/contrib/llvm/lib/Transforms/Scalar/Reassociate.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/Reassociate.cpp @@ -35,6 +35,7 @@ #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/PatternMatch.h" #include "llvm/IR/ValueHandle.h" #include "llvm/Pass.h" #include "llvm/Support/Debug.h" @@ -106,11 +107,12 @@ XorOpnd::XorOpnd(Value *V) { I->getOpcode() == Instruction::And)) { Value *V0 = I->getOperand(0); Value *V1 = I->getOperand(1); - if (isa<ConstantInt>(V0)) + const APInt *C; + if (match(V0, PatternMatch::m_APInt(C))) std::swap(V0, V1); - if (ConstantInt *C = dyn_cast<ConstantInt>(V1)) { - ConstPart = C->getValue(); + if (match(V1, PatternMatch::m_APInt(C))) { + ConstPart = *C; SymbolicPart = V0; isOr = (I->getOpcode() == Instruction::Or); return; @@ -119,7 +121,7 @@ XorOpnd::XorOpnd(Value *V) { // view the operand as "V | 0" SymbolicPart = V; - ConstPart = APInt::getNullValue(V->getType()->getIntegerBitWidth()); + ConstPart = APInt::getNullValue(V->getType()->getScalarSizeInBits()); isOr = true; } @@ -955,8 +957,8 @@ static BinaryOperator *ConvertShiftToMul(Instruction *Shl) { /// Scan backwards and forwards among values with the same rank as element i /// to see if X exists. If X does not exist, return i. This is useful when /// scanning for 'x' when we see '-x' because they both get the same rank. -static unsigned FindInOperandList(SmallVectorImpl<ValueEntry> &Ops, unsigned i, - Value *X) { +static unsigned FindInOperandList(const SmallVectorImpl<ValueEntry> &Ops, + unsigned i, Value *X) { unsigned XRank = Ops[i].Rank; unsigned e = Ops.size(); for (unsigned j = i+1; j != e && Ops[j].Rank == XRank; ++j) { @@ -982,7 +984,7 @@ static unsigned FindInOperandList(SmallVectorImpl<ValueEntry> &Ops, unsigned i, /// Emit a tree of add instructions, summing Ops together /// and returning the result. Insert the tree before I. static Value *EmitAddTreeOfValues(Instruction *I, - SmallVectorImpl<WeakVH> &Ops){ + SmallVectorImpl<WeakTrackingVH> &Ops) { if (Ops.size() == 1) return Ops.back(); Value *V1 = Ops.back(); @@ -1069,8 +1071,7 @@ Value *ReassociatePass::RemoveFactorFromExpression(Value *V, Value *Factor) { /// /// Ops is the top-level list of add operands we're trying to factor. static void FindSingleUseMultiplyFactors(Value *V, - SmallVectorImpl<Value*> &Factors, - const SmallVectorImpl<ValueEntry> &Ops) { + SmallVectorImpl<Value*> &Factors) { BinaryOperator *BO = isReassociableOp(V, Instruction::Mul, Instruction::FMul); if (!BO) { Factors.push_back(V); @@ -1078,8 +1079,8 @@ static void FindSingleUseMultiplyFactors(Value *V, } // Otherwise, add the LHS and RHS to the list of factors. - FindSingleUseMultiplyFactors(BO->getOperand(1), Factors, Ops); - FindSingleUseMultiplyFactors(BO->getOperand(0), Factors, Ops); + FindSingleUseMultiplyFactors(BO->getOperand(1), Factors); + FindSingleUseMultiplyFactors(BO->getOperand(0), Factors); } /// Optimize a series of operands to an 'and', 'or', or 'xor' instruction. @@ -1135,20 +1136,19 @@ static Value *OptimizeAndOrXor(unsigned Opcode, /// instruction. There are two special cases: 1) if the constant operand is 0, /// it will return NULL. 2) if the constant is ~0, the symbolic operand will /// be returned. -static Value *createAndInstr(Instruction *InsertBefore, Value *Opnd, +static Value *createAndInstr(Instruction *InsertBefore, Value *Opnd, const APInt &ConstOpnd) { - if (ConstOpnd != 0) { - if (!ConstOpnd.isAllOnesValue()) { - LLVMContext &Ctx = Opnd->getType()->getContext(); - Instruction *I; - I = BinaryOperator::CreateAnd(Opnd, ConstantInt::get(Ctx, ConstOpnd), - "and.ra", InsertBefore); - I->setDebugLoc(InsertBefore->getDebugLoc()); - return I; - } + if (ConstOpnd.isNullValue()) + return nullptr; + + if (ConstOpnd.isAllOnesValue()) return Opnd; - } - return nullptr; + + Instruction *I = BinaryOperator::CreateAnd( + Opnd, ConstantInt::get(Opnd->getType(), ConstOpnd), "and.ra", + InsertBefore); + I->setDebugLoc(InsertBefore->getDebugLoc()); + return I; } // Helper function of OptimizeXor(). It tries to simplify "Opnd1 ^ ConstOpnd" @@ -1164,24 +1164,24 @@ bool ReassociatePass::CombineXorOpnd(Instruction *I, XorOpnd *Opnd1, // = ((x | c1) ^ c1) ^ (c1 ^ c2) // = (x & ~c1) ^ (c1 ^ c2) // It is useful only when c1 == c2. - if (Opnd1->isOrExpr() && Opnd1->getConstPart() != 0) { - if (!Opnd1->getValue()->hasOneUse()) - return false; + if (!Opnd1->isOrExpr() || Opnd1->getConstPart().isNullValue()) + return false; - const APInt &C1 = Opnd1->getConstPart(); - if (C1 != ConstOpnd) - return false; + if (!Opnd1->getValue()->hasOneUse()) + return false; - Value *X = Opnd1->getSymbolicPart(); - Res = createAndInstr(I, X, ~C1); - // ConstOpnd was C2, now C1 ^ C2. - ConstOpnd ^= C1; + const APInt &C1 = Opnd1->getConstPart(); + if (C1 != ConstOpnd) + return false; - if (Instruction *T = dyn_cast<Instruction>(Opnd1->getValue())) - RedoInsts.insert(T); - return true; - } - return false; + Value *X = Opnd1->getSymbolicPart(); + Res = createAndInstr(I, X, ~C1); + // ConstOpnd was C2, now C1 ^ C2. + ConstOpnd ^= C1; + + if (Instruction *T = dyn_cast<Instruction>(Opnd1->getValue())) + RedoInsts.insert(T); + return true; } @@ -1222,8 +1222,8 @@ bool ReassociatePass::CombineXorOpnd(Instruction *I, XorOpnd *Opnd1, APInt C3((~C1) ^ C2); // Do not increase code size! - if (C3 != 0 && !C3.isAllOnesValue()) { - int NewInstNum = ConstOpnd != 0 ? 1 : 2; + if (!C3.isNullValue() && !C3.isAllOnesValue()) { + int NewInstNum = ConstOpnd.getBoolValue() ? 1 : 2; if (NewInstNum > DeadInstNum) return false; } @@ -1239,8 +1239,8 @@ bool ReassociatePass::CombineXorOpnd(Instruction *I, XorOpnd *Opnd1, APInt C3 = C1 ^ C2; // Do not increase code size - if (C3 != 0 && !C3.isAllOnesValue()) { - int NewInstNum = ConstOpnd != 0 ? 1 : 2; + if (!C3.isNullValue() && !C3.isAllOnesValue()) { + int NewInstNum = ConstOpnd.getBoolValue() ? 1 : 2; if (NewInstNum > DeadInstNum) return false; } @@ -1280,17 +1280,20 @@ Value *ReassociatePass::OptimizeXor(Instruction *I, SmallVector<XorOpnd, 8> Opnds; SmallVector<XorOpnd*, 8> OpndPtrs; Type *Ty = Ops[0].Op->getType(); - APInt ConstOpnd(Ty->getIntegerBitWidth(), 0); + APInt ConstOpnd(Ty->getScalarSizeInBits(), 0); // Step 1: Convert ValueEntry to XorOpnd for (unsigned i = 0, e = Ops.size(); i != e; ++i) { Value *V = Ops[i].Op; - if (!isa<ConstantInt>(V)) { + const APInt *C; + // TODO: Support non-splat vectors. + if (match(V, PatternMatch::m_APInt(C))) { + ConstOpnd ^= *C; + } else { XorOpnd O(V); O.setSymbolicRank(getRank(O.getSymbolicPart())); Opnds.push_back(O); - } else - ConstOpnd ^= cast<ConstantInt>(V)->getValue(); + } } // NOTE: From this point on, do *NOT* add/delete element to/from "Opnds". @@ -1328,7 +1331,8 @@ Value *ReassociatePass::OptimizeXor(Instruction *I, Value *CV; // Step 3.1: Try simplifying "CurrOpnd ^ ConstOpnd" - if (ConstOpnd != 0 && CombineXorOpnd(I, CurrOpnd, ConstOpnd, CV)) { + if (!ConstOpnd.isNullValue() && + CombineXorOpnd(I, CurrOpnd, ConstOpnd, CV)) { Changed = true; if (CV) *CurrOpnd = XorOpnd(CV); @@ -1370,17 +1374,17 @@ Value *ReassociatePass::OptimizeXor(Instruction *I, ValueEntry VE(getRank(O.getValue()), O.getValue()); Ops.push_back(VE); } - if (ConstOpnd != 0) { - Value *C = ConstantInt::get(Ty->getContext(), ConstOpnd); + if (!ConstOpnd.isNullValue()) { + Value *C = ConstantInt::get(Ty, ConstOpnd); ValueEntry VE(getRank(C), C); Ops.push_back(VE); } - int Sz = Ops.size(); + unsigned Sz = Ops.size(); if (Sz == 1) return Ops.back().Op; - else if (Sz == 0) { - assert(ConstOpnd == 0); - return ConstantInt::get(Ty->getContext(), ConstOpnd); + if (Sz == 0) { + assert(ConstOpnd.isNullValue()); + return ConstantInt::get(Ty, ConstOpnd); } } @@ -1499,7 +1503,7 @@ Value *ReassociatePass::OptimizeAdd(Instruction *I, // Compute all of the factors of this added value. SmallVector<Value*, 8> Factors; - FindSingleUseMultiplyFactors(BOp, Factors, Ops); + FindSingleUseMultiplyFactors(BOp, Factors); assert(Factors.size() > 1 && "Bad linearize!"); // Add one to FactorOccurrences for each unique factor in this op. @@ -1560,7 +1564,7 @@ Value *ReassociatePass::OptimizeAdd(Instruction *I, ? BinaryOperator::CreateAdd(MaxOccVal, MaxOccVal) : BinaryOperator::CreateFAdd(MaxOccVal, MaxOccVal); - SmallVector<WeakVH, 4> NewMulOps; + SmallVector<WeakTrackingVH, 4> NewMulOps; for (unsigned i = 0; i != Ops.size(); ++i) { // Only try to remove factors from expressions we're allowed to. BinaryOperator *BOp = @@ -1583,7 +1587,7 @@ Value *ReassociatePass::OptimizeAdd(Instruction *I, } // No need for extra uses anymore. - delete DummyInst; + DummyInst->deleteValue(); unsigned NumAddedValues = NewMulOps.size(); Value *V = EmitAddTreeOfValues(I, NewMulOps); @@ -1628,8 +1632,8 @@ Value *ReassociatePass::OptimizeAdd(Instruction *I, /// ((((x*y)*x)*y)*x) -> [(x, 3), (y, 2)] /// /// \returns Whether any factors have a power greater than one. -bool ReassociatePass::collectMultiplyFactors(SmallVectorImpl<ValueEntry> &Ops, - SmallVectorImpl<Factor> &Factors) { +static bool collectMultiplyFactors(SmallVectorImpl<ValueEntry> &Ops, + SmallVectorImpl<Factor> &Factors) { // FIXME: Have Ops be (ValueEntry, Multiplicity) pairs, simplifying this. // Compute the sum of powers of simplifiable factors. unsigned FactorPowerSum = 0; @@ -1890,6 +1894,8 @@ void ReassociatePass::EraseInst(Instruction *I) { Op = Op->user_back(); RedoInsts.insert(Op); } + + MadeChange = true; } // Canonicalize expressions of the following form: @@ -1923,7 +1929,7 @@ Instruction *ReassociatePass::canonicalizeNegConstExpr(Instruction *I) { // User must be a binary operator with one or more uses. Instruction *User = I->user_back(); - if (!isa<BinaryOperator>(User) || !User->hasNUsesOrMore(1)) + if (!isa<BinaryOperator>(User) || User->use_empty()) return nullptr; unsigned UserOpcode = User->getOpcode(); @@ -1935,6 +1941,12 @@ Instruction *ReassociatePass::canonicalizeNegConstExpr(Instruction *I) { if (!User->isCommutative() && User->getOperand(1) != I) return nullptr; + // Don't canonicalize x + (-Constant * y) -> x - (Constant * y), if the + // resulting subtract will be broken up later. This can get us into an + // infinite loop during reassociation. + if (UserOpcode == Instruction::FAdd && ShouldBreakUpSubtract(User)) + return nullptr; + // Change the sign of the constant. APFloat Val = CF->getValueAPF(); Val.changeSign(); @@ -2000,11 +2012,6 @@ void ReassociatePass::OptimizeInst(Instruction *I) { if (I->isCommutative()) canonicalizeOperands(I); - // TODO: We should optimize vector Xor instructions, but they are - // currently unsupported. - if (I->getType()->isVectorTy() && I->getOpcode() == Instruction::Xor) - return; - // Don't optimize floating point instructions that don't have unsafe algebra. if (I->getType()->isFPOrFPVectorTy() && !I->hasUnsafeAlgebra()) return; @@ -2147,7 +2154,7 @@ void ReassociatePass::ReassociateExpression(BinaryOperator *I) { if (I->getOpcode() == Instruction::Mul && cast<Instruction>(I->user_back())->getOpcode() == Instruction::Add && isa<ConstantInt>(Ops.back().Op) && - cast<ConstantInt>(Ops.back().Op)->isAllOnesValue()) { + cast<ConstantInt>(Ops.back().Op)->isMinusOne()) { ValueEntry Tmp = Ops.pop_back_val(); Ops.insert(Ops.begin(), Tmp); } else if (I->getOpcode() == Instruction::FMul && @@ -2236,8 +2243,8 @@ PreservedAnalyses ReassociatePass::run(Function &F, FunctionAnalysisManager &) { ValueRankMap.clear(); if (MadeChange) { - // FIXME: This should also 'preserve the CFG'. - auto PA = PreservedAnalyses(); + PreservedAnalyses PA; + PA.preserveSet<CFGAnalyses>(); PA.preserve<GlobalsAA>(); return PA; } diff --git a/contrib/llvm/lib/Transforms/Scalar/Reg2Mem.cpp b/contrib/llvm/lib/Transforms/Scalar/Reg2Mem.cpp index 615029d..9629568 100644 --- a/contrib/llvm/lib/Transforms/Scalar/Reg2Mem.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/Reg2Mem.cpp @@ -16,7 +16,6 @@ // //===----------------------------------------------------------------------===// -#include "llvm/Transforms/Scalar.h" #include "llvm/ADT/Statistic.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/CFG.h" @@ -25,6 +24,7 @@ #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Module.h" #include "llvm/Pass.h" +#include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/Local.h" #include <list> using namespace llvm; diff --git a/contrib/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp b/contrib/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp index 1de7420..f19d453 100644 --- a/contrib/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp @@ -7,20 +7,19 @@ // //===----------------------------------------------------------------------===// // -// Rewrite an existing set of gc.statepoints such that they make potential -// relocations performed by the garbage collector explicit in the IR. +// Rewrite call/invoke instructions so as to make potential relocations +// performed by the garbage collector explicit in the IR. // //===----------------------------------------------------------------------===// -#include "llvm/Pass.h" -#include "llvm/Analysis/CFG.h" -#include "llvm/Analysis/TargetTransformInfo.h" -#include "llvm/ADT/SetOperations.h" -#include "llvm/ADT/Statistic.h" #include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/MapVector.h" +#include "llvm/ADT/SetOperations.h" #include "llvm/ADT/SetVector.h" +#include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringRef.h" -#include "llvm/ADT/MapVector.h" +#include "llvm/Analysis/CFG.h" +#include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/CallSite.h" #include "llvm/IR/Dominators.h" @@ -28,15 +27,16 @@ #include "llvm/IR/IRBuilder.h" #include "llvm/IR/InstIterator.h" #include "llvm/IR/Instructions.h" -#include "llvm/IR/Intrinsics.h" #include "llvm/IR/IntrinsicInst.h" -#include "llvm/IR/Module.h" +#include "llvm/IR/Intrinsics.h" #include "llvm/IR/MDBuilder.h" +#include "llvm/IR/Module.h" #include "llvm/IR/Statepoint.h" #include "llvm/IR/Value.h" #include "llvm/IR/Verifier.h" -#include "llvm/Support/Debug.h" +#include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Cloning.h" @@ -89,10 +89,10 @@ struct RewriteStatepointsForGC : public ModulePass { Changed |= runOnFunction(F); if (Changed) { - // stripNonValidAttributes asserts that shouldRewriteStatepointsIn + // stripNonValidAttributesAndMetadata asserts that shouldRewriteStatepointsIn // returns true for at least one function in the module. Since at least // one function changed, we know that the precondition is satisfied. - stripNonValidAttributes(M); + stripNonValidAttributesAndMetadata(M); } return Changed; @@ -105,20 +105,24 @@ struct RewriteStatepointsForGC : public ModulePass { AU.addRequired<TargetTransformInfoWrapperPass>(); } - /// The IR fed into RewriteStatepointsForGC may have had attributes implying - /// dereferenceability that are no longer valid/correct after - /// RewriteStatepointsForGC has run. This is because semantically, after + /// The IR fed into RewriteStatepointsForGC may have had attributes and + /// metadata implying dereferenceability that are no longer valid/correct after + /// RewriteStatepointsForGC has run. This is because semantically, after /// RewriteStatepointsForGC runs, all calls to gc.statepoint "free" the entire - /// heap. stripNonValidAttributes (conservatively) restores correctness - /// by erasing all attributes in the module that externally imply - /// dereferenceability. - /// Similar reasoning also applies to the noalias attributes. gc.statepoint - /// can touch the entire heap including noalias objects. - void stripNonValidAttributes(Module &M); - - // Helpers for stripNonValidAttributes - void stripNonValidAttributesFromBody(Function &F); + /// heap. stripNonValidAttributesAndMetadata (conservatively) restores + /// correctness by erasing all attributes in the module that externally imply + /// dereferenceability. Similar reasoning also applies to the noalias + /// attributes and metadata. gc.statepoint can touch the entire heap including + /// noalias objects. + void stripNonValidAttributesAndMetadata(Module &M); + + // Helpers for stripNonValidAttributesAndMetadata + void stripNonValidAttributesAndMetadataFromBody(Function &F); void stripNonValidAttributesFromPrototype(Function &F); + // Certain metadata on instructions are invalid after running RS4GC. + // Optimizations that run after RS4GC can incorrectly use this metadata to + // optimize functions. We drop such metadata on the instruction. + void stripInvalidMetadataFromInstruction(Instruction &I); }; } // namespace @@ -365,6 +369,11 @@ findBaseDefiningValueOfVector(Value *I) { // for particular sufflevector patterns. return BaseDefiningValueResult(I, false); + // The behavior of getelementptr instructions is the same for vector and + // non-vector data types. + if (auto *GEP = dyn_cast<GetElementPtrInst>(I)) + return findBaseDefiningValue(GEP->getPointerOperand()); + // A PHI or Select is a base defining value. The outer findBasePointer // algorithm is responsible for constructing a base value for this BDV. assert((isa<SelectInst>(I) || isa<PHINode>(I)) && @@ -634,7 +643,7 @@ static BDVState meetBDVStateImpl(const BDVState &LHS, const BDVState &RHS) { // Values of type BDVState form a lattice, and this function implements the meet // operation. -static BDVState meetBDVState(BDVState LHS, BDVState RHS) { +static BDVState meetBDVState(const BDVState &LHS, const BDVState &RHS) { BDVState Result = meetBDVStateImpl(LHS, RHS); assert(Result == meetBDVStateImpl(RHS, LHS) && "Math is wrong: meet does not commute!"); @@ -1123,39 +1132,23 @@ normalizeForInvokeSafepoint(BasicBlock *BB, BasicBlock *InvokeParent, // Create new attribute set containing only attributes which can be transferred // from original call to the safepoint. -static AttributeSet legalizeCallAttributes(AttributeSet AS) { - AttributeSet Ret; - - for (unsigned Slot = 0; Slot < AS.getNumSlots(); Slot++) { - unsigned Index = AS.getSlotIndex(Slot); - - if (Index == AttributeSet::ReturnIndex || - Index == AttributeSet::FunctionIndex) { - - for (Attribute Attr : make_range(AS.begin(Slot), AS.end(Slot))) { - - // Do not allow certain attributes - just skip them - // Safepoint can not be read only or read none. - if (Attr.hasAttribute(Attribute::ReadNone) || - Attr.hasAttribute(Attribute::ReadOnly)) - continue; - - // These attributes control the generation of the gc.statepoint call / - // invoke itself; and once the gc.statepoint is in place, they're of no - // use. - if (isStatepointDirectiveAttr(Attr)) - continue; - - Ret = Ret.addAttributes( - AS.getContext(), Index, - AttributeSet::get(AS.getContext(), Index, AttrBuilder(Attr))); - } - } - - // Just skip parameter attributes for now - } - - return Ret; +static AttributeList legalizeCallAttributes(AttributeList AL) { + if (AL.isEmpty()) + return AL; + + // Remove the readonly, readnone, and statepoint function attributes. + AttrBuilder FnAttrs = AL.getFnAttributes(); + FnAttrs.removeAttribute(Attribute::ReadNone); + FnAttrs.removeAttribute(Attribute::ReadOnly); + for (Attribute A : AL.getFnAttributes()) { + if (isStatepointDirectiveAttr(A)) + FnAttrs.remove(A); + } + + // Just skip parameter and return attributes for now + LLVMContext &Ctx = AL.getContext(); + return AttributeList::get(Ctx, AttributeList::FunctionIndex, + AttributeSet::get(Ctx, FnAttrs)); } /// Helper function to place all gc relocates necessary for the given @@ -1299,12 +1292,11 @@ static StringRef getDeoptLowering(CallSite CS) { const char *DeoptLowering = "deopt-lowering"; if (CS.hasFnAttr(DeoptLowering)) { // FIXME: CallSite has a *really* confusing interface around attributes - // with values. - const AttributeSet &CSAS = CS.getAttributes(); - if (CSAS.hasAttribute(AttributeSet::FunctionIndex, - DeoptLowering)) - return CSAS.getAttribute(AttributeSet::FunctionIndex, - DeoptLowering).getValueAsString(); + // with values. + const AttributeList &CSAS = CS.getAttributes(); + if (CSAS.hasAttribute(AttributeList::FunctionIndex, DeoptLowering)) + return CSAS.getAttribute(AttributeList::FunctionIndex, DeoptLowering) + .getValueAsString(); Function *F = CS.getCalledFunction(); assert(F && F->hasFnAttribute(DeoptLowering)); return F->getFnAttribute(DeoptLowering).getValueAsString(); @@ -1388,7 +1380,6 @@ makeStatepointExplicitImpl(const CallSite CS, /* to replace */ // Create the statepoint given all the arguments Instruction *Token = nullptr; - AttributeSet ReturnAttrs; if (CS.isCall()) { CallInst *ToReplace = cast<CallInst>(CS.getInstruction()); CallInst *Call = Builder.CreateGCStatepointCall( @@ -1399,12 +1390,10 @@ makeStatepointExplicitImpl(const CallSite CS, /* to replace */ Call->setCallingConv(ToReplace->getCallingConv()); // Currently we will fail on parameter attributes and on certain - // function attributes. - AttributeSet NewAttrs = legalizeCallAttributes(ToReplace->getAttributes()); - // In case if we can handle this set of attributes - set up function attrs - // directly on statepoint and return attrs later for gc_result intrinsic. - Call->setAttributes(NewAttrs.getFnAttributes()); - ReturnAttrs = NewAttrs.getRetAttributes(); + // function attributes. In case if we can handle this set of attributes - + // set up function attrs directly on statepoint and return attrs later for + // gc_result intrinsic. + Call->setAttributes(legalizeCallAttributes(ToReplace->getAttributes())); Token = Call; @@ -1427,12 +1416,10 @@ makeStatepointExplicitImpl(const CallSite CS, /* to replace */ Invoke->setCallingConv(ToReplace->getCallingConv()); // Currently we will fail on parameter attributes and on certain - // function attributes. - AttributeSet NewAttrs = legalizeCallAttributes(ToReplace->getAttributes()); - // In case if we can handle this set of attributes - set up function attrs - // directly on statepoint and return attrs later for gc_result intrinsic. - Invoke->setAttributes(NewAttrs.getFnAttributes()); - ReturnAttrs = NewAttrs.getRetAttributes(); + // function attributes. In case if we can handle this set of attributes - + // set up function attrs directly on statepoint and return attrs later for + // gc_result intrinsic. + Invoke->setAttributes(legalizeCallAttributes(ToReplace->getAttributes())); Token = Invoke; @@ -1478,7 +1465,9 @@ makeStatepointExplicitImpl(const CallSite CS, /* to replace */ StringRef Name = CS.getInstruction()->hasName() ? CS.getInstruction()->getName() : ""; CallInst *GCResult = Builder.CreateGCResult(Token, CS.getType(), Name); - GCResult->setAttributes(CS.getAttributes().getRetAttributes()); + GCResult->setAttributes( + AttributeList::get(GCResult->getContext(), AttributeList::ReturnIndex, + CS.getAttributes().getRetAttributes())); // We cannot RAUW or delete CS.getInstruction() because it could be in the // live set of some other safepoint, in which case that safepoint's @@ -1615,8 +1604,10 @@ static void relocationViaAlloca( // Emit alloca for "LiveValue" and record it in "allocaMap" and // "PromotableAllocas" + const DataLayout &DL = F.getParent()->getDataLayout(); auto emitAllocaFor = [&](Value *LiveValue) { - AllocaInst *Alloca = new AllocaInst(LiveValue->getType(), "", + AllocaInst *Alloca = new AllocaInst(LiveValue->getType(), + DL.getAllocaAddrSpace(), "", F.getEntryBlock().getFirstNonPHI()); AllocaMap[LiveValue] = Alloca; PromotableAllocas.push_back(Alloca); @@ -1873,7 +1864,7 @@ chainToBasePointerCost(SmallVectorImpl<Instruction*> &Chain, "non noop cast is found during rematerialization"); Type *SrcTy = CI->getOperand(0)->getType(); - Cost += TTI.getCastInstrCost(CI->getOpcode(), CI->getType(), SrcTy); + Cost += TTI.getCastInstrCost(CI->getOpcode(), CI->getType(), SrcTy, CI); } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Instr)) { // Cost of the address calculation @@ -1963,7 +1954,7 @@ static void rematerializeLiveValues(CallSite CS, // to identify the newly generated AlternateRootPhi (.base version of phi) // and RootOfChain (the original phi node itself) are the same, so that we // can rematerialize the gep and casts. This is a workaround for the - // deficieny in the findBasePointer algorithm. + // deficiency in the findBasePointer algorithm. if (!AreEquivalentPhiNodes(*OrigRootPhi, *AlternateRootPhi)) continue; // Now that the phi nodes are proved to be the same, assert that @@ -2003,7 +1994,7 @@ static void rematerializeLiveValues(CallSite CS, Instruction *LastClonedValue = nullptr; Instruction *LastValue = nullptr; for (Instruction *Instr: ChainToBase) { - // Only GEP's and casts are suported as we need to be careful to not + // Only GEP's and casts are supported as we need to be careful to not // introduce any new uses of pointers not in the liveset. // Note that it's fine to introduce new uses of pointers which were // otherwise not used after this statepoint. @@ -2107,9 +2098,9 @@ static bool insertParsePoints(Function &F, DominatorTree &DT, // live in the IR. We'll remove all of these when done. SmallVector<CallInst *, 64> Holders; - // Insert a dummy call with all of the arguments to the vm_state we'll need - // for the actual safepoint insertion. This ensures reference arguments in - // the deopt argument list are considered live through the safepoint (and + // Insert a dummy call with all of the deopt operands we'll need for the + // actual safepoint insertion as arguments. This ensures reference operands + // in the deopt argument list are considered live through the safepoint (and // thus makes sure they get relocated.) for (CallSite CS : ToUpdate) { SmallVector<Value *, 64> DeoptValues; @@ -2299,12 +2290,11 @@ static void RemoveNonValidAttrAtIndex(LLVMContext &Ctx, AttrHolder &AH, if (AH.getDereferenceableOrNullBytes(Index)) R.addAttribute(Attribute::get(Ctx, Attribute::DereferenceableOrNull, AH.getDereferenceableOrNullBytes(Index))); - if (AH.doesNotAlias(Index)) + if (AH.getAttributes().hasAttribute(Index, Attribute::NoAlias)) R.addAttribute(Attribute::NoAlias); if (!R.empty()) - AH.setAttributes(AH.getAttributes().removeAttributes( - Ctx, Index, AttributeSet::get(Ctx, Index, R))); + AH.setAttributes(AH.getAttributes().removeAttributes(Ctx, Index, R)); } void @@ -2313,19 +2303,51 @@ RewriteStatepointsForGC::stripNonValidAttributesFromPrototype(Function &F) { for (Argument &A : F.args()) if (isa<PointerType>(A.getType())) - RemoveNonValidAttrAtIndex(Ctx, F, A.getArgNo() + 1); + RemoveNonValidAttrAtIndex(Ctx, F, + A.getArgNo() + AttributeList::FirstArgIndex); if (isa<PointerType>(F.getReturnType())) - RemoveNonValidAttrAtIndex(Ctx, F, AttributeSet::ReturnIndex); + RemoveNonValidAttrAtIndex(Ctx, F, AttributeList::ReturnIndex); +} + +void RewriteStatepointsForGC::stripInvalidMetadataFromInstruction(Instruction &I) { + + if (!isa<LoadInst>(I) && !isa<StoreInst>(I)) + return; + // These are the attributes that are still valid on loads and stores after + // RS4GC. + // The metadata implying dereferenceability and noalias are (conservatively) + // dropped. This is because semantically, after RewriteStatepointsForGC runs, + // all calls to gc.statepoint "free" the entire heap. Also, gc.statepoint can + // touch the entire heap including noalias objects. Note: The reasoning is + // same as stripping the dereferenceability and noalias attributes that are + // analogous to the metadata counterparts. + // We also drop the invariant.load metadata on the load because that metadata + // implies the address operand to the load points to memory that is never + // changed once it became dereferenceable. This is no longer true after RS4GC. + // Similar reasoning applies to invariant.group metadata, which applies to + // loads within a group. + unsigned ValidMetadataAfterRS4GC[] = {LLVMContext::MD_tbaa, + LLVMContext::MD_range, + LLVMContext::MD_alias_scope, + LLVMContext::MD_nontemporal, + LLVMContext::MD_nonnull, + LLVMContext::MD_align, + LLVMContext::MD_type}; + + // Drops all metadata on the instruction other than ValidMetadataAfterRS4GC. + I.dropUnknownNonDebugMetadata(ValidMetadataAfterRS4GC); + } -void RewriteStatepointsForGC::stripNonValidAttributesFromBody(Function &F) { +void RewriteStatepointsForGC::stripNonValidAttributesAndMetadataFromBody(Function &F) { if (F.empty()) return; LLVMContext &Ctx = F.getContext(); MDBuilder Builder(Ctx); + for (Instruction &I : instructions(F)) { if (const MDNode *MD = I.getMetadata(LLVMContext::MD_tbaa)) { assert(MD->getNumOperands() < 5 && "unrecognized metadata shape!"); @@ -2346,12 +2368,14 @@ void RewriteStatepointsForGC::stripNonValidAttributesFromBody(Function &F) { I.setMetadata(LLVMContext::MD_tbaa, MutableTBAA); } + stripInvalidMetadataFromInstruction(I); + if (CallSite CS = CallSite(&I)) { for (int i = 0, e = CS.arg_size(); i != e; i++) if (isa<PointerType>(CS.getArgument(i)->getType())) - RemoveNonValidAttrAtIndex(Ctx, CS, i + 1); + RemoveNonValidAttrAtIndex(Ctx, CS, i + AttributeList::FirstArgIndex); if (isa<PointerType>(CS.getType())) - RemoveNonValidAttrAtIndex(Ctx, CS, AttributeSet::ReturnIndex); + RemoveNonValidAttrAtIndex(Ctx, CS, AttributeList::ReturnIndex); } } } @@ -2370,7 +2394,7 @@ static bool shouldRewriteStatepointsIn(Function &F) { return false; } -void RewriteStatepointsForGC::stripNonValidAttributes(Module &M) { +void RewriteStatepointsForGC::stripNonValidAttributesAndMetadata(Module &M) { #ifndef NDEBUG assert(any_of(M, shouldRewriteStatepointsIn) && "precondition!"); #endif @@ -2379,7 +2403,7 @@ void RewriteStatepointsForGC::stripNonValidAttributes(Module &M) { stripNonValidAttributesFromPrototype(F); for (Function &F : M) - stripNonValidAttributesFromBody(F); + stripNonValidAttributesAndMetadataFromBody(F); } bool RewriteStatepointsForGC::runOnFunction(Function &F) { diff --git a/contrib/llvm/lib/Transforms/Scalar/SCCP.cpp b/contrib/llvm/lib/Transforms/Scalar/SCCP.cpp index ede381c..4822cf7 100644 --- a/contrib/llvm/lib/Transforms/Scalar/SCCP.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/SCCP.cpp @@ -140,6 +140,14 @@ public: return nullptr; } + /// getBlockAddress - If this is a constant with a BlockAddress value, return + /// it, otherwise return null. + BlockAddress *getBlockAddress() const { + if (isConstant()) + return dyn_cast<BlockAddress>(getConstant()); + return nullptr; + } + void markForcedConstant(Constant *V) { assert(isUnknown() && "Can't force a defined value!"); Val.setInt(forcedconstant); @@ -306,20 +314,14 @@ public: return MRVFunctionsTracked; } - void markOverdefined(Value *V) { - assert(!V->getType()->isStructTy() && - "structs should use markAnythingOverdefined"); - markOverdefined(ValueState[V], V); - } - - /// markAnythingOverdefined - Mark the specified value overdefined. This + /// markOverdefined - Mark the specified value overdefined. This /// works with both scalars and structs. - void markAnythingOverdefined(Value *V) { + void markOverdefined(Value *V) { if (auto *STy = dyn_cast<StructType>(V->getType())) for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) markOverdefined(getStructValueState(V, i), V); else - markOverdefined(V); + markOverdefined(ValueState[V], V); } // isStructLatticeConstant - Return true if all the lattice values @@ -513,12 +515,8 @@ private: void visitCmpInst(CmpInst &I); void visitExtractValueInst(ExtractValueInst &EVI); void visitInsertValueInst(InsertValueInst &IVI); - void visitLandingPadInst(LandingPadInst &I) { markAnythingOverdefined(&I); } - void visitFuncletPadInst(FuncletPadInst &FPI) { - markAnythingOverdefined(&FPI); - } void visitCatchSwitchInst(CatchSwitchInst &CPI) { - markAnythingOverdefined(&CPI); + markOverdefined(&CPI); visitTerminatorInst(CPI); } @@ -537,17 +535,11 @@ private: void visitResumeInst (TerminatorInst &I) { /*returns void*/ } void visitUnreachableInst(TerminatorInst &I) { /*returns void*/ } void visitFenceInst (FenceInst &I) { /*returns void*/ } - void visitAtomicCmpXchgInst(AtomicCmpXchgInst &I) { - markAnythingOverdefined(&I); - } - void visitAtomicRMWInst (AtomicRMWInst &I) { markOverdefined(&I); } - void visitAllocaInst (Instruction &I) { markOverdefined(&I); } - void visitVAArgInst (Instruction &I) { markAnythingOverdefined(&I); } - void visitInstruction(Instruction &I) { - // If a new instruction is added to LLVM that we don't handle. + // All the instructions we don't do any special handling for just + // go to overdefined. DEBUG(dbgs() << "SCCP: Don't know how to handle: " << I << '\n'); - markAnythingOverdefined(&I); // Just in case + markOverdefined(&I); } }; @@ -602,14 +594,36 @@ void SCCPSolver::getFeasibleSuccessors(TerminatorInst &TI, return; } - Succs[SI->findCaseValue(CI).getSuccessorIndex()] = true; + Succs[SI->findCaseValue(CI)->getSuccessorIndex()] = true; return; } - // TODO: This could be improved if the operand is a [cast of a] BlockAddress. - if (isa<IndirectBrInst>(&TI)) { - // Just mark all destinations executable! - Succs.assign(TI.getNumSuccessors(), true); + // In case of indirect branch and its address is a blockaddress, we mark + // the target as executable. + if (auto *IBR = dyn_cast<IndirectBrInst>(&TI)) { + // Casts are folded by visitCastInst. + LatticeVal IBRValue = getValueState(IBR->getAddress()); + BlockAddress *Addr = IBRValue.getBlockAddress(); + if (!Addr) { // Overdefined or unknown condition? + // All destinations are executable! + if (!IBRValue.isUnknown()) + Succs.assign(TI.getNumSuccessors(), true); + return; + } + + BasicBlock* T = Addr->getBasicBlock(); + assert(Addr->getFunction() == T->getParent() && + "Block address of a different function ?"); + for (unsigned i = 0; i < IBR->getNumSuccessors(); ++i) { + // This is the target. + if (IBR->getDestination(i) == T) { + Succs[i] = true; + return; + } + } + + // If we didn't find our destination in the IBR successor list, then we + // have undefined behavior. Its ok to assume no successor is executable. return; } @@ -659,13 +673,21 @@ bool SCCPSolver::isEdgeFeasible(BasicBlock *From, BasicBlock *To) { if (!CI) return !SCValue.isUnknown(); - return SI->findCaseValue(CI).getCaseSuccessor() == To; + return SI->findCaseValue(CI)->getCaseSuccessor() == To; } - // Just mark all destinations executable! - // TODO: This could be improved if the operand is a [cast of a] BlockAddress. - if (isa<IndirectBrInst>(TI)) - return true; + // In case of indirect branch and its address is a blockaddress, we mark + // the target as executable. + if (auto *IBR = dyn_cast<IndirectBrInst>(TI)) { + LatticeVal IBRValue = getValueState(IBR->getAddress()); + BlockAddress *Addr = IBRValue.getBlockAddress(); + + if (!Addr) + return !IBRValue.isUnknown(); + + // At this point, the indirectbr is branching on a blockaddress. + return Addr->getBasicBlock() == To; + } DEBUG(dbgs() << "Unknown terminator instruction: " << *TI << '\n'); llvm_unreachable("SCCP: Don't know how to handle this terminator!"); @@ -693,7 +715,7 @@ void SCCPSolver::visitPHINode(PHINode &PN) { // If this PN returns a struct, just mark the result overdefined. // TODO: We could do a lot better than this if code actually uses this. if (PN.getType()->isStructTy()) - return markAnythingOverdefined(&PN); + return markOverdefined(&PN); if (getValueState(&PN).isOverdefined()) return; // Quick exit @@ -803,7 +825,7 @@ void SCCPSolver::visitExtractValueInst(ExtractValueInst &EVI) { // If this returns a struct, mark all elements over defined, we don't track // structs in structs. if (EVI.getType()->isStructTy()) - return markAnythingOverdefined(&EVI); + return markOverdefined(&EVI); // If this is extracting from more than one level of struct, we don't know. if (EVI.getNumIndices() != 1) @@ -828,7 +850,7 @@ void SCCPSolver::visitInsertValueInst(InsertValueInst &IVI) { // If this has more than one index, we can't handle it, drive all results to // undef. if (IVI.getNumIndices() != 1) - return markAnythingOverdefined(&IVI); + return markOverdefined(&IVI); Value *Aggr = IVI.getAggregateOperand(); unsigned Idx = *IVI.idx_begin(); @@ -857,7 +879,7 @@ void SCCPSolver::visitSelectInst(SelectInst &I) { // If this select returns a struct, just mark the result overdefined. // TODO: We could do a lot better than this if code actually uses this. if (I.getType()->isStructTy()) - return markAnythingOverdefined(&I); + return markOverdefined(&I); LatticeVal CondValue = getValueState(I.getCondition()); if (CondValue.isUnknown()) @@ -910,9 +932,16 @@ void SCCPSolver::visitBinaryOperator(Instruction &I) { // Otherwise, one of our operands is overdefined. Try to produce something // better than overdefined with some tricks. - - // If this is an AND or OR with 0 or -1, it doesn't matter that the other - // operand is overdefined. + // If this is 0 / Y, it doesn't matter that the second operand is + // overdefined, and we can replace it with zero. + if (I.getOpcode() == Instruction::UDiv || I.getOpcode() == Instruction::SDiv) + if (V1State.isConstant() && V1State.getConstant()->isNullValue()) + return markConstant(IV, &I, V1State.getConstant()); + + // If this is: + // -> AND/MUL with 0 + // -> OR with -1 + // it doesn't matter that the other operand is overdefined. if (I.getOpcode() == Instruction::And || I.getOpcode() == Instruction::Mul || I.getOpcode() == Instruction::Or) { LatticeVal *NonOverdefVal = nullptr; @@ -934,7 +963,7 @@ void SCCPSolver::visitBinaryOperator(Instruction &I) { } else { // X or -1 = -1 if (ConstantInt *CI = NonOverdefVal->getConstantInt()) - if (CI->isAllOnesValue()) + if (CI->isMinusOne()) return markConstant(IV, &I, NonOverdefVal->getConstant()); } } @@ -1021,7 +1050,7 @@ void SCCPSolver::visitStoreInst(StoreInst &SI) { void SCCPSolver::visitLoadInst(LoadInst &I) { // If this load is of a struct, just mark the result overdefined. if (I.getType()->isStructTy()) - return markAnythingOverdefined(&I); + return markOverdefined(&I); LatticeVal PtrVal = getValueState(I.getOperand(0)); if (PtrVal.isUnknown()) return; // The pointer is not resolved yet! @@ -1078,7 +1107,7 @@ CallOverdefined: // Otherwise, if we have a single return value case, and if the function is // a declaration, maybe we can constant fold it. if (F && F->isDeclaration() && !I->getType()->isStructTy() && - canConstantFoldCallTo(F)) { + canConstantFoldCallTo(CS, F)) { SmallVector<Constant*, 8> Operands; for (CallSite::arg_iterator AI = CS.arg_begin(), E = CS.arg_end(); @@ -1098,7 +1127,7 @@ CallOverdefined: // If we can constant fold this, mark the result of the call as a // constant. - if (Constant *C = ConstantFoldCall(F, Operands, TLI)) { + if (Constant *C = ConstantFoldCall(CS, F, Operands, TLI)) { // call -> undef. if (isa<UndefValue>(C)) return; @@ -1107,7 +1136,7 @@ CallOverdefined: } // Otherwise, we don't know anything about this call, mark it overdefined. - return markAnythingOverdefined(I); + return markOverdefined(I); } // If this is a local function that doesn't have its address taken, mark its @@ -1483,6 +1512,31 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) { return true; } + if (auto *IBR = dyn_cast<IndirectBrInst>(TI)) { + // Indirect branch with no successor ?. Its ok to assume it branches + // to no target. + if (IBR->getNumSuccessors() < 1) + continue; + + if (!getValueState(IBR->getAddress()).isUnknown()) + continue; + + // If the input to SCCP is actually branch on undef, fix the undef to + // the first successor of the indirect branch. + if (isa<UndefValue>(IBR->getAddress())) { + IBR->setAddress(BlockAddress::get(IBR->getSuccessor(0))); + markEdgeExecutable(&BB, IBR->getSuccessor(0)); + return true; + } + + // Otherwise, it is a branch on a symbolic value which is currently + // considered to be undef. Handle this by forcing the input value to the + // branch to the first successor. + markForcedConstant(IBR->getAddress(), + BlockAddress::get(IBR->getSuccessor(0))); + return true; + } + if (auto *SI = dyn_cast<SwitchInst>(TI)) { if (!SI->getNumCases() || !getValueState(SI->getCondition()).isUnknown()) continue; @@ -1490,12 +1544,12 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) { // If the input to SCCP is actually switch on undef, fix the undef to // the first constant. if (isa<UndefValue>(SI->getCondition())) { - SI->setCondition(SI->case_begin().getCaseValue()); - markEdgeExecutable(&BB, SI->case_begin().getCaseSuccessor()); + SI->setCondition(SI->case_begin()->getCaseValue()); + markEdgeExecutable(&BB, SI->case_begin()->getCaseSuccessor()); return true; } - markForcedConstant(SI->getCondition(), SI->case_begin().getCaseValue()); + markForcedConstant(SI->getCondition(), SI->case_begin()->getCaseValue()); return true; } } @@ -1545,7 +1599,7 @@ static bool runSCCP(Function &F, const DataLayout &DL, // Mark all arguments to the function as being overdefined. for (Argument &AI : F.args()) - Solver.markAnythingOverdefined(&AI); + Solver.markOverdefined(&AI); // Solve for constants. bool ResolvedUndefs = true; @@ -1715,8 +1769,9 @@ static bool runIPSCCP(Module &M, const DataLayout &DL, // arguments and return value aggressively, and can assume it is not called // unless we see evidence to the contrary. if (F.hasLocalLinkage()) { - if (AddressIsTaken(&F)) + if (F.hasAddressTaken()) { AddressTakenFunctions.insert(&F); + } else { Solver.AddArgumentTrackedFunction(&F); continue; @@ -1728,14 +1783,15 @@ static bool runIPSCCP(Module &M, const DataLayout &DL, // Assume nothing about the incoming arguments. for (Argument &AI : F.args()) - Solver.markAnythingOverdefined(&AI); + Solver.markOverdefined(&AI); } // Loop over global variables. We inform the solver about any internal global // variables that do not have their 'addresses taken'. If they don't have // their addresses taken, we can propagate constants through them. for (GlobalVariable &G : M.globals()) - if (!G.isConstant() && G.hasLocalLinkage() && !AddressIsTaken(&G)) + if (!G.isConstant() && G.hasLocalLinkage() && + G.hasDefinitiveInitializer() && !AddressIsTaken(&G)) Solver.TrackValueOfGlobalVariable(&G); // Solve for constants. @@ -1760,15 +1816,11 @@ static bool runIPSCCP(Module &M, const DataLayout &DL, if (F.isDeclaration()) continue; - if (Solver.isBlockExecutable(&F.front())) { + if (Solver.isBlockExecutable(&F.front())) for (Function::arg_iterator AI = F.arg_begin(), E = F.arg_end(); AI != E; - ++AI) { - if (AI->use_empty()) - continue; - if (tryToReplaceWithConstant(Solver, &*AI)) + ++AI) + if (!AI->use_empty() && tryToReplaceWithConstant(Solver, &*AI)) ++IPNumArgsElimed; - } - } for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) { if (!Solver.isBlockExecutable(&*BB)) { @@ -1817,32 +1869,9 @@ static bool runIPSCCP(Module &M, const DataLayout &DL, if (!I) continue; bool Folded = ConstantFoldTerminator(I->getParent()); - if (!Folded) { - // The constant folder may not have been able to fold the terminator - // if this is a branch or switch on undef. Fold it manually as a - // branch to the first successor. -#ifndef NDEBUG - if (auto *BI = dyn_cast<BranchInst>(I)) { - assert(BI->isConditional() && isa<UndefValue>(BI->getCondition()) && - "Branch should be foldable!"); - } else if (auto *SI = dyn_cast<SwitchInst>(I)) { - assert(isa<UndefValue>(SI->getCondition()) && "Switch should fold"); - } else { - llvm_unreachable("Didn't fold away reference to block!"); - } -#endif - - // Make this an uncond branch to the first successor. - TerminatorInst *TI = I->getParent()->getTerminator(); - BranchInst::Create(TI->getSuccessor(0), TI); - - // Remove entries in successor phi nodes to remove edges. - for (unsigned i = 1, e = TI->getNumSuccessors(); i != e; ++i) - TI->getSuccessor(i)->removePredecessor(TI->getParent()); - - // Remove the old terminator. - TI->eraseFromParent(); - } + assert(Folded && + "Expect TermInst on constantint or blockaddress to be folded"); + (void) Folded; } // Finally, delete the basic block. diff --git a/contrib/llvm/lib/Transforms/Scalar/SROA.cpp b/contrib/llvm/lib/Transforms/Scalar/SROA.cpp index bfcb155..b9cee5b 100644 --- a/contrib/llvm/lib/Transforms/Scalar/SROA.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/SROA.cpp @@ -25,6 +25,7 @@ #include "llvm/Transforms/Scalar/SROA.h" #include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AssumptionCache.h" @@ -325,7 +326,7 @@ private: /// partition. uint64_t BeginOffset, EndOffset; - /// \brief The start end end iterators of this partition. + /// \brief The start and end iterators of this partition. iterator SI, SJ; /// \brief A collection of split slice tails overlapping the partition. @@ -1251,7 +1252,7 @@ static bool isSafeSelectToSpeculate(SelectInst &SI) { if (!LI || !LI->isSimple()) return false; - // Both operands to the select need to be dereferencable, either + // Both operands to the select need to be dereferenceable, either // absolutely (e.g. allocas) or at this point because we can see other // accesses to it. if (!isSafeToLoadUnconditionally(TValue, LI->getAlignment(), DL, LI)) @@ -1636,8 +1637,17 @@ static bool canConvertValue(const DataLayout &DL, Type *OldTy, Type *NewTy) { return cast<PointerType>(NewTy)->getPointerAddressSpace() == cast<PointerType>(OldTy)->getPointerAddressSpace(); } - if (NewTy->isIntegerTy() || OldTy->isIntegerTy()) - return true; + + // We can convert integers to integral pointers, but not to non-integral + // pointers. + if (OldTy->isIntegerTy()) + return !DL.isNonIntegralPointerType(NewTy); + + // We can convert integral pointers to integers, but non-integral pointers + // need to remain pointers. + if (!DL.isNonIntegralPointerType(OldTy)) + return NewTy->isIntegerTy(); + return false; } @@ -1663,8 +1673,7 @@ static Value *convertValue(const DataLayout &DL, IRBuilderTy &IRB, Value *V, // See if we need inttoptr for this type pair. A cast involving both scalars // and vectors requires and additional bitcast. - if (OldTy->getScalarType()->isIntegerTy() && - NewTy->getScalarType()->isPointerTy()) { + if (OldTy->isIntOrIntVectorTy() && NewTy->isPtrOrPtrVectorTy()) { // Expand <2 x i32> to i8* --> <2 x i32> to i64 to i8* if (OldTy->isVectorTy() && !NewTy->isVectorTy()) return IRB.CreateIntToPtr(IRB.CreateBitCast(V, DL.getIntPtrType(NewTy)), @@ -1680,8 +1689,7 @@ static Value *convertValue(const DataLayout &DL, IRBuilderTy &IRB, Value *V, // See if we need ptrtoint for this type pair. A cast involving both scalars // and vectors requires and additional bitcast. - if (OldTy->getScalarType()->isPointerTy() && - NewTy->getScalarType()->isIntegerTy()) { + if (OldTy->isPtrOrPtrVectorTy() && NewTy->isIntOrIntVectorTy()) { // Expand <2 x i8*> to i128 --> <2 x i8*> to <2 x i64> to i128 if (OldTy->isVectorTy() && !NewTy->isVectorTy()) return IRB.CreateBitCast(IRB.CreatePtrToInt(V, DL.getIntPtrType(OldTy)), @@ -1825,6 +1833,7 @@ static VectorType *isVectorPromotionViable(Partition &P, const DataLayout &DL) { // Rank the remaining candidate vector types. This is easy because we know // they're all integer vectors. We sort by ascending number of elements. auto RankVectorTypes = [&DL](VectorType *RHSTy, VectorType *LHSTy) { + (void)DL; assert(DL.getTypeSizeInBits(RHSTy) == DL.getTypeSizeInBits(LHSTy) && "Cannot have vector types of different sizes!"); assert(RHSTy->getElementType()->isIntegerTy() && @@ -2185,8 +2194,8 @@ class llvm::sroa::AllocaSliceRewriter Instruction *OldPtr; // Track post-rewrite users which are PHI nodes and Selects. - SmallPtrSetImpl<PHINode *> &PHIUsers; - SmallPtrSetImpl<SelectInst *> &SelectUsers; + SmallSetVector<PHINode *, 8> &PHIUsers; + SmallSetVector<SelectInst *, 8> &SelectUsers; // Utility IR builder, whose name prefix is setup for each visited use, and // the insertion point is set to point to the user. @@ -2198,8 +2207,8 @@ public: uint64_t NewAllocaBeginOffset, uint64_t NewAllocaEndOffset, bool IsIntegerPromotable, VectorType *PromotableVecTy, - SmallPtrSetImpl<PHINode *> &PHIUsers, - SmallPtrSetImpl<SelectInst *> &SelectUsers) + SmallSetVector<PHINode *, 8> &PHIUsers, + SmallSetVector<SelectInst *, 8> &SelectUsers) : DL(DL), AS(AS), Pass(Pass), OldAI(OldAI), NewAI(NewAI), NewAllocaBeginOffset(NewAllocaBeginOffset), NewAllocaEndOffset(NewAllocaEndOffset), @@ -2294,7 +2303,8 @@ private: #endif return getAdjustedPtr(IRB, DL, &NewAI, - APInt(DL.getPointerSizeInBits(), Offset), PointerTy, + APInt(DL.getPointerTypeSizeInBits(PointerTy), Offset), + PointerTy, #ifndef NDEBUG Twine(OldName) + "." #else @@ -2369,6 +2379,8 @@ private: Value *OldOp = LI.getOperand(0); assert(OldOp == OldPtr); + unsigned AS = LI.getPointerAddressSpace(); + Type *TargetTy = IsSplit ? Type::getIntNTy(LI.getContext(), SliceSize * 8) : LI.getType(); const bool IsLoadPastEnd = DL.getTypeStoreSize(TargetTy) > SliceSize; @@ -2386,7 +2398,22 @@ private: LoadInst *NewLI = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), LI.isVolatile(), LI.getName()); if (LI.isVolatile()) - NewLI->setAtomic(LI.getOrdering(), LI.getSynchScope()); + NewLI->setAtomic(LI.getOrdering(), LI.getSyncScopeID()); + + // Any !nonnull metadata or !range metadata on the old load is also valid + // on the new load. This is even true in some cases even when the loads + // are different types, for example by mapping !nonnull metadata to + // !range metadata by modeling the null pointer constant converted to the + // integer type. + // FIXME: Add support for range metadata here. Currently the utilities + // for this don't propagate range metadata in trivial cases from one + // integer load to another, don't handle non-addrspace-0 null pointers + // correctly, and don't have any support for mapping ranges as the + // integer type becomes winder or narrower. + if (MDNode *N = LI.getMetadata(LLVMContext::MD_nonnull)) + copyNonnullMetadata(LI, N, *NewLI); + + // Try to preserve nonnull metadata V = NewLI; // If this is an integer load past the end of the slice (which means the @@ -2401,12 +2428,12 @@ private: "endian_shift"); } } else { - Type *LTy = TargetTy->getPointerTo(); + Type *LTy = TargetTy->getPointerTo(AS); LoadInst *NewLI = IRB.CreateAlignedLoad(getNewAllocaSlicePtr(IRB, LTy), getSliceAlign(TargetTy), LI.isVolatile(), LI.getName()); if (LI.isVolatile()) - NewLI->setAtomic(LI.getOrdering(), LI.getSynchScope()); + NewLI->setAtomic(LI.getOrdering(), LI.getSyncScopeID()); V = NewLI; IsPtrAdjusted = true; @@ -2429,12 +2456,12 @@ private: // the computed value, and then replace the placeholder with LI, leaving // LI only used for this computation. Value *Placeholder = - new LoadInst(UndefValue::get(LI.getType()->getPointerTo())); + new LoadInst(UndefValue::get(LI.getType()->getPointerTo(AS))); V = insertInteger(DL, IRB, Placeholder, V, NewBeginOffset - BeginOffset, "insert"); LI.replaceAllUsesWith(V); Placeholder->replaceAllUsesWith(&LI); - delete Placeholder; + Placeholder->deleteValue(); } else { LI.replaceAllUsesWith(V); } @@ -2542,13 +2569,14 @@ private: NewSI = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment(), SI.isVolatile()); } else { - Value *NewPtr = getNewAllocaSlicePtr(IRB, V->getType()->getPointerTo()); + unsigned AS = SI.getPointerAddressSpace(); + Value *NewPtr = getNewAllocaSlicePtr(IRB, V->getType()->getPointerTo(AS)); NewSI = IRB.CreateAlignedStore(V, NewPtr, getSliceAlign(V->getType()), SI.isVolatile()); } NewSI->copyMetadata(SI, LLVMContext::MD_mem_parallel_loop_access); if (SI.isVolatile()) - NewSI->setAtomic(SI.getOrdering(), SI.getSynchScope()); + NewSI->setAtomic(SI.getOrdering(), SI.getSyncScopeID()); Pass.DeadInsts.insert(&SI); deleteIfTriviallyDead(OldOp); @@ -3561,10 +3589,11 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) { int Idx = 0, Size = Offsets.Splits.size(); for (;;) { auto *PartTy = Type::getIntNTy(Ty->getContext(), PartSize * 8); - auto *PartPtrTy = PartTy->getPointerTo(LI->getPointerAddressSpace()); + auto AS = LI->getPointerAddressSpace(); + auto *PartPtrTy = PartTy->getPointerTo(AS); LoadInst *PLoad = IRB.CreateAlignedLoad( getAdjustedPtr(IRB, DL, BasePtr, - APInt(DL.getPointerSizeInBits(), PartOffset), + APInt(DL.getPointerSizeInBits(AS), PartOffset), PartPtrTy, BasePtr->getName() + "."), getAdjustedAlignment(LI, PartOffset, DL), /*IsVolatile*/ false, LI->getName()); @@ -3616,10 +3645,12 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) { auto *PartPtrTy = PLoad->getType()->getPointerTo(SI->getPointerAddressSpace()); + auto AS = SI->getPointerAddressSpace(); StoreInst *PStore = IRB.CreateAlignedStore( - PLoad, getAdjustedPtr(IRB, DL, StoreBasePtr, - APInt(DL.getPointerSizeInBits(), PartOffset), - PartPtrTy, StoreBasePtr->getName() + "."), + PLoad, + getAdjustedPtr(IRB, DL, StoreBasePtr, + APInt(DL.getPointerSizeInBits(AS), PartOffset), + PartPtrTy, StoreBasePtr->getName() + "."), getAdjustedAlignment(SI, PartOffset, DL), /*IsVolatile*/ false); PStore->copyMetadata(*LI, LLVMContext::MD_mem_parallel_loop_access); DEBUG(dbgs() << " +" << PartOffset << ":" << *PStore << "\n"); @@ -3688,7 +3719,8 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) { int Idx = 0, Size = Offsets.Splits.size(); for (;;) { auto *PartTy = Type::getIntNTy(Ty->getContext(), PartSize * 8); - auto *PartPtrTy = PartTy->getPointerTo(SI->getPointerAddressSpace()); + auto *LoadPartPtrTy = PartTy->getPointerTo(LI->getPointerAddressSpace()); + auto *StorePartPtrTy = PartTy->getPointerTo(SI->getPointerAddressSpace()); // Either lookup a split load or create one. LoadInst *PLoad; @@ -3696,20 +3728,23 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) { PLoad = (*SplitLoads)[Idx]; } else { IRB.SetInsertPoint(LI); + auto AS = LI->getPointerAddressSpace(); PLoad = IRB.CreateAlignedLoad( getAdjustedPtr(IRB, DL, LoadBasePtr, - APInt(DL.getPointerSizeInBits(), PartOffset), - PartPtrTy, LoadBasePtr->getName() + "."), + APInt(DL.getPointerSizeInBits(AS), PartOffset), + LoadPartPtrTy, LoadBasePtr->getName() + "."), getAdjustedAlignment(LI, PartOffset, DL), /*IsVolatile*/ false, LI->getName()); } // And store this partition. IRB.SetInsertPoint(SI); + auto AS = SI->getPointerAddressSpace(); StoreInst *PStore = IRB.CreateAlignedStore( - PLoad, getAdjustedPtr(IRB, DL, StoreBasePtr, - APInt(DL.getPointerSizeInBits(), PartOffset), - PartPtrTy, StoreBasePtr->getName() + "."), + PLoad, + getAdjustedPtr(IRB, DL, StoreBasePtr, + APInt(DL.getPointerSizeInBits(AS), PartOffset), + StorePartPtrTy, StoreBasePtr->getName() + "."), getAdjustedAlignment(SI, PartOffset, DL), /*IsVolatile*/ false); // Now build a new slice for the alloca. @@ -3857,7 +3892,7 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS, if (Alignment <= DL.getABITypeAlignment(SliceTy)) Alignment = 0; NewAI = new AllocaInst( - SliceTy, nullptr, Alignment, + SliceTy, AI.getType()->getAddressSpace(), nullptr, Alignment, AI.getName() + ".sroa." + Twine(P.begin() - AS.begin()), &AI); ++NumNewAllocas; } @@ -3871,8 +3906,8 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS, // fact scheduled for promotion. unsigned PPWOldSize = PostPromotionWorklist.size(); unsigned NumUses = 0; - SmallPtrSet<PHINode *, 8> PHIUsers; - SmallPtrSet<SelectInst *, 8> SelectUsers; + SmallSetVector<PHINode *, 8> PHIUsers; + SmallSetVector<SelectInst *, 8> SelectUsers; AllocaSliceRewriter Rewriter(DL, AS, *this, AI, *NewAI, P.beginOffset(), P.endOffset(), IsIntegerPromotable, VecTy, @@ -3888,24 +3923,20 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS, } NumAllocaPartitionUses += NumUses; - MaxUsesPerAllocaPartition = - std::max<unsigned>(NumUses, MaxUsesPerAllocaPartition); + MaxUsesPerAllocaPartition.updateMax(NumUses); // Now that we've processed all the slices in the new partition, check if any // PHIs or Selects would block promotion. - for (SmallPtrSetImpl<PHINode *>::iterator I = PHIUsers.begin(), - E = PHIUsers.end(); - I != E; ++I) - if (!isSafePHIToSpeculate(**I)) { + for (PHINode *PHI : PHIUsers) + if (!isSafePHIToSpeculate(*PHI)) { Promotable = false; PHIUsers.clear(); SelectUsers.clear(); break; } - for (SmallPtrSetImpl<SelectInst *>::iterator I = SelectUsers.begin(), - E = SelectUsers.end(); - I != E; ++I) - if (!isSafeSelectToSpeculate(**I)) { + + for (SelectInst *Sel : SelectUsers) + if (!isSafeSelectToSpeculate(*Sel)) { Promotable = false; PHIUsers.clear(); SelectUsers.clear(); @@ -4009,8 +4040,7 @@ bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) { } NumAllocaPartitions += NumPartitions; - MaxPartitionsPerAlloca = - std::max<unsigned>(NumPartitions, MaxPartitionsPerAlloca); + MaxPartitionsPerAlloca.updateMax(NumPartitions); // Migrate debug information from the old alloca to the new alloca(s) // and the individual partitions. @@ -4184,7 +4214,7 @@ bool SROA::promoteAllocas(Function &F) { NumPromoted += PromotableAllocas.size(); DEBUG(dbgs() << "Promoting allocas with mem2reg...\n"); - PromoteMemToReg(PromotableAllocas, *DT, nullptr, AC); + PromoteMemToReg(PromotableAllocas, *DT, AC); PromotableAllocas.clear(); return true; } @@ -4234,9 +4264,8 @@ PreservedAnalyses SROA::runImpl(Function &F, DominatorTree &RunDT, if (!Changed) return PreservedAnalyses::all(); - // FIXME: Even when promoting allocas we should preserve some abstract set of - // CFG-specific analyses. PreservedAnalyses PA; + PA.preserveSet<CFGAnalyses>(); PA.preserve<GlobalsAA>(); return PA; } diff --git a/contrib/llvm/lib/Transforms/Scalar/Scalar.cpp b/contrib/llvm/lib/Transforms/Scalar/Scalar.cpp index afe7483..ce6f93e 100644 --- a/contrib/llvm/lib/Transforms/Scalar/Scalar.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/Scalar.cpp @@ -20,11 +20,12 @@ #include "llvm/Analysis/Passes.h" #include "llvm/Analysis/ScopedNoAliasAA.h" #include "llvm/Analysis/TypeBasedAliasAnalysis.h" -#include "llvm/Transforms/Scalar/GVN.h" #include "llvm/IR/DataLayout.h" +#include "llvm/IR/LegacyPassManager.h" #include "llvm/IR/Verifier.h" #include "llvm/InitializePasses.h" -#include "llvm/IR/LegacyPassManager.h" +#include "llvm/Transforms/Scalar/GVN.h" +#include "llvm/Transforms/Scalar/SimpleLoopUnswitch.h" using namespace llvm; @@ -43,13 +44,15 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) { initializeDSELegacyPassPass(Registry); initializeGuardWideningLegacyPassPass(Registry); initializeGVNLegacyPassPass(Registry); - initializeNewGVNPass(Registry); + initializeNewGVNLegacyPassPass(Registry); initializeEarlyCSELegacyPassPass(Registry); initializeEarlyCSEMemSSALegacyPassPass(Registry); initializeGVNHoistLegacyPassPass(Registry); + initializeGVNSinkLegacyPassPass(Registry); initializeFlattenCFGPassPass(Registry); initializeInductiveRangeCheckEliminationPass(Registry); initializeIndVarSimplifyLegacyPassPass(Registry); + initializeInferAddressSpacesPass(Registry); initializeJumpThreadingPass(Registry); initializeLegacyLICMPassPass(Registry); initializeLegacyLoopSinkPassPass(Registry); @@ -58,6 +61,7 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) { initializeLoopAccessLegacyAnalysisPass(Registry); initializeLoopInstSimplifyLegacyPassPass(Registry); initializeLoopInterchangePass(Registry); + initializeLoopPredicationLegacyPassPass(Registry); initializeLoopRotateLegacyPassPass(Registry); initializeLoopStrengthReducePass(Registry); initializeLoopRerollPass(Registry); @@ -79,13 +83,14 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) { initializeIPSCCPLegacyPassPass(Registry); initializeSROALegacyPassPass(Registry); initializeCFGSimplifyPassPass(Registry); + initializeLateCFGSimplifyPassPass(Registry); initializeStructurizeCFGPass(Registry); + initializeSimpleLoopUnswitchLegacyPassPass(Registry); initializeSinkingLegacyPassPass(Registry); initializeTailCallElimPass(Registry); initializeSeparateConstOffsetFromGEPPass(Registry); initializeSpeculativeExecutionLegacyPassPass(Registry); initializeStraightLineStrengthReducePass(Registry); - initializeLoadCombinePass(Registry); initializePlaceBackedgeSafepointsImplPass(Registry); initializePlaceSafepointsPass(Registry); initializeFloat2IntLegacyPassPass(Registry); @@ -115,6 +120,10 @@ void LLVMAddCFGSimplificationPass(LLVMPassManagerRef PM) { unwrap(PM)->add(createCFGSimplificationPass()); } +void LLVMAddLateCFGSimplificationPass(LLVMPassManagerRef PM) { + unwrap(PM)->add(createLateCFGSimplificationPass()); +} + void LLVMAddDeadStoreEliminationPass(LLVMPassManagerRef PM) { unwrap(PM)->add(createDeadStoreEliminationPass()); } diff --git a/contrib/llvm/lib/Transforms/Scalar/Scalarizer.cpp b/contrib/llvm/lib/Transforms/Scalar/Scalarizer.cpp index 39969e2..d11855f 100644 --- a/contrib/llvm/lib/Transforms/Scalar/Scalarizer.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/Scalarizer.cpp @@ -14,12 +14,12 @@ // //===----------------------------------------------------------------------===// -#include "llvm/Transforms/Scalar.h" #include "llvm/ADT/STLExtras.h" #include "llvm/Analysis/VectorUtils.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/InstVisitor.h" #include "llvm/Pass.h" +#include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" using namespace llvm; @@ -520,12 +520,25 @@ bool Scalarizer::visitGetElementPtrInst(GetElementPtrInst &GEPI) { unsigned NumElems = VT->getNumElements(); unsigned NumIndices = GEPI.getNumIndices(); - Scatterer Base = scatter(&GEPI, GEPI.getOperand(0)); + // The base pointer might be scalar even if it's a vector GEP. In those cases, + // splat the pointer into a vector value, and scatter that vector. + Value *Op0 = GEPI.getOperand(0); + if (!Op0->getType()->isVectorTy()) + Op0 = Builder.CreateVectorSplat(NumElems, Op0); + Scatterer Base = scatter(&GEPI, Op0); SmallVector<Scatterer, 8> Ops; Ops.resize(NumIndices); - for (unsigned I = 0; I < NumIndices; ++I) - Ops[I] = scatter(&GEPI, GEPI.getOperand(I + 1)); + for (unsigned I = 0; I < NumIndices; ++I) { + Value *Op = GEPI.getOperand(I + 1); + + // The indices might be scalars even if it's a vector GEP. In those cases, + // splat the scalar into a vector value, and scatter that vector. + if (!Op->getType()->isVectorTy()) + Op = Builder.CreateVectorSplat(NumElems, Op); + + Ops[I] = scatter(&GEPI, Op); + } ValueVector Res; Res.resize(NumElems); diff --git a/contrib/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/contrib/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp index 4d59453..84675f4 100644 --- a/contrib/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp @@ -156,27 +156,27 @@ // //===----------------------------------------------------------------------===// -#include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/MemoryBuiltins.h" +#include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/Dominators.h" +#include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Module.h" -#include "llvm/IR/PatternMatch.h" #include "llvm/IR/Operator.h" +#include "llvm/IR/PatternMatch.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Transforms/Scalar.h" -#include "llvm/Transforms/Utils/Local.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetSubtargetInfo.h" -#include "llvm/IR/IRBuilder.h" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/Local.h" using namespace llvm; using namespace llvm::PatternMatch; @@ -1138,7 +1138,7 @@ bool SeparateConstOffsetFromGEP::reuniteExts(Instruction *I) { // Add I to DominatingExprs if it's an add/sub that can't sign overflow. if (match(I, m_NSWAdd(m_Value(LHS), m_Value(RHS))) || match(I, m_NSWSub(m_Value(LHS), m_Value(RHS)))) { - if (isKnownNotFullPoison(I)) { + if (programUndefinedIfFullPoison(I)) { const SCEV *Key = SE->getAddExpr(SE->getUnknown(LHS), SE->getUnknown(RHS)); DominatingExprs[Key].push_back(I); diff --git a/contrib/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/contrib/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp new file mode 100644 index 0000000..aaab585 --- /dev/null +++ b/contrib/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp @@ -0,0 +1,808 @@ +//===- SimpleLoopUnswitch.cpp - Hoist loop-invariant control flow ---------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Scalar/SimpleLoopUnswitch.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/Sequence.h" +#include "llvm/ADT/SetVector.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/ADT/Twine.h" +#include "llvm/Analysis/AssumptionCache.h" +#include "llvm/Analysis/LoopAnalysisManager.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/LoopPass.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/Constant.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/InstrTypes.h" +#include "llvm/IR/Instruction.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Use.h" +#include "llvm/IR/Value.h" +#include "llvm/Pass.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/GenericDomTree.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/LoopUtils.h" +#include <algorithm> +#include <cassert> +#include <iterator> +#include <utility> + +#define DEBUG_TYPE "simple-loop-unswitch" + +using namespace llvm; + +STATISTIC(NumBranches, "Number of branches unswitched"); +STATISTIC(NumSwitches, "Number of switches unswitched"); +STATISTIC(NumTrivial, "Number of unswitches that are trivial"); + +static void replaceLoopUsesWithConstant(Loop &L, Value &LIC, + Constant &Replacement) { + assert(!isa<Constant>(LIC) && "Why are we unswitching on a constant?"); + + // Replace uses of LIC in the loop with the given constant. + for (auto UI = LIC.use_begin(), UE = LIC.use_end(); UI != UE;) { + // Grab the use and walk past it so we can clobber it in the use list. + Use *U = &*UI++; + Instruction *UserI = dyn_cast<Instruction>(U->getUser()); + if (!UserI || !L.contains(UserI)) + continue; + + // Replace this use within the loop body. + *U = &Replacement; + } +} + +/// Update the dominator tree after removing one exiting predecessor of a loop +/// exit block. +static void updateLoopExitIDom(BasicBlock *LoopExitBB, Loop &L, + DominatorTree &DT) { + assert(pred_begin(LoopExitBB) != pred_end(LoopExitBB) && + "Cannot have empty predecessors of the loop exit block if we split " + "off a block to unswitch!"); + + BasicBlock *IDom = *pred_begin(LoopExitBB); + // Walk all of the other predecessors finding the nearest common dominator + // until all predecessors are covered or we reach the loop header. The loop + // header necessarily dominates all loop exit blocks in loop simplified form + // so we can early-exit the moment we hit that block. + for (auto PI = std::next(pred_begin(LoopExitBB)), PE = pred_end(LoopExitBB); + PI != PE && IDom != L.getHeader(); ++PI) + IDom = DT.findNearestCommonDominator(IDom, *PI); + + DT.changeImmediateDominator(LoopExitBB, IDom); +} + +/// Update the dominator tree after unswitching a particular former exit block. +/// +/// This handles the full update of the dominator tree after hoisting a block +/// that previously was an exit block (or split off of an exit block) up to be +/// reached from the new immediate dominator of the preheader. +/// +/// The common case is simple -- we just move the unswitched block to have an +/// immediate dominator of the old preheader. But in complex cases, there may +/// be other blocks reachable from the unswitched block that are immediately +/// dominated by some node between the unswitched one and the old preheader. +/// All of these also need to be hoisted in the dominator tree. We also want to +/// minimize queries to the dominator tree because each step of this +/// invalidates any DFS numbers that would make queries fast. +static void updateDTAfterUnswitch(BasicBlock *UnswitchedBB, BasicBlock *OldPH, + DominatorTree &DT) { + DomTreeNode *OldPHNode = DT[OldPH]; + DomTreeNode *UnswitchedNode = DT[UnswitchedBB]; + // If the dominator tree has already been updated for this unswitched node, + // we're done. This makes it easier to use this routine if there are multiple + // paths to the same unswitched destination. + if (UnswitchedNode->getIDom() == OldPHNode) + return; + + // First collect the domtree nodes that we are hoisting over. These are the + // set of nodes which may have children that need to be hoisted as well. + SmallPtrSet<DomTreeNode *, 4> DomChain; + for (auto *IDom = UnswitchedNode->getIDom(); IDom != OldPHNode; + IDom = IDom->getIDom()) + DomChain.insert(IDom); + + // The unswitched block ends up immediately dominated by the old preheader -- + // regardless of whether it is the loop exit block or split off of the loop + // exit block. + DT.changeImmediateDominator(UnswitchedNode, OldPHNode); + + // For everything that moves up the dominator tree, we need to examine the + // dominator frontier to see if it additionally should move up the dominator + // tree. This lambda appends the dominator frontier for a node on the + // worklist. + // + // Note that we don't currently use the IDFCalculator here for two reasons: + // 1) It computes dominator tree levels for the entire function on each run + // of 'compute'. While this isn't terrible, given that we expect to update + // relatively small subtrees of the domtree, it isn't necessarily the right + // tradeoff. + // 2) The interface doesn't fit this usage well. It doesn't operate in + // append-only, and builds several sets that we don't need. + // + // FIXME: Neither of these issues are a big deal and could be addressed with + // some amount of refactoring of IDFCalculator. That would allow us to share + // the core logic here (which is solving the same core problem). + SmallSetVector<BasicBlock *, 4> Worklist; + SmallVector<DomTreeNode *, 4> DomNodes; + SmallPtrSet<BasicBlock *, 4> DomSet; + auto AppendDomFrontier = [&](DomTreeNode *Node) { + assert(DomNodes.empty() && "Must start with no dominator nodes."); + assert(DomSet.empty() && "Must start with an empty dominator set."); + + // First flatten this subtree into sequence of nodes by doing a pre-order + // walk. + DomNodes.push_back(Node); + // We intentionally re-evaluate the size as each node can add new children. + // Because this is a tree walk, this cannot add any duplicates. + for (int i = 0; i < (int)DomNodes.size(); ++i) + DomNodes.insert(DomNodes.end(), DomNodes[i]->begin(), DomNodes[i]->end()); + + // Now create a set of the basic blocks so we can quickly test for + // dominated successors. We could in theory use the DFS numbers of the + // dominator tree for this, but we want this to remain predictably fast + // even while we mutate the dominator tree in ways that would invalidate + // the DFS numbering. + for (DomTreeNode *InnerN : DomNodes) + DomSet.insert(InnerN->getBlock()); + + // Now re-walk the nodes, appending every successor of every node that isn't + // in the set. Note that we don't append the node itself, even though if it + // is a successor it does not strictly dominate itself and thus it would be + // part of the dominance frontier. The reason we don't append it is that + // the node passed in came *from* the worklist and so it has already been + // processed. + for (DomTreeNode *InnerN : DomNodes) + for (BasicBlock *SuccBB : successors(InnerN->getBlock())) + if (!DomSet.count(SuccBB)) + Worklist.insert(SuccBB); + + DomNodes.clear(); + DomSet.clear(); + }; + + // Append the initial dom frontier nodes. + AppendDomFrontier(UnswitchedNode); + + // Walk the worklist. We grow the list in the loop and so must recompute size. + for (int i = 0; i < (int)Worklist.size(); ++i) { + auto *BB = Worklist[i]; + + DomTreeNode *Node = DT[BB]; + assert(!DomChain.count(Node) && + "Cannot be dominated by a block you can reach!"); + + // If this block had an immediate dominator somewhere in the chain + // we hoisted over, then its position in the domtree needs to move as it is + // reachable from a node hoisted over this chain. + if (!DomChain.count(Node->getIDom())) + continue; + + DT.changeImmediateDominator(Node, OldPHNode); + + // Now add this node's dominator frontier to the worklist as well. + AppendDomFrontier(Node); + } +} + +/// Check that all the LCSSA PHI nodes in the loop exit block have trivial +/// incoming values along this edge. +static bool areLoopExitPHIsLoopInvariant(Loop &L, BasicBlock &ExitingBB, + BasicBlock &ExitBB) { + for (Instruction &I : ExitBB) { + auto *PN = dyn_cast<PHINode>(&I); + if (!PN) + // No more PHIs to check. + return true; + + // If the incoming value for this edge isn't loop invariant the unswitch + // won't be trivial. + if (!L.isLoopInvariant(PN->getIncomingValueForBlock(&ExitingBB))) + return false; + } + llvm_unreachable("Basic blocks should never be empty!"); +} + +/// Rewrite the PHI nodes in an unswitched loop exit basic block. +/// +/// Requires that the loop exit and unswitched basic block are the same, and +/// that the exiting block was a unique predecessor of that block. Rewrites the +/// PHI nodes in that block such that what were LCSSA PHI nodes become trivial +/// PHI nodes from the old preheader that now contains the unswitched +/// terminator. +static void rewritePHINodesForUnswitchedExitBlock(BasicBlock &UnswitchedBB, + BasicBlock &OldExitingBB, + BasicBlock &OldPH) { + for (Instruction &I : UnswitchedBB) { + auto *PN = dyn_cast<PHINode>(&I); + if (!PN) + // No more PHIs to check. + break; + + // When the loop exit is directly unswitched we just need to update the + // incoming basic block. We loop to handle weird cases with repeated + // incoming blocks, but expect to typically only have one operand here. + for (auto i : seq<int>(0, PN->getNumOperands())) { + assert(PN->getIncomingBlock(i) == &OldExitingBB && + "Found incoming block different from unique predecessor!"); + PN->setIncomingBlock(i, &OldPH); + } + } +} + +/// Rewrite the PHI nodes in the loop exit basic block and the split off +/// unswitched block. +/// +/// Because the exit block remains an exit from the loop, this rewrites the +/// LCSSA PHI nodes in it to remove the unswitched edge and introduces PHI +/// nodes into the unswitched basic block to select between the value in the +/// old preheader and the loop exit. +static void rewritePHINodesForExitAndUnswitchedBlocks(BasicBlock &ExitBB, + BasicBlock &UnswitchedBB, + BasicBlock &OldExitingBB, + BasicBlock &OldPH) { + assert(&ExitBB != &UnswitchedBB && + "Must have different loop exit and unswitched blocks!"); + Instruction *InsertPt = &*UnswitchedBB.begin(); + for (Instruction &I : ExitBB) { + auto *PN = dyn_cast<PHINode>(&I); + if (!PN) + // No more PHIs to check. + break; + + auto *NewPN = PHINode::Create(PN->getType(), /*NumReservedValues*/ 2, + PN->getName() + ".split", InsertPt); + + // Walk backwards over the old PHI node's inputs to minimize the cost of + // removing each one. We have to do this weird loop manually so that we + // create the same number of new incoming edges in the new PHI as we expect + // each case-based edge to be included in the unswitched switch in some + // cases. + // FIXME: This is really, really gross. It would be much cleaner if LLVM + // allowed us to create a single entry for a predecessor block without + // having separate entries for each "edge" even though these edges are + // required to produce identical results. + for (int i = PN->getNumIncomingValues() - 1; i >= 0; --i) { + if (PN->getIncomingBlock(i) != &OldExitingBB) + continue; + + Value *Incoming = PN->removeIncomingValue(i); + NewPN->addIncoming(Incoming, &OldPH); + } + + // Now replace the old PHI with the new one and wire the old one in as an + // input to the new one. + PN->replaceAllUsesWith(NewPN); + NewPN->addIncoming(PN, &ExitBB); + } +} + +/// Unswitch a trivial branch if the condition is loop invariant. +/// +/// This routine should only be called when loop code leading to the branch has +/// been validated as trivial (no side effects). This routine checks if the +/// condition is invariant and one of the successors is a loop exit. This +/// allows us to unswitch without duplicating the loop, making it trivial. +/// +/// If this routine fails to unswitch the branch it returns false. +/// +/// If the branch can be unswitched, this routine splits the preheader and +/// hoists the branch above that split. Preserves loop simplified form +/// (splitting the exit block as necessary). It simplifies the branch within +/// the loop to an unconditional branch but doesn't remove it entirely. Further +/// cleanup can be done with some simplify-cfg like pass. +static bool unswitchTrivialBranch(Loop &L, BranchInst &BI, DominatorTree &DT, + LoopInfo &LI) { + assert(BI.isConditional() && "Can only unswitch a conditional branch!"); + DEBUG(dbgs() << " Trying to unswitch branch: " << BI << "\n"); + + Value *LoopCond = BI.getCondition(); + + // Need a trivial loop condition to unswitch. + if (!L.isLoopInvariant(LoopCond)) + return false; + + // FIXME: We should compute this once at the start and update it! + SmallVector<BasicBlock *, 16> ExitBlocks; + L.getExitBlocks(ExitBlocks); + SmallPtrSet<BasicBlock *, 16> ExitBlockSet(ExitBlocks.begin(), + ExitBlocks.end()); + + // Check to see if a successor of the branch is guaranteed to + // exit through a unique exit block without having any + // side-effects. If so, determine the value of Cond that causes + // it to do this. + ConstantInt *CondVal = ConstantInt::getTrue(BI.getContext()); + ConstantInt *Replacement = ConstantInt::getFalse(BI.getContext()); + int LoopExitSuccIdx = 0; + auto *LoopExitBB = BI.getSuccessor(0); + if (!ExitBlockSet.count(LoopExitBB)) { + std::swap(CondVal, Replacement); + LoopExitSuccIdx = 1; + LoopExitBB = BI.getSuccessor(1); + if (!ExitBlockSet.count(LoopExitBB)) + return false; + } + auto *ContinueBB = BI.getSuccessor(1 - LoopExitSuccIdx); + assert(L.contains(ContinueBB) && + "Cannot have both successors exit and still be in the loop!"); + + auto *ParentBB = BI.getParent(); + if (!areLoopExitPHIsLoopInvariant(L, *ParentBB, *LoopExitBB)) + return false; + + DEBUG(dbgs() << " unswitching trivial branch when: " << CondVal + << " == " << LoopCond << "\n"); + + // Split the preheader, so that we know that there is a safe place to insert + // the conditional branch. We will change the preheader to have a conditional + // branch on LoopCond. + BasicBlock *OldPH = L.getLoopPreheader(); + BasicBlock *NewPH = SplitEdge(OldPH, L.getHeader(), &DT, &LI); + + // Now that we have a place to insert the conditional branch, create a place + // to branch to: this is the exit block out of the loop that we are + // unswitching. We need to split this if there are other loop predecessors. + // Because the loop is in simplified form, *any* other predecessor is enough. + BasicBlock *UnswitchedBB; + if (BasicBlock *PredBB = LoopExitBB->getUniquePredecessor()) { + (void)PredBB; + assert(PredBB == BI.getParent() && + "A branch's parent isn't a predecessor!"); + UnswitchedBB = LoopExitBB; + } else { + UnswitchedBB = SplitBlock(LoopExitBB, &LoopExitBB->front(), &DT, &LI); + } + + // Now splice the branch to gate reaching the new preheader and re-point its + // successors. + OldPH->getInstList().splice(std::prev(OldPH->end()), + BI.getParent()->getInstList(), BI); + OldPH->getTerminator()->eraseFromParent(); + BI.setSuccessor(LoopExitSuccIdx, UnswitchedBB); + BI.setSuccessor(1 - LoopExitSuccIdx, NewPH); + + // Create a new unconditional branch that will continue the loop as a new + // terminator. + BranchInst::Create(ContinueBB, ParentBB); + + // Rewrite the relevant PHI nodes. + if (UnswitchedBB == LoopExitBB) + rewritePHINodesForUnswitchedExitBlock(*UnswitchedBB, *ParentBB, *OldPH); + else + rewritePHINodesForExitAndUnswitchedBlocks(*LoopExitBB, *UnswitchedBB, + *ParentBB, *OldPH); + + // Now we need to update the dominator tree. + updateDTAfterUnswitch(UnswitchedBB, OldPH, DT); + // But if we split something off of the loop exit block then we also removed + // one of the predecessors for the loop exit block and may need to update its + // idom. + if (UnswitchedBB != LoopExitBB) + updateLoopExitIDom(LoopExitBB, L, DT); + + // Since this is an i1 condition we can also trivially replace uses of it + // within the loop with a constant. + replaceLoopUsesWithConstant(L, *LoopCond, *Replacement); + + ++NumTrivial; + ++NumBranches; + return true; +} + +/// Unswitch a trivial switch if the condition is loop invariant. +/// +/// This routine should only be called when loop code leading to the switch has +/// been validated as trivial (no side effects). This routine checks if the +/// condition is invariant and that at least one of the successors is a loop +/// exit. This allows us to unswitch without duplicating the loop, making it +/// trivial. +/// +/// If this routine fails to unswitch the switch it returns false. +/// +/// If the switch can be unswitched, this routine splits the preheader and +/// copies the switch above that split. If the default case is one of the +/// exiting cases, it copies the non-exiting cases and points them at the new +/// preheader. If the default case is not exiting, it copies the exiting cases +/// and points the default at the preheader. It preserves loop simplified form +/// (splitting the exit blocks as necessary). It simplifies the switch within +/// the loop by removing now-dead cases. If the default case is one of those +/// unswitched, it replaces its destination with a new basic block containing +/// only unreachable. Such basic blocks, while technically loop exits, are not +/// considered for unswitching so this is a stable transform and the same +/// switch will not be revisited. If after unswitching there is only a single +/// in-loop successor, the switch is further simplified to an unconditional +/// branch. Still more cleanup can be done with some simplify-cfg like pass. +static bool unswitchTrivialSwitch(Loop &L, SwitchInst &SI, DominatorTree &DT, + LoopInfo &LI) { + DEBUG(dbgs() << " Trying to unswitch switch: " << SI << "\n"); + Value *LoopCond = SI.getCondition(); + + // If this isn't switching on an invariant condition, we can't unswitch it. + if (!L.isLoopInvariant(LoopCond)) + return false; + + auto *ParentBB = SI.getParent(); + + // FIXME: We should compute this once at the start and update it! + SmallVector<BasicBlock *, 16> ExitBlocks; + L.getExitBlocks(ExitBlocks); + SmallPtrSet<BasicBlock *, 16> ExitBlockSet(ExitBlocks.begin(), + ExitBlocks.end()); + + SmallVector<int, 4> ExitCaseIndices; + for (auto Case : SI.cases()) { + auto *SuccBB = Case.getCaseSuccessor(); + if (ExitBlockSet.count(SuccBB) && + areLoopExitPHIsLoopInvariant(L, *ParentBB, *SuccBB)) + ExitCaseIndices.push_back(Case.getCaseIndex()); + } + BasicBlock *DefaultExitBB = nullptr; + if (ExitBlockSet.count(SI.getDefaultDest()) && + areLoopExitPHIsLoopInvariant(L, *ParentBB, *SI.getDefaultDest()) && + !isa<UnreachableInst>(SI.getDefaultDest()->getTerminator())) + DefaultExitBB = SI.getDefaultDest(); + else if (ExitCaseIndices.empty()) + return false; + + DEBUG(dbgs() << " unswitching trivial cases...\n"); + + SmallVector<std::pair<ConstantInt *, BasicBlock *>, 4> ExitCases; + ExitCases.reserve(ExitCaseIndices.size()); + // We walk the case indices backwards so that we remove the last case first + // and don't disrupt the earlier indices. + for (unsigned Index : reverse(ExitCaseIndices)) { + auto CaseI = SI.case_begin() + Index; + // Save the value of this case. + ExitCases.push_back({CaseI->getCaseValue(), CaseI->getCaseSuccessor()}); + // Delete the unswitched cases. + SI.removeCase(CaseI); + } + + // Check if after this all of the remaining cases point at the same + // successor. + BasicBlock *CommonSuccBB = nullptr; + if (SI.getNumCases() > 0 && + std::all_of(std::next(SI.case_begin()), SI.case_end(), + [&SI](const SwitchInst::CaseHandle &Case) { + return Case.getCaseSuccessor() == + SI.case_begin()->getCaseSuccessor(); + })) + CommonSuccBB = SI.case_begin()->getCaseSuccessor(); + + if (DefaultExitBB) { + // We can't remove the default edge so replace it with an edge to either + // the single common remaining successor (if we have one) or an unreachable + // block. + if (CommonSuccBB) { + SI.setDefaultDest(CommonSuccBB); + } else { + BasicBlock *UnreachableBB = BasicBlock::Create( + ParentBB->getContext(), + Twine(ParentBB->getName()) + ".unreachable_default", + ParentBB->getParent()); + new UnreachableInst(ParentBB->getContext(), UnreachableBB); + SI.setDefaultDest(UnreachableBB); + DT.addNewBlock(UnreachableBB, ParentBB); + } + } else { + // If we're not unswitching the default, we need it to match any cases to + // have a common successor or if we have no cases it is the common + // successor. + if (SI.getNumCases() == 0) + CommonSuccBB = SI.getDefaultDest(); + else if (SI.getDefaultDest() != CommonSuccBB) + CommonSuccBB = nullptr; + } + + // Split the preheader, so that we know that there is a safe place to insert + // the switch. + BasicBlock *OldPH = L.getLoopPreheader(); + BasicBlock *NewPH = SplitEdge(OldPH, L.getHeader(), &DT, &LI); + OldPH->getTerminator()->eraseFromParent(); + + // Now add the unswitched switch. + auto *NewSI = SwitchInst::Create(LoopCond, NewPH, ExitCases.size(), OldPH); + + // Rewrite the IR for the unswitched basic blocks. This requires two steps. + // First, we split any exit blocks with remaining in-loop predecessors. Then + // we update the PHIs in one of two ways depending on if there was a split. + // We walk in reverse so that we split in the same order as the cases + // appeared. This is purely for convenience of reading the resulting IR, but + // it doesn't cost anything really. + SmallPtrSet<BasicBlock *, 2> UnswitchedExitBBs; + SmallDenseMap<BasicBlock *, BasicBlock *, 2> SplitExitBBMap; + // Handle the default exit if necessary. + // FIXME: It'd be great if we could merge this with the loop below but LLVM's + // ranges aren't quite powerful enough yet. + if (DefaultExitBB) { + if (pred_empty(DefaultExitBB)) { + UnswitchedExitBBs.insert(DefaultExitBB); + rewritePHINodesForUnswitchedExitBlock(*DefaultExitBB, *ParentBB, *OldPH); + } else { + auto *SplitBB = + SplitBlock(DefaultExitBB, &DefaultExitBB->front(), &DT, &LI); + rewritePHINodesForExitAndUnswitchedBlocks(*DefaultExitBB, *SplitBB, + *ParentBB, *OldPH); + updateLoopExitIDom(DefaultExitBB, L, DT); + DefaultExitBB = SplitExitBBMap[DefaultExitBB] = SplitBB; + } + } + // Note that we must use a reference in the for loop so that we update the + // container. + for (auto &CasePair : reverse(ExitCases)) { + // Grab a reference to the exit block in the pair so that we can update it. + BasicBlock *ExitBB = CasePair.second; + + // If this case is the last edge into the exit block, we can simply reuse it + // as it will no longer be a loop exit. No mapping necessary. + if (pred_empty(ExitBB)) { + // Only rewrite once. + if (UnswitchedExitBBs.insert(ExitBB).second) + rewritePHINodesForUnswitchedExitBlock(*ExitBB, *ParentBB, *OldPH); + continue; + } + + // Otherwise we need to split the exit block so that we retain an exit + // block from the loop and a target for the unswitched condition. + BasicBlock *&SplitExitBB = SplitExitBBMap[ExitBB]; + if (!SplitExitBB) { + // If this is the first time we see this, do the split and remember it. + SplitExitBB = SplitBlock(ExitBB, &ExitBB->front(), &DT, &LI); + rewritePHINodesForExitAndUnswitchedBlocks(*ExitBB, *SplitExitBB, + *ParentBB, *OldPH); + updateLoopExitIDom(ExitBB, L, DT); + } + // Update the case pair to point to the split block. + CasePair.second = SplitExitBB; + } + + // Now add the unswitched cases. We do this in reverse order as we built them + // in reverse order. + for (auto CasePair : reverse(ExitCases)) { + ConstantInt *CaseVal = CasePair.first; + BasicBlock *UnswitchedBB = CasePair.second; + + NewSI->addCase(CaseVal, UnswitchedBB); + updateDTAfterUnswitch(UnswitchedBB, OldPH, DT); + } + + // If the default was unswitched, re-point it and add explicit cases for + // entering the loop. + if (DefaultExitBB) { + NewSI->setDefaultDest(DefaultExitBB); + updateDTAfterUnswitch(DefaultExitBB, OldPH, DT); + + // We removed all the exit cases, so we just copy the cases to the + // unswitched switch. + for (auto Case : SI.cases()) + NewSI->addCase(Case.getCaseValue(), NewPH); + } + + // If we ended up with a common successor for every path through the switch + // after unswitching, rewrite it to an unconditional branch to make it easy + // to recognize. Otherwise we potentially have to recognize the default case + // pointing at unreachable and other complexity. + if (CommonSuccBB) { + BasicBlock *BB = SI.getParent(); + SI.eraseFromParent(); + BranchInst::Create(CommonSuccBB, BB); + } + + DT.verifyDomTree(); + ++NumTrivial; + ++NumSwitches; + return true; +} + +/// This routine scans the loop to find a branch or switch which occurs before +/// any side effects occur. These can potentially be unswitched without +/// duplicating the loop. If a branch or switch is successfully unswitched the +/// scanning continues to see if subsequent branches or switches have become +/// trivial. Once all trivial candidates have been unswitched, this routine +/// returns. +/// +/// The return value indicates whether anything was unswitched (and therefore +/// changed). +static bool unswitchAllTrivialConditions(Loop &L, DominatorTree &DT, + LoopInfo &LI) { + bool Changed = false; + + // If loop header has only one reachable successor we should keep looking for + // trivial condition candidates in the successor as well. An alternative is + // to constant fold conditions and merge successors into loop header (then we + // only need to check header's terminator). The reason for not doing this in + // LoopUnswitch pass is that it could potentially break LoopPassManager's + // invariants. Folding dead branches could either eliminate the current loop + // or make other loops unreachable. LCSSA form might also not be preserved + // after deleting branches. The following code keeps traversing loop header's + // successors until it finds the trivial condition candidate (condition that + // is not a constant). Since unswitching generates branches with constant + // conditions, this scenario could be very common in practice. + BasicBlock *CurrentBB = L.getHeader(); + SmallPtrSet<BasicBlock *, 8> Visited; + Visited.insert(CurrentBB); + do { + // Check if there are any side-effecting instructions (e.g. stores, calls, + // volatile loads) in the part of the loop that the code *would* execute + // without unswitching. + if (llvm::any_of(*CurrentBB, + [](Instruction &I) { return I.mayHaveSideEffects(); })) + return Changed; + + TerminatorInst *CurrentTerm = CurrentBB->getTerminator(); + + if (auto *SI = dyn_cast<SwitchInst>(CurrentTerm)) { + // Don't bother trying to unswitch past a switch with a constant + // condition. This should be removed prior to running this pass by + // simplify-cfg. + if (isa<Constant>(SI->getCondition())) + return Changed; + + if (!unswitchTrivialSwitch(L, *SI, DT, LI)) + // Coludn't unswitch this one so we're done. + return Changed; + + // Mark that we managed to unswitch something. + Changed = true; + + // If unswitching turned the terminator into an unconditional branch then + // we can continue. The unswitching logic specifically works to fold any + // cases it can into an unconditional branch to make it easier to + // recognize here. + auto *BI = dyn_cast<BranchInst>(CurrentBB->getTerminator()); + if (!BI || BI->isConditional()) + return Changed; + + CurrentBB = BI->getSuccessor(0); + continue; + } + + auto *BI = dyn_cast<BranchInst>(CurrentTerm); + if (!BI) + // We do not understand other terminator instructions. + return Changed; + + // Don't bother trying to unswitch past an unconditional branch or a branch + // with a constant value. These should be removed by simplify-cfg prior to + // running this pass. + if (!BI->isConditional() || isa<Constant>(BI->getCondition())) + return Changed; + + // Found a trivial condition candidate: non-foldable conditional branch. If + // we fail to unswitch this, we can't do anything else that is trivial. + if (!unswitchTrivialBranch(L, *BI, DT, LI)) + return Changed; + + // Mark that we managed to unswitch something. + Changed = true; + + // We unswitched the branch. This should always leave us with an + // unconditional branch that we can follow now. + BI = cast<BranchInst>(CurrentBB->getTerminator()); + assert(!BI->isConditional() && + "Cannot form a conditional branch by unswitching1"); + CurrentBB = BI->getSuccessor(0); + + // When continuing, if we exit the loop or reach a previous visited block, + // then we can not reach any trivial condition candidates (unfoldable + // branch instructions or switch instructions) and no unswitch can happen. + } while (L.contains(CurrentBB) && Visited.insert(CurrentBB).second); + + return Changed; +} + +/// Unswitch control flow predicated on loop invariant conditions. +/// +/// This first hoists all branches or switches which are trivial (IE, do not +/// require duplicating any part of the loop) out of the loop body. It then +/// looks at other loop invariant control flows and tries to unswitch those as +/// well by cloning the loop if the result is small enough. +static bool unswitchLoop(Loop &L, DominatorTree &DT, LoopInfo &LI, + AssumptionCache &AC) { + assert(L.isLCSSAForm(DT) && + "Loops must be in LCSSA form before unswitching."); + bool Changed = false; + + // Must be in loop simplified form: we need a preheader and dedicated exits. + if (!L.isLoopSimplifyForm()) + return false; + + // Try trivial unswitch first before loop over other basic blocks in the loop. + Changed |= unswitchAllTrivialConditions(L, DT, LI); + + // FIXME: Add support for non-trivial unswitching by cloning the loop. + + return Changed; +} + +PreservedAnalyses SimpleLoopUnswitchPass::run(Loop &L, LoopAnalysisManager &AM, + LoopStandardAnalysisResults &AR, + LPMUpdater &U) { + Function &F = *L.getHeader()->getParent(); + (void)F; + + DEBUG(dbgs() << "Unswitching loop in " << F.getName() << ": " << L << "\n"); + + if (!unswitchLoop(L, AR.DT, AR.LI, AR.AC)) + return PreservedAnalyses::all(); + +#ifndef NDEBUG + // Historically this pass has had issues with the dominator tree so verify it + // in asserts builds. + AR.DT.verifyDomTree(); +#endif + return getLoopPassPreservedAnalyses(); +} + +namespace { + +class SimpleLoopUnswitchLegacyPass : public LoopPass { +public: + static char ID; // Pass ID, replacement for typeid + + explicit SimpleLoopUnswitchLegacyPass() : LoopPass(ID) { + initializeSimpleLoopUnswitchLegacyPassPass( + *PassRegistry::getPassRegistry()); + } + + bool runOnLoop(Loop *L, LPPassManager &LPM) override; + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<AssumptionCacheTracker>(); + getLoopAnalysisUsage(AU); + } +}; + +} // end anonymous namespace + +bool SimpleLoopUnswitchLegacyPass::runOnLoop(Loop *L, LPPassManager &LPM) { + if (skipLoop(L)) + return false; + + Function &F = *L->getHeader()->getParent(); + + DEBUG(dbgs() << "Unswitching loop in " << F.getName() << ": " << *L << "\n"); + + auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree(); + auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); + auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); + + bool Changed = unswitchLoop(*L, DT, LI, AC); + +#ifndef NDEBUG + // Historically this pass has had issues with the dominator tree so verify it + // in asserts builds. + DT.verifyDomTree(); +#endif + return Changed; +} + +char SimpleLoopUnswitchLegacyPass::ID = 0; +INITIALIZE_PASS_BEGIN(SimpleLoopUnswitchLegacyPass, "simple-loop-unswitch", + "Simple unswitch loops", false, false) +INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) +INITIALIZE_PASS_DEPENDENCY(LoopPass) +INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) +INITIALIZE_PASS_END(SimpleLoopUnswitchLegacyPass, "simple-loop-unswitch", + "Simple unswitch loops", false, false) + +Pass *llvm::createSimpleLoopUnswitchLegacyPass() { + return new SimpleLoopUnswitchLegacyPass(); +} diff --git a/contrib/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp b/contrib/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp index f2723bd..8754c71 100644 --- a/contrib/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp @@ -130,7 +130,8 @@ static bool mergeEmptyReturnBlocks(Function &F) { /// iterating until no more changes are made. static bool iterativelySimplifyCFG(Function &F, const TargetTransformInfo &TTI, AssumptionCache *AC, - unsigned BonusInstThreshold) { + unsigned BonusInstThreshold, + bool LateSimplifyCFG) { bool Changed = false; bool LocalChange = true; @@ -145,7 +146,7 @@ static bool iterativelySimplifyCFG(Function &F, const TargetTransformInfo &TTI, // Loop over all of the basic blocks and remove them if they are unneeded. for (Function::iterator BBIt = F.begin(); BBIt != F.end(); ) { - if (SimplifyCFG(&*BBIt++, TTI, BonusInstThreshold, AC, &LoopHeaders)) { + if (SimplifyCFG(&*BBIt++, TTI, BonusInstThreshold, AC, &LoopHeaders, LateSimplifyCFG)) { LocalChange = true; ++NumSimpl; } @@ -156,10 +157,12 @@ static bool iterativelySimplifyCFG(Function &F, const TargetTransformInfo &TTI, } static bool simplifyFunctionCFG(Function &F, const TargetTransformInfo &TTI, - AssumptionCache *AC, int BonusInstThreshold) { + AssumptionCache *AC, int BonusInstThreshold, + bool LateSimplifyCFG) { bool EverChanged = removeUnreachableBlocks(F); EverChanged |= mergeEmptyReturnBlocks(F); - EverChanged |= iterativelySimplifyCFG(F, TTI, AC, BonusInstThreshold); + EverChanged |= iterativelySimplifyCFG(F, TTI, AC, BonusInstThreshold, + LateSimplifyCFG); // If neither pass changed anything, we're done. if (!EverChanged) return false; @@ -173,7 +176,8 @@ static bool simplifyFunctionCFG(Function &F, const TargetTransformInfo &TTI, return true; do { - EverChanged = iterativelySimplifyCFG(F, TTI, AC, BonusInstThreshold); + EverChanged = iterativelySimplifyCFG(F, TTI, AC, BonusInstThreshold, + LateSimplifyCFG); EverChanged |= removeUnreachableBlocks(F); } while (EverChanged); @@ -181,17 +185,19 @@ static bool simplifyFunctionCFG(Function &F, const TargetTransformInfo &TTI, } SimplifyCFGPass::SimplifyCFGPass() - : BonusInstThreshold(UserBonusInstThreshold) {} + : BonusInstThreshold(UserBonusInstThreshold), + LateSimplifyCFG(true) {} -SimplifyCFGPass::SimplifyCFGPass(int BonusInstThreshold) - : BonusInstThreshold(BonusInstThreshold) {} +SimplifyCFGPass::SimplifyCFGPass(int BonusInstThreshold, bool LateSimplifyCFG) + : BonusInstThreshold(BonusInstThreshold), + LateSimplifyCFG(LateSimplifyCFG) {} PreservedAnalyses SimplifyCFGPass::run(Function &F, FunctionAnalysisManager &AM) { auto &TTI = AM.getResult<TargetIRAnalysis>(F); auto &AC = AM.getResult<AssumptionAnalysis>(F); - if (!simplifyFunctionCFG(F, TTI, &AC, BonusInstThreshold)) + if (!simplifyFunctionCFG(F, TTI, &AC, BonusInstThreshold, LateSimplifyCFG)) return PreservedAnalyses::all(); PreservedAnalyses PA; PA.preserve<GlobalsAA>(); @@ -199,16 +205,17 @@ PreservedAnalyses SimplifyCFGPass::run(Function &F, } namespace { -struct CFGSimplifyPass : public FunctionPass { - static char ID; // Pass identification, replacement for typeid +struct BaseCFGSimplifyPass : public FunctionPass { unsigned BonusInstThreshold; std::function<bool(const Function &)> PredicateFtor; + bool LateSimplifyCFG; - CFGSimplifyPass(int T = -1, - std::function<bool(const Function &)> Ftor = nullptr) - : FunctionPass(ID), PredicateFtor(std::move(Ftor)) { + BaseCFGSimplifyPass(int T, bool LateSimplifyCFG, + std::function<bool(const Function &)> Ftor, + char &ID) + : FunctionPass(ID), PredicateFtor(std::move(Ftor)), + LateSimplifyCFG(LateSimplifyCFG) { BonusInstThreshold = (T == -1) ? UserBonusInstThreshold : unsigned(T); - initializeCFGSimplifyPassPass(*PassRegistry::getPassRegistry()); } bool runOnFunction(Function &F) override { if (skipFunction(F) || (PredicateFtor && !PredicateFtor(F))) @@ -218,7 +225,7 @@ struct CFGSimplifyPass : public FunctionPass { &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F); const TargetTransformInfo &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); - return simplifyFunctionCFG(F, TTI, AC, BonusInstThreshold); + return simplifyFunctionCFG(F, TTI, AC, BonusInstThreshold, LateSimplifyCFG); } void getAnalysisUsage(AnalysisUsage &AU) const override { @@ -227,6 +234,26 @@ struct CFGSimplifyPass : public FunctionPass { AU.addPreserved<GlobalsAAWrapperPass>(); } }; + +struct CFGSimplifyPass : public BaseCFGSimplifyPass { + static char ID; // Pass identification, replacement for typeid + + CFGSimplifyPass(int T = -1, + std::function<bool(const Function &)> Ftor = nullptr) + : BaseCFGSimplifyPass(T, false, Ftor, ID) { + initializeCFGSimplifyPassPass(*PassRegistry::getPassRegistry()); + } +}; + +struct LateCFGSimplifyPass : public BaseCFGSimplifyPass { + static char ID; // Pass identification, replacement for typeid + + LateCFGSimplifyPass(int T = -1, + std::function<bool(const Function &)> Ftor = nullptr) + : BaseCFGSimplifyPass(T, true, Ftor, ID) { + initializeLateCFGSimplifyPassPass(*PassRegistry::getPassRegistry()); + } +}; } char CFGSimplifyPass::ID = 0; @@ -237,9 +264,24 @@ INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_END(CFGSimplifyPass, "simplifycfg", "Simplify the CFG", false, false) +char LateCFGSimplifyPass::ID = 0; +INITIALIZE_PASS_BEGIN(LateCFGSimplifyPass, "latesimplifycfg", + "Simplify the CFG more aggressively", false, false) +INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) +INITIALIZE_PASS_END(LateCFGSimplifyPass, "latesimplifycfg", + "Simplify the CFG more aggressively", false, false) + // Public interface to the CFGSimplification pass FunctionPass * llvm::createCFGSimplificationPass(int Threshold, - std::function<bool(const Function &)> Ftor) { + std::function<bool(const Function &)> Ftor) { return new CFGSimplifyPass(Threshold, std::move(Ftor)); } + +// Public interface to the LateCFGSimplification pass +FunctionPass * +llvm::createLateCFGSimplificationPass(int Threshold, + std::function<bool(const Function &)> Ftor) { + return new LateCFGSimplifyPass(Threshold, std::move(Ftor)); +} diff --git a/contrib/llvm/lib/Transforms/Scalar/Sink.cpp b/contrib/llvm/lib/Transforms/Scalar/Sink.cpp index c3f14a0..5210f16 100644 --- a/contrib/llvm/lib/Transforms/Scalar/Sink.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/Sink.cpp @@ -114,7 +114,7 @@ static bool IsAcceptableTarget(Instruction *Inst, BasicBlock *SuccToSinkTo, if (SuccToSinkTo->getUniquePredecessor() != Inst->getParent()) { // We cannot sink a load across a critical edge - there may be stores in // other code paths. - if (!isSafeToSpeculativelyExecute(Inst)) + if (isa<LoadInst>(Inst)) return false; // We don't want to sink across a critical edge if we don't dominate the @@ -164,13 +164,14 @@ static bool SinkInstruction(Instruction *Inst, // Instructions can only be sunk if all their uses are in blocks // dominated by one of the successors. - // Look at all the postdominators and see if we can sink it in one. + // Look at all the dominated blocks and see if we can sink it in one. DomTreeNode *DTN = DT.getNode(Inst->getParent()); for (DomTreeNode::iterator I = DTN->begin(), E = DTN->end(); I != E && SuccToSinkTo == nullptr; ++I) { BasicBlock *Candidate = (*I)->getBlock(); - if ((*I)->getIDom()->getBlock() == Inst->getParent() && - IsAcceptableTarget(Inst, Candidate, DT, LI)) + // A node always immediate-dominates its children on the dominator + // tree. + if (IsAcceptableTarget(Inst, Candidate, DT, LI)) SuccToSinkTo = Candidate; } @@ -262,9 +263,8 @@ PreservedAnalyses SinkingPass::run(Function &F, FunctionAnalysisManager &AM) { if (!iterativelySinkInstructions(F, DT, LI, AA)) return PreservedAnalyses::all(); - auto PA = PreservedAnalyses(); - PA.preserve<DominatorTreeAnalysis>(); - PA.preserve<LoopAnalysis>(); + PreservedAnalyses PA; + PA.preserveSet<CFGAnalyses>(); return PA; } diff --git a/contrib/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp b/contrib/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp index 2be3f5c..8b8d659 100644 --- a/contrib/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp @@ -693,7 +693,7 @@ bool StraightLineStrengthReduce::runOnFunction(Function &F) { UnlinkedInst->setOperand(I, nullptr); RecursivelyDeleteTriviallyDeadInstructions(Op); } - delete UnlinkedInst; + UnlinkedInst->deleteValue(); } bool Ret = !UnlinkedInstructions.empty(); UnlinkedInstructions.clear(); diff --git a/contrib/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp b/contrib/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp index 49ce026..0cccb41 100644 --- a/contrib/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp @@ -7,7 +7,6 @@ // //===----------------------------------------------------------------------===// -#include "llvm/Transforms/Scalar.h" #include "llvm/ADT/MapVector.h" #include "llvm/ADT/PostOrderIterator.h" #include "llvm/ADT/SCCIterator.h" @@ -20,6 +19,7 @@ #include "llvm/IR/PatternMatch.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/SSAUpdater.h" using namespace llvm; @@ -329,7 +329,7 @@ void StructurizeCFG::analyzeLoops(RegionNode *N) { Loops[Exit] = N->getEntry(); } else { - // Test for sucessors as back edge + // Test for successors as back edge BasicBlock *BB = N->getNodeAs<BasicBlock>(); BranchInst *Term = cast<BranchInst>(BB->getTerminator()); diff --git a/contrib/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp b/contrib/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp index a6b9fee..90c5c24 100644 --- a/contrib/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp +++ b/contrib/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp @@ -51,13 +51,12 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/Scalar/TailRecursionElimination.h" -#include "llvm/Transforms/Scalar.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/Statistic.h" -#include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/CFG.h" #include "llvm/Analysis/CaptureTracking.h" +#include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/InlineCost.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/Loads.h" @@ -69,6 +68,7 @@ #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/Function.h" +#include "llvm/IR/InstIterator.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Module.h" @@ -76,6 +76,7 @@ #include "llvm/Pass.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Local.h" using namespace llvm; @@ -90,16 +91,10 @@ STATISTIC(NumAccumAdded, "Number of accumulators introduced"); /// If it contains any dynamic allocas, returns false. static bool canTRE(Function &F) { // Because of PR962, we don't TRE dynamic allocas. - for (auto &BB : F) { - for (auto &I : BB) { - if (AllocaInst *AI = dyn_cast<AllocaInst>(&I)) { - if (!AI->isStaticAlloca()) - return false; - } - } - } - - return true; + return llvm::all_of(instructions(F), [](Instruction &I) { + auto *AI = dyn_cast<AllocaInst>(&I); + return !AI || AI->isStaticAlloca(); + }); } namespace { @@ -321,7 +316,7 @@ static bool markTails(Function &F, bool &AllCallsAreTailCalls) { /// instruction from after the call to before the call, assuming that all /// instructions between the call and this instruction are movable. /// -static bool canMoveAboveCall(Instruction *I, CallInst *CI) { +static bool canMoveAboveCall(Instruction *I, CallInst *CI, AliasAnalysis *AA) { // FIXME: We can move load/store/call/free instructions above the call if the // call does not mod/ref the memory location being processed. if (I->mayHaveSideEffects()) // This also handles volatile loads. @@ -332,10 +327,10 @@ static bool canMoveAboveCall(Instruction *I, CallInst *CI) { if (CI->mayHaveSideEffects()) { // Non-volatile loads may be moved above a call with side effects if it // does not write to memory and the load provably won't trap. - // FIXME: Writes to memory only matter if they may alias the pointer + // Writes to memory only matter if they may alias the pointer // being loaded from. const DataLayout &DL = L->getModule()->getDataLayout(); - if (CI->mayWriteToMemory() || + if ((AA->getModRefInfo(CI, MemoryLocation::get(L)) & MRI_Mod) || !isSafeToLoadUnconditionally(L->getPointerOperand(), L->getAlignment(), DL, L)) return false; @@ -496,7 +491,7 @@ static bool eliminateRecursiveTailCall(CallInst *CI, ReturnInst *Ret, BasicBlock *&OldEntry, bool &TailCallsAreMarkedTail, SmallVectorImpl<PHINode *> &ArgumentPHIs, - bool CannotTailCallElimCallsMarkedTail) { + AliasAnalysis *AA) { // If we are introducing accumulator recursion to eliminate operations after // the call instruction that are both associative and commutative, the initial // value for the accumulator is placed in this variable. If this value is set @@ -516,7 +511,8 @@ static bool eliminateRecursiveTailCall(CallInst *CI, ReturnInst *Ret, // Check that this is the case now. BasicBlock::iterator BBI(CI); for (++BBI; &*BBI != Ret; ++BBI) { - if (canMoveAboveCall(&*BBI, CI)) continue; + if (canMoveAboveCall(&*BBI, CI, AA)) + continue; // If we can't move the instruction above the call, it might be because it // is an associative and commutative operation that could be transformed @@ -675,12 +671,17 @@ static bool foldReturnAndProcessPred(BasicBlock *BB, ReturnInst *Ret, bool &TailCallsAreMarkedTail, SmallVectorImpl<PHINode *> &ArgumentPHIs, bool CannotTailCallElimCallsMarkedTail, - const TargetTransformInfo *TTI) { + const TargetTransformInfo *TTI, + AliasAnalysis *AA) { bool Change = false; + // Make sure this block is a trivial return block. + assert(BB->getFirstNonPHIOrDbg() == Ret && + "Trying to fold non-trivial return block"); + // If the return block contains nothing but the return and PHI's, // there might be an opportunity to duplicate the return in its - // predecessors and perform TRC there. Look for predecessors that end + // predecessors and perform TRE there. Look for predecessors that end // in unconditional branch and recursive call(s). SmallVector<BranchInst*, 8> UncondBranchPreds; for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) { @@ -707,8 +708,7 @@ static bool foldReturnAndProcessPred(BasicBlock *BB, ReturnInst *Ret, BB->eraseFromParent(); eliminateRecursiveTailCall(CI, RI, OldEntry, TailCallsAreMarkedTail, - ArgumentPHIs, - CannotTailCallElimCallsMarkedTail); + ArgumentPHIs, AA); ++NumRetDuped; Change = true; } @@ -721,17 +721,18 @@ static bool processReturningBlock(ReturnInst *Ret, BasicBlock *&OldEntry, bool &TailCallsAreMarkedTail, SmallVectorImpl<PHINode *> &ArgumentPHIs, bool CannotTailCallElimCallsMarkedTail, - const TargetTransformInfo *TTI) { + const TargetTransformInfo *TTI, + AliasAnalysis *AA) { CallInst *CI = findTRECandidate(Ret, CannotTailCallElimCallsMarkedTail, TTI); if (!CI) return false; return eliminateRecursiveTailCall(CI, Ret, OldEntry, TailCallsAreMarkedTail, - ArgumentPHIs, - CannotTailCallElimCallsMarkedTail); + ArgumentPHIs, AA); } -static bool eliminateTailRecursion(Function &F, const TargetTransformInfo *TTI) { +static bool eliminateTailRecursion(Function &F, const TargetTransformInfo *TTI, + AliasAnalysis *AA) { if (F.getFnAttribute("disable-tail-calls").getValueAsString() == "true") return false; @@ -766,11 +767,11 @@ static bool eliminateTailRecursion(Function &F, const TargetTransformInfo *TTI) if (ReturnInst *Ret = dyn_cast<ReturnInst>(BB->getTerminator())) { bool Change = processReturningBlock(Ret, OldEntry, TailCallsAreMarkedTail, - ArgumentPHIs, !CanTRETailMarkedCall, TTI); + ArgumentPHIs, !CanTRETailMarkedCall, TTI, AA); if (!Change && BB->getFirstNonPHIOrDbg() == Ret) - Change = - foldReturnAndProcessPred(BB, Ret, OldEntry, TailCallsAreMarkedTail, - ArgumentPHIs, !CanTRETailMarkedCall, TTI); + Change = foldReturnAndProcessPred(BB, Ret, OldEntry, + TailCallsAreMarkedTail, ArgumentPHIs, + !CanTRETailMarkedCall, TTI, AA); MadeChange |= Change; } } @@ -800,6 +801,7 @@ struct TailCallElim : public FunctionPass { void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<TargetTransformInfoWrapperPass>(); + AU.addRequired<AAResultsWrapperPass>(); AU.addPreserved<GlobalsAAWrapperPass>(); } @@ -808,7 +810,8 @@ struct TailCallElim : public FunctionPass { return false; return eliminateTailRecursion( - F, &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F)); + F, &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F), + &getAnalysis<AAResultsWrapperPass>().getAAResults()); } }; } @@ -829,8 +832,9 @@ PreservedAnalyses TailCallElimPass::run(Function &F, FunctionAnalysisManager &AM) { TargetTransformInfo &TTI = AM.getResult<TargetIRAnalysis>(F); + AliasAnalysis &AA = AM.getResult<AAManager>(F); - bool Changed = eliminateTailRecursion(F, &TTI); + bool Changed = eliminateTailRecursion(F, &TTI, &AA); if (!Changed) return PreservedAnalyses::all(); |