diff options
author | dim <dim@FreeBSD.org> | 2011-02-20 12:57:14 +0000 |
---|---|---|
committer | dim <dim@FreeBSD.org> | 2011-02-20 12:57:14 +0000 |
commit | cbb70ce070d220642b038ea101d9c0f9fbf860d6 (patch) | |
tree | d2b61ce94e654cb01a254d2195259db5f9cc3f3c /lib/Transforms/Scalar | |
parent | 4ace901e87dac5bbbac78ed325e75462e48e386e (diff) | |
download | FreeBSD-src-cbb70ce070d220642b038ea101d9c0f9fbf860d6.zip FreeBSD-src-cbb70ce070d220642b038ea101d9c0f9fbf860d6.tar.gz |
Vendor import of llvm trunk r126079:
http://llvm.org/svn/llvm-project/llvm/trunk@126079
Diffstat (limited to 'lib/Transforms/Scalar')
35 files changed, 5121 insertions, 4458 deletions
diff --git a/lib/Transforms/Scalar/ADCE.cpp b/lib/Transforms/Scalar/ADCE.cpp index ada086e..a5adb5e 100644 --- a/lib/Transforms/Scalar/ADCE.cpp +++ b/lib/Transforms/Scalar/ADCE.cpp @@ -33,7 +33,9 @@ STATISTIC(NumRemoved, "Number of instructions removed"); namespace { struct ADCE : public FunctionPass { static char ID; // Pass identification, replacement for typeid - ADCE() : FunctionPass(ID) {} + ADCE() : FunctionPass(ID) { + initializeADCEPass(*PassRegistry::getPassRegistry()); + } virtual bool runOnFunction(Function& F); @@ -45,7 +47,7 @@ namespace { } char ADCE::ID = 0; -INITIALIZE_PASS(ADCE, "adce", "Aggressive Dead Code Elimination", false, false); +INITIALIZE_PASS(ADCE, "adce", "Aggressive Dead Code Elimination", false, false) bool ADCE::runOnFunction(Function& F) { SmallPtrSet<Instruction*, 128> alive; diff --git a/lib/Transforms/Scalar/BasicBlockPlacement.cpp b/lib/Transforms/Scalar/BasicBlockPlacement.cpp index b144678..cee5502 100644 --- a/lib/Transforms/Scalar/BasicBlockPlacement.cpp +++ b/lib/Transforms/Scalar/BasicBlockPlacement.cpp @@ -41,7 +41,9 @@ STATISTIC(NumMoved, "Number of basic blocks moved"); namespace { struct BlockPlacement : public FunctionPass { static char ID; // Pass identification, replacement for typeid - BlockPlacement() : FunctionPass(ID) {} + BlockPlacement() : FunctionPass(ID) { + initializeBlockPlacementPass(*PassRegistry::getPassRegistry()); + } virtual bool runOnFunction(Function &F); @@ -74,8 +76,11 @@ namespace { } char BlockPlacement::ID = 0; -INITIALIZE_PASS(BlockPlacement, "block-placement", - "Profile Guided Basic Block Placement", false, false); +INITIALIZE_PASS_BEGIN(BlockPlacement, "block-placement", + "Profile Guided Basic Block Placement", false, false) +INITIALIZE_AG_DEPENDENCY(ProfileInfo) +INITIALIZE_PASS_END(BlockPlacement, "block-placement", + "Profile Guided Basic Block Placement", false, false) FunctionPass *llvm::createBlockPlacementPass() { return new BlockPlacement(); } diff --git a/lib/Transforms/Scalar/CMakeLists.txt b/lib/Transforms/Scalar/CMakeLists.txt index b7598ea..106fb8f 100644 --- a/lib/Transforms/Scalar/CMakeLists.txt +++ b/lib/Transforms/Scalar/CMakeLists.txt @@ -6,13 +6,15 @@ add_llvm_library(LLVMScalarOpts CorrelatedValuePropagation.cpp DCE.cpp DeadStoreElimination.cpp + EarlyCSE.cpp GEPSplitter.cpp GVN.cpp IndVarSimplify.cpp JumpThreading.cpp LICM.cpp LoopDeletion.cpp - LoopIndexSplit.cpp + LoopIdiomRecognize.cpp + LoopInstSimplify.cpp LoopRotation.cpp LoopStrengthReduce.cpp LoopUnrollPass.cpp @@ -31,5 +33,3 @@ add_llvm_library(LLVMScalarOpts TailDuplication.cpp TailRecursionElimination.cpp ) - -target_link_libraries (LLVMScalarOpts LLVMTransformUtils) diff --git a/lib/Transforms/Scalar/CodeGenPrepare.cpp b/lib/Transforms/Scalar/CodeGenPrepare.cpp index e07b761..9536939 100644 --- a/lib/Transforms/Scalar/CodeGenPrepare.cpp +++ b/lib/Transforms/Scalar/CodeGenPrepare.cpp @@ -22,6 +22,8 @@ #include "llvm/Instructions.h" #include "llvm/IntrinsicInst.h" #include "llvm/Pass.h" +#include "llvm/Analysis/Dominators.h" +#include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/ProfileInfo.h" #include "llvm/Target/TargetData.h" #include "llvm/Target/TargetLowering.h" @@ -31,6 +33,7 @@ #include "llvm/Transforms/Utils/BuildLibCalls.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/Statistic.h" #include "llvm/Assembly/Writer.h" #include "llvm/Support/CallSite.h" #include "llvm/Support/CommandLine.h" @@ -39,31 +42,59 @@ #include "llvm/Support/PatternMatch.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Support/IRBuilder.h" +#include "llvm/Support/ValueHandle.h" using namespace llvm; using namespace llvm::PatternMatch; +STATISTIC(NumBlocksElim, "Number of blocks eliminated"); +STATISTIC(NumPHIsElim, "Number of trivial PHIs eliminated"); +STATISTIC(NumGEPsElim, "Number of GEPs converted to casts"); +STATISTIC(NumCmpUses, "Number of uses of Cmp expressions replaced with uses of " + "sunken Cmps"); +STATISTIC(NumCastUses, "Number of uses of Cast expressions replaced with uses " + "of sunken Casts"); +STATISTIC(NumMemoryInsts, "Number of memory instructions whose address " + "computations were sunk"); +STATISTIC(NumExtsMoved, "Number of [s|z]ext instructions combined with loads"); +STATISTIC(NumExtUses, "Number of uses of [s|z]ext instructions optimized"); + static cl::opt<bool> CriticalEdgeSplit("cgp-critical-edge-splitting", cl::desc("Split critical edges during codegen prepare"), - cl::init(true), cl::Hidden); + cl::init(false), cl::Hidden); namespace { class CodeGenPrepare : public FunctionPass { /// TLI - Keep a pointer of a TargetLowering to consult for determining /// transformation profitability. const TargetLowering *TLI; + DominatorTree *DT; ProfileInfo *PFI; + + /// CurInstIterator - As we scan instructions optimizing them, this is the + /// next instruction to optimize. Xforms that can invalidate this should + /// update it. + BasicBlock::iterator CurInstIterator; /// BackEdges - Keep a set of all the loop back edges. /// SmallSet<std::pair<const BasicBlock*, const BasicBlock*>, 8> BackEdges; + + // Keeps track of non-local addresses that have been sunk into a block. This + // allows us to avoid inserting duplicate code for blocks with multiple + // load/stores of the same address. + DenseMap<Value*, Value*> SunkAddrs; + public: static char ID; // Pass identification, replacement for typeid explicit CodeGenPrepare(const TargetLowering *tli = 0) - : FunctionPass(ID), TLI(tli) {} + : FunctionPass(ID), TLI(tli) { + initializeCodeGenPreparePass(*PassRegistry::getPassRegistry()); + } bool runOnFunction(Function &F); virtual void getAnalysisUsage(AnalysisUsage &AU) const { + AU.addPreserved<DominatorTree>(); AU.addPreserved<ProfileInfo>(); } @@ -76,10 +107,9 @@ namespace { bool CanMergeBlocks(const BasicBlock *BB, const BasicBlock *DestBB) const; void EliminateMostlyEmptyBlock(BasicBlock *BB); bool OptimizeBlock(BasicBlock &BB); - bool OptimizeMemoryInst(Instruction *I, Value *Addr, const Type *AccessTy, - DenseMap<Value*,Value*> &SunkAddrs); - bool OptimizeInlineAsmInst(Instruction *I, CallSite CS, - DenseMap<Value*,Value*> &SunkAddrs); + bool OptimizeInst(Instruction *I); + bool OptimizeMemoryInst(Instruction *I, Value *Addr, const Type *AccessTy); + bool OptimizeInlineAsmInst(CallInst *CS); bool OptimizeCallInst(CallInst *CI); bool MoveExtToFormExtLoad(Instruction *I); bool OptimizeExtUses(Instruction *I); @@ -89,7 +119,7 @@ namespace { char CodeGenPrepare::ID = 0; INITIALIZE_PASS(CodeGenPrepare, "codegenprepare", - "Optimize for code generation", false, false); + "Optimize for code generation", false, false) FunctionPass *llvm::createCodeGenPreparePass(const TargetLowering *TLI) { return new CodeGenPrepare(TLI); @@ -108,13 +138,16 @@ void CodeGenPrepare::findLoopBackEdges(const Function &F) { bool CodeGenPrepare::runOnFunction(Function &F) { bool EverMadeChange = false; + DT = getAnalysisIfAvailable<DominatorTree>(); PFI = getAnalysisIfAvailable<ProfileInfo>(); // First pass, eliminate blocks that contain only PHI nodes and an // unconditional branch. EverMadeChange |= EliminateMostlyEmptyBlocks(F); - // Now find loop back edges. - findLoopBackEdges(F); + // Now find loop back edges, but only if they are being used to decide which + // critical edges to split. + if (CriticalEdgeSplit) + findLoopBackEdges(F); bool MadeChange = true; while (MadeChange) { @@ -123,6 +156,9 @@ bool CodeGenPrepare::runOnFunction(Function &F) { MadeChange |= OptimizeBlock(*BB); EverMadeChange |= MadeChange; } + + SunkAddrs.clear(); + return EverMadeChange; } @@ -297,11 +333,19 @@ void CodeGenPrepare::EliminateMostlyEmptyBlock(BasicBlock *BB) { // The PHIs are now updated, change everything that refers to BB to use // DestBB and remove BB. BB->replaceAllUsesWith(DestBB); + if (DT) { + BasicBlock *BBIDom = DT->getNode(BB)->getIDom()->getBlock(); + BasicBlock *DestBBIDom = DT->getNode(DestBB)->getIDom()->getBlock(); + BasicBlock *NewIDom = DT->findNearestCommonDominator(BBIDom, DestBBIDom); + DT->changeImmediateDominator(DestBB, NewIDom); + DT->eraseNode(BB); + } if (PFI) { PFI->replaceAllUses(BB, DestBB); PFI->removeEdge(ProfileInfo::getEdge(BB, DestBB)); } BB->eraseFromParent(); + ++NumBlocksElim; DEBUG(dbgs() << "AFTER:\n" << *DestBB << "\n\n\n"); } @@ -480,6 +524,7 @@ static bool OptimizeNoopCopyExpression(CastInst *CI, const TargetLowering &TLI){ // Replace a use of the cast with a use of the new cast. TheUse = InsertedCast; + ++NumCastUses; } // If we removed all uses, nuke the cast. @@ -537,6 +582,7 @@ static bool OptimizeCmpExpression(CmpInst *CI) { // Replace a use of the cmp with a use of the new cmp. TheUse = InsertedCmp; + ++NumCmpUses; } // If we removed all uses, nuke the cmp. @@ -563,14 +609,45 @@ protected: } // end anonymous namespace bool CodeGenPrepare::OptimizeCallInst(CallInst *CI) { + BasicBlock *BB = CI->getParent(); + + // Lower inline assembly if we can. + // If we found an inline asm expession, and if the target knows how to + // lower it to normal LLVM code, do so now. + if (TLI && isa<InlineAsm>(CI->getCalledValue())) { + if (TLI->ExpandInlineAsm(CI)) { + // Avoid invalidating the iterator. + CurInstIterator = BB->begin(); + // Avoid processing instructions out of order, which could cause + // reuse before a value is defined. + SunkAddrs.clear(); + return true; + } + // Sink address computing for memory operands into the block. + if (OptimizeInlineAsmInst(CI)) + return true; + } + // Lower all uses of llvm.objectsize.* IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI); if (II && II->getIntrinsicID() == Intrinsic::objectsize) { bool Min = (cast<ConstantInt>(II->getArgOperand(1))->getZExtValue() == 1); const Type *ReturnTy = CI->getType(); Constant *RetVal = ConstantInt::get(ReturnTy, Min ? 0 : -1ULL); - CI->replaceAllUsesWith(RetVal); - CI->eraseFromParent(); + + // Substituting this can cause recursive simplifications, which can + // invalidate our iterator. Use a WeakVH to hold onto it in case this + // happens. + WeakVH IterHandle(CurInstIterator); + + ReplaceAndSimplifyAllUses(CI, RetVal, TLI ? TLI->getTargetData() : 0, DT); + + // If the iterator instruction was recursively deleted, start over at the + // start of the block. + if (IterHandle != CurInstIterator) { + CurInstIterator = BB->begin(); + SunkAddrs.clear(); + } return true; } @@ -588,6 +665,7 @@ bool CodeGenPrepare::OptimizeCallInst(CallInst *CI) { CodeGenPrepareFortifiedLibCalls Simplifier; return Simplifier.fold(CI, TD); } + //===----------------------------------------------------------------------===// // Memory Optimization //===----------------------------------------------------------------------===// @@ -610,13 +688,69 @@ static bool IsNonLocalValue(Value *V, BasicBlock *BB) { /// This method is used to optimize both load/store and inline asms with memory /// operands. bool CodeGenPrepare::OptimizeMemoryInst(Instruction *MemoryInst, Value *Addr, - const Type *AccessTy, - DenseMap<Value*,Value*> &SunkAddrs) { - // Figure out what addressing mode will be built up for this operation. + const Type *AccessTy) { + Value *Repl = Addr; + + // Try to collapse single-value PHI nodes. This is necessary to undo + // unprofitable PRE transformations. + SmallVector<Value*, 8> worklist; + SmallPtrSet<Value*, 16> Visited; + worklist.push_back(Addr); + + // Use a worklist to iteratively look through PHI nodes, and ensure that + // the addressing mode obtained from the non-PHI roots of the graph + // are equivalent. + Value *Consensus = 0; + unsigned NumUses = 0; SmallVector<Instruction*, 16> AddrModeInsts; - ExtAddrMode AddrMode = AddressingModeMatcher::Match(Addr, AccessTy,MemoryInst, - AddrModeInsts, *TLI); - + ExtAddrMode AddrMode; + while (!worklist.empty()) { + Value *V = worklist.back(); + worklist.pop_back(); + + // Break use-def graph loops. + if (Visited.count(V)) { + Consensus = 0; + break; + } + + Visited.insert(V); + + // For a PHI node, push all of its incoming values. + if (PHINode *P = dyn_cast<PHINode>(V)) { + for (unsigned i = 0, e = P->getNumIncomingValues(); i != e; ++i) + worklist.push_back(P->getIncomingValue(i)); + continue; + } + + // For non-PHIs, determine the addressing mode being computed. + SmallVector<Instruction*, 16> NewAddrModeInsts; + ExtAddrMode NewAddrMode = + AddressingModeMatcher::Match(V, AccessTy,MemoryInst, + NewAddrModeInsts, *TLI); + + // Ensure that the obtained addressing mode is equivalent to that obtained + // for all other roots of the PHI traversal. Also, when choosing one + // such root as representative, select the one with the most uses in order + // to keep the cost modeling heuristics in AddressingModeMatcher applicable. + if (!Consensus || NewAddrMode == AddrMode) { + if (V->getNumUses() > NumUses) { + Consensus = V; + NumUses = V->getNumUses(); + AddrMode = NewAddrMode; + AddrModeInsts = NewAddrModeInsts; + } + continue; + } + + Consensus = 0; + break; + } + + // If the addressing mode couldn't be determined, or if multiple different + // ones were determined, bail out now. + if (!Consensus) return false; + // Check to see if any of the instructions supersumed by this addr mode are // non-local to I's BB. bool AnyNonLocal = false; @@ -719,60 +853,39 @@ bool CodeGenPrepare::OptimizeMemoryInst(Instruction *MemoryInst, Value *Addr, SunkAddr = new IntToPtrInst(Result, Addr->getType(), "sunkaddr",InsertPt); } - MemoryInst->replaceUsesOfWith(Addr, SunkAddr); + MemoryInst->replaceUsesOfWith(Repl, SunkAddr); - if (Addr->use_empty()) { - RecursivelyDeleteTriviallyDeadInstructions(Addr); + if (Repl->use_empty()) { + RecursivelyDeleteTriviallyDeadInstructions(Repl); // This address is now available for reassignment, so erase the table entry; // we don't want to match some completely different instruction. SunkAddrs[Addr] = 0; } + ++NumMemoryInsts; return true; } /// OptimizeInlineAsmInst - If there are any memory operands, use /// OptimizeMemoryInst to sink their address computing into the block when /// possible / profitable. -bool CodeGenPrepare::OptimizeInlineAsmInst(Instruction *I, CallSite CS, - DenseMap<Value*,Value*> &SunkAddrs) { +bool CodeGenPrepare::OptimizeInlineAsmInst(CallInst *CS) { bool MadeChange = false; - InlineAsm *IA = cast<InlineAsm>(CS.getCalledValue()); - - // Do a prepass over the constraints, canonicalizing them, and building up the - // ConstraintOperands list. - std::vector<InlineAsm::ConstraintInfo> - ConstraintInfos = IA->ParseConstraints(); - - /// ConstraintOperands - Information about all of the constraints. - std::vector<TargetLowering::AsmOperandInfo> ConstraintOperands; - unsigned ArgNo = 0; // ArgNo - The argument of the CallInst. - for (unsigned i = 0, e = ConstraintInfos.size(); i != e; ++i) { - ConstraintOperands. - push_back(TargetLowering::AsmOperandInfo(ConstraintInfos[i])); - TargetLowering::AsmOperandInfo &OpInfo = ConstraintOperands.back(); - - // Compute the value type for each operand. - switch (OpInfo.Type) { - case InlineAsm::isOutput: - if (OpInfo.isIndirect) - OpInfo.CallOperandVal = CS.getArgument(ArgNo++); - break; - case InlineAsm::isInput: - OpInfo.CallOperandVal = CS.getArgument(ArgNo++); - break; - case InlineAsm::isClobber: - // Nothing to do. - break; - } + TargetLowering::AsmOperandInfoVector + TargetConstraints = TLI->ParseConstraints(CS); + unsigned ArgNo = 0; + for (unsigned i = 0, e = TargetConstraints.size(); i != e; ++i) { + TargetLowering::AsmOperandInfo &OpInfo = TargetConstraints[i]; + // Compute the constraint code and ConstraintType to use. TLI->ComputeConstraintToUse(OpInfo, SDValue()); if (OpInfo.ConstraintType == TargetLowering::C_Memory && OpInfo.isIndirect) { - Value *OpVal = OpInfo.CallOperandVal; - MadeChange |= OptimizeMemoryInst(I, OpVal, OpVal->getType(), SunkAddrs); - } + Value *OpVal = CS->getArgOperand(ArgNo++); + MadeChange |= OptimizeMemoryInst(CS, OpVal, OpVal->getType()); + } else if (OpInfo.Type == InlineAsm::isInput) + ArgNo++; } return MadeChange; @@ -794,7 +907,9 @@ bool CodeGenPrepare::MoveExtToFormExtLoad(Instruction *I) { // If the load has other users and the truncate is not free, this probably // isn't worthwhile. if (!LI->hasOneUse() && - TLI && !TLI->isTruncateFree(I->getType(), LI->getType())) + TLI && (TLI->isTypeLegal(TLI->getValueType(LI->getType())) || + !TLI->isTypeLegal(TLI->getValueType(I->getType()))) && + !TLI->isTruncateFree(I->getType(), LI->getType())) return false; // Check whether the target supports casts folded into loads. @@ -812,13 +927,14 @@ bool CodeGenPrepare::MoveExtToFormExtLoad(Instruction *I) { // can fold it. I->removeFromParent(); I->insertAfter(LI); + ++NumExtsMoved; return true; } bool CodeGenPrepare::OptimizeExtUses(Instruction *I) { BasicBlock *DefBB = I->getParent(); - // If both result of the {s|z}xt and its source are live out, rewrite all + // If the result of a {s|z}ext and its source are both live out, rewrite all // other uses of the source with result of extension. Value *Src = I->getOperand(0); if (Src->hasOneUse()) @@ -883,13 +999,83 @@ bool CodeGenPrepare::OptimizeExtUses(Instruction *I) { // Replace a use of the {s|z}ext source with a use of the result. TheUse = InsertedTrunc; - + ++NumExtUses; MadeChange = true; } return MadeChange; } +bool CodeGenPrepare::OptimizeInst(Instruction *I) { + if (PHINode *P = dyn_cast<PHINode>(I)) { + // It is possible for very late stage optimizations (such as SimplifyCFG) + // to introduce PHI nodes too late to be cleaned up. If we detect such a + // trivial PHI, go ahead and zap it here. + if (Value *V = SimplifyInstruction(P)) { + P->replaceAllUsesWith(V); + P->eraseFromParent(); + ++NumPHIsElim; + return true; + } + return false; + } + + if (CastInst *CI = dyn_cast<CastInst>(I)) { + // If the source of the cast is a constant, then this should have + // already been constant folded. The only reason NOT to constant fold + // it is if something (e.g. LSR) was careful to place the constant + // evaluation in a block other than then one that uses it (e.g. to hoist + // the address of globals out of a loop). If this is the case, we don't + // want to forward-subst the cast. + if (isa<Constant>(CI->getOperand(0))) + return false; + + if (TLI && OptimizeNoopCopyExpression(CI, *TLI)) + return true; + + if (isa<ZExtInst>(I) || isa<SExtInst>(I)) { + bool MadeChange = MoveExtToFormExtLoad(I); + return MadeChange | OptimizeExtUses(I); + } + return false; + } + + if (CmpInst *CI = dyn_cast<CmpInst>(I)) + return OptimizeCmpExpression(CI); + + if (LoadInst *LI = dyn_cast<LoadInst>(I)) { + if (TLI) + return OptimizeMemoryInst(I, I->getOperand(0), LI->getType()); + return false; + } + + if (StoreInst *SI = dyn_cast<StoreInst>(I)) { + if (TLI) + return OptimizeMemoryInst(I, SI->getOperand(1), + SI->getOperand(0)->getType()); + return false; + } + + if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(I)) { + if (GEPI->hasAllZeroIndices()) { + /// The GEP operand must be a pointer, so must its result -> BitCast + Instruction *NC = new BitCastInst(GEPI->getOperand(0), GEPI->getType(), + GEPI->getName(), GEPI); + GEPI->replaceAllUsesWith(NC); + GEPI->eraseFromParent(); + ++NumGEPsElim; + OptimizeInst(NC); + return true; + } + return false; + } + + if (CallInst *CI = dyn_cast<CallInst>(I)) + return OptimizeCallInst(CI); + + return false; +} + // In this pass we look for GEP and cast instructions that are used // across basic blocks and rewrite them to improve basic-block-at-a-time // selection. @@ -908,74 +1094,11 @@ bool CodeGenPrepare::OptimizeBlock(BasicBlock &BB) { } } - // Keep track of non-local addresses that have been sunk into this block. - // This allows us to avoid inserting duplicate code for blocks with multiple - // load/stores of the same address. - DenseMap<Value*, Value*> SunkAddrs; - - for (BasicBlock::iterator BBI = BB.begin(), E = BB.end(); BBI != E; ) { - Instruction *I = BBI++; + SunkAddrs.clear(); - if (CastInst *CI = dyn_cast<CastInst>(I)) { - // If the source of the cast is a constant, then this should have - // already been constant folded. The only reason NOT to constant fold - // it is if something (e.g. LSR) was careful to place the constant - // evaluation in a block other than then one that uses it (e.g. to hoist - // the address of globals out of a loop). If this is the case, we don't - // want to forward-subst the cast. - if (isa<Constant>(CI->getOperand(0))) - continue; - - bool Change = false; - if (TLI) { - Change = OptimizeNoopCopyExpression(CI, *TLI); - MadeChange |= Change; - } - - if (!Change && (isa<ZExtInst>(I) || isa<SExtInst>(I))) { - MadeChange |= MoveExtToFormExtLoad(I); - MadeChange |= OptimizeExtUses(I); - } - } else if (CmpInst *CI = dyn_cast<CmpInst>(I)) { - MadeChange |= OptimizeCmpExpression(CI); - } else if (LoadInst *LI = dyn_cast<LoadInst>(I)) { - if (TLI) - MadeChange |= OptimizeMemoryInst(I, I->getOperand(0), LI->getType(), - SunkAddrs); - } else if (StoreInst *SI = dyn_cast<StoreInst>(I)) { - if (TLI) - MadeChange |= OptimizeMemoryInst(I, SI->getOperand(1), - SI->getOperand(0)->getType(), - SunkAddrs); - } else if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(I)) { - if (GEPI->hasAllZeroIndices()) { - /// The GEP operand must be a pointer, so must its result -> BitCast - Instruction *NC = new BitCastInst(GEPI->getOperand(0), GEPI->getType(), - GEPI->getName(), GEPI); - GEPI->replaceAllUsesWith(NC); - GEPI->eraseFromParent(); - MadeChange = true; - BBI = NC; - } - } else if (CallInst *CI = dyn_cast<CallInst>(I)) { - // If we found an inline asm expession, and if the target knows how to - // lower it to normal LLVM code, do so now. - if (TLI && isa<InlineAsm>(CI->getCalledValue())) { - if (TLI->ExpandInlineAsm(CI)) { - BBI = BB.begin(); - // Avoid processing instructions out of order, which could cause - // reuse before a value is defined. - SunkAddrs.clear(); - } else - // Sink address computing for memory operands into the block. - MadeChange |= OptimizeInlineAsmInst(I, &(*CI), SunkAddrs); - } else { - // Other CallInst optimizations that don't need to muck with the - // enclosing iterator here. - MadeChange |= OptimizeCallInst(CI); - } - } - } + CurInstIterator = BB.begin(); + for (BasicBlock::iterator E = BB.end(); CurInstIterator != E; ) + MadeChange |= OptimizeInst(CurInstIterator++); return MadeChange; } diff --git a/lib/Transforms/Scalar/ConstantProp.cpp b/lib/Transforms/Scalar/ConstantProp.cpp index a0ea369..664c3f6 100644 --- a/lib/Transforms/Scalar/ConstantProp.cpp +++ b/lib/Transforms/Scalar/ConstantProp.cpp @@ -34,7 +34,9 @@ STATISTIC(NumInstKilled, "Number of instructions killed"); namespace { struct ConstantPropagation : public FunctionPass { static char ID; // Pass identification, replacement for typeid - ConstantPropagation() : FunctionPass(ID) {} + ConstantPropagation() : FunctionPass(ID) { + initializeConstantPropagationPass(*PassRegistry::getPassRegistry()); + } bool runOnFunction(Function &F); @@ -46,7 +48,7 @@ namespace { char ConstantPropagation::ID = 0; INITIALIZE_PASS(ConstantPropagation, "constprop", - "Simple constant propagation", false, false); + "Simple constant propagation", false, false) FunctionPass *llvm::createConstantPropagationPass() { return new ConstantPropagation(); diff --git a/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp b/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp index 0d4e45d..be12973 100644 --- a/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp +++ b/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp @@ -16,6 +16,7 @@ #include "llvm/Function.h" #include "llvm/Instructions.h" #include "llvm/Pass.h" +#include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/LazyValueInfo.h" #include "llvm/Support/CFG.h" #include "llvm/Transforms/Utils/Local.h" @@ -30,18 +31,20 @@ STATISTIC(NumCmps, "Number of comparisons propagated"); namespace { class CorrelatedValuePropagation : public FunctionPass { LazyValueInfo *LVI; - + bool processSelect(SelectInst *SI); bool processPHI(PHINode *P); bool processMemAccess(Instruction *I); bool processCmp(CmpInst *C); - + public: static char ID; - CorrelatedValuePropagation(): FunctionPass(ID) { } - + CorrelatedValuePropagation(): FunctionPass(ID) { + initializeCorrelatedValuePropagationPass(*PassRegistry::getPassRegistry()); + } + bool runOnFunction(Function &F); - + virtual void getAnalysisUsage(AnalysisUsage &AU) const { AU.addRequired<LazyValueInfo>(); } @@ -49,8 +52,11 @@ namespace { } char CorrelatedValuePropagation::ID = 0; -INITIALIZE_PASS(CorrelatedValuePropagation, "correlated-propagation", - "Value Propagation", false, false); +INITIALIZE_PASS_BEGIN(CorrelatedValuePropagation, "correlated-propagation", + "Value Propagation", false, false) +INITIALIZE_PASS_DEPENDENCY(LazyValueInfo) +INITIALIZE_PASS_END(CorrelatedValuePropagation, "correlated-propagation", + "Value Propagation", false, false) // Public interface to the Value Propagation pass Pass *llvm::createCorrelatedValuePropagationPass() { @@ -60,46 +66,51 @@ Pass *llvm::createCorrelatedValuePropagationPass() { bool CorrelatedValuePropagation::processSelect(SelectInst *S) { if (S->getType()->isVectorTy()) return false; if (isa<Constant>(S->getOperand(0))) return false; - + Constant *C = LVI->getConstant(S->getOperand(0), S->getParent()); if (!C) return false; - + ConstantInt *CI = dyn_cast<ConstantInt>(C); if (!CI) return false; - - S->replaceAllUsesWith(S->getOperand(CI->isOne() ? 1 : 2)); + + Value *ReplaceWith = S->getOperand(1); + Value *Other = S->getOperand(2); + if (!CI->isOne()) std::swap(ReplaceWith, Other); + if (ReplaceWith == S) ReplaceWith = UndefValue::get(S->getType()); + + S->replaceAllUsesWith(ReplaceWith); S->eraseFromParent(); ++NumSelects; - + return true; } bool CorrelatedValuePropagation::processPHI(PHINode *P) { bool Changed = false; - + BasicBlock *BB = P->getParent(); for (unsigned i = 0, e = P->getNumIncomingValues(); i < e; ++i) { Value *Incoming = P->getIncomingValue(i); if (isa<Constant>(Incoming)) continue; - + Constant *C = LVI->getConstantOnEdge(P->getIncomingValue(i), P->getIncomingBlock(i), BB); if (!C) continue; - + P->setIncomingValue(i, C); Changed = true; } - - if (Value *ConstVal = P->hasConstantValue()) { - P->replaceAllUsesWith(ConstVal); + + if (Value *V = SimplifyInstruction(P)) { + P->replaceAllUsesWith(V); P->eraseFromParent(); Changed = true; } - + ++NumPhis; - + return Changed; } @@ -109,12 +120,12 @@ bool CorrelatedValuePropagation::processMemAccess(Instruction *I) { Pointer = L->getPointerOperand(); else Pointer = cast<StoreInst>(I)->getPointerOperand(); - + if (isa<Constant>(Pointer)) return false; - + Constant *C = LVI->getConstant(Pointer, I->getParent()); if (!C) return false; - + ++NumMemAccess; I->replaceUsesOfWith(Pointer, C); return true; @@ -130,32 +141,32 @@ bool CorrelatedValuePropagation::processCmp(CmpInst *C) { if (isa<Instruction>(Op0) && cast<Instruction>(Op0)->getParent() == C->getParent()) return false; - + Constant *Op1 = dyn_cast<Constant>(C->getOperand(1)); if (!Op1) return false; - + pred_iterator PI = pred_begin(C->getParent()), PE = pred_end(C->getParent()); if (PI == PE) return false; - - LazyValueInfo::Tristate Result = LVI->getPredicateOnEdge(C->getPredicate(), + + LazyValueInfo::Tristate Result = LVI->getPredicateOnEdge(C->getPredicate(), C->getOperand(0), Op1, *PI, C->getParent()); if (Result == LazyValueInfo::Unknown) return false; ++PI; while (PI != PE) { - LazyValueInfo::Tristate Res = LVI->getPredicateOnEdge(C->getPredicate(), + LazyValueInfo::Tristate Res = LVI->getPredicateOnEdge(C->getPredicate(), C->getOperand(0), Op1, *PI, C->getParent()); if (Res != Result) return false; ++PI; } - + ++NumCmps; - + if (Result == LazyValueInfo::True) C->replaceAllUsesWith(ConstantInt::getTrue(C->getContext())); else C->replaceAllUsesWith(ConstantInt::getFalse(C->getContext())); - + C->eraseFromParent(); return true; @@ -163,9 +174,9 @@ bool CorrelatedValuePropagation::processCmp(CmpInst *C) { bool CorrelatedValuePropagation::runOnFunction(Function &F) { LVI = &getAnalysis<LazyValueInfo>(); - + bool FnChanged = false; - + for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE; ++FI) { bool BBChanged = false; for (BasicBlock::iterator BI = FI->begin(), BE = FI->end(); BI != BE; ) { @@ -187,14 +198,9 @@ bool CorrelatedValuePropagation::runOnFunction(Function &F) { break; } } - - // Propagating correlated values might leave cruft around. - // Try to clean it up before we continue. - if (BBChanged) - SimplifyInstructionsInBlock(FI); - + FnChanged |= BBChanged; } - + return FnChanged; } diff --git a/lib/Transforms/Scalar/DCE.cpp b/lib/Transforms/Scalar/DCE.cpp index 87ea803..dbb68f3 100644 --- a/lib/Transforms/Scalar/DCE.cpp +++ b/lib/Transforms/Scalar/DCE.cpp @@ -35,7 +35,9 @@ namespace { // struct DeadInstElimination : public BasicBlockPass { static char ID; // Pass identification, replacement for typeid - DeadInstElimination() : BasicBlockPass(ID) {} + DeadInstElimination() : BasicBlockPass(ID) { + initializeDeadInstEliminationPass(*PassRegistry::getPassRegistry()); + } virtual bool runOnBasicBlock(BasicBlock &BB) { bool Changed = false; for (BasicBlock::iterator DI = BB.begin(); DI != BB.end(); ) { @@ -57,7 +59,7 @@ namespace { char DeadInstElimination::ID = 0; INITIALIZE_PASS(DeadInstElimination, "die", - "Dead Instruction Elimination", false, false); + "Dead Instruction Elimination", false, false) Pass *llvm::createDeadInstEliminationPass() { return new DeadInstElimination(); @@ -70,7 +72,9 @@ namespace { // struct DCE : public FunctionPass { static char ID; // Pass identification, replacement for typeid - DCE() : FunctionPass(ID) {} + DCE() : FunctionPass(ID) { + initializeDCEPass(*PassRegistry::getPassRegistry()); + } virtual bool runOnFunction(Function &F); @@ -81,7 +85,7 @@ namespace { } char DCE::ID = 0; -INITIALIZE_PASS(DCE, "dce", "Dead Code Elimination", false, false); +INITIALIZE_PASS(DCE, "dce", "Dead Code Elimination", false, false) bool DCE::runOnFunction(Function &F) { // Start out with all of the instructions in the worklist... diff --git a/lib/Transforms/Scalar/DeadStoreElimination.cpp b/lib/Transforms/Scalar/DeadStoreElimination.cpp index c8fd9d9..867a06a 100644 --- a/lib/Transforms/Scalar/DeadStoreElimination.cpp +++ b/lib/Transforms/Scalar/DeadStoreElimination.cpp @@ -19,17 +19,20 @@ #include "llvm/Transforms/Scalar.h" #include "llvm/Constants.h" #include "llvm/Function.h" +#include "llvm/GlobalVariable.h" #include "llvm/Instructions.h" #include "llvm/IntrinsicInst.h" #include "llvm/Pass.h" -#include "llvm/ADT/SmallPtrSet.h" -#include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/Dominators.h" #include "llvm/Analysis/MemoryBuiltins.h" #include "llvm/Analysis/MemoryDependenceAnalysis.h" +#include "llvm/Analysis/ValueTracking.h" #include "llvm/Target/TargetData.h" #include "llvm/Transforms/Utils/Local.h" +#include "llvm/Support/Debug.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/Statistic.h" using namespace llvm; STATISTIC(NumFastStores, "Number of stores deleted"); @@ -37,58 +40,107 @@ STATISTIC(NumFastOther , "Number of other instrs removed"); namespace { struct DSE : public FunctionPass { - TargetData *TD; + AliasAnalysis *AA; + MemoryDependenceAnalysis *MD; static char ID; // Pass identification, replacement for typeid - DSE() : FunctionPass(ID) {} + DSE() : FunctionPass(ID), AA(0), MD(0) { + initializeDSEPass(*PassRegistry::getPassRegistry()); + } virtual bool runOnFunction(Function &F) { - bool Changed = false; - + AA = &getAnalysis<AliasAnalysis>(); + MD = &getAnalysis<MemoryDependenceAnalysis>(); DominatorTree &DT = getAnalysis<DominatorTree>(); + bool Changed = false; for (Function::iterator I = F.begin(), E = F.end(); I != E; ++I) // Only check non-dead blocks. Dead blocks may have strange pointer // cycles that will confuse alias analysis. if (DT.isReachableFromEntry(I)) Changed |= runOnBasicBlock(*I); + + AA = 0; MD = 0; return Changed; } bool runOnBasicBlock(BasicBlock &BB); - bool handleFreeWithNonTrivialDependency(const CallInst *F, - MemDepResult Dep); + bool HandleFree(CallInst *F); bool handleEndBlock(BasicBlock &BB); - bool RemoveUndeadPointers(Value *Ptr, uint64_t killPointerSize, - BasicBlock::iterator &BBI, - SmallPtrSet<Value*, 64> &deadPointers); - void DeleteDeadInstruction(Instruction *I, - SmallPtrSet<Value*, 64> *deadPointers = 0); - + void RemoveAccessedObjects(const AliasAnalysis::Location &LoadedLoc, + SmallPtrSet<Value*, 16> &DeadStackObjects); - // getAnalysisUsage - We require post dominance frontiers (aka Control - // Dependence Graph) virtual void getAnalysisUsage(AnalysisUsage &AU) const { AU.setPreservesCFG(); AU.addRequired<DominatorTree>(); AU.addRequired<AliasAnalysis>(); AU.addRequired<MemoryDependenceAnalysis>(); + AU.addPreserved<AliasAnalysis>(); AU.addPreserved<DominatorTree>(); AU.addPreserved<MemoryDependenceAnalysis>(); } - - unsigned getPointerSize(Value *V) const; }; } char DSE::ID = 0; -INITIALIZE_PASS(DSE, "dse", "Dead Store Elimination", false, false); +INITIALIZE_PASS_BEGIN(DSE, "dse", "Dead Store Elimination", false, false) +INITIALIZE_PASS_DEPENDENCY(DominatorTree) +INITIALIZE_PASS_DEPENDENCY(MemoryDependenceAnalysis) +INITIALIZE_AG_DEPENDENCY(AliasAnalysis) +INITIALIZE_PASS_END(DSE, "dse", "Dead Store Elimination", false, false) FunctionPass *llvm::createDeadStoreEliminationPass() { return new DSE(); } -/// doesClobberMemory - Does this instruction clobber (write without reading) -/// some memory? -static bool doesClobberMemory(Instruction *I) { +//===----------------------------------------------------------------------===// +// Helper functions +//===----------------------------------------------------------------------===// + +/// DeleteDeadInstruction - Delete this instruction. Before we do, go through +/// and zero out all the operands of this instruction. If any of them become +/// dead, delete them and the computation tree that feeds them. +/// +/// If ValueSet is non-null, remove any deleted instructions from it as well. +/// +static void DeleteDeadInstruction(Instruction *I, + MemoryDependenceAnalysis &MD, + SmallPtrSet<Value*, 16> *ValueSet = 0) { + SmallVector<Instruction*, 32> NowDeadInsts; + + NowDeadInsts.push_back(I); + --NumFastOther; + + // Before we touch this instruction, remove it from memdep! + do { + Instruction *DeadInst = NowDeadInsts.pop_back_val(); + ++NumFastOther; + + // This instruction is dead, zap it, in stages. Start by removing it from + // MemDep, which needs to know the operands and needs it to be in the + // function. + MD.removeInstruction(DeadInst); + + for (unsigned op = 0, e = DeadInst->getNumOperands(); op != e; ++op) { + Value *Op = DeadInst->getOperand(op); + DeadInst->setOperand(op, 0); + + // If this operand just became dead, add it to the NowDeadInsts list. + if (!Op->use_empty()) continue; + + if (Instruction *OpI = dyn_cast<Instruction>(Op)) + if (isInstructionTriviallyDead(OpI)) + NowDeadInsts.push_back(OpI); + } + + DeadInst->eraseFromParent(); + + if (ValueSet) ValueSet->erase(DeadInst); + } while (!NowDeadInsts.empty()); +} + + +/// hasMemoryWrite - Does this instruction write some memory? This only returns +/// true for things that we can analyze with other helpers below. +static bool hasMemoryWrite(Instruction *I) { if (isa<StoreInst>(I)) return true; if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) { @@ -106,146 +158,296 @@ static bool doesClobberMemory(Instruction *I) { return false; } -/// isElidable - If the value of this instruction and the memory it writes to is -/// unused, may we delete this instrtction? -static bool isElidable(Instruction *I) { - assert(doesClobberMemory(I)); - if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) - return II->getIntrinsicID() != Intrinsic::lifetime_end; +/// getLocForWrite - Return a Location stored to by the specified instruction. +static AliasAnalysis::Location +getLocForWrite(Instruction *Inst, AliasAnalysis &AA) { + if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) + return AA.getLocation(SI); + + if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(Inst)) { + // memcpy/memmove/memset. + AliasAnalysis::Location Loc = AA.getLocationForDest(MI); + // If we don't have target data around, an unknown size in Location means + // that we should use the size of the pointee type. This isn't valid for + // memset/memcpy, which writes more than an i8. + if (Loc.Size == AliasAnalysis::UnknownSize && AA.getTargetData() == 0) + return AliasAnalysis::Location(); + return Loc; + } + + IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst); + if (II == 0) return AliasAnalysis::Location(); + + switch (II->getIntrinsicID()) { + default: return AliasAnalysis::Location(); // Unhandled intrinsic. + case Intrinsic::init_trampoline: + // If we don't have target data around, an unknown size in Location means + // that we should use the size of the pointee type. This isn't valid for + // init.trampoline, which writes more than an i8. + if (AA.getTargetData() == 0) return AliasAnalysis::Location(); + + // FIXME: We don't know the size of the trampoline, so we can't really + // handle it here. + return AliasAnalysis::Location(II->getArgOperand(0)); + case Intrinsic::lifetime_end: { + uint64_t Len = cast<ConstantInt>(II->getArgOperand(0))->getZExtValue(); + return AliasAnalysis::Location(II->getArgOperand(1), Len); + } + } +} + +/// getLocForRead - Return the location read by the specified "hasMemoryWrite" +/// instruction if any. +static AliasAnalysis::Location +getLocForRead(Instruction *Inst, AliasAnalysis &AA) { + assert(hasMemoryWrite(Inst) && "Unknown instruction case"); + + // The only instructions that both read and write are the mem transfer + // instructions (memcpy/memmove). + if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(Inst)) + return AA.getLocationForSource(MTI); + return AliasAnalysis::Location(); +} + + +/// isRemovable - If the value of this instruction and the memory it writes to +/// is unused, may we delete this instruction? +static bool isRemovable(Instruction *I) { + // Don't remove volatile stores. if (StoreInst *SI = dyn_cast<StoreInst>(I)) return !SI->isVolatile(); - return true; + + IntrinsicInst *II = cast<IntrinsicInst>(I); + switch (II->getIntrinsicID()) { + default: assert(0 && "doesn't pass 'hasMemoryWrite' predicate"); + case Intrinsic::lifetime_end: + // Never remove dead lifetime_end's, e.g. because it is followed by a + // free. + return false; + case Intrinsic::init_trampoline: + // Always safe to remove init_trampoline. + return true; + + case Intrinsic::memset: + case Intrinsic::memmove: + case Intrinsic::memcpy: + // Don't remove volatile memory intrinsics. + return !cast<MemIntrinsic>(II)->isVolatile(); + } } -/// getPointerOperand - Return the pointer that is being clobbered. -static Value *getPointerOperand(Instruction *I) { - assert(doesClobberMemory(I)); +/// getStoredPointerOperand - Return the pointer that is being written to. +static Value *getStoredPointerOperand(Instruction *I) { if (StoreInst *SI = dyn_cast<StoreInst>(I)) return SI->getPointerOperand(); if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I)) - return MI->getArgOperand(0); + return MI->getDest(); IntrinsicInst *II = cast<IntrinsicInst>(I); switch (II->getIntrinsicID()) { default: assert(false && "Unexpected intrinsic!"); case Intrinsic::init_trampoline: return II->getArgOperand(0); - case Intrinsic::lifetime_end: - return II->getArgOperand(1); } } -/// getStoreSize - Return the length in bytes of the write by the clobbering -/// instruction. If variable or unknown, returns -1. -static unsigned getStoreSize(Instruction *I, const TargetData *TD) { - assert(doesClobberMemory(I)); - if (StoreInst *SI = dyn_cast<StoreInst>(I)) { - if (!TD) return -1u; - return TD->getTypeStoreSize(SI->getOperand(0)->getType()); +static uint64_t getPointerSize(Value *V, AliasAnalysis &AA) { + const TargetData *TD = AA.getTargetData(); + if (TD == 0) + return AliasAnalysis::UnknownSize; + + if (AllocaInst *A = dyn_cast<AllocaInst>(V)) { + // Get size information for the alloca + if (ConstantInt *C = dyn_cast<ConstantInt>(A->getArraySize())) + return C->getZExtValue() * TD->getTypeAllocSize(A->getAllocatedType()); + return AliasAnalysis::UnknownSize; } + + assert(isa<Argument>(V) && "Expected AllocaInst or Argument!"); + const PointerType *PT = cast<PointerType>(V->getType()); + return TD->getTypeAllocSize(PT->getElementType()); +} - Value *Len; - if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I)) { - Len = MI->getLength(); - } else { - IntrinsicInst *II = cast<IntrinsicInst>(I); - switch (II->getIntrinsicID()) { - default: assert(false && "Unexpected intrinsic!"); - case Intrinsic::init_trampoline: - return -1u; - case Intrinsic::lifetime_end: - Len = II->getArgOperand(0); - break; +/// isObjectPointerWithTrustworthySize - Return true if the specified Value* is +/// pointing to an object with a pointer size we can trust. +static bool isObjectPointerWithTrustworthySize(const Value *V) { + if (const AllocaInst *AI = dyn_cast<AllocaInst>(V)) + return !AI->isArrayAllocation(); + if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(V)) + return !GV->mayBeOverridden(); + if (const Argument *A = dyn_cast<Argument>(V)) + return A->hasByValAttr(); + return false; +} + +/// isCompleteOverwrite - Return true if a store to the 'Later' location +/// completely overwrites a store to the 'Earlier' location. +static bool isCompleteOverwrite(const AliasAnalysis::Location &Later, + const AliasAnalysis::Location &Earlier, + AliasAnalysis &AA) { + const Value *P1 = Earlier.Ptr->stripPointerCasts(); + const Value *P2 = Later.Ptr->stripPointerCasts(); + + // If the start pointers are the same, we just have to compare sizes to see if + // the later store was larger than the earlier store. + if (P1 == P2) { + // If we don't know the sizes of either access, then we can't do a + // comparison. + if (Later.Size == AliasAnalysis::UnknownSize || + Earlier.Size == AliasAnalysis::UnknownSize) { + // If we have no TargetData information around, then the size of the store + // is inferrable from the pointee type. If they are the same type, then + // we know that the store is safe. + if (AA.getTargetData() == 0) + return Later.Ptr->getType() == Earlier.Ptr->getType(); + return false; } + + // Make sure that the Later size is >= the Earlier size. + if (Later.Size < Earlier.Size) + return false; + return true; } - if (ConstantInt *LenCI = dyn_cast<ConstantInt>(Len)) - if (!LenCI->isAllOnesValue()) - return LenCI->getZExtValue(); - return -1u; + + // Otherwise, we have to have size information, and the later store has to be + // larger than the earlier one. + if (Later.Size == AliasAnalysis::UnknownSize || + Earlier.Size == AliasAnalysis::UnknownSize || + Later.Size <= Earlier.Size || AA.getTargetData() == 0) + return false; + + // Check to see if the later store is to the entire object (either a global, + // an alloca, or a byval argument). If so, then it clearly overwrites any + // other store to the same object. + const TargetData &TD = *AA.getTargetData(); + + const Value *UO1 = GetUnderlyingObject(P1, &TD), + *UO2 = GetUnderlyingObject(P2, &TD); + + // If we can't resolve the same pointers to the same object, then we can't + // analyze them at all. + if (UO1 != UO2) + return false; + + // If the "Later" store is to a recognizable object, get its size. + if (isObjectPointerWithTrustworthySize(UO2)) { + uint64_t ObjectSize = + TD.getTypeAllocSize(cast<PointerType>(UO2->getType())->getElementType()); + if (ObjectSize == Later.Size) + return true; + } + + // Okay, we have stores to two completely different pointers. Try to + // decompose the pointer into a "base + constant_offset" form. If the base + // pointers are equal, then we can reason about the two stores. + int64_t Off1 = 0, Off2 = 0; + const Value *BP1 = GetPointerBaseWithConstantOffset(P1, Off1, TD); + const Value *BP2 = GetPointerBaseWithConstantOffset(P2, Off2, TD); + + // If the base pointers still differ, we have two completely different stores. + if (BP1 != BP2) + return false; + + // Otherwise, we might have a situation like: + // store i16 -> P + 1 Byte + // store i32 -> P + // In this case, we see if the later store completely overlaps all bytes + // stored by the previous store. + if (Off1 < Off2 || // Earlier starts before Later. + Off1+Earlier.Size > Off2+Later.Size) // Earlier goes beyond Later. + return false; + // Otherwise, we have complete overlap. + return true; } -/// isStoreAtLeastAsWideAs - Return true if the size of the store in I1 is -/// greater than or equal to the store in I2. This returns false if we don't -/// know. +/// isPossibleSelfRead - If 'Inst' might be a self read (i.e. a noop copy of a +/// memory region into an identical pointer) then it doesn't actually make its +/// input dead in the traditional sense. Consider this case: +/// +/// memcpy(A <- B) +/// memcpy(A <- A) +/// +/// In this case, the second store to A does not make the first store to A dead. +/// The usual situation isn't an explicit A<-A store like this (which can be +/// trivially removed) but a case where two pointers may alias. /// -static bool isStoreAtLeastAsWideAs(Instruction *I1, Instruction *I2, - const TargetData *TD) { - const Type *I1Ty = getPointerOperand(I1)->getType(); - const Type *I2Ty = getPointerOperand(I2)->getType(); +/// This function detects when it is unsafe to remove a dependent instruction +/// because the DSE inducing instruction may be a self-read. +static bool isPossibleSelfRead(Instruction *Inst, + const AliasAnalysis::Location &InstStoreLoc, + Instruction *DepWrite, AliasAnalysis &AA) { + // Self reads can only happen for instructions that read memory. Get the + // location read. + AliasAnalysis::Location InstReadLoc = getLocForRead(Inst, AA); + if (InstReadLoc.Ptr == 0) return false; // Not a reading instruction. - // Exactly the same type, must have exactly the same size. - if (I1Ty == I2Ty) return true; + // If the read and written loc obviously don't alias, it isn't a read. + if (AA.isNoAlias(InstReadLoc, InstStoreLoc)) return false; - int I1Size = getStoreSize(I1, TD); - int I2Size = getStoreSize(I2, TD); + // Okay, 'Inst' may copy over itself. However, we can still remove a the + // DepWrite instruction if we can prove that it reads from the same location + // as Inst. This handles useful cases like: + // memcpy(A <- B) + // memcpy(A <- B) + // Here we don't know if A/B may alias, but we do know that B/B are must + // aliases, so removing the first memcpy is safe (assuming it writes <= # + // bytes as the second one. + AliasAnalysis::Location DepReadLoc = getLocForRead(DepWrite, AA); - return I1Size != -1 && I2Size != -1 && I1Size >= I2Size; + if (DepReadLoc.Ptr && AA.isMustAlias(InstReadLoc.Ptr, DepReadLoc.Ptr)) + return false; + + // If DepWrite doesn't read memory or if we can't prove it is a must alias, + // then it can't be considered dead. + return true; } -bool DSE::runOnBasicBlock(BasicBlock &BB) { - MemoryDependenceAnalysis &MD = getAnalysis<MemoryDependenceAnalysis>(); - TD = getAnalysisIfAvailable<TargetData>(); +//===----------------------------------------------------------------------===// +// DSE Pass +//===----------------------------------------------------------------------===// + +bool DSE::runOnBasicBlock(BasicBlock &BB) { bool MadeChange = false; // Do a top-down walk on the BB. for (BasicBlock::iterator BBI = BB.begin(), BBE = BB.end(); BBI != BBE; ) { Instruction *Inst = BBI++; - // If we find a store or a free, get its memory dependence. - if (!doesClobberMemory(Inst) && !isFreeCall(Inst)) - continue; - - MemDepResult InstDep = MD.getDependency(Inst); - - // Ignore non-local stores. - // FIXME: cross-block DSE would be fun. :) - if (InstDep.isNonLocal()) continue; - - // Handle frees whose dependencies are non-trivial. - if (const CallInst *F = isFreeCall(Inst)) { - MadeChange |= handleFreeWithNonTrivialDependency(F, InstDep); + // Handle 'free' calls specially. + if (CallInst *F = isFreeCall(Inst)) { + MadeChange |= HandleFree(F); continue; } - // If not a definite must-alias dependency, ignore it. - if (!InstDep.isDef()) + // If we find something that writes memory, get its memory dependence. + if (!hasMemoryWrite(Inst)) continue; - - // If this is a store-store dependence, then the previous store is dead so - // long as this store is at least as big as it. - if (doesClobberMemory(InstDep.getInst())) { - Instruction *DepStore = InstDep.getInst(); - if (isStoreAtLeastAsWideAs(Inst, DepStore, TD) && - isElidable(DepStore)) { - // Delete the store and now-dead instructions that feed it. - DeleteDeadInstruction(DepStore); - ++NumFastStores; - MadeChange = true; - // DeleteDeadInstruction can delete the current instruction in loop - // cases, reset BBI. - BBI = Inst; - if (BBI != BB.begin()) - --BBI; - continue; - } - } + MemDepResult InstDep = MD->getDependency(Inst); - if (!isElidable(Inst)) + // Ignore non-local store liveness. + // FIXME: cross-block DSE would be fun. :) + if (InstDep.isNonLocal() || + // Ignore self dependence, which happens in the entry block of the + // function. + InstDep.getInst() == Inst) continue; - + // If we're storing the same value back to a pointer that we just // loaded from, then the store can be removed. if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) { if (LoadInst *DepLoad = dyn_cast<LoadInst>(InstDep.getInst())) { if (SI->getPointerOperand() == DepLoad->getPointerOperand() && - SI->getOperand(0) == DepLoad) { + SI->getOperand(0) == DepLoad && !SI->isVolatile()) { + DEBUG(dbgs() << "DSE: Remove Store Of Load from same pointer:\n " + << "LOAD: " << *DepLoad << "\n STORE: " << *SI << '\n'); + // DeleteDeadInstruction can delete the current instruction. Save BBI // in case we need it. WeakVH NextInst(BBI); - DeleteDeadInstruction(SI); + DeleteDeadInstruction(SI, *MD); if (NextInst == 0) // Next instruction deleted. BBI = BB.begin(); @@ -258,24 +460,63 @@ bool DSE::runOnBasicBlock(BasicBlock &BB) { } } - // If this is a lifetime end marker, we can throw away the store. - if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(InstDep.getInst())) { - if (II->getIntrinsicID() == Intrinsic::lifetime_end) { - // Delete the store and now-dead instructions that feed it. - // DeleteDeadInstruction can delete the current instruction. Save BBI - // in case we need it. - WeakVH NextInst(BBI); - - DeleteDeadInstruction(Inst); + // Figure out what location is being stored to. + AliasAnalysis::Location Loc = getLocForWrite(Inst, *AA); + + // If we didn't get a useful location, fail. + if (Loc.Ptr == 0) + continue; + + while (!InstDep.isNonLocal()) { + // Get the memory clobbered by the instruction we depend on. MemDep will + // skip any instructions that 'Loc' clearly doesn't interact with. If we + // end up depending on a may- or must-aliased load, then we can't optimize + // away the store and we bail out. However, if we depend on on something + // that overwrites the memory location we *can* potentially optimize it. + // + // Find out what memory location the dependant instruction stores. + Instruction *DepWrite = InstDep.getInst(); + AliasAnalysis::Location DepLoc = getLocForWrite(DepWrite, *AA); + // If we didn't get a useful location, or if it isn't a size, bail out. + if (DepLoc.Ptr == 0) + break; + + // If we find a write that is a) removable (i.e., non-volatile), b) is + // completely obliterated by the store to 'Loc', and c) which we know that + // 'Inst' doesn't load from, then we can remove it. + if (isRemovable(DepWrite) && isCompleteOverwrite(Loc, DepLoc, *AA) && + !isPossibleSelfRead(Inst, Loc, DepWrite, *AA)) { + DEBUG(dbgs() << "DSE: Remove Dead Store:\n DEAD: " + << *DepWrite << "\n KILLER: " << *Inst << '\n'); - if (NextInst == 0) // Next instruction deleted. - BBI = BB.begin(); - else if (BBI != BB.begin()) // Revisit this instruction if possible. - --BBI; + // Delete the store and now-dead instructions that feed it. + DeleteDeadInstruction(DepWrite, *MD); ++NumFastStores; MadeChange = true; - continue; + + // DeleteDeadInstruction can delete the current instruction in loop + // cases, reset BBI. + BBI = Inst; + if (BBI != BB.begin()) + --BBI; + break; } + + // If this is a may-aliased store that is clobbering the store value, we + // can keep searching past it for another must-aliased pointer that stores + // to the same location. For example, in: + // store -> P + // store -> Q + // store -> P + // we can remove the first store to P even though we don't know if P and Q + // alias. + if (DepWrite == &BB.front()) break; + + // Can't look past this instruction if it might read 'Loc'. + if (AA->getModRefInfo(DepWrite, Loc) & AliasAnalysis::Ref) + break; + + InstDep = MD->getPointerDependencyFrom(Loc, false, DepWrite, &BB); } } @@ -287,26 +528,36 @@ bool DSE::runOnBasicBlock(BasicBlock &BB) { return MadeChange; } -/// handleFreeWithNonTrivialDependency - Handle frees of entire structures whose -/// dependency is a store to a field of that structure. -bool DSE::handleFreeWithNonTrivialDependency(const CallInst *F, - MemDepResult Dep) { - AliasAnalysis &AA = getAnalysis<AliasAnalysis>(); - - Instruction *Dependency = Dep.getInst(); - if (!Dependency || !doesClobberMemory(Dependency) || !isElidable(Dependency)) - return false; +/// HandleFree - Handle frees of entire structures whose dependency is a store +/// to a field of that structure. +bool DSE::HandleFree(CallInst *F) { + MemDepResult Dep = MD->getDependency(F); + do { + if (Dep.isNonLocal()) return false; + + Instruction *Dependency = Dep.getInst(); + if (!hasMemoryWrite(Dependency) || !isRemovable(Dependency)) + return false; - Value *DepPointer = getPointerOperand(Dependency)->getUnderlyingObject(); + Value *DepPointer = + GetUnderlyingObject(getStoredPointerOperand(Dependency)); - // Check for aliasing. - if (AA.alias(F->getArgOperand(0), 1, DepPointer, 1) != - AliasAnalysis::MustAlias) - return false; + // Check for aliasing. + if (!AA->isMustAlias(F->getArgOperand(0), DepPointer)) + return false; + + // DCE instructions only used to calculate that store + DeleteDeadInstruction(Dependency, *MD); + ++NumFastStores; + + // Inst's old Dependency is now deleted. Compute the next dependency, + // which may also be dead, as in + // s[0] = 0; + // s[1] = 0; // This has just been deleted. + // free(s); + Dep = MD->getDependency(F); + } while (!Dep.isNonLocal()); - // DCE instructions only used to calculate that store - DeleteDeadInstruction(Dependency); - ++NumFastStores; return true; } @@ -317,259 +568,163 @@ bool DSE::handleFreeWithNonTrivialDependency(const CallInst *F, /// store i32 1, i32* %A /// ret void bool DSE::handleEndBlock(BasicBlock &BB) { - AliasAnalysis &AA = getAnalysis<AliasAnalysis>(); - bool MadeChange = false; - // Pointers alloca'd in this function are dead in the end block - SmallPtrSet<Value*, 64> deadPointers; + // Keep track of all of the stack objects that are dead at the end of the + // function. + SmallPtrSet<Value*, 16> DeadStackObjects; // Find all of the alloca'd pointers in the entry block. BasicBlock *Entry = BB.getParent()->begin(); for (BasicBlock::iterator I = Entry->begin(), E = Entry->end(); I != E; ++I) if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) - deadPointers.insert(AI); + DeadStackObjects.insert(AI); // Treat byval arguments the same, stores to them are dead at the end of the // function. for (Function::arg_iterator AI = BB.getParent()->arg_begin(), AE = BB.getParent()->arg_end(); AI != AE; ++AI) if (AI->hasByValAttr()) - deadPointers.insert(AI); + DeadStackObjects.insert(AI); // Scan the basic block backwards for (BasicBlock::iterator BBI = BB.end(); BBI != BB.begin(); ){ --BBI; - // If we find a store whose pointer is dead. - if (doesClobberMemory(BBI)) { - if (isElidable(BBI)) { - // See through pointer-to-pointer bitcasts - Value *pointerOperand = getPointerOperand(BBI)->getUnderlyingObject(); - - // Alloca'd pointers or byval arguments (which are functionally like - // alloca's) are valid candidates for removal. - if (deadPointers.count(pointerOperand)) { - // DCE instructions only used to calculate that store. - Instruction *Dead = BBI; - ++BBI; - DeleteDeadInstruction(Dead, &deadPointers); - ++NumFastStores; - MadeChange = true; - continue; - } - } - - // Because a memcpy or memmove is also a load, we can't skip it if we - // didn't remove it. - if (!isa<MemTransferInst>(BBI)) + // If we find a store, check to see if it points into a dead stack value. + if (hasMemoryWrite(BBI) && isRemovable(BBI)) { + // See through pointer-to-pointer bitcasts + Value *Pointer = GetUnderlyingObject(getStoredPointerOperand(BBI)); + + // Stores to stack values are valid candidates for removal. + if (DeadStackObjects.count(Pointer)) { + Instruction *Dead = BBI++; + + DEBUG(dbgs() << "DSE: Dead Store at End of Block:\n DEAD: " + << *Dead << "\n Object: " << *Pointer << '\n'); + + // DCE instructions only used to calculate that store. + DeleteDeadInstruction(Dead, *MD, &DeadStackObjects); + ++NumFastStores; + MadeChange = true; continue; + } } - Value *killPointer = 0; - uint64_t killPointerSize = ~0UL; + // Remove any dead non-memory-mutating instructions. + if (isInstructionTriviallyDead(BBI)) { + Instruction *Inst = BBI++; + DeleteDeadInstruction(Inst, *MD, &DeadStackObjects); + ++NumFastOther; + MadeChange = true; + continue; + } - // If we encounter a use of the pointer, it is no longer considered dead - if (LoadInst *L = dyn_cast<LoadInst>(BBI)) { - // However, if this load is unused and not volatile, we can go ahead and - // remove it, and not have to worry about it making our pointer undead! - if (L->use_empty() && !L->isVolatile()) { - ++BBI; - DeleteDeadInstruction(L, &deadPointers); - ++NumFastOther; - MadeChange = true; - continue; - } - - killPointer = L->getPointerOperand(); - } else if (VAArgInst *V = dyn_cast<VAArgInst>(BBI)) { - killPointer = V->getOperand(0); - } else if (isa<MemTransferInst>(BBI) && - isa<ConstantInt>(cast<MemTransferInst>(BBI)->getLength())) { - killPointer = cast<MemTransferInst>(BBI)->getSource(); - killPointerSize = cast<ConstantInt>( - cast<MemTransferInst>(BBI)->getLength())->getZExtValue(); - } else if (AllocaInst *A = dyn_cast<AllocaInst>(BBI)) { - deadPointers.erase(A); - - // Dead alloca's can be DCE'd when we reach them - if (A->use_empty()) { - ++BBI; - DeleteDeadInstruction(A, &deadPointers); - ++NumFastOther; - MadeChange = true; - } - + if (AllocaInst *A = dyn_cast<AllocaInst>(BBI)) { + DeadStackObjects.erase(A); continue; - } else if (CallSite CS = cast<Value>(BBI)) { - // If this call does not access memory, it can't - // be undeadifying any of our pointers. - if (AA.doesNotAccessMemory(CS)) + } + + if (CallSite CS = cast<Value>(BBI)) { + // If this call does not access memory, it can't be loading any of our + // pointers. + if (AA->doesNotAccessMemory(CS)) continue; - unsigned modRef = 0; - unsigned other = 0; + unsigned NumModRef = 0, NumOther = 0; - // Remove any pointers made undead by the call from the dead set - std::vector<Value*> dead; - for (SmallPtrSet<Value*, 64>::iterator I = deadPointers.begin(), - E = deadPointers.end(); I != E; ++I) { - // HACK: if we detect that our AA is imprecise, it's not - // worth it to scan the rest of the deadPointers set. Just - // assume that the AA will return ModRef for everything, and - // go ahead and bail. - if (modRef >= 16 && other == 0) { - deadPointers.clear(); + // If the call might load from any of our allocas, then any store above + // the call is live. + SmallVector<Value*, 8> LiveAllocas; + for (SmallPtrSet<Value*, 16>::iterator I = DeadStackObjects.begin(), + E = DeadStackObjects.end(); I != E; ++I) { + // If we detect that our AA is imprecise, it's not worth it to scan the + // rest of the DeadPointers set. Just assume that the AA will return + // ModRef for everything, and go ahead and bail out. + if (NumModRef >= 16 && NumOther == 0) return MadeChange; - } - - // See if the call site touches it - AliasAnalysis::ModRefResult A = AA.getModRefInfo(CS, *I, - getPointerSize(*I)); + + // See if the call site touches it. + AliasAnalysis::ModRefResult A = + AA->getModRefInfo(CS, *I, getPointerSize(*I, *AA)); if (A == AliasAnalysis::ModRef) - ++modRef; + ++NumModRef; else - ++other; + ++NumOther; if (A == AliasAnalysis::ModRef || A == AliasAnalysis::Ref) - dead.push_back(*I); + LiveAllocas.push_back(*I); } - - for (std::vector<Value*>::iterator I = dead.begin(), E = dead.end(); - I != E; ++I) - deadPointers.erase(*I); - continue; - } else if (isInstructionTriviallyDead(BBI)) { - // For any non-memory-affecting non-terminators, DCE them as we reach them - Instruction *Inst = BBI; - ++BBI; - DeleteDeadInstruction(Inst, &deadPointers); - ++NumFastOther; - MadeChange = true; + for (SmallVector<Value*, 8>::iterator I = LiveAllocas.begin(), + E = LiveAllocas.end(); I != E; ++I) + DeadStackObjects.erase(*I); + + // If all of the allocas were clobbered by the call then we're not going + // to find anything else to process. + if (DeadStackObjects.empty()) + return MadeChange; + continue; } - if (!killPointer) + AliasAnalysis::Location LoadedLoc; + + // If we encounter a use of the pointer, it is no longer considered dead + if (LoadInst *L = dyn_cast<LoadInst>(BBI)) { + LoadedLoc = AA->getLocation(L); + } else if (VAArgInst *V = dyn_cast<VAArgInst>(BBI)) { + LoadedLoc = AA->getLocation(V); + } else if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(BBI)) { + LoadedLoc = AA->getLocationForSource(MTI); + } else { + // Not a loading instruction. continue; + } - killPointer = killPointer->getUnderlyingObject(); + // Remove any allocas from the DeadPointer set that are loaded, as this + // makes any stores above the access live. + RemoveAccessedObjects(LoadedLoc, DeadStackObjects); - // Deal with undead pointers - MadeChange |= RemoveUndeadPointers(killPointer, killPointerSize, BBI, - deadPointers); + // If all of the allocas were clobbered by the access then we're not going + // to find anything else to process. + if (DeadStackObjects.empty()) + break; } return MadeChange; } -/// RemoveUndeadPointers - check for uses of a pointer that make it -/// undead when scanning for dead stores to alloca's. -bool DSE::RemoveUndeadPointers(Value *killPointer, uint64_t killPointerSize, - BasicBlock::iterator &BBI, - SmallPtrSet<Value*, 64> &deadPointers) { - AliasAnalysis &AA = getAnalysis<AliasAnalysis>(); - - // If the kill pointer can be easily reduced to an alloca, - // don't bother doing extraneous AA queries. - if (deadPointers.count(killPointer)) { - deadPointers.erase(killPointer); - return false; - } - - // A global can't be in the dead pointer set. - if (isa<GlobalValue>(killPointer)) - return false; - - bool MadeChange = false; +/// RemoveAccessedObjects - Check to see if the specified location may alias any +/// of the stack objects in the DeadStackObjects set. If so, they become live +/// because the location is being loaded. +void DSE::RemoveAccessedObjects(const AliasAnalysis::Location &LoadedLoc, + SmallPtrSet<Value*, 16> &DeadStackObjects) { + const Value *UnderlyingPointer = GetUnderlyingObject(LoadedLoc.Ptr); + + // A constant can't be in the dead pointer set. + if (isa<Constant>(UnderlyingPointer)) + return; - SmallVector<Value*, 16> undead; + // If the kill pointer can be easily reduced to an alloca, don't bother doing + // extraneous AA queries. + if (isa<AllocaInst>(UnderlyingPointer) || isa<Argument>(UnderlyingPointer)) { + DeadStackObjects.erase(const_cast<Value*>(UnderlyingPointer)); + return; + } - for (SmallPtrSet<Value*, 64>::iterator I = deadPointers.begin(), - E = deadPointers.end(); I != E; ++I) { - // See if this pointer could alias it - AliasAnalysis::AliasResult A = AA.alias(*I, getPointerSize(*I), - killPointer, killPointerSize); - - // If it must-alias and a store, we can delete it - if (isa<StoreInst>(BBI) && A == AliasAnalysis::MustAlias) { - StoreInst *S = cast<StoreInst>(BBI); - - // Remove it! - ++BBI; - DeleteDeadInstruction(S, &deadPointers); - ++NumFastStores; - MadeChange = true; - - continue; - - // Otherwise, it is undead - } else if (A != AliasAnalysis::NoAlias) - undead.push_back(*I); + SmallVector<Value*, 16> NowLive; + for (SmallPtrSet<Value*, 16>::iterator I = DeadStackObjects.begin(), + E = DeadStackObjects.end(); I != E; ++I) { + // See if the loaded location could alias the stack location. + AliasAnalysis::Location StackLoc(*I, getPointerSize(*I, *AA)); + if (!AA->isNoAlias(StackLoc, LoadedLoc)) + NowLive.push_back(*I); } - for (SmallVector<Value*, 16>::iterator I = undead.begin(), E = undead.end(); + for (SmallVector<Value*, 16>::iterator I = NowLive.begin(), E = NowLive.end(); I != E; ++I) - deadPointers.erase(*I); - - return MadeChange; + DeadStackObjects.erase(*I); } -/// DeleteDeadInstruction - Delete this instruction. Before we do, go through -/// and zero out all the operands of this instruction. If any of them become -/// dead, delete them and the computation tree that feeds them. -/// -/// If ValueSet is non-null, remove any deleted instructions from it as well. -/// -void DSE::DeleteDeadInstruction(Instruction *I, - SmallPtrSet<Value*, 64> *ValueSet) { - SmallVector<Instruction*, 32> NowDeadInsts; - - NowDeadInsts.push_back(I); - --NumFastOther; - - // Before we touch this instruction, remove it from memdep! - MemoryDependenceAnalysis &MDA = getAnalysis<MemoryDependenceAnalysis>(); - do { - Instruction *DeadInst = NowDeadInsts.pop_back_val(); - - ++NumFastOther; - - // This instruction is dead, zap it, in stages. Start by removing it from - // MemDep, which needs to know the operands and needs it to be in the - // function. - MDA.removeInstruction(DeadInst); - - for (unsigned op = 0, e = DeadInst->getNumOperands(); op != e; ++op) { - Value *Op = DeadInst->getOperand(op); - DeadInst->setOperand(op, 0); - - // If this operand just became dead, add it to the NowDeadInsts list. - if (!Op->use_empty()) continue; - - if (Instruction *OpI = dyn_cast<Instruction>(Op)) - if (isInstructionTriviallyDead(OpI)) - NowDeadInsts.push_back(OpI); - } - - DeadInst->eraseFromParent(); - - if (ValueSet) ValueSet->erase(DeadInst); - } while (!NowDeadInsts.empty()); -} - -unsigned DSE::getPointerSize(Value *V) const { - if (TD) { - if (AllocaInst *A = dyn_cast<AllocaInst>(V)) { - // Get size information for the alloca - if (ConstantInt *C = dyn_cast<ConstantInt>(A->getArraySize())) - return C->getZExtValue() * TD->getTypeAllocSize(A->getAllocatedType()); - } else { - assert(isa<Argument>(V) && "Expected AllocaInst or Argument!"); - const PointerType *PT = cast<PointerType>(V->getType()); - return TD->getTypeAllocSize(PT->getElementType()); - } - } - return ~0U; -} diff --git a/lib/Transforms/Scalar/EarlyCSE.cpp b/lib/Transforms/Scalar/EarlyCSE.cpp new file mode 100644 index 0000000..3d3f17b --- /dev/null +++ b/lib/Transforms/Scalar/EarlyCSE.cpp @@ -0,0 +1,470 @@ +//===- EarlyCSE.cpp - Simple and fast CSE pass ----------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass performs a simple dominator tree walk that eliminates trivially +// redundant instructions. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "early-cse" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Instructions.h" +#include "llvm/Pass.h" +#include "llvm/Analysis/Dominators.h" +#include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/Target/TargetData.h" +#include "llvm/Transforms/Utils/Local.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/RecyclingAllocator.h" +#include "llvm/ADT/ScopedHashTable.h" +#include "llvm/ADT/Statistic.h" +using namespace llvm; + +STATISTIC(NumSimplify, "Number of instructions simplified or DCE'd"); +STATISTIC(NumCSE, "Number of instructions CSE'd"); +STATISTIC(NumCSELoad, "Number of load instructions CSE'd"); +STATISTIC(NumCSECall, "Number of call instructions CSE'd"); +STATISTIC(NumDSE, "Number of trivial dead stores removed"); + +static unsigned getHash(const void *V) { + return DenseMapInfo<const void*>::getHashValue(V); +} + +//===----------------------------------------------------------------------===// +// SimpleValue +//===----------------------------------------------------------------------===// + +namespace { + /// SimpleValue - Instances of this struct represent available values in the + /// scoped hash table. + struct SimpleValue { + Instruction *Inst; + + SimpleValue(Instruction *I) : Inst(I) { + assert((isSentinel() || canHandle(I)) && "Inst can't be handled!"); + } + + bool isSentinel() const { + return Inst == DenseMapInfo<Instruction*>::getEmptyKey() || + Inst == DenseMapInfo<Instruction*>::getTombstoneKey(); + } + + static bool canHandle(Instruction *Inst) { + // This can only handle non-void readnone functions. + if (CallInst *CI = dyn_cast<CallInst>(Inst)) + return CI->doesNotAccessMemory() && !CI->getType()->isVoidTy(); + return isa<CastInst>(Inst) || isa<BinaryOperator>(Inst) || + isa<GetElementPtrInst>(Inst) || isa<CmpInst>(Inst) || + isa<SelectInst>(Inst) || isa<ExtractElementInst>(Inst) || + isa<InsertElementInst>(Inst) || isa<ShuffleVectorInst>(Inst) || + isa<ExtractValueInst>(Inst) || isa<InsertValueInst>(Inst); + } + }; +} + +namespace llvm { +// SimpleValue is POD. +template<> struct isPodLike<SimpleValue> { + static const bool value = true; +}; + +template<> struct DenseMapInfo<SimpleValue> { + static inline SimpleValue getEmptyKey() { + return DenseMapInfo<Instruction*>::getEmptyKey(); + } + static inline SimpleValue getTombstoneKey() { + return DenseMapInfo<Instruction*>::getTombstoneKey(); + } + static unsigned getHashValue(SimpleValue Val); + static bool isEqual(SimpleValue LHS, SimpleValue RHS); +}; +} + +unsigned DenseMapInfo<SimpleValue>::getHashValue(SimpleValue Val) { + Instruction *Inst = Val.Inst; + + // Hash in all of the operands as pointers. + unsigned Res = 0; + for (unsigned i = 0, e = Inst->getNumOperands(); i != e; ++i) + Res ^= getHash(Inst->getOperand(i)) << i; + + if (CastInst *CI = dyn_cast<CastInst>(Inst)) + Res ^= getHash(CI->getType()); + else if (CmpInst *CI = dyn_cast<CmpInst>(Inst)) + Res ^= CI->getPredicate(); + else if (const ExtractValueInst *EVI = dyn_cast<ExtractValueInst>(Inst)) { + for (ExtractValueInst::idx_iterator I = EVI->idx_begin(), + E = EVI->idx_end(); I != E; ++I) + Res ^= *I; + } else if (const InsertValueInst *IVI = dyn_cast<InsertValueInst>(Inst)) { + for (InsertValueInst::idx_iterator I = IVI->idx_begin(), + E = IVI->idx_end(); I != E; ++I) + Res ^= *I; + } else { + // nothing extra to hash in. + assert((isa<CallInst>(Inst) || + isa<BinaryOperator>(Inst) || isa<GetElementPtrInst>(Inst) || + isa<SelectInst>(Inst) || isa<ExtractElementInst>(Inst) || + isa<InsertElementInst>(Inst) || isa<ShuffleVectorInst>(Inst)) && + "Invalid/unknown instruction"); + } + + // Mix in the opcode. + return (Res << 1) ^ Inst->getOpcode(); +} + +bool DenseMapInfo<SimpleValue>::isEqual(SimpleValue LHS, SimpleValue RHS) { + Instruction *LHSI = LHS.Inst, *RHSI = RHS.Inst; + + if (LHS.isSentinel() || RHS.isSentinel()) + return LHSI == RHSI; + + if (LHSI->getOpcode() != RHSI->getOpcode()) return false; + return LHSI->isIdenticalTo(RHSI); +} + +//===----------------------------------------------------------------------===// +// CallValue +//===----------------------------------------------------------------------===// + +namespace { + /// CallValue - Instances of this struct represent available call values in + /// the scoped hash table. + struct CallValue { + Instruction *Inst; + + CallValue(Instruction *I) : Inst(I) { + assert((isSentinel() || canHandle(I)) && "Inst can't be handled!"); + } + + bool isSentinel() const { + return Inst == DenseMapInfo<Instruction*>::getEmptyKey() || + Inst == DenseMapInfo<Instruction*>::getTombstoneKey(); + } + + static bool canHandle(Instruction *Inst) { + // Don't value number anything that returns void. + if (Inst->getType()->isVoidTy()) + return false; + + CallInst *CI = dyn_cast<CallInst>(Inst); + if (CI == 0 || !CI->onlyReadsMemory()) + return false; + return true; + } + }; +} + +namespace llvm { + // CallValue is POD. + template<> struct isPodLike<CallValue> { + static const bool value = true; + }; + + template<> struct DenseMapInfo<CallValue> { + static inline CallValue getEmptyKey() { + return DenseMapInfo<Instruction*>::getEmptyKey(); + } + static inline CallValue getTombstoneKey() { + return DenseMapInfo<Instruction*>::getTombstoneKey(); + } + static unsigned getHashValue(CallValue Val); + static bool isEqual(CallValue LHS, CallValue RHS); + }; +} +unsigned DenseMapInfo<CallValue>::getHashValue(CallValue Val) { + Instruction *Inst = Val.Inst; + // Hash in all of the operands as pointers. + unsigned Res = 0; + for (unsigned i = 0, e = Inst->getNumOperands(); i != e; ++i) { + assert(!Inst->getOperand(i)->getType()->isMetadataTy() && + "Cannot value number calls with metadata operands"); + Res ^= getHash(Inst->getOperand(i)) << i; + } + + // Mix in the opcode. + return (Res << 1) ^ Inst->getOpcode(); +} + +bool DenseMapInfo<CallValue>::isEqual(CallValue LHS, CallValue RHS) { + Instruction *LHSI = LHS.Inst, *RHSI = RHS.Inst; + if (LHS.isSentinel() || RHS.isSentinel()) + return LHSI == RHSI; + return LHSI->isIdenticalTo(RHSI); +} + + +//===----------------------------------------------------------------------===// +// EarlyCSE pass. +//===----------------------------------------------------------------------===// + +namespace { + +/// EarlyCSE - This pass does a simple depth-first walk over the dominator +/// tree, eliminating trivially redundant instructions and using instsimplify +/// to canonicalize things as it goes. It is intended to be fast and catch +/// obvious cases so that instcombine and other passes are more effective. It +/// is expected that a later pass of GVN will catch the interesting/hard +/// cases. +class EarlyCSE : public FunctionPass { +public: + const TargetData *TD; + DominatorTree *DT; + typedef RecyclingAllocator<BumpPtrAllocator, + ScopedHashTableVal<SimpleValue, Value*> > AllocatorTy; + typedef ScopedHashTable<SimpleValue, Value*, DenseMapInfo<SimpleValue>, + AllocatorTy> ScopedHTType; + + /// AvailableValues - This scoped hash table contains the current values of + /// all of our simple scalar expressions. As we walk down the domtree, we + /// look to see if instructions are in this: if so, we replace them with what + /// we find, otherwise we insert them so that dominated values can succeed in + /// their lookup. + ScopedHTType *AvailableValues; + + /// AvailableLoads - This scoped hash table contains the current values + /// of loads. This allows us to get efficient access to dominating loads when + /// we have a fully redundant load. In addition to the most recent load, we + /// keep track of a generation count of the read, which is compared against + /// the current generation count. The current generation count is + /// incremented after every possibly writing memory operation, which ensures + /// that we only CSE loads with other loads that have no intervening store. + typedef RecyclingAllocator<BumpPtrAllocator, + ScopedHashTableVal<Value*, std::pair<Value*, unsigned> > > LoadMapAllocator; + typedef ScopedHashTable<Value*, std::pair<Value*, unsigned>, + DenseMapInfo<Value*>, LoadMapAllocator> LoadHTType; + LoadHTType *AvailableLoads; + + /// AvailableCalls - This scoped hash table contains the current values + /// of read-only call values. It uses the same generation count as loads. + typedef ScopedHashTable<CallValue, std::pair<Value*, unsigned> > CallHTType; + CallHTType *AvailableCalls; + + /// CurrentGeneration - This is the current generation of the memory value. + unsigned CurrentGeneration; + + static char ID; + explicit EarlyCSE() : FunctionPass(ID) { + initializeEarlyCSEPass(*PassRegistry::getPassRegistry()); + } + + bool runOnFunction(Function &F); + +private: + + bool processNode(DomTreeNode *Node); + + // This transformation requires dominator postdominator info + virtual void getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired<DominatorTree>(); + AU.setPreservesCFG(); + } +}; +} + +char EarlyCSE::ID = 0; + +// createEarlyCSEPass - The public interface to this file. +FunctionPass *llvm::createEarlyCSEPass() { + return new EarlyCSE(); +} + +INITIALIZE_PASS_BEGIN(EarlyCSE, "early-cse", "Early CSE", false, false) +INITIALIZE_PASS_DEPENDENCY(DominatorTree) +INITIALIZE_PASS_END(EarlyCSE, "early-cse", "Early CSE", false, false) + +bool EarlyCSE::processNode(DomTreeNode *Node) { + // Define a scope in the scoped hash table. When we are done processing this + // domtree node and recurse back up to our parent domtree node, this will pop + // off all the values we install. + ScopedHTType::ScopeTy Scope(*AvailableValues); + + // Define a scope for the load values so that anything we add will get + // popped when we recurse back up to our parent domtree node. + LoadHTType::ScopeTy LoadScope(*AvailableLoads); + + // Define a scope for the call values so that anything we add will get + // popped when we recurse back up to our parent domtree node. + CallHTType::ScopeTy CallScope(*AvailableCalls); + + BasicBlock *BB = Node->getBlock(); + + // If this block has a single predecessor, then the predecessor is the parent + // of the domtree node and all of the live out memory values are still current + // in this block. If this block has multiple predecessors, then they could + // have invalidated the live-out memory values of our parent value. For now, + // just be conservative and invalidate memory if this block has multiple + // predecessors. + if (BB->getSinglePredecessor() == 0) + ++CurrentGeneration; + + /// LastStore - Keep track of the last non-volatile store that we saw... for + /// as long as there in no instruction that reads memory. If we see a store + /// to the same location, we delete the dead store. This zaps trivial dead + /// stores which can occur in bitfield code among other things. + StoreInst *LastStore = 0; + + bool Changed = false; + + // See if any instructions in the block can be eliminated. If so, do it. If + // not, add them to AvailableValues. + for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ) { + Instruction *Inst = I++; + + // Dead instructions should just be removed. + if (isInstructionTriviallyDead(Inst)) { + DEBUG(dbgs() << "EarlyCSE DCE: " << *Inst << '\n'); + Inst->eraseFromParent(); + Changed = true; + ++NumSimplify; + continue; + } + + // If the instruction can be simplified (e.g. X+0 = X) then replace it with + // its simpler value. + if (Value *V = SimplifyInstruction(Inst, TD, DT)) { + DEBUG(dbgs() << "EarlyCSE Simplify: " << *Inst << " to: " << *V << '\n'); + Inst->replaceAllUsesWith(V); + Inst->eraseFromParent(); + Changed = true; + ++NumSimplify; + continue; + } + + // If this is a simple instruction that we can value number, process it. + if (SimpleValue::canHandle(Inst)) { + // See if the instruction has an available value. If so, use it. + if (Value *V = AvailableValues->lookup(Inst)) { + DEBUG(dbgs() << "EarlyCSE CSE: " << *Inst << " to: " << *V << '\n'); + Inst->replaceAllUsesWith(V); + Inst->eraseFromParent(); + Changed = true; + ++NumCSE; + continue; + } + + // Otherwise, just remember that this value is available. + AvailableValues->insert(Inst, Inst); + continue; + } + + // If this is a non-volatile load, process it. + if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) { + // Ignore volatile loads. + if (LI->isVolatile()) { + LastStore = 0; + continue; + } + + // If we have an available version of this load, and if it is the right + // generation, replace this instruction. + std::pair<Value*, unsigned> InVal = + AvailableLoads->lookup(Inst->getOperand(0)); + if (InVal.first != 0 && InVal.second == CurrentGeneration) { + DEBUG(dbgs() << "EarlyCSE CSE LOAD: " << *Inst << " to: " + << *InVal.first << '\n'); + if (!Inst->use_empty()) Inst->replaceAllUsesWith(InVal.first); + Inst->eraseFromParent(); + Changed = true; + ++NumCSELoad; + continue; + } + + // Otherwise, remember that we have this instruction. + AvailableLoads->insert(Inst->getOperand(0), + std::pair<Value*, unsigned>(Inst, CurrentGeneration)); + LastStore = 0; + continue; + } + + // If this instruction may read from memory, forget LastStore. + if (Inst->mayReadFromMemory()) + LastStore = 0; + + // If this is a read-only call, process it. + if (CallValue::canHandle(Inst)) { + // If we have an available version of this call, and if it is the right + // generation, replace this instruction. + std::pair<Value*, unsigned> InVal = AvailableCalls->lookup(Inst); + if (InVal.first != 0 && InVal.second == CurrentGeneration) { + DEBUG(dbgs() << "EarlyCSE CSE CALL: " << *Inst << " to: " + << *InVal.first << '\n'); + if (!Inst->use_empty()) Inst->replaceAllUsesWith(InVal.first); + Inst->eraseFromParent(); + Changed = true; + ++NumCSECall; + continue; + } + + // Otherwise, remember that we have this instruction. + AvailableCalls->insert(Inst, + std::pair<Value*, unsigned>(Inst, CurrentGeneration)); + continue; + } + + // Okay, this isn't something we can CSE at all. Check to see if it is + // something that could modify memory. If so, our available memory values + // cannot be used so bump the generation count. + if (Inst->mayWriteToMemory()) { + ++CurrentGeneration; + + if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) { + // We do a trivial form of DSE if there are two stores to the same + // location with no intervening loads. Delete the earlier store. + if (LastStore && + LastStore->getPointerOperand() == SI->getPointerOperand()) { + DEBUG(dbgs() << "EarlyCSE DEAD STORE: " << *LastStore << " due to: " + << *Inst << '\n'); + LastStore->eraseFromParent(); + Changed = true; + ++NumDSE; + LastStore = 0; + continue; + } + + // Okay, we just invalidated anything we knew about loaded values. Try + // to salvage *something* by remembering that the stored value is a live + // version of the pointer. It is safe to forward from volatile stores + // to non-volatile loads, so we don't have to check for volatility of + // the store. + AvailableLoads->insert(SI->getPointerOperand(), + std::pair<Value*, unsigned>(SI->getValueOperand(), CurrentGeneration)); + + // Remember that this was the last store we saw for DSE. + if (!SI->isVolatile()) + LastStore = SI; + } + } + } + + unsigned LiveOutGeneration = CurrentGeneration; + for (DomTreeNode::iterator I = Node->begin(), E = Node->end(); I != E; ++I) { + Changed |= processNode(*I); + // Pop any generation changes off the stack from the recursive walk. + CurrentGeneration = LiveOutGeneration; + } + return Changed; +} + + +bool EarlyCSE::runOnFunction(Function &F) { + TD = getAnalysisIfAvailable<TargetData>(); + DT = &getAnalysis<DominatorTree>(); + + // Tables that the pass uses when walking the domtree. + ScopedHTType AVTable; + AvailableValues = &AVTable; + LoadHTType LoadTable; + AvailableLoads = &LoadTable; + CallHTType CallTable; + AvailableCalls = &CallTable; + + CurrentGeneration = 0; + return processNode(DT->getRootNode()); +} diff --git a/lib/Transforms/Scalar/GEPSplitter.cpp b/lib/Transforms/Scalar/GEPSplitter.cpp index 53dd06d..4c3d188 100644 --- a/lib/Transforms/Scalar/GEPSplitter.cpp +++ b/lib/Transforms/Scalar/GEPSplitter.cpp @@ -27,13 +27,15 @@ namespace { virtual void getAnalysisUsage(AnalysisUsage &AU) const; public: static char ID; // Pass identification, replacement for typeid - explicit GEPSplitter() : FunctionPass(ID) {} + explicit GEPSplitter() : FunctionPass(ID) { + initializeGEPSplitterPass(*PassRegistry::getPassRegistry()); + } }; } char GEPSplitter::ID = 0; INITIALIZE_PASS(GEPSplitter, "split-geps", - "split complex GEPs into simple GEPs", false, false); + "split complex GEPs into simple GEPs", false, false) FunctionPass *llvm::createGEPSplitterPass() { return new GEPSplitter(); diff --git a/lib/Transforms/Scalar/GVN.cpp b/lib/Transforms/Scalar/GVN.cpp index c62ce1f..a0123f5 100644 --- a/lib/Transforms/Scalar/GVN.cpp +++ b/lib/Transforms/Scalar/GVN.cpp @@ -17,39 +17,30 @@ #define DEBUG_TYPE "gvn" #include "llvm/Transforms/Scalar.h" -#include "llvm/BasicBlock.h" -#include "llvm/Constants.h" -#include "llvm/DerivedTypes.h" #include "llvm/GlobalVariable.h" -#include "llvm/Function.h" #include "llvm/IntrinsicInst.h" #include "llvm/LLVMContext.h" -#include "llvm/Operator.h" -#include "llvm/Value.h" -#include "llvm/ADT/DenseMap.h" -#include "llvm/ADT/DepthFirstIterator.h" -#include "llvm/ADT/PostOrderIterator.h" -#include "llvm/ADT/SmallPtrSet.h" -#include "llvm/ADT/SmallVector.h" -#include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/ConstantFolding.h" #include "llvm/Analysis/Dominators.h" +#include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/Loads.h" #include "llvm/Analysis/MemoryBuiltins.h" #include "llvm/Analysis/MemoryDependenceAnalysis.h" #include "llvm/Analysis/PHITransAddr.h" -#include "llvm/Support/CFG.h" -#include "llvm/Support/CommandLine.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/ErrorHandling.h" -#include "llvm/Support/GetElementPtrTypeIterator.h" -#include "llvm/Support/IRBuilder.h" -#include "llvm/Support/raw_ostream.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/Assembly/Writer.h" #include "llvm/Target/TargetData.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" -#include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/SSAUpdater.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/DepthFirstIterator.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Support/Allocator.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/IRBuilder.h" using namespace llvm; STATISTIC(NumGVNInstr, "Number of instructions deleted"); @@ -61,7 +52,6 @@ STATISTIC(NumPRELoad, "Number of loads PRE'd"); static cl::opt<bool> EnablePRE("enable-pre", cl::init(true), cl::Hidden); static cl::opt<bool> EnableLoadPRE("enable-load-pre", cl::init(true)); -static cl::opt<bool> EnableFullLoadPRE("enable-full-load-pre", cl::init(false)); //===----------------------------------------------------------------------===// // ValueTable Class @@ -72,76 +62,23 @@ static cl::opt<bool> EnableFullLoadPRE("enable-full-load-pre", cl::init(false)); /// two values. namespace { struct Expression { - enum ExpressionOpcode { - ADD = Instruction::Add, - FADD = Instruction::FAdd, - SUB = Instruction::Sub, - FSUB = Instruction::FSub, - MUL = Instruction::Mul, - FMUL = Instruction::FMul, - UDIV = Instruction::UDiv, - SDIV = Instruction::SDiv, - FDIV = Instruction::FDiv, - UREM = Instruction::URem, - SREM = Instruction::SRem, - FREM = Instruction::FRem, - SHL = Instruction::Shl, - LSHR = Instruction::LShr, - ASHR = Instruction::AShr, - AND = Instruction::And, - OR = Instruction::Or, - XOR = Instruction::Xor, - TRUNC = Instruction::Trunc, - ZEXT = Instruction::ZExt, - SEXT = Instruction::SExt, - FPTOUI = Instruction::FPToUI, - FPTOSI = Instruction::FPToSI, - UITOFP = Instruction::UIToFP, - SITOFP = Instruction::SIToFP, - FPTRUNC = Instruction::FPTrunc, - FPEXT = Instruction::FPExt, - PTRTOINT = Instruction::PtrToInt, - INTTOPTR = Instruction::IntToPtr, - BITCAST = Instruction::BitCast, - ICMPEQ, ICMPNE, ICMPUGT, ICMPUGE, ICMPULT, ICMPULE, - ICMPSGT, ICMPSGE, ICMPSLT, ICMPSLE, FCMPOEQ, - FCMPOGT, FCMPOGE, FCMPOLT, FCMPOLE, FCMPONE, - FCMPORD, FCMPUNO, FCMPUEQ, FCMPUGT, FCMPUGE, - FCMPULT, FCMPULE, FCMPUNE, EXTRACT, INSERT, - SHUFFLE, SELECT, GEP, CALL, CONSTANT, - INSERTVALUE, EXTRACTVALUE, EMPTY, TOMBSTONE }; - - ExpressionOpcode opcode; + uint32_t opcode; const Type* type; SmallVector<uint32_t, 4> varargs; - Value *function; Expression() { } - Expression(ExpressionOpcode o) : opcode(o) { } + Expression(uint32_t o) : opcode(o) { } bool operator==(const Expression &other) const { if (opcode != other.opcode) return false; - else if (opcode == EMPTY || opcode == TOMBSTONE) + else if (opcode == ~0U || opcode == ~1U) return true; else if (type != other.type) return false; - else if (function != other.function) + else if (varargs != other.varargs) return false; - else { - if (varargs.size() != other.varargs.size()) - return false; - - for (size_t i = 0; i < varargs.size(); ++i) - if (varargs[i] != other.varargs[i]) - return false; - - return true; - } - } - - bool operator!=(const Expression &other) const { - return !(*this == other); + return true; } }; @@ -155,19 +92,7 @@ namespace { uint32_t nextValueNumber; - Expression::ExpressionOpcode getOpcode(CmpInst* C); - Expression create_expression(BinaryOperator* BO); - Expression create_expression(CmpInst* C); - Expression create_expression(ShuffleVectorInst* V); - Expression create_expression(ExtractElementInst* C); - Expression create_expression(InsertElementInst* V); - Expression create_expression(SelectInst* V); - Expression create_expression(CastInst* C); - Expression create_expression(GetElementPtrInst* G); - Expression create_expression(CallInst* C); - Expression create_expression(ExtractValueInst* C); - Expression create_expression(InsertValueInst* C); - + Expression create_expression(Instruction* I); uint32_t lookup_or_add_call(CallInst* C); public: ValueTable() : nextValueNumber(1) { } @@ -176,7 +101,6 @@ namespace { void add(Value *V, uint32_t num); void clear(); void erase(Value *v); - unsigned size(); void setAliasAnalysis(AliasAnalysis* A) { AA = A; } AliasAnalysis *getAliasAnalysis() const { return AA; } void setMemDep(MemoryDependenceAnalysis* M) { MD = M; } @@ -189,11 +113,11 @@ namespace { namespace llvm { template <> struct DenseMapInfo<Expression> { static inline Expression getEmptyKey() { - return Expression(Expression::EMPTY); + return ~0U; } static inline Expression getTombstoneKey() { - return Expression(Expression::TOMBSTONE); + return ~1U; } static unsigned getHashValue(const Expression e) { @@ -205,20 +129,13 @@ template <> struct DenseMapInfo<Expression> { for (SmallVector<uint32_t, 4>::const_iterator I = e.varargs.begin(), E = e.varargs.end(); I != E; ++I) hash = *I + hash * 37; - - hash = ((unsigned)((uintptr_t)e.function >> 4) ^ - (unsigned)((uintptr_t)e.function >> 9)) + - hash * 37; - + return hash; } static bool isEqual(const Expression &LHS, const Expression &RHS) { return LHS == RHS; } }; - -template <> -struct isPodLike<Expression> { static const bool value = true; }; } @@ -226,185 +143,27 @@ struct isPodLike<Expression> { static const bool value = true; }; // ValueTable Internal Functions //===----------------------------------------------------------------------===// -Expression::ExpressionOpcode ValueTable::getOpcode(CmpInst* C) { - if (isa<ICmpInst>(C)) { - switch (C->getPredicate()) { - default: // THIS SHOULD NEVER HAPPEN - llvm_unreachable("Comparison with unknown predicate?"); - case ICmpInst::ICMP_EQ: return Expression::ICMPEQ; - case ICmpInst::ICMP_NE: return Expression::ICMPNE; - case ICmpInst::ICMP_UGT: return Expression::ICMPUGT; - case ICmpInst::ICMP_UGE: return Expression::ICMPUGE; - case ICmpInst::ICMP_ULT: return Expression::ICMPULT; - case ICmpInst::ICMP_ULE: return Expression::ICMPULE; - case ICmpInst::ICMP_SGT: return Expression::ICMPSGT; - case ICmpInst::ICMP_SGE: return Expression::ICMPSGE; - case ICmpInst::ICMP_SLT: return Expression::ICMPSLT; - case ICmpInst::ICMP_SLE: return Expression::ICMPSLE; - } - } else { - switch (C->getPredicate()) { - default: // THIS SHOULD NEVER HAPPEN - llvm_unreachable("Comparison with unknown predicate?"); - case FCmpInst::FCMP_OEQ: return Expression::FCMPOEQ; - case FCmpInst::FCMP_OGT: return Expression::FCMPOGT; - case FCmpInst::FCMP_OGE: return Expression::FCMPOGE; - case FCmpInst::FCMP_OLT: return Expression::FCMPOLT; - case FCmpInst::FCMP_OLE: return Expression::FCMPOLE; - case FCmpInst::FCMP_ONE: return Expression::FCMPONE; - case FCmpInst::FCMP_ORD: return Expression::FCMPORD; - case FCmpInst::FCMP_UNO: return Expression::FCMPUNO; - case FCmpInst::FCMP_UEQ: return Expression::FCMPUEQ; - case FCmpInst::FCMP_UGT: return Expression::FCMPUGT; - case FCmpInst::FCMP_UGE: return Expression::FCMPUGE; - case FCmpInst::FCMP_ULT: return Expression::FCMPULT; - case FCmpInst::FCMP_ULE: return Expression::FCMPULE; - case FCmpInst::FCMP_UNE: return Expression::FCMPUNE; - } - } -} - -Expression ValueTable::create_expression(CallInst* C) { - Expression e; - - e.type = C->getType(); - e.function = C->getCalledFunction(); - e.opcode = Expression::CALL; - - CallSite CS(C); - for (CallInst::op_iterator I = CS.arg_begin(), E = CS.arg_end(); - I != E; ++I) - e.varargs.push_back(lookup_or_add(*I)); - - return e; -} - -Expression ValueTable::create_expression(BinaryOperator* BO) { - Expression e; - e.varargs.push_back(lookup_or_add(BO->getOperand(0))); - e.varargs.push_back(lookup_or_add(BO->getOperand(1))); - e.function = 0; - e.type = BO->getType(); - e.opcode = static_cast<Expression::ExpressionOpcode>(BO->getOpcode()); - - return e; -} - -Expression ValueTable::create_expression(CmpInst* C) { - Expression e; - - e.varargs.push_back(lookup_or_add(C->getOperand(0))); - e.varargs.push_back(lookup_or_add(C->getOperand(1))); - e.function = 0; - e.type = C->getType(); - e.opcode = getOpcode(C); - - return e; -} - -Expression ValueTable::create_expression(CastInst* C) { - Expression e; - - e.varargs.push_back(lookup_or_add(C->getOperand(0))); - e.function = 0; - e.type = C->getType(); - e.opcode = static_cast<Expression::ExpressionOpcode>(C->getOpcode()); - - return e; -} - -Expression ValueTable::create_expression(ShuffleVectorInst* S) { - Expression e; - - e.varargs.push_back(lookup_or_add(S->getOperand(0))); - e.varargs.push_back(lookup_or_add(S->getOperand(1))); - e.varargs.push_back(lookup_or_add(S->getOperand(2))); - e.function = 0; - e.type = S->getType(); - e.opcode = Expression::SHUFFLE; - - return e; -} -Expression ValueTable::create_expression(ExtractElementInst* E) { +Expression ValueTable::create_expression(Instruction *I) { Expression e; - - e.varargs.push_back(lookup_or_add(E->getOperand(0))); - e.varargs.push_back(lookup_or_add(E->getOperand(1))); - e.function = 0; - e.type = E->getType(); - e.opcode = Expression::EXTRACT; - - return e; -} - -Expression ValueTable::create_expression(InsertElementInst* I) { - Expression e; - - e.varargs.push_back(lookup_or_add(I->getOperand(0))); - e.varargs.push_back(lookup_or_add(I->getOperand(1))); - e.varargs.push_back(lookup_or_add(I->getOperand(2))); - e.function = 0; e.type = I->getType(); - e.opcode = Expression::INSERT; - - return e; -} - -Expression ValueTable::create_expression(SelectInst* I) { - Expression e; - - e.varargs.push_back(lookup_or_add(I->getCondition())); - e.varargs.push_back(lookup_or_add(I->getTrueValue())); - e.varargs.push_back(lookup_or_add(I->getFalseValue())); - e.function = 0; - e.type = I->getType(); - e.opcode = Expression::SELECT; - - return e; -} - -Expression ValueTable::create_expression(GetElementPtrInst* G) { - Expression e; - - e.varargs.push_back(lookup_or_add(G->getPointerOperand())); - e.function = 0; - e.type = G->getType(); - e.opcode = Expression::GEP; - - for (GetElementPtrInst::op_iterator I = G->idx_begin(), E = G->idx_end(); - I != E; ++I) - e.varargs.push_back(lookup_or_add(*I)); - - return e; -} - -Expression ValueTable::create_expression(ExtractValueInst* E) { - Expression e; - - e.varargs.push_back(lookup_or_add(E->getAggregateOperand())); - for (ExtractValueInst::idx_iterator II = E->idx_begin(), IE = E->idx_end(); - II != IE; ++II) - e.varargs.push_back(*II); - e.function = 0; - e.type = E->getType(); - e.opcode = Expression::EXTRACTVALUE; - - return e; -} - -Expression ValueTable::create_expression(InsertValueInst* E) { - Expression e; - - e.varargs.push_back(lookup_or_add(E->getAggregateOperand())); - e.varargs.push_back(lookup_or_add(E->getInsertedValueOperand())); - for (InsertValueInst::idx_iterator II = E->idx_begin(), IE = E->idx_end(); - II != IE; ++II) - e.varargs.push_back(*II); - e.function = 0; - e.type = E->getType(); - e.opcode = Expression::INSERTVALUE; - + e.opcode = I->getOpcode(); + for (Instruction::op_iterator OI = I->op_begin(), OE = I->op_end(); + OI != OE; ++OI) + e.varargs.push_back(lookup_or_add(*OI)); + + if (CmpInst *C = dyn_cast<CmpInst>(I)) + e.opcode = (C->getOpcode() << 8) | C->getPredicate(); + else if (ExtractValueInst *E = dyn_cast<ExtractValueInst>(I)) { + for (ExtractValueInst::idx_iterator II = E->idx_begin(), IE = E->idx_end(); + II != IE; ++II) + e.varargs.push_back(*II); + } else if (InsertValueInst *E = dyn_cast<InsertValueInst>(I)) { + for (InsertValueInst::idx_iterator II = E->idx_begin(), IE = E->idx_end(); + II != IE; ++II) + e.varargs.push_back(*II); + } + return e; } @@ -563,12 +322,8 @@ uint32_t ValueTable::lookup_or_add(Value *V) { case Instruction::And: case Instruction::Or : case Instruction::Xor: - exp = create_expression(cast<BinaryOperator>(I)); - break; case Instruction::ICmp: case Instruction::FCmp: - exp = create_expression(cast<CmpInst>(I)); - break; case Instruction::Trunc: case Instruction::ZExt: case Instruction::SExt: @@ -581,28 +336,14 @@ uint32_t ValueTable::lookup_or_add(Value *V) { case Instruction::PtrToInt: case Instruction::IntToPtr: case Instruction::BitCast: - exp = create_expression(cast<CastInst>(I)); - break; case Instruction::Select: - exp = create_expression(cast<SelectInst>(I)); - break; case Instruction::ExtractElement: - exp = create_expression(cast<ExtractElementInst>(I)); - break; case Instruction::InsertElement: - exp = create_expression(cast<InsertElementInst>(I)); - break; case Instruction::ShuffleVector: - exp = create_expression(cast<ShuffleVectorInst>(I)); - break; case Instruction::ExtractValue: - exp = create_expression(cast<ExtractValueInst>(I)); - break; case Instruction::InsertValue: - exp = create_expression(cast<InsertValueInst>(I)); - break; case Instruction::GetElementPtr: - exp = create_expression(cast<GetElementPtrInst>(I)); + exp = create_expression(I); break; default: valueNumbering[V] = nextValueNumber; @@ -649,30 +390,76 @@ void ValueTable::verifyRemoved(const Value *V) const { //===----------------------------------------------------------------------===// namespace { - struct ValueNumberScope { - ValueNumberScope* parent; - DenseMap<uint32_t, Value*> table; - - ValueNumberScope(ValueNumberScope* p) : parent(p) { } - }; -} - -namespace { class GVN : public FunctionPass { bool runOnFunction(Function &F); public: static char ID; // Pass identification, replacement for typeid explicit GVN(bool noloads = false) - : FunctionPass(ID), NoLoads(noloads), MD(0) { } + : FunctionPass(ID), NoLoads(noloads), MD(0) { + initializeGVNPass(*PassRegistry::getPassRegistry()); + } private: bool NoLoads; MemoryDependenceAnalysis *MD; DominatorTree *DT; + const TargetData* TD; ValueTable VN; - DenseMap<BasicBlock*, ValueNumberScope*> localAvail; + + /// LeaderTable - A mapping from value numbers to lists of Value*'s that + /// have that value number. Use findLeader to query it. + struct LeaderTableEntry { + Value *Val; + BasicBlock *BB; + LeaderTableEntry *Next; + }; + DenseMap<uint32_t, LeaderTableEntry> LeaderTable; + BumpPtrAllocator TableAllocator; + + /// addToLeaderTable - Push a new Value to the LeaderTable onto the list for + /// its value number. + void addToLeaderTable(uint32_t N, Value *V, BasicBlock *BB) { + LeaderTableEntry& Curr = LeaderTable[N]; + if (!Curr.Val) { + Curr.Val = V; + Curr.BB = BB; + return; + } + + LeaderTableEntry* Node = TableAllocator.Allocate<LeaderTableEntry>(); + Node->Val = V; + Node->BB = BB; + Node->Next = Curr.Next; + Curr.Next = Node; + } + + /// removeFromLeaderTable - Scan the list of values corresponding to a given + /// value number, and remove the given value if encountered. + void removeFromLeaderTable(uint32_t N, Value *V, BasicBlock *BB) { + LeaderTableEntry* Prev = 0; + LeaderTableEntry* Curr = &LeaderTable[N]; + + while (Curr->Val != V || Curr->BB != BB) { + Prev = Curr; + Curr = Curr->Next; + } + + if (Prev) { + Prev->Next = Curr->Next; + } else { + if (!Curr->Next) { + Curr->Val = 0; + Curr->BB = 0; + } else { + LeaderTableEntry* Next = Curr->Next; + Curr->Val = Next->Val; + Curr->BB = Next->BB; + Curr->Next = Next->Next; + } + } + } // List of critical edges to be split between iterations. SmallVector<std::pair<TerminatorInst*, unsigned>, 4> toSplit; @@ -699,9 +486,8 @@ namespace { bool processBlock(BasicBlock *BB); void dump(DenseMap<uint32_t, Value*>& d); bool iterateOnFunction(Function &F); - Value *CollapsePhi(PHINode* p); bool performPRE(Function& F); - Value *lookupNumber(BasicBlock *BB, uint32_t num); + Value *findLeader(BasicBlock *BB, uint32_t num); void cleanupGlobalSets(); void verifyRemoved(const Instruction *I) const; bool splitCriticalEdges(); @@ -715,7 +501,11 @@ FunctionPass *llvm::createGVNPass(bool NoLoads) { return new GVN(NoLoads); } -INITIALIZE_PASS(GVN, "gvn", "Global Value Numbering", false, false); +INITIALIZE_PASS_BEGIN(GVN, "gvn", "Global Value Numbering", false, false) +INITIALIZE_PASS_DEPENDENCY(MemoryDependenceAnalysis) +INITIALIZE_PASS_DEPENDENCY(DominatorTree) +INITIALIZE_AG_DEPENDENCY(AliasAnalysis) +INITIALIZE_PASS_END(GVN, "gvn", "Global Value Numbering", false, false) void GVN::dump(DenseMap<uint32_t, Value*>& d) { errs() << "{\n"; @@ -727,33 +517,6 @@ void GVN::dump(DenseMap<uint32_t, Value*>& d) { errs() << "}\n"; } -static bool isSafeReplacement(PHINode* p, Instruction *inst) { - if (!isa<PHINode>(inst)) - return true; - - for (Instruction::use_iterator UI = p->use_begin(), E = p->use_end(); - UI != E; ++UI) - if (PHINode* use_phi = dyn_cast<PHINode>(*UI)) - if (use_phi->getParent() == inst->getParent()) - return false; - - return true; -} - -Value *GVN::CollapsePhi(PHINode *PN) { - Value *ConstVal = PN->hasConstantValue(DT); - if (!ConstVal) return 0; - - Instruction *Inst = dyn_cast<Instruction>(ConstVal); - if (!Inst) - return ConstVal; - - if (DT->dominates(Inst, PN)) - if (isSafeReplacement(PN, Inst)) - return Inst; - return 0; -} - /// IsValueFullyAvailableInBlock - Return true if we can prove that the value /// we're analyzing is fully available in the specified block. As we go, keep /// track of which blocks we know are fully alive in FullyAvailableBlocks. This @@ -937,47 +700,6 @@ static Value *CoerceAvailableValueToLoadType(Value *StoredVal, return new BitCastInst(StoredVal, LoadedTy, "bitcast", InsertPt); } -/// GetBaseWithConstantOffset - Analyze the specified pointer to see if it can -/// be expressed as a base pointer plus a constant offset. Return the base and -/// offset to the caller. -static Value *GetBaseWithConstantOffset(Value *Ptr, int64_t &Offset, - const TargetData &TD) { - Operator *PtrOp = dyn_cast<Operator>(Ptr); - if (PtrOp == 0) return Ptr; - - // Just look through bitcasts. - if (PtrOp->getOpcode() == Instruction::BitCast) - return GetBaseWithConstantOffset(PtrOp->getOperand(0), Offset, TD); - - // If this is a GEP with constant indices, we can look through it. - GEPOperator *GEP = dyn_cast<GEPOperator>(PtrOp); - if (GEP == 0 || !GEP->hasAllConstantIndices()) return Ptr; - - gep_type_iterator GTI = gep_type_begin(GEP); - for (User::op_iterator I = GEP->idx_begin(), E = GEP->idx_end(); I != E; - ++I, ++GTI) { - ConstantInt *OpC = cast<ConstantInt>(*I); - if (OpC->isZero()) continue; - - // Handle a struct and array indices which add their offset to the pointer. - if (const StructType *STy = dyn_cast<StructType>(*GTI)) { - Offset += TD.getStructLayout(STy)->getElementOffset(OpC->getZExtValue()); - } else { - uint64_t Size = TD.getTypeAllocSize(GTI.getIndexedType()); - Offset += OpC->getSExtValue()*Size; - } - } - - // Re-sign extend from the pointer size if needed to get overflow edge cases - // right. - unsigned PtrSize = TD.getPointerSizeInBits(); - if (PtrSize < 64) - Offset = (Offset << (64-PtrSize)) >> (64-PtrSize); - - return GetBaseWithConstantOffset(GEP->getPointerOperand(), Offset, TD); -} - - /// AnalyzeLoadFromClobberingWrite - This function is called when we have a /// memdep query of a load that ends up being a clobbering memory write (store, /// memset, memcpy, memmove). This means that the write *may* provide bits used @@ -996,9 +718,8 @@ static int AnalyzeLoadFromClobberingWrite(const Type *LoadTy, Value *LoadPtr, return -1; int64_t StoreOffset = 0, LoadOffset = 0; - Value *StoreBase = GetBaseWithConstantOffset(WritePtr, StoreOffset, TD); - Value *LoadBase = - GetBaseWithConstantOffset(LoadPtr, LoadOffset, TD); + Value *StoreBase = GetPointerBaseWithConstantOffset(WritePtr, StoreOffset,TD); + Value *LoadBase = GetPointerBaseWithConstantOffset(LoadPtr, LoadOffset, TD); if (StoreBase != LoadBase) return -1; @@ -1020,8 +741,6 @@ static int AnalyzeLoadFromClobberingWrite(const Type *LoadTy, Value *LoadPtr, // If the load and store don't overlap at all, the store doesn't provide // anything to the load. In this case, they really don't alias at all, AA // must have gotten confused. - // FIXME: Investigate cases where this bails out, e.g. rdar://7238614. Then - // remove this check, as it is duplicated with what we have below. uint64_t LoadSize = TD.getTypeSizeInBits(LoadTy); if ((WriteSizeInBits & 7) | (LoadSize & 7)) @@ -1067,12 +786,12 @@ static int AnalyzeLoadFromClobberingStore(const Type *LoadTy, Value *LoadPtr, StoreInst *DepSI, const TargetData &TD) { // Cannot handle reading from store of first-class aggregate yet. - if (DepSI->getOperand(0)->getType()->isStructTy() || - DepSI->getOperand(0)->getType()->isArrayTy()) + if (DepSI->getValueOperand()->getType()->isStructTy() || + DepSI->getValueOperand()->getType()->isArrayTy()) return -1; Value *StorePtr = DepSI->getPointerOperand(); - uint64_t StoreSize = TD.getTypeSizeInBits(DepSI->getOperand(0)->getType()); + uint64_t StoreSize =TD.getTypeSizeInBits(DepSI->getValueOperand()->getType()); return AnalyzeLoadFromClobberingWrite(LoadTy, LoadPtr, StorePtr, StoreSize, TD); } @@ -1099,7 +818,7 @@ static int AnalyzeLoadFromClobberingMemInst(const Type *LoadTy, Value *LoadPtr, Constant *Src = dyn_cast<Constant>(MTI->getSource()); if (Src == 0) return -1; - GlobalVariable *GV = dyn_cast<GlobalVariable>(Src->getUnderlyingObject()); + GlobalVariable *GV = dyn_cast<GlobalVariable>(GetUnderlyingObject(Src, &TD)); if (GV == 0 || !GV->isConstant()) return -1; // See if the access is within the bounds of the transfer. @@ -1331,6 +1050,15 @@ static Value *ConstructSSAForLoadSet(LoadInst *LI, if (V->getType()->isPointerTy()) for (unsigned i = 0, e = NewPHIs.size(); i != e; ++i) AA->copyValue(LI, NewPHIs[i]); + + // Now that we've copied information to the new PHIs, scan through + // them again and inform alias analysis that we've added potentially + // escaping uses to any values that are operands to these PHIs. + for (unsigned i = 0, e = NewPHIs.size(); i != e; ++i) { + PHINode *P = NewPHIs[i]; + for (unsigned ii = 0, ee = P->getNumIncomingValues(); ii != ee; ++ii) + AA->addEscapingUse(P->getOperandUse(2*ii)); + } return V; } @@ -1347,8 +1075,8 @@ bool GVN::processNonLocalLoad(LoadInst *LI, SmallVectorImpl<Instruction*> &toErase) { // Find the non-local dependencies of the load. SmallVector<NonLocalDepResult, 64> Deps; - MD->getNonLocalPointerDependency(LI->getOperand(0), true, LI->getParent(), - Deps); + AliasAnalysis::Location Loc = VN.getAliasAnalysis()->getLocation(LI); + MD->getNonLocalPointerDependency(Loc, true, LI->getParent(), Deps); //DEBUG(dbgs() << "INVESTIGATING NONLOCAL LOAD: " // << Deps.size() << *LI << '\n'); @@ -1376,8 +1104,6 @@ bool GVN::processNonLocalLoad(LoadInst *LI, SmallVector<AvailableValueInBlock, 16> ValuesPerBlock; SmallVector<BasicBlock*, 16> UnavailableBlocks; - const TargetData *TD = 0; - for (unsigned i = 0, e = Deps.size(); i != e; ++i) { BasicBlock *DepBB = Deps[i].getBB(); MemDepResult DepInfo = Deps[i].getResult(); @@ -1392,14 +1118,12 @@ bool GVN::processNonLocalLoad(LoadInst *LI, // read by the load, we can extract the bits we need for the load from the // stored value. if (StoreInst *DepSI = dyn_cast<StoreInst>(DepInfo.getInst())) { - if (TD == 0) - TD = getAnalysisIfAvailable<TargetData>(); if (TD && Address) { int Offset = AnalyzeLoadFromClobberingStore(LI->getType(), Address, DepSI, *TD); if (Offset != -1) { ValuesPerBlock.push_back(AvailableValueInBlock::get(DepBB, - DepSI->getOperand(0), + DepSI->getValueOperand(), Offset)); continue; } @@ -1409,8 +1133,6 @@ bool GVN::processNonLocalLoad(LoadInst *LI, // If the clobbering value is a memset/memcpy/memmove, see if we can // forward a value on from it. if (MemIntrinsic *DepMI = dyn_cast<MemIntrinsic>(DepInfo.getInst())) { - if (TD == 0) - TD = getAnalysisIfAvailable<TargetData>(); if (TD && Address) { int Offset = AnalyzeLoadFromClobberingMemInst(LI->getType(), Address, DepMI, *TD); @@ -1440,13 +1162,10 @@ bool GVN::processNonLocalLoad(LoadInst *LI, if (StoreInst *S = dyn_cast<StoreInst>(DepInst)) { // Reject loads and stores that are to the same address but are of // different types if we have to. - if (S->getOperand(0)->getType() != LI->getType()) { - if (TD == 0) - TD = getAnalysisIfAvailable<TargetData>(); - + if (S->getValueOperand()->getType() != LI->getType()) { // If the stored value is larger or equal to the loaded value, we can // reuse it. - if (TD == 0 || !CanCoerceMustAliasedValueToLoad(S->getOperand(0), + if (TD == 0 || !CanCoerceMustAliasedValueToLoad(S->getValueOperand(), LI->getType(), *TD)) { UnavailableBlocks.push_back(DepBB); continue; @@ -1454,16 +1173,13 @@ bool GVN::processNonLocalLoad(LoadInst *LI, } ValuesPerBlock.push_back(AvailableValueInBlock::get(DepBB, - S->getOperand(0))); + S->getValueOperand())); continue; } if (LoadInst *LD = dyn_cast<LoadInst>(DepInst)) { // If the types mismatch and we can't handle it, reject reuse of the load. if (LD->getType() != LI->getType()) { - if (TD == 0) - TD = getAnalysisIfAvailable<TargetData>(); - // If the stored value is larger or equal to the loaded value, we can // reuse it. if (TD == 0 || !CanCoerceMustAliasedValueToLoad(LD, LI->getType(),*TD)){ @@ -1533,26 +1249,19 @@ bool GVN::processNonLocalLoad(LoadInst *LI, return false; if (Blockers.count(TmpBB)) return false; + + // If any of these blocks has more than one successor (i.e. if the edge we + // just traversed was critical), then there are other paths through this + // block along which the load may not be anticipated. Hoisting the load + // above this block would be adding the load to execution paths along + // which it was not previously executed. if (TmpBB->getTerminator()->getNumSuccessors() != 1) - allSingleSucc = false; + return false; } assert(TmpBB); LoadBB = TmpBB; - // If we have a repl set with LI itself in it, this means we have a loop where - // at least one of the values is LI. Since this means that we won't be able - // to eliminate LI even if we insert uses in the other predecessors, we will - // end up increasing code size. Reject this by scanning for LI. - for (unsigned i = 0, e = ValuesPerBlock.size(); i != e; ++i) { - if (ValuesPerBlock[i].isSimpleValue() && - ValuesPerBlock[i].getSimpleValue() == LI) { - // Skip cases where LI is the only definition, even for EnableFullLoadPRE. - if (!EnableFullLoadPRE || e == 1) - return false; - } - } - // FIXME: It is extremely unclear what this loop is doing, other than // artificially restricting loadpre. if (isSinglePred) { @@ -1612,14 +1321,13 @@ bool GVN::processNonLocalLoad(LoadInst *LI, unsigned NumUnavailablePreds = PredLoads.size(); assert(NumUnavailablePreds != 0 && "Fully available value should be eliminated above!"); - if (!EnableFullLoadPRE) { - // If this load is unavailable in multiple predecessors, reject it. - // FIXME: If we could restructure the CFG, we could make a common pred with - // all the preds that don't have an available LI and insert a new load into - // that one block. - if (NumUnavailablePreds != 1) + + // If this load is unavailable in multiple predecessors, reject it. + // FIXME: If we could restructure the CFG, we could make a common pred with + // all the preds that don't have an available LI and insert a new load into + // that one block. + if (NumUnavailablePreds != 1) return false; - } // Check if the load can safely be moved to all the unavailable predecessors. bool CanDoPRE = true; @@ -1634,7 +1342,7 @@ bool GVN::processNonLocalLoad(LoadInst *LI, // If all preds have a single successor, then we know it is safe to insert // the load on the pred (?!?), so we can insert code to materialize the // pointer if it is not available. - PHITransAddr Address(LI->getOperand(0), TD); + PHITransAddr Address(LI->getPointerOperand(), TD); Value *LoadPtr = 0; if (allSingleSucc) { LoadPtr = Address.PHITranslateWithInsertion(LoadBB, UnavailablePred, @@ -1648,7 +1356,7 @@ bool GVN::processNonLocalLoad(LoadInst *LI, // we fail PRE. if (LoadPtr == 0) { DEBUG(dbgs() << "COULDN'T INSERT PHI TRANSLATED VALUE OF: " - << *LI->getOperand(0) << "\n"); + << *LI->getPointerOperand() << "\n"); CanDoPRE = false; break; } @@ -1657,8 +1365,8 @@ bool GVN::processNonLocalLoad(LoadInst *LI, // @1 = getelementptr (i8* p, ... // test p and branch if == 0 // load @1 - // It is valid to have the getelementptr before the test, even if p can be 0, - // as getelementptr only does address arithmetic. + // It is valid to have the getelementptr before the test, even if p can + // be 0, as getelementptr only does address arithmetic. // If we are not pushing the value through any multiple-successor blocks // we do not have this case. Otherwise, check that the load is safe to // put anywhere; this can be improved, but should be conservatively safe. @@ -1675,8 +1383,11 @@ bool GVN::processNonLocalLoad(LoadInst *LI, } if (!CanDoPRE) { - while (!NewInsts.empty()) - NewInsts.pop_back_val()->eraseFromParent(); + while (!NewInsts.empty()) { + Instruction *I = NewInsts.pop_back_val(); + if (MD) MD->removeInstruction(I); + I->eraseFromParent(); + } return false; } @@ -1702,9 +1413,13 @@ bool GVN::processNonLocalLoad(LoadInst *LI, BasicBlock *UnavailablePred = I->first; Value *LoadPtr = I->second; - Value *NewLoad = new LoadInst(LoadPtr, LI->getName()+".pre", false, - LI->getAlignment(), - UnavailablePred->getTerminator()); + Instruction *NewLoad = new LoadInst(LoadPtr, LI->getName()+".pre", false, + LI->getAlignment(), + UnavailablePred->getTerminator()); + + // Transfer the old load's TBAA tag to the new load. + if (MDNode *Tag = LI->getMetadata(LLVMContext::MD_tbaa)) + NewLoad->setMetadata(LLVMContext::MD_tbaa, Tag); // Add the newly created load. ValuesPerBlock.push_back(AvailableValueInBlock::get(UnavailablePred, @@ -1753,19 +1468,19 @@ bool GVN::processLoad(LoadInst *L, SmallVectorImpl<Instruction*> &toErase) { // access code. Value *AvailVal = 0; if (StoreInst *DepSI = dyn_cast<StoreInst>(Dep.getInst())) - if (const TargetData *TD = getAnalysisIfAvailable<TargetData>()) { + if (TD) { int Offset = AnalyzeLoadFromClobberingStore(L->getType(), L->getPointerOperand(), DepSI, *TD); if (Offset != -1) - AvailVal = GetStoreValueForLoad(DepSI->getOperand(0), Offset, + AvailVal = GetStoreValueForLoad(DepSI->getValueOperand(), Offset, L->getType(), L, *TD); } // If the clobbering value is a memset/memcpy/memmove, see if we can forward // a value on from it. if (MemIntrinsic *DepMI = dyn_cast<MemIntrinsic>(Dep.getInst())) { - if (const TargetData *TD = getAnalysisIfAvailable<TargetData>()) { + if (TD) { int Offset = AnalyzeLoadFromClobberingMemInst(L->getType(), L->getPointerOperand(), DepMI, *TD); @@ -1804,14 +1519,13 @@ bool GVN::processLoad(LoadInst *L, SmallVectorImpl<Instruction*> &toErase) { Instruction *DepInst = Dep.getInst(); if (StoreInst *DepSI = dyn_cast<StoreInst>(DepInst)) { - Value *StoredVal = DepSI->getOperand(0); + Value *StoredVal = DepSI->getValueOperand(); // The store and load are to a must-aliased pointer, but they may not // actually have the same type. See if we know how to reuse the stored // value (depending on its type). - const TargetData *TD = 0; if (StoredVal->getType() != L->getType()) { - if ((TD = getAnalysisIfAvailable<TargetData>())) { + if (TD) { StoredVal = CoerceAvailableValueToLoadType(StoredVal, L->getType(), L, *TD); if (StoredVal == 0) @@ -1840,9 +1554,8 @@ bool GVN::processLoad(LoadInst *L, SmallVectorImpl<Instruction*> &toErase) { // The loads are of a must-aliased pointer, but they may not actually have // the same type. See if we know how to reuse the previously loaded value // (depending on its type). - const TargetData *TD = 0; if (DepLI->getType() != L->getType()) { - if ((TD = getAnalysisIfAvailable<TargetData>())) { + if (TD) { AvailableVal = CoerceAvailableValueToLoadType(DepLI, L->getType(), L,*TD); if (AvailableVal == 0) return false; @@ -1890,20 +1603,32 @@ bool GVN::processLoad(LoadInst *L, SmallVectorImpl<Instruction*> &toErase) { return false; } -Value *GVN::lookupNumber(BasicBlock *BB, uint32_t num) { - DenseMap<BasicBlock*, ValueNumberScope*>::iterator I = localAvail.find(BB); - if (I == localAvail.end()) - return 0; - - ValueNumberScope *Locals = I->second; - while (Locals) { - DenseMap<uint32_t, Value*>::iterator I = Locals->table.find(num); - if (I != Locals->table.end()) - return I->second; - Locals = Locals->parent; +// findLeader - In order to find a leader for a given value number at a +// specific basic block, we first obtain the list of all Values for that number, +// and then scan the list to find one whose block dominates the block in +// question. This is fast because dominator tree queries consist of only +// a few comparisons of DFS numbers. +Value *GVN::findLeader(BasicBlock *BB, uint32_t num) { + LeaderTableEntry Vals = LeaderTable[num]; + if (!Vals.Val) return 0; + + Value *Val = 0; + if (DT->dominates(Vals.BB, BB)) { + Val = Vals.Val; + if (isa<Constant>(Val)) return Val; + } + + LeaderTableEntry* Next = Vals.Next; + while (Next) { + if (DT->dominates(Next->BB, BB)) { + if (isa<Constant>(Next->Val)) return Next->Val; + if (!Val) Val = Next->Val; + } + + Next = Next->Next; } - return 0; + return Val; } @@ -1915,85 +1640,92 @@ bool GVN::processInstruction(Instruction *I, if (isa<DbgInfoIntrinsic>(I)) return false; + // If the instruction can be easily simplified then do so now in preference + // to value numbering it. Value numbering often exposes redundancies, for + // example if it determines that %y is equal to %x then the instruction + // "%z = and i32 %x, %y" becomes "%z = and i32 %x, %x" which we now simplify. + if (Value *V = SimplifyInstruction(I, TD, DT)) { + I->replaceAllUsesWith(V); + if (MD && V->getType()->isPointerTy()) + MD->invalidateCachedPointerInfo(V); + VN.erase(I); + toErase.push_back(I); + return true; + } + if (LoadInst *LI = dyn_cast<LoadInst>(I)) { bool Changed = processLoad(LI, toErase); if (!Changed) { unsigned Num = VN.lookup_or_add(LI); - localAvail[I->getParent()]->table.insert(std::make_pair(Num, LI)); + addToLeaderTable(Num, LI, LI->getParent()); } return Changed; } - uint32_t NextNum = VN.getNextUnusedValueNumber(); - unsigned Num = VN.lookup_or_add(I); - + // For conditions branches, we can perform simple conditional propagation on + // the condition value itself. if (BranchInst *BI = dyn_cast<BranchInst>(I)) { - localAvail[I->getParent()]->table.insert(std::make_pair(Num, I)); - if (!BI->isConditional() || isa<Constant>(BI->getCondition())) return false; - + Value *BranchCond = BI->getCondition(); uint32_t CondVN = VN.lookup_or_add(BranchCond); - + BasicBlock *TrueSucc = BI->getSuccessor(0); BasicBlock *FalseSucc = BI->getSuccessor(1); - + if (TrueSucc->getSinglePredecessor()) - localAvail[TrueSucc]->table[CondVN] = - ConstantInt::getTrue(TrueSucc->getContext()); + addToLeaderTable(CondVN, + ConstantInt::getTrue(TrueSucc->getContext()), + TrueSucc); if (FalseSucc->getSinglePredecessor()) - localAvail[FalseSucc]->table[CondVN] = - ConstantInt::getFalse(TrueSucc->getContext()); - + addToLeaderTable(CondVN, + ConstantInt::getFalse(TrueSucc->getContext()), + FalseSucc); + return false; + } + + // Instructions with void type don't return a value, so there's + // no point in trying to find redudancies in them. + if (I->getType()->isVoidTy()) return false; + + uint32_t NextNum = VN.getNextUnusedValueNumber(); + unsigned Num = VN.lookup_or_add(I); // Allocations are always uniquely numbered, so we can save time and memory // by fast failing them. - } else if (isa<AllocaInst>(I) || isa<TerminatorInst>(I)) { - localAvail[I->getParent()]->table.insert(std::make_pair(Num, I)); + if (isa<AllocaInst>(I) || isa<TerminatorInst>(I) || isa<PHINode>(I)) { + addToLeaderTable(Num, I, I->getParent()); return false; } - // Collapse PHI nodes - if (PHINode* p = dyn_cast<PHINode>(I)) { - Value *constVal = CollapsePhi(p); - - if (constVal) { - p->replaceAllUsesWith(constVal); - if (MD && constVal->getType()->isPointerTy()) - MD->invalidateCachedPointerInfo(constVal); - VN.erase(p); - - toErase.push_back(p); - } else { - localAvail[I->getParent()]->table.insert(std::make_pair(Num, I)); - } - // If the number we were assigned was a brand new VN, then we don't // need to do a lookup to see if the number already exists // somewhere in the domtree: it can't! - } else if (Num == NextNum) { - localAvail[I->getParent()]->table.insert(std::make_pair(Num, I)); - + if (Num == NextNum) { + addToLeaderTable(Num, I, I->getParent()); + return false; + } + // Perform fast-path value-number based elimination of values inherited from // dominators. - } else if (Value *repl = lookupNumber(I->getParent(), Num)) { - // Remove it! - VN.erase(I); - I->replaceAllUsesWith(repl); - if (MD && repl->getType()->isPointerTy()) - MD->invalidateCachedPointerInfo(repl); - toErase.push_back(I); - return true; - - } else { - localAvail[I->getParent()]->table.insert(std::make_pair(Num, I)); + Value *repl = findLeader(I->getParent(), Num); + if (repl == 0) { + // Failure, just remember this instance for future use. + addToLeaderTable(Num, I, I->getParent()); + return false; } - - return false; + + // Remove it! + VN.erase(I); + I->replaceAllUsesWith(repl); + if (MD && repl->getType()->isPointerTy()) + MD->invalidateCachedPointerInfo(repl); + toErase.push_back(I); + return true; } /// runOnFunction - This is the main transformation entry point for a function. @@ -2001,6 +1733,7 @@ bool GVN::runOnFunction(Function& F) { if (!NoLoads) MD = &getAnalysis<MemoryDependenceAnalysis>(); DT = &getAnalysis<DominatorTree>(); + TD = getAnalysisIfAvailable<TargetData>(); VN.setAliasAnalysis(&getAnalysis<AliasAnalysis>()); VN.setMemDep(MD); VN.setDomTree(DT); @@ -2011,8 +1744,8 @@ bool GVN::runOnFunction(Function& F) { // Merge unconditional branches, allowing PRE to catch more // optimization opportunities. for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE; ) { - BasicBlock *BB = FI; - ++FI; + BasicBlock *BB = FI++; + bool removedBlock = MergeBlockIntoPredecessor(BB, this); if (removedBlock) ++NumGVNBlocks; @@ -2020,7 +1753,6 @@ bool GVN::runOnFunction(Function& F) { } unsigned Iteration = 0; - while (ShouldContinue) { DEBUG(dbgs() << "GVN iteration: " << Iteration << "\n"); ShouldContinue = iterateOnFunction(F); @@ -2138,20 +1870,19 @@ bool GVN::performPRE(Function &F) { if (P == CurrentBlock) { NumWithout = 2; break; - } else if (!localAvail.count(P)) { + } else if (!DT->dominates(&F.getEntryBlock(), P)) { NumWithout = 2; break; } - DenseMap<uint32_t, Value*>::iterator predV = - localAvail[P]->table.find(ValNo); - if (predV == localAvail[P]->table.end()) { + Value* predV = findLeader(P, ValNo); + if (predV == 0) { PREPred = P; ++NumWithout; - } else if (predV->second == CurInst) { + } else if (predV == CurInst) { NumWithout = 2; } else { - predMap[P] = predV->second; + predMap[P] = predV; ++NumWith; } } @@ -2186,7 +1917,7 @@ bool GVN::performPRE(Function &F) { if (isa<Argument>(Op) || isa<Constant>(Op) || isa<GlobalValue>(Op)) continue; - if (Value *V = lookupNumber(PREPred, VN.lookup(Op))) { + if (Value *V = findLeader(PREPred, VN.lookup(Op))) { PREInstr->setOperand(i, V); } else { success = false; @@ -2210,7 +1941,7 @@ bool GVN::performPRE(Function &F) { ++NumGVNPRE; // Update the availability map to include the new instruction. - localAvail[PREPred]->table.insert(std::make_pair(ValNo, PREInstr)); + addToLeaderTable(ValNo, PREInstr, PREPred); // Create a PHI to make the value available in this block. PHINode* Phi = PHINode::Create(CurInst->getType(), @@ -2223,12 +1954,21 @@ bool GVN::performPRE(Function &F) { } VN.add(Phi, ValNo); - localAvail[CurrentBlock]->table[ValNo] = Phi; + addToLeaderTable(ValNo, Phi, CurrentBlock); CurInst->replaceAllUsesWith(Phi); - if (MD && Phi->getType()->isPointerTy()) - MD->invalidateCachedPointerInfo(Phi); + if (Phi->getType()->isPointerTy()) { + // Because we have added a PHI-use of the pointer value, it has now + // "escaped" from alias analysis' perspective. We need to inform + // AA of this. + for (unsigned ii = 0, ee = Phi->getNumIncomingValues(); ii != ee; ++ii) + VN.getAliasAnalysis()->addEscapingUse(Phi->getOperandUse(2*ii)); + + if (MD) + MD->invalidateCachedPointerInfo(Phi); + } VN.erase(CurInst); + removeFromLeaderTable(ValNo, CurInst, CurrentBlock); DEBUG(dbgs() << "GVN PRE removed: " << *CurInst << '\n'); if (MD) MD->removeInstruction(CurInst); @@ -2260,16 +2000,7 @@ bool GVN::splitCriticalEdges() { /// iterateOnFunction - Executes one iteration of GVN bool GVN::iterateOnFunction(Function &F) { cleanupGlobalSets(); - - for (df_iterator<DomTreeNode*> DI = df_begin(DT->getRootNode()), - DE = df_end(DT->getRootNode()); DI != DE; ++DI) { - if (DI->getIDom()) - localAvail[DI->getBlock()] = - new ValueNumberScope(localAvail[DI->getIDom()->getBlock()]); - else - localAvail[DI->getBlock()] = new ValueNumberScope(0); - } - + // Top-down walk of the dominator tree bool Changed = false; #if 0 @@ -2289,11 +2020,8 @@ bool GVN::iterateOnFunction(Function &F) { void GVN::cleanupGlobalSets() { VN.clear(); - - for (DenseMap<BasicBlock*, ValueNumberScope*>::iterator - I = localAvail.begin(), E = localAvail.end(); I != E; ++I) - delete I->second; - localAvail.clear(); + LeaderTable.clear(); + TableAllocator.Reset(); } /// verifyRemoved - Verify that the specified instruction does not occur in our @@ -2303,17 +2031,14 @@ void GVN::verifyRemoved(const Instruction *Inst) const { // Walk through the value number scope to make sure the instruction isn't // ferreted away in it. - for (DenseMap<BasicBlock*, ValueNumberScope*>::const_iterator - I = localAvail.begin(), E = localAvail.end(); I != E; ++I) { - const ValueNumberScope *VNS = I->second; - - while (VNS) { - for (DenseMap<uint32_t, Value*>::const_iterator - II = VNS->table.begin(), IE = VNS->table.end(); II != IE; ++II) { - assert(II->second != Inst && "Inst still in value numbering scope!"); - } - - VNS = VNS->parent; + for (DenseMap<uint32_t, LeaderTableEntry>::const_iterator + I = LeaderTable.begin(), E = LeaderTable.end(); I != E; ++I) { + const LeaderTableEntry *Node = &I->second; + assert(Node->Val != Inst && "Inst still in value numbering scope!"); + + while (Node->Next) { + Node = Node->Next; + assert(Node->Val != Inst && "Inst still in value numbering scope!"); } } } diff --git a/lib/Transforms/Scalar/IndVarSimplify.cpp b/lib/Transforms/Scalar/IndVarSimplify.cpp index af2eafc..0fb6798 100644 --- a/lib/Transforms/Scalar/IndVarSimplify.cpp +++ b/lib/Transforms/Scalar/IndVarSimplify.cpp @@ -77,7 +77,9 @@ namespace { public: static char ID; // Pass identification, replacement for typeid - IndVarSimplify() : LoopPass(ID) {} + IndVarSimplify() : LoopPass(ID) { + initializeIndVarSimplifyPass(*PassRegistry::getPassRegistry()); + } virtual bool runOnLoop(Loop *L, LPPassManager &LPM); @@ -117,8 +119,16 @@ namespace { } char IndVarSimplify::ID = 0; -INITIALIZE_PASS(IndVarSimplify, "indvars", - "Canonicalize Induction Variables", false, false); +INITIALIZE_PASS_BEGIN(IndVarSimplify, "indvars", + "Canonicalize Induction Variables", false, false) +INITIALIZE_PASS_DEPENDENCY(DominatorTree) +INITIALIZE_PASS_DEPENDENCY(LoopInfo) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) +INITIALIZE_PASS_DEPENDENCY(LoopSimplify) +INITIALIZE_PASS_DEPENDENCY(LCSSA) +INITIALIZE_PASS_DEPENDENCY(IVUsers) +INITIALIZE_PASS_END(IndVarSimplify, "indvars", + "Canonicalize Induction Variables", false, false) Pass *llvm::createIndVarSimplifyPass() { return new IndVarSimplify(); @@ -190,7 +200,7 @@ ICmpInst *IndVarSimplify::LinearFunctionTestReplace(Loop *L, } // Expand the code for the iteration count. - assert(RHS->isLoopInvariant(L) && + assert(SE->isLoopInvariant(RHS, L) && "Computed iteration count is not loop invariant!"); Value *ExitCnt = Rewriter.expandCodeFor(RHS, IndVar->getType(), BI); @@ -233,8 +243,7 @@ ICmpInst *IndVarSimplify::LinearFunctionTestReplace(Loop *L, /// happen later, except that it's more powerful in some cases, because it's /// able to brute-force evaluate arbitrary instructions as long as they have /// constant operands at the beginning of the loop. -void IndVarSimplify::RewriteLoopExitValues(Loop *L, - SCEVExpander &Rewriter) { +void IndVarSimplify::RewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter) { // Verify the input to the pass in already in LCSSA form. assert(L->isLCSSAForm(*DT)); @@ -292,7 +301,7 @@ void IndVarSimplify::RewriteLoopExitValues(Loop *L, // and varies predictably *inside* the loop. Evaluate the value it // contains when the loop exits, if possible. const SCEV *ExitValue = SE->getSCEVAtScope(Inst, L->getParentLoop()); - if (!ExitValue->isLoopInvariant(L)) + if (!SE->isLoopInvariant(ExitValue, L)) continue; Changed = true; @@ -338,7 +347,7 @@ void IndVarSimplify::RewriteNonIntegerIVs(Loop *L) { // If there are, change them into integer recurrences, permitting analysis by // the SCEV routines. // - BasicBlock *Header = L->getHeader(); + BasicBlock *Header = L->getHeader(); SmallVector<WeakVH, 8> PHIs; for (BasicBlock::iterator I = Header->begin(); @@ -346,7 +355,7 @@ void IndVarSimplify::RewriteNonIntegerIVs(Loop *L) { PHIs.push_back(PN); for (unsigned i = 0, e = PHIs.size(); i != e; ++i) - if (PHINode *PN = dyn_cast_or_null<PHINode>(PHIs[i])) + if (PHINode *PN = dyn_cast_or_null<PHINode>(&*PHIs[i])) HandleFloatingPointIV(L, PN); // If the loop previously had floating-point IV, ScalarEvolution @@ -395,7 +404,7 @@ void IndVarSimplify::EliminateIVComparisons() { // which are now dead. while (!DeadInsts.empty()) if (Instruction *Inst = - dyn_cast_or_null<Instruction>(DeadInsts.pop_back_val())) + dyn_cast_or_null<Instruction>(&*DeadInsts.pop_back_val())) RecursivelyDeleteTriviallyDeadInstructions(Inst); } @@ -462,7 +471,7 @@ void IndVarSimplify::EliminateIVRemainders() { // which are now dead. while (!DeadInsts.empty()) if (Instruction *Inst = - dyn_cast_or_null<Instruction>(DeadInsts.pop_back_val())) + dyn_cast_or_null<Instruction>(&*DeadInsts.pop_back_val())) RecursivelyDeleteTriviallyDeadInstructions(Inst); } @@ -607,9 +616,9 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) { // currently can only reduce affine polynomials. For now just disable // indvar subst on anything more complex than an affine addrec, unless // it can be expanded to a trivial value. -static bool isSafe(const SCEV *S, const Loop *L) { +static bool isSafe(const SCEV *S, const Loop *L, ScalarEvolution *SE) { // Loop-invariant values are safe. - if (S->isLoopInvariant(L)) return true; + if (SE->isLoopInvariant(S, L)) return true; // Affine addrecs are safe. Non-affine are not, because LSR doesn't know how // to transform them into efficient code. @@ -620,18 +629,18 @@ static bool isSafe(const SCEV *S, const Loop *L) { if (const SCEVCommutativeExpr *Commutative = dyn_cast<SCEVCommutativeExpr>(S)) { for (SCEVCommutativeExpr::op_iterator I = Commutative->op_begin(), E = Commutative->op_end(); I != E; ++I) - if (!isSafe(*I, L)) return false; + if (!isSafe(*I, L, SE)) return false; return true; } // A cast is safe if its operand is. if (const SCEVCastExpr *C = dyn_cast<SCEVCastExpr>(S)) - return isSafe(C->getOperand(), L); + return isSafe(C->getOperand(), L, SE); // A udiv is safe if its operands are. if (const SCEVUDivExpr *UD = dyn_cast<SCEVUDivExpr>(S)) - return isSafe(UD->getLHS(), L) && - isSafe(UD->getRHS(), L); + return isSafe(UD->getLHS(), L, SE) && + isSafe(UD->getRHS(), L, SE); // SCEVUnknown is always safe. if (isa<SCEVUnknown>(S)) @@ -662,7 +671,7 @@ void IndVarSimplify::RewriteIVExpressions(Loop *L, SCEVExpander &Rewriter) { // Evaluate the expression out of the loop, if possible. if (!L->contains(UI->getUser())) { const SCEV *ExitVal = SE->getSCEVAtScope(AR, L->getParentLoop()); - if (ExitVal->isLoopInvariant(L)) + if (SE->isLoopInvariant(ExitVal, L)) AR = ExitVal; } @@ -672,7 +681,7 @@ void IndVarSimplify::RewriteIVExpressions(Loop *L, SCEVExpander &Rewriter) { // currently can only reduce affine polynomials. For now just disable // indvar subst on anything more complex than an affine addrec, unless // it can be expanded to a trivial value. - if (!isSafe(AR, L)) + if (!isSafe(AR, L, SE)) continue; // Determine the insertion point for this user. By default, insert @@ -725,7 +734,7 @@ void IndVarSimplify::RewriteIVExpressions(Loop *L, SCEVExpander &Rewriter) { // which are now dead. while (!DeadInsts.empty()) if (Instruction *Inst = - dyn_cast_or_null<Instruction>(DeadInsts.pop_back_val())) + dyn_cast_or_null<Instruction>(&*DeadInsts.pop_back_val())) RecursivelyDeleteTriviallyDeadInstructions(Inst); } diff --git a/lib/Transforms/Scalar/JumpThreading.cpp b/lib/Transforms/Scalar/JumpThreading.cpp index 104d5ae..90094a8 100644 --- a/lib/Transforms/Scalar/JumpThreading.cpp +++ b/lib/Transforms/Scalar/JumpThreading.cpp @@ -40,20 +40,22 @@ STATISTIC(NumFolds, "Number of terminators folded"); STATISTIC(NumDupes, "Number of branch blocks duplicated to eliminate phi"); static cl::opt<unsigned> -Threshold("jump-threading-threshold", +Threshold("jump-threading-threshold", cl::desc("Max block size to duplicate for jump threading"), cl::init(6), cl::Hidden); -// Turn on use of LazyValueInfo. -static cl::opt<bool> -EnableLVI("enable-jump-threading-lvi", - cl::desc("Use LVI for jump threading"), - cl::init(true), - cl::ReallyHidden); - - - namespace { + // These are at global scope so static functions can use them too. + typedef SmallVectorImpl<std::pair<Constant*, BasicBlock*> > PredValueInfo; + typedef SmallVector<std::pair<Constant*, BasicBlock*>, 8> PredValueInfoTy; + + // This is used to keep track of what kind of constant we're currently hoping + // to find. + enum ConstantPreference { + WantInteger, + WantBlockAddress + }; + /// This pass performs 'jump threading', which looks at blocks that have /// multiple predecessors and multiple successors. If one or more of the /// predecessors of the block can be proven to always jump to one of the @@ -79,61 +81,59 @@ namespace { SmallSet<AssertingVH<BasicBlock>, 16> LoopHeaders; #endif DenseSet<std::pair<Value*, BasicBlock*> > RecursionSet; - + // RAII helper for updating the recursion stack. struct RecursionSetRemover { DenseSet<std::pair<Value*, BasicBlock*> > &TheSet; std::pair<Value*, BasicBlock*> ThePair; - + RecursionSetRemover(DenseSet<std::pair<Value*, BasicBlock*> > &S, std::pair<Value*, BasicBlock*> P) : TheSet(S), ThePair(P) { } - + ~RecursionSetRemover() { TheSet.erase(ThePair); } }; public: static char ID; // Pass identification - JumpThreading() : FunctionPass(ID) {} + JumpThreading() : FunctionPass(ID) { + initializeJumpThreadingPass(*PassRegistry::getPassRegistry()); + } bool runOnFunction(Function &F); - + virtual void getAnalysisUsage(AnalysisUsage &AU) const { - if (EnableLVI) { - AU.addRequired<LazyValueInfo>(); - AU.addPreserved<LazyValueInfo>(); - } + AU.addRequired<LazyValueInfo>(); + AU.addPreserved<LazyValueInfo>(); } - + void FindLoopHeaders(Function &F); bool ProcessBlock(BasicBlock *BB); bool ThreadEdge(BasicBlock *BB, const SmallVectorImpl<BasicBlock*> &PredBBs, BasicBlock *SuccBB); bool DuplicateCondBranchOnPHIIntoPred(BasicBlock *BB, const SmallVectorImpl<BasicBlock *> &PredBBs); - - typedef SmallVectorImpl<std::pair<ConstantInt*, - BasicBlock*> > PredValueInfo; - + bool ComputeValueKnownInPredecessors(Value *V, BasicBlock *BB, - PredValueInfo &Result); - bool ProcessThreadableEdges(Value *Cond, BasicBlock *BB); - - - bool ProcessBranchOnDuplicateCond(BasicBlock *PredBB, BasicBlock *DestBB); - bool ProcessSwitchOnDuplicateCond(BasicBlock *PredBB, BasicBlock *DestBB); + PredValueInfo &Result, + ConstantPreference Preference); + bool ProcessThreadableEdges(Value *Cond, BasicBlock *BB, + ConstantPreference Preference); bool ProcessBranchOnPHI(PHINode *PN); bool ProcessBranchOnXOR(BinaryOperator *BO); - + bool SimplifyPartiallyRedundantLoad(LoadInst *LI); }; } char JumpThreading::ID = 0; -INITIALIZE_PASS(JumpThreading, "jump-threading", - "Jump Threading", false, false); +INITIALIZE_PASS_BEGIN(JumpThreading, "jump-threading", + "Jump Threading", false, false) +INITIALIZE_PASS_DEPENDENCY(LazyValueInfo) +INITIALIZE_PASS_END(JumpThreading, "jump-threading", + "Jump Threading", false, false) // Public interface to the Jump Threading pass FunctionPass *llvm::createJumpThreadingPass() { return new JumpThreading(); } @@ -143,21 +143,21 @@ FunctionPass *llvm::createJumpThreadingPass() { return new JumpThreading(); } bool JumpThreading::runOnFunction(Function &F) { DEBUG(dbgs() << "Jump threading on function '" << F.getName() << "'\n"); TD = getAnalysisIfAvailable<TargetData>(); - LVI = EnableLVI ? &getAnalysis<LazyValueInfo>() : 0; - + LVI = &getAnalysis<LazyValueInfo>(); + FindLoopHeaders(F); - + bool Changed, EverChanged = false; do { Changed = false; for (Function::iterator I = F.begin(), E = F.end(); I != E;) { BasicBlock *BB = I; - // Thread all of the branches we can over this block. + // Thread all of the branches we can over this block. while (ProcessBlock(BB)) Changed = true; - + ++I; - + // If the block is trivially dead, zap it. This eliminates the successor // edges which simplifies the CFG. if (pred_begin(BB) == pred_end(BB) && @@ -165,48 +165,46 @@ bool JumpThreading::runOnFunction(Function &F) { DEBUG(dbgs() << " JT: Deleting dead block '" << BB->getName() << "' with terminator: " << *BB->getTerminator() << '\n'); LoopHeaders.erase(BB); - if (LVI) LVI->eraseBlock(BB); + LVI->eraseBlock(BB); DeleteDeadBlock(BB); Changed = true; - } else if (BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator())) { - // Can't thread an unconditional jump, but if the block is "almost - // empty", we can replace uses of it with uses of the successor and make - // this dead. - if (BI->isUnconditional() && - BB != &BB->getParent()->getEntryBlock()) { - BasicBlock::iterator BBI = BB->getFirstNonPHI(); - // Ignore dbg intrinsics. - while (isa<DbgInfoIntrinsic>(BBI)) - ++BBI; + continue; + } + + BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator()); + + // Can't thread an unconditional jump, but if the block is "almost + // empty", we can replace uses of it with uses of the successor and make + // this dead. + if (BI && BI->isUnconditional() && + BB != &BB->getParent()->getEntryBlock() && // If the terminator is the only non-phi instruction, try to nuke it. - if (BBI->isTerminator()) { - // Since TryToSimplifyUncondBranchFromEmptyBlock may delete the - // block, we have to make sure it isn't in the LoopHeaders set. We - // reinsert afterward if needed. - bool ErasedFromLoopHeaders = LoopHeaders.erase(BB); - BasicBlock *Succ = BI->getSuccessor(0); - - // FIXME: It is always conservatively correct to drop the info - // for a block even if it doesn't get erased. This isn't totally - // awesome, but it allows us to use AssertingVH to prevent nasty - // dangling pointer issues within LazyValueInfo. - if (LVI) LVI->eraseBlock(BB); - if (TryToSimplifyUncondBranchFromEmptyBlock(BB)) { - Changed = true; - // If we deleted BB and BB was the header of a loop, then the - // successor is now the header of the loop. - BB = Succ; - } - - if (ErasedFromLoopHeaders) - LoopHeaders.insert(BB); - } + BB->getFirstNonPHIOrDbg()->isTerminator()) { + // Since TryToSimplifyUncondBranchFromEmptyBlock may delete the + // block, we have to make sure it isn't in the LoopHeaders set. We + // reinsert afterward if needed. + bool ErasedFromLoopHeaders = LoopHeaders.erase(BB); + BasicBlock *Succ = BI->getSuccessor(0); + + // FIXME: It is always conservatively correct to drop the info + // for a block even if it doesn't get erased. This isn't totally + // awesome, but it allows us to use AssertingVH to prevent nasty + // dangling pointer issues within LazyValueInfo. + LVI->eraseBlock(BB); + if (TryToSimplifyUncondBranchFromEmptyBlock(BB)) { + Changed = true; + // If we deleted BB and BB was the header of a loop, then the + // successor is now the header of the loop. + BB = Succ; } + + if (ErasedFromLoopHeaders) + LoopHeaders.insert(BB); } } EverChanged |= Changed; } while (Changed); - + LoopHeaders.clear(); return EverChanged; } @@ -216,25 +214,25 @@ bool JumpThreading::runOnFunction(Function &F) { static unsigned getJumpThreadDuplicationCost(const BasicBlock *BB) { /// Ignore PHI nodes, these will be flattened when duplication happens. BasicBlock::const_iterator I = BB->getFirstNonPHI(); - + // FIXME: THREADING will delete values that are just used to compute the // branch, so they shouldn't count against the duplication cost. - - + + // Sum up the cost of each instruction until we get to the terminator. Don't // include the terminator because the copy won't include it. unsigned Size = 0; for (; !isa<TerminatorInst>(I); ++I) { // Debugger intrinsics don't incur code size. if (isa<DbgInfoIntrinsic>(I)) continue; - + // If this is a pointer->pointer bitcast, it is free. if (isa<BitCastInst>(I) && I->getType()->isPointerTy()) continue; - + // All other instructions count for at least one unit. ++Size; - + // Calls are more expensive. If they are non-intrinsic calls, we model them // as having cost of 4. If they are a non-vector intrinsic, we model them // as having cost of 2 total, and if they are a vector intrinsic, we model @@ -246,12 +244,16 @@ static unsigned getJumpThreadDuplicationCost(const BasicBlock *BB) { Size += 1; } } - + // Threading through a switch statement is particularly profitable. If this // block ends in a switch, decrease its cost to make it more likely to happen. if (isa<SwitchInst>(I)) Size = Size > 6 ? Size-6 : 0; - + + // The same holds for indirect branches, but slightly more so. + if (isa<IndirectBrInst>(I)) + Size = Size > 8 ? Size-8 : 0; + return Size; } @@ -273,57 +275,64 @@ static unsigned getJumpThreadDuplicationCost(const BasicBlock *BB) { void JumpThreading::FindLoopHeaders(Function &F) { SmallVector<std::pair<const BasicBlock*,const BasicBlock*>, 32> Edges; FindFunctionBackedges(F, Edges); - + for (unsigned i = 0, e = Edges.size(); i != e; ++i) LoopHeaders.insert(const_cast<BasicBlock*>(Edges[i].second)); } -// Helper method for ComputeValueKnownInPredecessors. If Value is a -// ConstantInt, push it. If it's an undef, push 0. Otherwise, do nothing. -static void PushConstantIntOrUndef(SmallVectorImpl<std::pair<ConstantInt*, - BasicBlock*> > &Result, - Constant *Value, BasicBlock* BB){ - if (ConstantInt *FoldedCInt = dyn_cast<ConstantInt>(Value)) - Result.push_back(std::make_pair(FoldedCInt, BB)); - else if (isa<UndefValue>(Value)) - Result.push_back(std::make_pair((ConstantInt*)0, BB)); +/// getKnownConstant - Helper method to determine if we can thread over a +/// terminator with the given value as its condition, and if so what value to +/// use for that. What kind of value this is depends on whether we want an +/// integer or a block address, but an undef is always accepted. +/// Returns null if Val is null or not an appropriate constant. +static Constant *getKnownConstant(Value *Val, ConstantPreference Preference) { + if (!Val) + return 0; + + // Undef is "known" enough. + if (UndefValue *U = dyn_cast<UndefValue>(Val)) + return U; + + if (Preference == WantBlockAddress) + return dyn_cast<BlockAddress>(Val->stripPointerCasts()); + + return dyn_cast<ConstantInt>(Val); } /// ComputeValueKnownInPredecessors - Given a basic block BB and a value V, see -/// if we can infer that the value is a known ConstantInt in any of our -/// predecessors. If so, return the known list of value and pred BB in the -/// result vector. If a value is known to be undef, it is returned as null. +/// if we can infer that the value is a known ConstantInt/BlockAddress or undef +/// in any of our predecessors. If so, return the known list of value and pred +/// BB in the result vector. /// /// This returns true if there were any known values. /// bool JumpThreading:: -ComputeValueKnownInPredecessors(Value *V, BasicBlock *BB,PredValueInfo &Result){ +ComputeValueKnownInPredecessors(Value *V, BasicBlock *BB, PredValueInfo &Result, + ConstantPreference Preference) { // This method walks up use-def chains recursively. Because of this, we could // get into an infinite loop going around loops in the use-def chain. To // prevent this, keep track of what (value, block) pairs we've already visited // and terminate the search if we loop back to them if (!RecursionSet.insert(std::make_pair(V, BB)).second) return false; - + // An RAII help to remove this pair from the recursion set once the recursion // stack pops back out again. RecursionSetRemover remover(RecursionSet, std::make_pair(V, BB)); - - // If V is a constantint, then it is known in all predecessors. - if (isa<ConstantInt>(V) || isa<UndefValue>(V)) { - ConstantInt *CI = dyn_cast<ConstantInt>(V); - + + // If V is a constant, then it is known in all predecessors. + if (Constant *KC = getKnownConstant(V, Preference)) { for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) - Result.push_back(std::make_pair(CI, *PI)); - + Result.push_back(std::make_pair(KC, *PI)); + return true; } - + // If V is a non-instruction value, or an instruction in a different block, // then it can't be derived from a PHI. Instruction *I = dyn_cast<Instruction>(V); if (I == 0 || I->getParent() != BB) { - + // Okay, if this is a live-in value, see if it has a known value at the end // of any of our predecessors. // @@ -331,82 +340,78 @@ ComputeValueKnownInPredecessors(Value *V, BasicBlock *BB,PredValueInfo &Result){ /// TODO: Per PR2563, we could infer value range information about a /// predecessor based on its terminator. // - if (LVI) { - // FIXME: change this to use the more-rich 'getPredicateOnEdge' method if - // "I" is a non-local compare-with-a-constant instruction. This would be - // able to handle value inequalities better, for example if the compare is - // "X < 4" and "X < 3" is known true but "X < 4" itself is not available. - // Perhaps getConstantOnEdge should be smart enough to do this? - - for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) { - BasicBlock *P = *PI; - // If the value is known by LazyValueInfo to be a constant in a - // predecessor, use that information to try to thread this block. - Constant *PredCst = LVI->getConstantOnEdge(V, P, BB); - if (PredCst == 0 || - (!isa<ConstantInt>(PredCst) && !isa<UndefValue>(PredCst))) - continue; - - Result.push_back(std::make_pair(dyn_cast<ConstantInt>(PredCst), P)); - } - - return !Result.empty(); + // FIXME: change this to use the more-rich 'getPredicateOnEdge' method if + // "I" is a non-local compare-with-a-constant instruction. This would be + // able to handle value inequalities better, for example if the compare is + // "X < 4" and "X < 3" is known true but "X < 4" itself is not available. + // Perhaps getConstantOnEdge should be smart enough to do this? + + for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) { + BasicBlock *P = *PI; + // If the value is known by LazyValueInfo to be a constant in a + // predecessor, use that information to try to thread this block. + Constant *PredCst = LVI->getConstantOnEdge(V, P, BB); + if (Constant *KC = getKnownConstant(PredCst, Preference)) + Result.push_back(std::make_pair(KC, P)); } - - return false; + + return !Result.empty(); } - + /// If I is a PHI node, then we know the incoming values for any constants. if (PHINode *PN = dyn_cast<PHINode>(I)) { for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) { Value *InVal = PN->getIncomingValue(i); - if (isa<ConstantInt>(InVal) || isa<UndefValue>(InVal)) { - ConstantInt *CI = dyn_cast<ConstantInt>(InVal); - Result.push_back(std::make_pair(CI, PN->getIncomingBlock(i))); - } else if (LVI) { + if (Constant *KC = getKnownConstant(InVal, Preference)) { + Result.push_back(std::make_pair(KC, PN->getIncomingBlock(i))); + } else { Constant *CI = LVI->getConstantOnEdge(InVal, PN->getIncomingBlock(i), BB); - // LVI returns null is no value could be determined. - if (!CI) continue; - PushConstantIntOrUndef(Result, CI, PN->getIncomingBlock(i)); + if (Constant *KC = getKnownConstant(CI, Preference)) + Result.push_back(std::make_pair(KC, PN->getIncomingBlock(i))); } } - + return !Result.empty(); } - - SmallVector<std::pair<ConstantInt*, BasicBlock*>, 8> LHSVals, RHSVals; + + PredValueInfoTy LHSVals, RHSVals; // Handle some boolean conditions. - if (I->getType()->getPrimitiveSizeInBits() == 1) { + if (I->getType()->getPrimitiveSizeInBits() == 1) { + assert(Preference == WantInteger && "One-bit non-integer type?"); // X | true -> true // X & false -> false if (I->getOpcode() == Instruction::Or || I->getOpcode() == Instruction::And) { - ComputeValueKnownInPredecessors(I->getOperand(0), BB, LHSVals); - ComputeValueKnownInPredecessors(I->getOperand(1), BB, RHSVals); - + ComputeValueKnownInPredecessors(I->getOperand(0), BB, LHSVals, + WantInteger); + ComputeValueKnownInPredecessors(I->getOperand(1), BB, RHSVals, + WantInteger); + if (LHSVals.empty() && RHSVals.empty()) return false; - + ConstantInt *InterestingVal; if (I->getOpcode() == Instruction::Or) InterestingVal = ConstantInt::getTrue(I->getContext()); else InterestingVal = ConstantInt::getFalse(I->getContext()); - + SmallPtrSet<BasicBlock*, 4> LHSKnownBBs; - + // Scan for the sentinel. If we find an undef, force it to the // interesting value: x|undef -> true and x&undef -> false. for (unsigned i = 0, e = LHSVals.size(); i != e; ++i) - if (LHSVals[i].first == InterestingVal || LHSVals[i].first == 0) { + if (LHSVals[i].first == InterestingVal || + isa<UndefValue>(LHSVals[i].first)) { Result.push_back(LHSVals[i]); Result.back().first = InterestingVal; LHSKnownBBs.insert(LHSVals[i].second); } for (unsigned i = 0, e = RHSVals.size(); i != e; ++i) - if (RHSVals[i].first == InterestingVal || RHSVals[i].first == 0) { + if (RHSVals[i].first == InterestingVal || + isa<UndefValue>(RHSVals[i].first)) { // If we already inferred a value for this block on the LHS, don't // re-add it. if (!LHSKnownBBs.count(RHSVals[i].second)) { @@ -414,48 +419,51 @@ ComputeValueKnownInPredecessors(Value *V, BasicBlock *BB,PredValueInfo &Result){ Result.back().first = InterestingVal; } } - + return !Result.empty(); } - + // Handle the NOT form of XOR. if (I->getOpcode() == Instruction::Xor && isa<ConstantInt>(I->getOperand(1)) && cast<ConstantInt>(I->getOperand(1))->isOne()) { - ComputeValueKnownInPredecessors(I->getOperand(0), BB, Result); + ComputeValueKnownInPredecessors(I->getOperand(0), BB, Result, + WantInteger); if (Result.empty()) return false; // Invert the known values. for (unsigned i = 0, e = Result.size(); i != e; ++i) - if (Result[i].first) - Result[i].first = - cast<ConstantInt>(ConstantExpr::getNot(Result[i].first)); - + Result[i].first = ConstantExpr::getNot(Result[i].first); + return true; } - + // Try to simplify some other binary operator values. } else if (BinaryOperator *BO = dyn_cast<BinaryOperator>(I)) { + assert(Preference != WantBlockAddress + && "A binary operator creating a block address?"); if (ConstantInt *CI = dyn_cast<ConstantInt>(BO->getOperand(1))) { - SmallVector<std::pair<ConstantInt*, BasicBlock*>, 8> LHSVals; - ComputeValueKnownInPredecessors(BO->getOperand(0), BB, LHSVals); - + PredValueInfoTy LHSVals; + ComputeValueKnownInPredecessors(BO->getOperand(0), BB, LHSVals, + WantInteger); + // Try to use constant folding to simplify the binary operator. for (unsigned i = 0, e = LHSVals.size(); i != e; ++i) { - Constant *V = LHSVals[i].first ? LHSVals[i].first : - cast<Constant>(UndefValue::get(BO->getType())); + Constant *V = LHSVals[i].first; Constant *Folded = ConstantExpr::get(BO->getOpcode(), V, CI); - - PushConstantIntOrUndef(Result, Folded, LHSVals[i].second); + + if (Constant *KC = getKnownConstant(Folded, WantInteger)) + Result.push_back(std::make_pair(KC, LHSVals[i].second)); } } - + return !Result.empty(); } - + // Handle compare with phi operand, where the PHI is defined in this block. if (CmpInst *Cmp = dyn_cast<CmpInst>(I)) { + assert(Preference == WantInteger && "Compares only produce integers"); PHINode *PN = dyn_cast<PHINode>(Cmp->getOperand(0)); if (PN && PN->getParent() == BB) { // We can do this simplification if any comparisons fold to true or false. @@ -464,32 +472,31 @@ ComputeValueKnownInPredecessors(Value *V, BasicBlock *BB,PredValueInfo &Result){ BasicBlock *PredBB = PN->getIncomingBlock(i); Value *LHS = PN->getIncomingValue(i); Value *RHS = Cmp->getOperand(1)->DoPHITranslation(BB, PredBB); - + Value *Res = SimplifyCmpInst(Cmp->getPredicate(), LHS, RHS, TD); if (Res == 0) { - if (!LVI || !isa<Constant>(RHS)) + if (!isa<Constant>(RHS)) continue; - - LazyValueInfo::Tristate + + LazyValueInfo::Tristate ResT = LVI->getPredicateOnEdge(Cmp->getPredicate(), LHS, cast<Constant>(RHS), PredBB, BB); if (ResT == LazyValueInfo::Unknown) continue; Res = ConstantInt::get(Type::getInt1Ty(LHS->getContext()), ResT); } - - if (Constant *ConstRes = dyn_cast<Constant>(Res)) - PushConstantIntOrUndef(Result, ConstRes, PredBB); + + if (Constant *KC = getKnownConstant(Res, WantInteger)) + Result.push_back(std::make_pair(KC, PredBB)); } - + return !Result.empty(); } - - + + // If comparing a live-in value against a constant, see if we know the // live-in value on any predecessors. - if (LVI && isa<Constant>(Cmp->getOperand(1)) && - Cmp->getType()->isIntegerTy()) { + if (isa<Constant>(Cmp->getOperand(1)) && Cmp->getType()->isIntegerTy()) { if (!isa<Instruction>(Cmp->getOperand(0)) || cast<Instruction>(Cmp->getOperand(0))->getParent() != BB) { Constant *RHSCst = cast<Constant>(Cmp->getOperand(1)); @@ -505,44 +512,74 @@ ComputeValueKnownInPredecessors(Value *V, BasicBlock *BB,PredValueInfo &Result){ continue; Constant *ResC = ConstantInt::get(Cmp->getType(), Res); - Result.push_back(std::make_pair(cast<ConstantInt>(ResC), P)); + Result.push_back(std::make_pair(ResC, P)); } return !Result.empty(); } - + // Try to find a constant value for the LHS of a comparison, // and evaluate it statically if we can. if (Constant *CmpConst = dyn_cast<Constant>(Cmp->getOperand(1))) { - SmallVector<std::pair<ConstantInt*, BasicBlock*>, 8> LHSVals; - ComputeValueKnownInPredecessors(I->getOperand(0), BB, LHSVals); - + PredValueInfoTy LHSVals; + ComputeValueKnownInPredecessors(I->getOperand(0), BB, LHSVals, + WantInteger); + for (unsigned i = 0, e = LHSVals.size(); i != e; ++i) { - Constant *V = LHSVals[i].first ? LHSVals[i].first : - cast<Constant>(UndefValue::get(CmpConst->getType())); + Constant *V = LHSVals[i].first; Constant *Folded = ConstantExpr::getCompare(Cmp->getPredicate(), V, CmpConst); - PushConstantIntOrUndef(Result, Folded, LHSVals[i].second); + if (Constant *KC = getKnownConstant(Folded, WantInteger)) + Result.push_back(std::make_pair(KC, LHSVals[i].second)); } - + return !Result.empty(); } } } - - if (LVI) { - // If all else fails, see if LVI can figure out a constant value for us. - Constant *CI = LVI->getConstant(V, BB); - ConstantInt *CInt = dyn_cast_or_null<ConstantInt>(CI); - if (CInt) { - for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) - Result.push_back(std::make_pair(CInt, *PI)); + + if (SelectInst *SI = dyn_cast<SelectInst>(I)) { + // Handle select instructions where at least one operand is a known constant + // and we can figure out the condition value for any predecessor block. + Constant *TrueVal = getKnownConstant(SI->getTrueValue(), Preference); + Constant *FalseVal = getKnownConstant(SI->getFalseValue(), Preference); + PredValueInfoTy Conds; + if ((TrueVal || FalseVal) && + ComputeValueKnownInPredecessors(SI->getCondition(), BB, Conds, + WantInteger)) { + for (unsigned i = 0, e = Conds.size(); i != e; ++i) { + Constant *Cond = Conds[i].first; + + // Figure out what value to use for the condition. + bool KnownCond; + if (ConstantInt *CI = dyn_cast<ConstantInt>(Cond)) { + // A known boolean. + KnownCond = CI->isOne(); + } else { + assert(isa<UndefValue>(Cond) && "Unexpected condition value"); + // Either operand will do, so be sure to pick the one that's a known + // constant. + // FIXME: Do this more cleverly if both values are known constants? + KnownCond = (TrueVal != 0); + } + + // See if the select has a known constant value for this predecessor. + if (Constant *Val = KnownCond ? TrueVal : FalseVal) + Result.push_back(std::make_pair(Val, Conds[i].second)); + } + + return !Result.empty(); } - - return !Result.empty(); } - - return false; + + // If all else fails, see if LVI can figure out a constant value for us. + Constant *CI = LVI->getConstant(V, BB); + if (Constant *KC = getKnownConstant(CI, Preference)) { + for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) + Result.push_back(std::make_pair(KC, *PI)); + } + + return !Result.empty(); } @@ -565,10 +602,20 @@ static unsigned GetBestDestForJumpOnUndef(BasicBlock *BB) { if (NumPreds < MinNumPreds) MinSucc = i; } - + return MinSucc; } +static bool hasAddressTakenAndUsed(BasicBlock *BB) { + if (!BB->hasAddressTaken()) return false; + + // If the block has its address taken, it may be a tree of dead constants + // hanging off of it. These shouldn't keep the block alive. + BlockAddress *BA = BlockAddress::get(BB); + BA->removeDeadConstantUsers(); + return !BA->use_empty(); +} + /// ProcessBlock - If there are any predecessors whose control can be threaded /// through to a successor, transform them now. bool JumpThreading::ProcessBlock(BasicBlock *BB) { @@ -577,167 +624,122 @@ bool JumpThreading::ProcessBlock(BasicBlock *BB) { if (pred_begin(BB) == pred_end(BB) && BB != &BB->getParent()->getEntryBlock()) return false; - + // If this block has a single predecessor, and if that pred has a single // successor, merge the blocks. This encourages recursive jump threading // because now the condition in this block can be threaded through // predecessors of our predecessor block. if (BasicBlock *SinglePred = BB->getSinglePredecessor()) { if (SinglePred->getTerminator()->getNumSuccessors() == 1 && - SinglePred != BB) { + SinglePred != BB && !hasAddressTakenAndUsed(BB)) { // If SinglePred was a loop header, BB becomes one. if (LoopHeaders.erase(SinglePred)) LoopHeaders.insert(BB); - + // Remember if SinglePred was the entry block of the function. If so, we // will need to move BB back to the entry position. bool isEntry = SinglePred == &SinglePred->getParent()->getEntryBlock(); - if (LVI) LVI->eraseBlock(SinglePred); + LVI->eraseBlock(SinglePred); MergeBasicBlockIntoOnlyPred(BB); - + if (isEntry && BB != &BB->getParent()->getEntryBlock()) BB->moveBefore(&BB->getParent()->getEntryBlock()); return true; } } - // Look to see if the terminator is a branch of switch, if not we can't thread - // it. + // What kind of constant we're looking for. + ConstantPreference Preference = WantInteger; + + // Look to see if the terminator is a conditional branch, switch or indirect + // branch, if not we can't thread it. Value *Condition; - if (BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator())) { + Instruction *Terminator = BB->getTerminator(); + if (BranchInst *BI = dyn_cast<BranchInst>(Terminator)) { // Can't thread an unconditional jump. if (BI->isUnconditional()) return false; Condition = BI->getCondition(); - } else if (SwitchInst *SI = dyn_cast<SwitchInst>(BB->getTerminator())) + } else if (SwitchInst *SI = dyn_cast<SwitchInst>(Terminator)) { Condition = SI->getCondition(); - else + } else if (IndirectBrInst *IB = dyn_cast<IndirectBrInst>(Terminator)) { + Condition = IB->getAddress()->stripPointerCasts(); + Preference = WantBlockAddress; + } else { return false; // Must be an invoke. - - // If the terminator of this block is branching on a constant, simplify the - // terminator to an unconditional branch. This can occur due to threading in - // other blocks. - if (isa<ConstantInt>(Condition)) { - DEBUG(dbgs() << " In block '" << BB->getName() - << "' folding terminator: " << *BB->getTerminator() << '\n'); - ++NumFolds; - ConstantFoldTerminator(BB); - return true; } - + // If the terminator is branching on an undef, we can pick any of the // successors to branch to. Let GetBestDestForJumpOnUndef decide. if (isa<UndefValue>(Condition)) { unsigned BestSucc = GetBestDestForJumpOnUndef(BB); - + // Fold the branch/switch. TerminatorInst *BBTerm = BB->getTerminator(); for (unsigned i = 0, e = BBTerm->getNumSuccessors(); i != e; ++i) { if (i == BestSucc) continue; - RemovePredecessorAndSimplify(BBTerm->getSuccessor(i), BB, TD); + BBTerm->getSuccessor(i)->removePredecessor(BB, true); } - + DEBUG(dbgs() << " In block '" << BB->getName() << "' folding undef terminator: " << *BBTerm << '\n'); BranchInst::Create(BBTerm->getSuccessor(BestSucc), BBTerm); BBTerm->eraseFromParent(); return true; } - - Instruction *CondInst = dyn_cast<Instruction>(Condition); - // If the condition is an instruction defined in another block, see if a - // predecessor has the same condition: - // br COND, BBX, BBY - // BBX: - // br COND, BBZ, BBW - if (!LVI && - !Condition->hasOneUse() && // Multiple uses. - (CondInst == 0 || CondInst->getParent() != BB)) { // Non-local definition. - pred_iterator PI = pred_begin(BB), E = pred_end(BB); - if (isa<BranchInst>(BB->getTerminator())) { - for (; PI != E; ++PI) { - BasicBlock *P = *PI; - if (BranchInst *PBI = dyn_cast<BranchInst>(P->getTerminator())) - if (PBI->isConditional() && PBI->getCondition() == Condition && - ProcessBranchOnDuplicateCond(P, BB)) - return true; - } - } else { - assert(isa<SwitchInst>(BB->getTerminator()) && "Unknown jump terminator"); - for (; PI != E; ++PI) { - BasicBlock *P = *PI; - if (SwitchInst *PSI = dyn_cast<SwitchInst>(P->getTerminator())) - if (PSI->getCondition() == Condition && - ProcessSwitchOnDuplicateCond(P, BB)) - return true; - } - } + // If the terminator of this block is branching on a constant, simplify the + // terminator to an unconditional branch. This can occur due to threading in + // other blocks. + if (getKnownConstant(Condition, Preference)) { + DEBUG(dbgs() << " In block '" << BB->getName() + << "' folding terminator: " << *BB->getTerminator() << '\n'); + ++NumFolds; + ConstantFoldTerminator(BB); + return true; } + Instruction *CondInst = dyn_cast<Instruction>(Condition); + // All the rest of our checks depend on the condition being an instruction. if (CondInst == 0) { // FIXME: Unify this with code below. - if (LVI && ProcessThreadableEdges(Condition, BB)) + if (ProcessThreadableEdges(Condition, BB, Preference)) return true; return false; - } - - + } + + if (CmpInst *CondCmp = dyn_cast<CmpInst>(CondInst)) { - if (!LVI && - (!isa<PHINode>(CondCmp->getOperand(0)) || - cast<PHINode>(CondCmp->getOperand(0))->getParent() != BB)) { - // If we have a comparison, loop over the predecessors to see if there is - // a condition with a lexically identical value. - pred_iterator PI = pred_begin(BB), E = pred_end(BB); - for (; PI != E; ++PI) { - BasicBlock *P = *PI; - if (BranchInst *PBI = dyn_cast<BranchInst>(P->getTerminator())) - if (PBI->isConditional() && P != BB) { - if (CmpInst *CI = dyn_cast<CmpInst>(PBI->getCondition())) { - if (CI->getOperand(0) == CondCmp->getOperand(0) && - CI->getOperand(1) == CondCmp->getOperand(1) && - CI->getPredicate() == CondCmp->getPredicate()) { - // TODO: Could handle things like (x != 4) --> (x == 17) - if (ProcessBranchOnDuplicateCond(P, BB)) - return true; - } - } - } - } - } - // For a comparison where the LHS is outside this block, it's possible // that we've branched on it before. Used LVI to see if we can simplify // the branch based on that. BranchInst *CondBr = dyn_cast<BranchInst>(BB->getTerminator()); Constant *CondConst = dyn_cast<Constant>(CondCmp->getOperand(1)); pred_iterator PI = pred_begin(BB), PE = pred_end(BB); - if (LVI && CondBr && CondConst && CondBr->isConditional() && PI != PE && + if (CondBr && CondConst && CondBr->isConditional() && PI != PE && (!isa<Instruction>(CondCmp->getOperand(0)) || cast<Instruction>(CondCmp->getOperand(0))->getParent() != BB)) { // For predecessor edge, determine if the comparison is true or false // on that edge. If they're all true or all false, we can simplify the // branch. // FIXME: We could handle mixed true/false by duplicating code. - LazyValueInfo::Tristate Baseline = + LazyValueInfo::Tristate Baseline = LVI->getPredicateOnEdge(CondCmp->getPredicate(), CondCmp->getOperand(0), CondConst, *PI, BB); if (Baseline != LazyValueInfo::Unknown) { // Check that all remaining incoming values match the first one. while (++PI != PE) { - LazyValueInfo::Tristate Ret = LVI->getPredicateOnEdge( - CondCmp->getPredicate(), - CondCmp->getOperand(0), - CondConst, *PI, BB); + LazyValueInfo::Tristate Ret = + LVI->getPredicateOnEdge(CondCmp->getPredicate(), + CondCmp->getOperand(0), CondConst, *PI, BB); if (Ret != Baseline) break; } - + // If we terminated early, then one of the values didn't match. if (PI == PE) { unsigned ToRemove = Baseline == LazyValueInfo::True ? 1 : 0; unsigned ToKeep = Baseline == LazyValueInfo::True ? 0 : 1; - RemovePredecessorAndSimplify(CondBr->getSuccessor(ToRemove), BB, TD); + CondBr->getSuccessor(ToRemove)->removePredecessor(BB, true); BranchInst::Create(CondBr->getSuccessor(ToKeep), CondBr); CondBr->eraseFromParent(); return true; @@ -755,174 +757,37 @@ bool JumpThreading::ProcessBlock(BasicBlock *BB) { if (CmpInst *CondCmp = dyn_cast<CmpInst>(SimplifyValue)) if (isa<Constant>(CondCmp->getOperand(1))) SimplifyValue = CondCmp->getOperand(0); - + // TODO: There are other places where load PRE would be profitable, such as // more complex comparisons. if (LoadInst *LI = dyn_cast<LoadInst>(SimplifyValue)) if (SimplifyPartiallyRedundantLoad(LI)) return true; - - + + // Handle a variety of cases where we are branching on something derived from // a PHI node in the current block. If we can prove that any predecessors // compute a predictable value based on a PHI node, thread those predecessors. // - if (ProcessThreadableEdges(CondInst, BB)) + if (ProcessThreadableEdges(CondInst, BB, Preference)) return true; - + // If this is an otherwise-unfoldable branch on a phi node in the current // block, see if we can simplify. if (PHINode *PN = dyn_cast<PHINode>(CondInst)) if (PN->getParent() == BB && isa<BranchInst>(BB->getTerminator())) return ProcessBranchOnPHI(PN); - - + + // If this is an otherwise-unfoldable branch on a XOR, see if we can simplify. if (CondInst->getOpcode() == Instruction::Xor && CondInst->getParent() == BB && isa<BranchInst>(BB->getTerminator())) return ProcessBranchOnXOR(cast<BinaryOperator>(CondInst)); - - - // TODO: If we have: "br (X > 0)" and we have a predecessor where we know - // "(X == 4)", thread through this block. - - return false; -} -/// ProcessBranchOnDuplicateCond - We found a block and a predecessor of that -/// block that jump on exactly the same condition. This means that we almost -/// always know the direction of the edge in the DESTBB: -/// PREDBB: -/// br COND, DESTBB, BBY -/// DESTBB: -/// br COND, BBZ, BBW -/// -/// If DESTBB has multiple predecessors, we can't just constant fold the branch -/// in DESTBB, we have to thread over it. -bool JumpThreading::ProcessBranchOnDuplicateCond(BasicBlock *PredBB, - BasicBlock *BB) { - BranchInst *PredBI = cast<BranchInst>(PredBB->getTerminator()); - - // If both successors of PredBB go to DESTBB, we don't know anything. We can - // fold the branch to an unconditional one, which allows other recursive - // simplifications. - bool BranchDir; - if (PredBI->getSuccessor(1) != BB) - BranchDir = true; - else if (PredBI->getSuccessor(0) != BB) - BranchDir = false; - else { - DEBUG(dbgs() << " In block '" << PredBB->getName() - << "' folding terminator: " << *PredBB->getTerminator() << '\n'); - ++NumFolds; - ConstantFoldTerminator(PredBB); - return true; - } - - BranchInst *DestBI = cast<BranchInst>(BB->getTerminator()); - // If the dest block has one predecessor, just fix the branch condition to a - // constant and fold it. - if (BB->getSinglePredecessor()) { - DEBUG(dbgs() << " In block '" << BB->getName() - << "' folding condition to '" << BranchDir << "': " - << *BB->getTerminator() << '\n'); - ++NumFolds; - Value *OldCond = DestBI->getCondition(); - DestBI->setCondition(ConstantInt::get(Type::getInt1Ty(BB->getContext()), - BranchDir)); - // Delete dead instructions before we fold the branch. Folding the branch - // can eliminate edges from the CFG which can end up deleting OldCond. - RecursivelyDeleteTriviallyDeadInstructions(OldCond); - ConstantFoldTerminator(BB); - return true; - } - - - // Next, figure out which successor we are threading to. - BasicBlock *SuccBB = DestBI->getSuccessor(!BranchDir); - - SmallVector<BasicBlock*, 2> Preds; - Preds.push_back(PredBB); - - // Ok, try to thread it! - return ThreadEdge(BB, Preds, SuccBB); -} - -/// ProcessSwitchOnDuplicateCond - We found a block and a predecessor of that -/// block that switch on exactly the same condition. This means that we almost -/// always know the direction of the edge in the DESTBB: -/// PREDBB: -/// switch COND [... DESTBB, BBY ... ] -/// DESTBB: -/// switch COND [... BBZ, BBW ] -/// -/// Optimizing switches like this is very important, because simplifycfg builds -/// switches out of repeated 'if' conditions. -bool JumpThreading::ProcessSwitchOnDuplicateCond(BasicBlock *PredBB, - BasicBlock *DestBB) { - // Can't thread edge to self. - if (PredBB == DestBB) - return false; - - SwitchInst *PredSI = cast<SwitchInst>(PredBB->getTerminator()); - SwitchInst *DestSI = cast<SwitchInst>(DestBB->getTerminator()); - - // There are a variety of optimizations that we can potentially do on these - // blocks: we order them from most to least preferable. - - // If DESTBB *just* contains the switch, then we can forward edges from PREDBB - // directly to their destination. This does not introduce *any* code size - // growth. Skip debug info first. - BasicBlock::iterator BBI = DestBB->begin(); - while (isa<DbgInfoIntrinsic>(BBI)) - BBI++; - - // FIXME: Thread if it just contains a PHI. - if (isa<SwitchInst>(BBI)) { - bool MadeChange = false; - // Ignore the default edge for now. - for (unsigned i = 1, e = DestSI->getNumSuccessors(); i != e; ++i) { - ConstantInt *DestVal = DestSI->getCaseValue(i); - BasicBlock *DestSucc = DestSI->getSuccessor(i); - - // Okay, DestSI has a case for 'DestVal' that goes to 'DestSucc'. See if - // PredSI has an explicit case for it. If so, forward. If it is covered - // by the default case, we can't update PredSI. - unsigned PredCase = PredSI->findCaseValue(DestVal); - if (PredCase == 0) continue; - - // If PredSI doesn't go to DestBB on this value, then it won't reach the - // case on this condition. - if (PredSI->getSuccessor(PredCase) != DestBB && - DestSI->getSuccessor(i) != DestBB) - continue; - - // Do not forward this if it already goes to this destination, this would - // be an infinite loop. - if (PredSI->getSuccessor(PredCase) == DestSucc) - continue; - - // Otherwise, we're safe to make the change. Make sure that the edge from - // DestSI to DestSucc is not critical and has no PHI nodes. - DEBUG(dbgs() << "FORWARDING EDGE " << *DestVal << " FROM: " << *PredSI); - DEBUG(dbgs() << "THROUGH: " << *DestSI); + // TODO: If we have: "br (X > 0)" and we have a predecessor where we know + // "(X == 4)", thread through this block. - // If the destination has PHI nodes, just split the edge for updating - // simplicity. - if (isa<PHINode>(DestSucc->begin()) && !DestSucc->getSinglePredecessor()){ - SplitCriticalEdge(DestSI, i, this); - DestSucc = DestSI->getSuccessor(i); - } - FoldSingleEntryPHINodes(DestSucc); - PredSI->setSuccessor(PredCase, DestSucc); - MadeChange = true; - } - - if (MadeChange) - return true; - } - return false; } @@ -934,13 +799,13 @@ bool JumpThreading::ProcessSwitchOnDuplicateCond(BasicBlock *PredBB, bool JumpThreading::SimplifyPartiallyRedundantLoad(LoadInst *LI) { // Don't hack volatile loads. if (LI->isVolatile()) return false; - + // If the load is defined in a block with exactly one predecessor, it can't be // partially redundant. BasicBlock *LoadBB = LI->getParent(); if (LoadBB->getSinglePredecessor()) return false; - + Value *LoadedPtr = LI->getOperand(0); // If the loaded operand is defined in the LoadBB, it can't be available. @@ -948,17 +813,17 @@ bool JumpThreading::SimplifyPartiallyRedundantLoad(LoadInst *LI) { if (Instruction *PtrOp = dyn_cast<Instruction>(LoadedPtr)) if (PtrOp->getParent() == LoadBB) return false; - + // Scan a few instructions up from the load, to see if it is obviously live at // the entry to its block. BasicBlock::iterator BBIt = LI; - if (Value *AvailableVal = + if (Value *AvailableVal = FindAvailableLoadedValue(LoadedPtr, LoadBB, BBIt, 6)) { // If the value if the load is locally available within the block, just use // it. This frequently occurs for reg2mem'd allocas. //cerr << "LOAD ELIMINATED:\n" << *BBIt << *LI << "\n"; - + // If the returned value is the load itself, replace with an undef. This can // only happen in dead loops. if (AvailableVal == LI) AvailableVal = UndefValue::get(LI->getType()); @@ -972,13 +837,13 @@ bool JumpThreading::SimplifyPartiallyRedundantLoad(LoadInst *LI) { // might clobber its value. if (BBIt != LoadBB->begin()) return false; - - + + SmallPtrSet<BasicBlock*, 8> PredsScanned; typedef SmallVector<std::pair<BasicBlock*, Value*>, 8> AvailablePredsTy; AvailablePredsTy AvailablePreds; BasicBlock *OneUnavailablePred = 0; - + // If we got here, the loaded value is transparent through to the start of the // block. Check to see if it is available in any of the predecessor blocks. for (pred_iterator PI = pred_begin(LoadBB), PE = pred_end(LoadBB); @@ -996,23 +861,23 @@ bool JumpThreading::SimplifyPartiallyRedundantLoad(LoadInst *LI) { OneUnavailablePred = PredBB; continue; } - + // If so, this load is partially redundant. Remember this info so that we // can create a PHI node. AvailablePreds.push_back(std::make_pair(PredBB, PredAvailable)); } - + // If the loaded value isn't available in any predecessor, it isn't partially // redundant. if (AvailablePreds.empty()) return false; - + // Okay, the loaded value is available in at least one (and maybe all!) // predecessors. If the value is unavailable in more than one unique // predecessor, we want to insert a merge block for those common predecessors. // This ensures that we only have to insert one reload, thus not increasing // code size. BasicBlock *UnavailablePred = 0; - + // If there is exactly one predecessor where the value is unavailable, the // already computed 'OneUnavailablePred' block is it. If it ends in an // unconditional branch, we know that it isn't a critical edge. @@ -1035,17 +900,17 @@ bool JumpThreading::SimplifyPartiallyRedundantLoad(LoadInst *LI) { // If the predecessor is an indirect goto, we can't split the edge. if (isa<IndirectBrInst>(P->getTerminator())) return false; - + if (!AvailablePredSet.count(P)) PredsToSplit.push_back(P); } - + // Split them out to their own block. UnavailablePred = SplitBlockPredecessors(LoadBB, &PredsToSplit[0], PredsToSplit.size(), "thread-pre-split", this); } - + // If the value isn't available in all predecessors, then there will be // exactly one where it isn't available. Insert a load on that edge and add // it to the AvailablePreds list. @@ -1057,35 +922,35 @@ bool JumpThreading::SimplifyPartiallyRedundantLoad(LoadInst *LI) { UnavailablePred->getTerminator()); AvailablePreds.push_back(std::make_pair(UnavailablePred, NewVal)); } - + // Now we know that each predecessor of this block has a value in // AvailablePreds, sort them for efficient access as we're walking the preds. array_pod_sort(AvailablePreds.begin(), AvailablePreds.end()); - + // Create a PHI node at the start of the block for the PRE'd load value. PHINode *PN = PHINode::Create(LI->getType(), "", LoadBB->begin()); PN->takeName(LI); - + // Insert new entries into the PHI for each predecessor. A single block may // have multiple entries here. for (pred_iterator PI = pred_begin(LoadBB), E = pred_end(LoadBB); PI != E; ++PI) { BasicBlock *P = *PI; - AvailablePredsTy::iterator I = + AvailablePredsTy::iterator I = std::lower_bound(AvailablePreds.begin(), AvailablePreds.end(), std::make_pair(P, (Value*)0)); - + assert(I != AvailablePreds.end() && I->first == P && "Didn't find entry for predecessor!"); - + PN->addIncoming(I->second, I->first); } - + //cerr << "PRE: " << *LI << *PN << "\n"; - + LI->replaceAllUsesWith(PN); LI->eraseFromParent(); - + return true; } @@ -1097,7 +962,7 @@ FindMostPopularDest(BasicBlock *BB, const SmallVectorImpl<std::pair<BasicBlock*, BasicBlock*> > &PredToDestList) { assert(!PredToDestList.empty()); - + // Determine popularity. If there are multiple possible destinations, we // explicitly choose to ignore 'undef' destinations. We prefer to thread // blocks with known and real destinations to threading undef. We'll handle @@ -1106,13 +971,13 @@ FindMostPopularDest(BasicBlock *BB, for (unsigned i = 0, e = PredToDestList.size(); i != e; ++i) if (PredToDestList[i].second) DestPopularity[PredToDestList[i].second]++; - + // Find the most popular dest. DenseMap<BasicBlock*, unsigned>::iterator DPI = DestPopularity.begin(); BasicBlock *MostPopularDest = DPI->first; unsigned Popularity = DPI->second; SmallVector<BasicBlock*, 4> SamePopularity; - + for (++DPI; DPI != DestPopularity.end(); ++DPI) { // If the popularity of this entry isn't higher than the popularity we've // seen so far, ignore it. @@ -1126,10 +991,10 @@ FindMostPopularDest(BasicBlock *BB, SamePopularity.clear(); MostPopularDest = DPI->first; Popularity = DPI->second; - } + } } - - // Okay, now we know the most popular destination. If there is more than + + // Okay, now we know the most popular destination. If there is more than one // destination, we need to determine one. This is arbitrary, but we need // to make a deterministic decision. Pick the first one that appears in the // successor list. @@ -1138,105 +1003,105 @@ FindMostPopularDest(BasicBlock *BB, TerminatorInst *TI = BB->getTerminator(); for (unsigned i = 0; ; ++i) { assert(i != TI->getNumSuccessors() && "Didn't find any successor!"); - + if (std::find(SamePopularity.begin(), SamePopularity.end(), TI->getSuccessor(i)) == SamePopularity.end()) continue; - + MostPopularDest = TI->getSuccessor(i); break; } } - + // Okay, we have finally picked the most popular destination. return MostPopularDest; } -bool JumpThreading::ProcessThreadableEdges(Value *Cond, BasicBlock *BB) { +bool JumpThreading::ProcessThreadableEdges(Value *Cond, BasicBlock *BB, + ConstantPreference Preference) { // If threading this would thread across a loop header, don't even try to // thread the edge. if (LoopHeaders.count(BB)) return false; - - SmallVector<std::pair<ConstantInt*, BasicBlock*>, 8> PredValues; - if (!ComputeValueKnownInPredecessors(Cond, BB, PredValues)) + + PredValueInfoTy PredValues; + if (!ComputeValueKnownInPredecessors(Cond, BB, PredValues, Preference)) return false; - + assert(!PredValues.empty() && "ComputeValueKnownInPredecessors returned true with no values"); DEBUG(dbgs() << "IN BB: " << *BB; for (unsigned i = 0, e = PredValues.size(); i != e; ++i) { - dbgs() << " BB '" << BB->getName() << "': FOUND condition = "; - if (PredValues[i].first) - dbgs() << *PredValues[i].first; - else - dbgs() << "UNDEF"; - dbgs() << " for pred '" << PredValues[i].second->getName() - << "'.\n"; + dbgs() << " BB '" << BB->getName() << "': FOUND condition = " + << *PredValues[i].first + << " for pred '" << PredValues[i].second->getName() << "'.\n"; }); - + // Decide what we want to thread through. Convert our list of known values to // a list of known destinations for each pred. This also discards duplicate // predecessors and keeps track of the undefined inputs (which are represented // as a null dest in the PredToDestList). SmallPtrSet<BasicBlock*, 16> SeenPreds; SmallVector<std::pair<BasicBlock*, BasicBlock*>, 16> PredToDestList; - + BasicBlock *OnlyDest = 0; BasicBlock *MultipleDestSentinel = (BasicBlock*)(intptr_t)~0ULL; - + for (unsigned i = 0, e = PredValues.size(); i != e; ++i) { BasicBlock *Pred = PredValues[i].second; if (!SeenPreds.insert(Pred)) continue; // Duplicate predecessor entry. - + // If the predecessor ends with an indirect goto, we can't change its // destination. if (isa<IndirectBrInst>(Pred->getTerminator())) continue; - - ConstantInt *Val = PredValues[i].first; - + + Constant *Val = PredValues[i].first; + BasicBlock *DestBB; - if (Val == 0) // Undef. + if (isa<UndefValue>(Val)) DestBB = 0; else if (BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator())) - DestBB = BI->getSuccessor(Val->isZero()); + DestBB = BI->getSuccessor(cast<ConstantInt>(Val)->isZero()); + else if (SwitchInst *SI = dyn_cast<SwitchInst>(BB->getTerminator())) + DestBB = SI->getSuccessor(SI->findCaseValue(cast<ConstantInt>(Val))); else { - SwitchInst *SI = cast<SwitchInst>(BB->getTerminator()); - DestBB = SI->getSuccessor(SI->findCaseValue(Val)); + assert(isa<IndirectBrInst>(BB->getTerminator()) + && "Unexpected terminator"); + DestBB = cast<BlockAddress>(Val)->getBasicBlock(); } // If we have exactly one destination, remember it for efficiency below. - if (i == 0) + if (PredToDestList.empty()) OnlyDest = DestBB; else if (OnlyDest != DestBB) OnlyDest = MultipleDestSentinel; - + PredToDestList.push_back(std::make_pair(Pred, DestBB)); } - + // If all edges were unthreadable, we fail. if (PredToDestList.empty()) return false; - + // Determine which is the most common successor. If we have many inputs and // this block is a switch, we want to start by threading the batch that goes // to the most popular destination first. If we only know about one // threadable destination (the common case) we can avoid this. BasicBlock *MostPopularDest = OnlyDest; - + if (MostPopularDest == MultipleDestSentinel) MostPopularDest = FindMostPopularDest(BB, PredToDestList); - + // Now that we know what the most popular destination is, factor all // predecessors that will jump to it into a single predecessor. SmallVector<BasicBlock*, 16> PredsToFactor; for (unsigned i = 0, e = PredToDestList.size(); i != e; ++i) if (PredToDestList[i].second == MostPopularDest) { BasicBlock *Pred = PredToDestList[i].first; - + // This predecessor may be a switch or something else that has multiple // edges to the block. Factor each of these edges by listing them // according to # occurrences in PredsToFactor. @@ -1251,7 +1116,7 @@ bool JumpThreading::ProcessThreadableEdges(Value *Cond, BasicBlock *BB) { if (MostPopularDest == 0) MostPopularDest = BB->getTerminator()-> getSuccessor(GetBestDestForJumpOnUndef(BB)); - + // Ok, try to thread it! return ThreadEdge(BB, PredsToFactor, MostPopularDest); } @@ -1259,15 +1124,15 @@ bool JumpThreading::ProcessThreadableEdges(Value *Cond, BasicBlock *BB) { /// ProcessBranchOnPHI - We have an otherwise unthreadable conditional branch on /// a PHI node in the current block. See if there are any simplifications we /// can do based on inputs to the phi node. -/// +/// bool JumpThreading::ProcessBranchOnPHI(PHINode *PN) { BasicBlock *BB = PN->getParent(); - + // TODO: We could make use of this to do it once for blocks with common PHI // values. SmallVector<BasicBlock*, 1> PredBBs; PredBBs.resize(1); - + // If any of the predecessor blocks end in an unconditional branch, we can // *duplicate* the conditional branch into that block in order to further // encourage jump threading and to eliminate cases where we have branch on a @@ -1289,21 +1154,21 @@ bool JumpThreading::ProcessBranchOnPHI(PHINode *PN) { /// ProcessBranchOnXOR - We have an otherwise unthreadable conditional branch on /// a xor instruction in the current block. See if there are any /// simplifications we can do based on inputs to the xor. -/// +/// bool JumpThreading::ProcessBranchOnXOR(BinaryOperator *BO) { BasicBlock *BB = BO->getParent(); - + // If either the LHS or RHS of the xor is a constant, don't do this // optimization. if (isa<ConstantInt>(BO->getOperand(0)) || isa<ConstantInt>(BO->getOperand(1))) return false; - + // If the first instruction in BB isn't a phi, we won't be able to infer // anything special about any particular predecessor. if (!isa<PHINode>(BB->front())) return false; - + // If we have a xor as the branch input to this block, and we know that the // LHS or RHS of the xor in any predecessor is true/false, then we can clone // the condition into the predecessor and fix that value to true, saving some @@ -1322,15 +1187,17 @@ bool JumpThreading::ProcessBranchOnXOR(BinaryOperator *BO) { // %Y = icmp ne i32 %A, %B // br i1 %Z, ... - SmallVector<std::pair<ConstantInt*, BasicBlock*>, 8> XorOpValues; + PredValueInfoTy XorOpValues; bool isLHS = true; - if (!ComputeValueKnownInPredecessors(BO->getOperand(0), BB, XorOpValues)) { + if (!ComputeValueKnownInPredecessors(BO->getOperand(0), BB, XorOpValues, + WantInteger)) { assert(XorOpValues.empty()); - if (!ComputeValueKnownInPredecessors(BO->getOperand(1), BB, XorOpValues)) + if (!ComputeValueKnownInPredecessors(BO->getOperand(1), BB, XorOpValues, + WantInteger)) return false; isLHS = false; } - + assert(!XorOpValues.empty() && "ComputeValueKnownInPredecessors returned true with no values"); @@ -1338,29 +1205,33 @@ bool JumpThreading::ProcessBranchOnXOR(BinaryOperator *BO) { // predecessors can be of the set true, false, or undef. unsigned NumTrue = 0, NumFalse = 0; for (unsigned i = 0, e = XorOpValues.size(); i != e; ++i) { - if (!XorOpValues[i].first) continue; // Ignore undefs for the count. - if (XorOpValues[i].first->isZero()) + if (isa<UndefValue>(XorOpValues[i].first)) + // Ignore undefs for the count. + continue; + if (cast<ConstantInt>(XorOpValues[i].first)->isZero()) ++NumFalse; else ++NumTrue; } - + // Determine which value to split on, true, false, or undef if neither. ConstantInt *SplitVal = 0; if (NumTrue > NumFalse) SplitVal = ConstantInt::getTrue(BB->getContext()); else if (NumTrue != 0 || NumFalse != 0) SplitVal = ConstantInt::getFalse(BB->getContext()); - + // Collect all of the blocks that this can be folded into so that we can // factor this once and clone it once. SmallVector<BasicBlock*, 8> BlocksToFoldInto; for (unsigned i = 0, e = XorOpValues.size(); i != e; ++i) { - if (XorOpValues[i].first != SplitVal && XorOpValues[i].first != 0) continue; + if (XorOpValues[i].first != SplitVal && + !isa<UndefValue>(XorOpValues[i].first)) + continue; BlocksToFoldInto.push_back(XorOpValues[i].second); } - + // If we inferred a value for all of the predecessors, then duplication won't // help us. However, we can just replace the LHS or RHS with the constant. if (BlocksToFoldInto.size() == @@ -1377,10 +1248,10 @@ bool JumpThreading::ProcessBranchOnXOR(BinaryOperator *BO) { // If all preds provide 1, set the computed value to 1. BO->setOperand(!isLHS, SplitVal); } - + return true; } - + // Try to duplicate BB into PredBB. return DuplicateCondBranchOnPHIIntoPred(BB, BlocksToFoldInto); } @@ -1398,14 +1269,14 @@ static void AddPHINodeEntriesForMappedBlock(BasicBlock *PHIBB, // Ok, we have a PHI node. Figure out what the incoming value was for the // DestBlock. Value *IV = PN->getIncomingValueForBlock(OldPred); - + // Remap the value if necessary. if (Instruction *Inst = dyn_cast<Instruction>(IV)) { DenseMap<Instruction*, Value*>::iterator I = ValueMap.find(Inst); if (I != ValueMap.end()) IV = I->second; } - + PN->addIncoming(IV, NewPred); } } @@ -1413,8 +1284,8 @@ static void AddPHINodeEntriesForMappedBlock(BasicBlock *PHIBB, /// ThreadEdge - We have decided that it is safe and profitable to factor the /// blocks in PredBBs to one predecessor, then thread an edge from it to SuccBB /// across BB. Transform the IR to reflect this change. -bool JumpThreading::ThreadEdge(BasicBlock *BB, - const SmallVectorImpl<BasicBlock*> &PredBBs, +bool JumpThreading::ThreadEdge(BasicBlock *BB, + const SmallVectorImpl<BasicBlock*> &PredBBs, BasicBlock *SuccBB) { // If threading to the same block as we come from, we would infinite loop. if (SuccBB == BB) { @@ -1422,7 +1293,7 @@ bool JumpThreading::ThreadEdge(BasicBlock *BB, << "' - would thread to self!\n"); return false; } - + // If threading this would thread across a loop header, don't thread the edge. // See the comments above FindLoopHeaders for justifications and caveats. if (LoopHeaders.count(BB)) { @@ -1438,7 +1309,7 @@ bool JumpThreading::ThreadEdge(BasicBlock *BB, << "' - Cost is too high: " << JumpThreadCost << "\n"); return false; } - + // And finally, do it! Start by factoring the predecessors is needed. BasicBlock *PredBB; if (PredBBs.size() == 1) @@ -1449,30 +1320,29 @@ bool JumpThreading::ThreadEdge(BasicBlock *BB, PredBB = SplitBlockPredecessors(BB, &PredBBs[0], PredBBs.size(), ".thr_comm", this); } - + // And finally, do it! DEBUG(dbgs() << " Threading edge from '" << PredBB->getName() << "' to '" << SuccBB->getName() << "' with cost: " << JumpThreadCost << ", across block:\n " << *BB << "\n"); - - if (LVI) - LVI->threadEdge(PredBB, BB, SuccBB); - + + LVI->threadEdge(PredBB, BB, SuccBB); + // We are going to have to map operands from the original BB block to the new // copy of the block 'NewBB'. If there are PHI nodes in BB, evaluate them to // account for entry from PredBB. DenseMap<Instruction*, Value*> ValueMapping; - - BasicBlock *NewBB = BasicBlock::Create(BB->getContext(), - BB->getName()+".thread", + + BasicBlock *NewBB = BasicBlock::Create(BB->getContext(), + BB->getName()+".thread", BB->getParent(), BB); NewBB->moveAfter(PredBB); - + BasicBlock::iterator BI = BB->begin(); for (; PHINode *PN = dyn_cast<PHINode>(BI); ++BI) ValueMapping[PN] = PN->getIncomingValueForBlock(PredBB); - + // Clone the non-phi instructions of BB into NewBB, keeping track of the // mapping and using it to remap operands in the cloned instructions. for (; !isa<TerminatorInst>(BI); ++BI) { @@ -1480,7 +1350,7 @@ bool JumpThreading::ThreadEdge(BasicBlock *BB, New->setName(BI->getName()); NewBB->getInstList().push_back(New); ValueMapping[BI] = New; - + // Remap operands to patch up intra-block references. for (unsigned i = 0, e = New->getNumOperands(); i != e; ++i) if (Instruction *Inst = dyn_cast<Instruction>(New->getOperand(i))) { @@ -1489,15 +1359,15 @@ bool JumpThreading::ThreadEdge(BasicBlock *BB, New->setOperand(i, I->second); } } - + // We didn't copy the terminator from BB over to NewBB, because there is now // an unconditional jump to SuccBB. Insert the unconditional jump. BranchInst::Create(SuccBB, NewBB); - + // Check to see if SuccBB has PHI nodes. If so, we need to add entries to the // PHI nodes for NewBB now. AddPHINodeEntriesForMappedBlock(SuccBB, BB, NewBB, ValueMapping); - + // If there were values defined in BB that are used outside the block, then we // now have to update all uses of the value to use either the original value, // the cloned value, or some PHI derived value. This can require arbitrary @@ -1515,14 +1385,14 @@ bool JumpThreading::ThreadEdge(BasicBlock *BB, continue; } else if (User->getParent() == BB) continue; - + UsesToRename.push_back(&UI.getUse()); } - + // If there are no uses outside the block, we're done with this instruction. if (UsesToRename.empty()) continue; - + DEBUG(dbgs() << "JT: Renaming non-local uses of: " << *I << "\n"); // We found a use of I outside of BB. Rename all uses of I that are outside @@ -1531,28 +1401,28 @@ bool JumpThreading::ThreadEdge(BasicBlock *BB, SSAUpdate.Initialize(I->getType(), I->getName()); SSAUpdate.AddAvailableValue(BB, I); SSAUpdate.AddAvailableValue(NewBB, ValueMapping[I]); - + while (!UsesToRename.empty()) SSAUpdate.RewriteUse(*UsesToRename.pop_back_val()); DEBUG(dbgs() << "\n"); } - - + + // Ok, NewBB is good to go. Update the terminator of PredBB to jump to // NewBB instead of BB. This eliminates predecessors from BB, which requires // us to simplify any PHI nodes in BB. TerminatorInst *PredTerm = PredBB->getTerminator(); for (unsigned i = 0, e = PredTerm->getNumSuccessors(); i != e; ++i) if (PredTerm->getSuccessor(i) == BB) { - RemovePredecessorAndSimplify(BB, PredBB, TD); + BB->removePredecessor(PredBB, true); PredTerm->setSuccessor(i, NewBB); } - + // At this point, the IR is fully up to date and consistent. Do a quick scan // over the new instructions and zap any that are constants or dead. This // frequently happens because of phi translation. SimplifyInstructionsInBlock(NewBB, TD); - + // Threaded an edge! ++NumThreads; return true; @@ -1576,14 +1446,14 @@ bool JumpThreading::DuplicateCondBranchOnPHIIntoPred(BasicBlock *BB, << "' - it might create an irreducible loop!\n"); return false; } - + unsigned DuplicationCost = getJumpThreadDuplicationCost(BB); if (DuplicationCost > Threshold) { DEBUG(dbgs() << " Not duplicating BB '" << BB->getName() << "' - Cost is too high: " << DuplicationCost << "\n"); return false; } - + // And finally, do it! Start by factoring the predecessors is needed. BasicBlock *PredBB; if (PredBBs.size() == 1) @@ -1594,35 +1464,35 @@ bool JumpThreading::DuplicateCondBranchOnPHIIntoPred(BasicBlock *BB, PredBB = SplitBlockPredecessors(BB, &PredBBs[0], PredBBs.size(), ".thr_comm", this); } - + // Okay, we decided to do this! Clone all the instructions in BB onto the end // of PredBB. DEBUG(dbgs() << " Duplicating block '" << BB->getName() << "' into end of '" << PredBB->getName() << "' to eliminate branch on phi. Cost: " << DuplicationCost << " block is:" << *BB << "\n"); - + // Unless PredBB ends with an unconditional branch, split the edge so that we // can just clone the bits from BB into the end of the new PredBB. BranchInst *OldPredBranch = dyn_cast<BranchInst>(PredBB->getTerminator()); - + if (OldPredBranch == 0 || !OldPredBranch->isUnconditional()) { PredBB = SplitEdge(PredBB, BB, this); OldPredBranch = cast<BranchInst>(PredBB->getTerminator()); } - + // We are going to have to map operands from the original BB block into the // PredBB block. Evaluate PHI nodes in BB. DenseMap<Instruction*, Value*> ValueMapping; - + BasicBlock::iterator BI = BB->begin(); for (; PHINode *PN = dyn_cast<PHINode>(BI); ++BI) ValueMapping[PN] = PN->getIncomingValueForBlock(PredBB); - + // Clone the non-phi instructions of BB into PredBB, keeping track of the // mapping and using it to remap operands in the cloned instructions. for (; BI != BB->end(); ++BI) { Instruction *New = BI->clone(); - + // Remap operands to patch up intra-block references. for (unsigned i = 0, e = New->getNumOperands(); i != e; ++i) if (Instruction *Inst = dyn_cast<Instruction>(New->getOperand(i))) { @@ -1644,7 +1514,7 @@ bool JumpThreading::DuplicateCondBranchOnPHIIntoPred(BasicBlock *BB, ValueMapping[BI] = New; } } - + // Check to see if the targets of the branch had PHI nodes. If so, we need to // add entries to the PHI nodes for branch from PredBB now. BranchInst *BBBranch = cast<BranchInst>(BB->getTerminator()); @@ -1652,7 +1522,7 @@ bool JumpThreading::DuplicateCondBranchOnPHIIntoPred(BasicBlock *BB, ValueMapping); AddPHINodeEntriesForMappedBlock(BBBranch->getSuccessor(1), BB, PredBB, ValueMapping); - + // If there were values defined in BB that are used outside the block, then we // now have to update all uses of the value to use either the original value, // the cloned value, or some PHI derived value. This can require arbitrary @@ -1670,35 +1540,35 @@ bool JumpThreading::DuplicateCondBranchOnPHIIntoPred(BasicBlock *BB, continue; } else if (User->getParent() == BB) continue; - + UsesToRename.push_back(&UI.getUse()); } - + // If there are no uses outside the block, we're done with this instruction. if (UsesToRename.empty()) continue; - + DEBUG(dbgs() << "JT: Renaming non-local uses of: " << *I << "\n"); - + // We found a use of I outside of BB. Rename all uses of I that are outside // its block to be uses of the appropriate PHI node etc. See ValuesInBlocks // with the two values we know. SSAUpdate.Initialize(I->getType(), I->getName()); SSAUpdate.AddAvailableValue(BB, I); SSAUpdate.AddAvailableValue(PredBB, ValueMapping[I]); - + while (!UsesToRename.empty()) SSAUpdate.RewriteUse(*UsesToRename.pop_back_val()); DEBUG(dbgs() << "\n"); } - + // PredBB no longer jumps to BB, remove entries in the PHI node for the edge // that we nuked. - RemovePredecessorAndSimplify(BB, PredBB, TD); - + BB->removePredecessor(PredBB, true); + // Remove the unconditional branch at the end of the PredBB block. OldPredBranch->eraseFromParent(); - + ++NumDupes; return true; } diff --git a/lib/Transforms/Scalar/LICM.cpp b/lib/Transforms/Scalar/LICM.cpp index 2ef8544..0786793 100644 --- a/lib/Transforms/Scalar/LICM.cpp +++ b/lib/Transforms/Scalar/LICM.cpp @@ -36,13 +36,13 @@ #include "llvm/DerivedTypes.h" #include "llvm/IntrinsicInst.h" #include "llvm/Instructions.h" +#include "llvm/LLVMContext.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/AliasSetTracker.h" #include "llvm/Analysis/ConstantFolding.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopPass.h" #include "llvm/Analysis/Dominators.h" -#include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/SSAUpdater.h" #include "llvm/Support/CFG.h" @@ -66,7 +66,9 @@ DisablePromotion("disable-licm-promotion", cl::Hidden, namespace { struct LICM : public LoopPass { static char ID; // Pass identification, replacement for typeid - LICM() : LoopPass(ID) {} + LICM() : LoopPass(ID) { + initializeLICMPass(*PassRegistry::getPassRegistry()); + } virtual bool runOnLoop(Loop *L, LPPassManager &LPM); @@ -80,7 +82,7 @@ namespace { AU.addRequiredID(LoopSimplifyID); AU.addRequired<AliasAnalysis>(); AU.addPreserved<AliasAnalysis>(); - AU.addPreserved<ScalarEvolution>(); + AU.addPreserved("scalar-evolution"); AU.addPreservedID(LoopSimplifyID); } @@ -129,42 +131,7 @@ namespace { /// bool inSubLoop(BasicBlock *BB) { assert(CurLoop->contains(BB) && "Only valid if BB is IN the loop"); - for (Loop::iterator I = CurLoop->begin(), E = CurLoop->end(); I != E; ++I) - if ((*I)->contains(BB)) - return true; // A subloop actually contains this block! - return false; - } - - /// isExitBlockDominatedByBlockInLoop - This method checks to see if the - /// specified exit block of the loop is dominated by the specified block - /// that is in the body of the loop. We use these constraints to - /// dramatically limit the amount of the dominator tree that needs to be - /// searched. - bool isExitBlockDominatedByBlockInLoop(BasicBlock *ExitBlock, - BasicBlock *BlockInLoop) const { - // If the block in the loop is the loop header, it must be dominated! - BasicBlock *LoopHeader = CurLoop->getHeader(); - if (BlockInLoop == LoopHeader) - return true; - - DomTreeNode *BlockInLoopNode = DT->getNode(BlockInLoop); - DomTreeNode *IDom = DT->getNode(ExitBlock); - - // Because the exit block is not in the loop, we know we have to get _at - // least_ its immediate dominator. - IDom = IDom->getIDom(); - - while (IDom && IDom != BlockInLoopNode) { - // If we have got to the header of the loop, then the instructions block - // did not dominate the exit node, so we can't hoist it. - if (IDom->getBlock() == LoopHeader) - return false; - - // Get next Immediate Dominator. - IDom = IDom->getIDom(); - }; - - return true; + return LI->getLoopFor(BB) != CurLoop; } /// sink - When an instruction is found to only be used outside of the loop, @@ -187,13 +154,13 @@ namespace { /// pointerInvalidatedByLoop - Return true if the body of this loop may /// store into the memory location pointed to by V. /// - bool pointerInvalidatedByLoop(Value *V, unsigned Size) { + bool pointerInvalidatedByLoop(Value *V, uint64_t Size, + const MDNode *TBAAInfo) { // Check to see if any of the basic blocks in CurLoop invalidate *V. - return CurAST->getAliasSetForPointer(V, Size).isMod(); + return CurAST->getAliasSetForPointer(V, Size, TBAAInfo).isMod(); } bool canSinkOrHoistInst(Instruction &I); - bool isLoopInvariantInst(Instruction &I); bool isNotUsedInLoop(Instruction &I); void PromoteAliasSet(AliasSet &AS); @@ -201,7 +168,12 @@ namespace { } char LICM::ID = 0; -INITIALIZE_PASS(LICM, "licm", "Loop Invariant Code Motion", false, false); +INITIALIZE_PASS_BEGIN(LICM, "licm", "Loop Invariant Code Motion", false, false) +INITIALIZE_PASS_DEPENDENCY(DominatorTree) +INITIALIZE_PASS_DEPENDENCY(LoopInfo) +INITIALIZE_PASS_DEPENDENCY(LoopSimplify) +INITIALIZE_AG_DEPENDENCY(AliasAnalysis) +INITIALIZE_PASS_END(LICM, "licm", "Loop Invariant Code Motion", false, false) Pass *llvm::createLICMPass() { return new LICM(); } @@ -369,7 +341,7 @@ void LICM::HoistRegion(DomTreeNode *N) { // if all of the operands of the instruction are loop invariant and if it // is safe to hoist the instruction. // - if (isLoopInvariantInst(I) && canSinkOrHoistInst(I) && + if (CurLoop->hasLoopInvariantOperands(&I) && canSinkOrHoistInst(I) && isSafeToExecuteUnconditionally(I)) hoist(I); } @@ -394,16 +366,17 @@ bool LICM::canSinkOrHoistInst(Instruction &I) { return true; // Don't hoist loads which have may-aliased stores in loop. - unsigned Size = 0; + uint64_t Size = 0; if (LI->getType()->isSized()) Size = AA->getTypeStoreSize(LI->getType()); - return !pointerInvalidatedByLoop(LI->getOperand(0), Size); + return !pointerInvalidatedByLoop(LI->getOperand(0), Size, + LI->getMetadata(LLVMContext::MD_tbaa)); } else if (CallInst *CI = dyn_cast<CallInst>(&I)) { // Handle obvious cases efficiently. AliasAnalysis::ModRefBehavior Behavior = AA->getModRefBehavior(CI); if (Behavior == AliasAnalysis::DoesNotAccessMemory) return true; - else if (Behavior == AliasAnalysis::OnlyReadsMemory) { + if (AliasAnalysis::onlyReadsMemory(Behavior)) { // If this call only reads from memory and there are no writes to memory // in the loop, we can hoist or sink the call as appropriate. bool FoundMod = false; @@ -452,20 +425,6 @@ bool LICM::isNotUsedInLoop(Instruction &I) { } -/// isLoopInvariantInst - Return true if all operands of this instruction are -/// loop invariant. We also filter out non-hoistable instructions here just for -/// efficiency. -/// -bool LICM::isLoopInvariantInst(Instruction &I) { - // The instruction is loop invariant if all of its operands are loop-invariant - for (unsigned i = 0, e = I.getNumOperands(); i != e; ++i) - if (!CurLoop->isLoopInvariant(I.getOperand(i))) - return false; - - // If we got this far, the instruction is loop invariant! - return true; -} - /// sink - When an instruction is found to only be used outside of the loop, /// this function moves it to the exit blocks and patches up SSA form as needed. /// This method is guaranteed to remove the original instruction from its @@ -486,7 +445,7 @@ void LICM::sink(Instruction &I) { // enough that we handle it as a special (more efficient) case. It is more // efficient to handle because there are no PHI nodes that need to be placed. if (ExitBlocks.size() == 1) { - if (!isExitBlockDominatedByBlockInLoop(ExitBlocks[0], I.getParent())) { + if (!DT->dominates(I.getParent(), ExitBlocks[0])) { // Instruction is not used, just delete it. CurAST->deleteValue(&I); // If I has users in unreachable blocks, eliminate. @@ -537,7 +496,7 @@ void LICM::sink(Instruction &I) { for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i) { BasicBlock *ExitBlock = ExitBlocks[i]; - if (!isExitBlockDominatedByBlockInLoop(ExitBlock, InstOrigBB)) + if (!DT->dominates(InstOrigBB, ExitBlock)) continue; // Insert the code after the last PHI node. @@ -628,15 +587,61 @@ bool LICM::isSafeToExecuteUnconditionally(Instruction &Inst) { SmallVector<BasicBlock*, 8> ExitBlocks; CurLoop->getExitBlocks(ExitBlocks); - // For each exit block, get the DT node and walk up the DT until the - // instruction's basic block is found or we exit the loop. + // Verify that the block dominates each of the exit blocks of the loop. for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i) - if (!isExitBlockDominatedByBlockInLoop(ExitBlocks[i], Inst.getParent())) + if (!DT->dominates(Inst.getParent(), ExitBlocks[i])) return false; return true; } +namespace { + class LoopPromoter : public LoadAndStorePromoter { + Value *SomePtr; // Designated pointer to store to. + SmallPtrSet<Value*, 4> &PointerMustAliases; + SmallVectorImpl<BasicBlock*> &LoopExitBlocks; + AliasSetTracker &AST; + public: + LoopPromoter(Value *SP, + const SmallVectorImpl<Instruction*> &Insts, SSAUpdater &S, + SmallPtrSet<Value*, 4> &PMA, + SmallVectorImpl<BasicBlock*> &LEB, AliasSetTracker &ast) + : LoadAndStorePromoter(Insts, S), SomePtr(SP), PointerMustAliases(PMA), + LoopExitBlocks(LEB), AST(ast) {} + + virtual bool isInstInList(Instruction *I, + const SmallVectorImpl<Instruction*> &) const { + Value *Ptr; + if (LoadInst *LI = dyn_cast<LoadInst>(I)) + Ptr = LI->getOperand(0); + else + Ptr = cast<StoreInst>(I)->getPointerOperand(); + return PointerMustAliases.count(Ptr); + } + + virtual void doExtraRewritesBeforeFinalDeletion() const { + // Insert stores after in the loop exit blocks. Each exit block gets a + // store of the live-out values that feed them. Since we've already told + // the SSA updater about the defs in the loop and the preheader + // definition, it is all set and we can start using it. + for (unsigned i = 0, e = LoopExitBlocks.size(); i != e; ++i) { + BasicBlock *ExitBlock = LoopExitBlocks[i]; + Value *LiveInValue = SSA.GetValueInMiddleOfBlock(ExitBlock); + Instruction *InsertPos = ExitBlock->getFirstNonPHI(); + new StoreInst(LiveInValue, SomePtr, InsertPos); + } + } + + virtual void replaceLoadWithValue(LoadInst *LI, Value *V) const { + // Update alias analysis. + AST.copyValue(LI, V); + } + virtual void instructionDeleted(Instruction *I) const { + AST.deleteValue(I); + } + }; +} // end anon namespace + /// PromoteAliasSet - Try to promote memory values to scalars by sinking /// stores out of the loop and moving loads to before the loop. We do this by /// looping over the stores in the loop, looking for stores to Must pointers @@ -697,8 +702,11 @@ void LICM::PromoteAliasSet(AliasSet &AS) { if (isa<LoadInst>(Use)) assert(!cast<LoadInst>(Use)->isVolatile() && "AST broken"); else if (isa<StoreInst>(Use)) { + // Stores *of* the pointer are not interesting, only stores *to* the + // pointer. + if (Use->getOperand(1) != ASIV) + continue; assert(!cast<StoreInst>(Use)->isVolatile() && "AST broken"); - if (Use->getOperand(0) == ASIV) return; } else return; // Not a load or store. @@ -718,179 +726,43 @@ void LICM::PromoteAliasSet(AliasSet &AS) { Changed = true; ++NumPromoted; + SmallVector<BasicBlock*, 8> ExitBlocks; + CurLoop->getUniqueExitBlocks(ExitBlocks); + // We use the SSAUpdater interface to insert phi nodes as required. SmallVector<PHINode*, 16> NewPHIs; SSAUpdater SSA(&NewPHIs); + LoopPromoter Promoter(SomePtr, LoopUses, SSA, PointerMustAliases, ExitBlocks, + *CurAST); - // It wants to know some value of the same type as what we'll be inserting. - Value *SomeValue; - if (isa<LoadInst>(LoopUses[0])) - SomeValue = LoopUses[0]; - else - SomeValue = cast<StoreInst>(LoopUses[0])->getOperand(0); - SSA.Initialize(SomeValue->getType(), SomeValue->getName()); - - // First step: bucket up uses of the pointers by the block they occur in. - // This is important because we have to handle multiple defs/uses in a block - // ourselves: SSAUpdater is purely for cross-block references. - // FIXME: Want a TinyVector<Instruction*> since there is usually 0/1 element. - DenseMap<BasicBlock*, std::vector<Instruction*> > UsesByBlock; - for (unsigned i = 0, e = LoopUses.size(); i != e; ++i) { - Instruction *User = LoopUses[i]; - UsesByBlock[User->getParent()].push_back(User); - } - - // Okay, now we can iterate over all the blocks in the loop with uses, - // processing them. Keep track of which loads are loading a live-in value. - SmallVector<LoadInst*, 32> LiveInLoads; - DenseMap<Value*, Value*> ReplacedLoads; - - for (unsigned LoopUse = 0, e = LoopUses.size(); LoopUse != e; ++LoopUse) { - Instruction *User = LoopUses[LoopUse]; - std::vector<Instruction*> &BlockUses = UsesByBlock[User->getParent()]; - - // If this block has already been processed, ignore this repeat use. - if (BlockUses.empty()) continue; - - // Okay, this is the first use in the block. If this block just has a - // single user in it, we can rewrite it trivially. - if (BlockUses.size() == 1) { - // If it is a store, it is a trivial def of the value in the block. - if (isa<StoreInst>(User)) { - SSA.AddAvailableValue(User->getParent(), - cast<StoreInst>(User)->getOperand(0)); - } else { - // Otherwise it is a load, queue it to rewrite as a live-in load. - LiveInLoads.push_back(cast<LoadInst>(User)); - } - BlockUses.clear(); - continue; - } - - // Otherwise, check to see if this block is all loads. If so, we can queue - // them all as live in loads. - bool HasStore = false; - for (unsigned i = 0, e = BlockUses.size(); i != e; ++i) { - if (isa<StoreInst>(BlockUses[i])) { - HasStore = true; - break; - } - } - - if (!HasStore) { - for (unsigned i = 0, e = BlockUses.size(); i != e; ++i) - LiveInLoads.push_back(cast<LoadInst>(BlockUses[i])); - BlockUses.clear(); - continue; - } - - // Otherwise, we have mixed loads and stores (or just a bunch of stores). - // Since SSAUpdater is purely for cross-block values, we need to determine - // the order of these instructions in the block. If the first use in the - // block is a load, then it uses the live in value. The last store defines - // the live out value. We handle this by doing a linear scan of the block. - BasicBlock *BB = User->getParent(); - Value *StoredValue = 0; - for (BasicBlock::iterator II = BB->begin(), E = BB->end(); II != E; ++II) { - if (LoadInst *L = dyn_cast<LoadInst>(II)) { - // If this is a load from an unrelated pointer, ignore it. - if (!PointerMustAliases.count(L->getOperand(0))) continue; - - // If we haven't seen a store yet, this is a live in use, otherwise - // use the stored value. - if (StoredValue) { - L->replaceAllUsesWith(StoredValue); - ReplacedLoads[L] = StoredValue; - } else { - LiveInLoads.push_back(L); - } - continue; - } - - if (StoreInst *S = dyn_cast<StoreInst>(II)) { - // If this is a store to an unrelated pointer, ignore it. - if (!PointerMustAliases.count(S->getOperand(1))) continue; - - // Remember that this is the active value in the block. - StoredValue = S->getOperand(0); - } - } - - // The last stored value that happened is the live-out for the block. - assert(StoredValue && "Already checked that there is a store in block"); - SSA.AddAvailableValue(BB, StoredValue); - BlockUses.clear(); - } - - // Now that all the intra-loop values are classified, set up the preheader. - // It gets a load of the pointer we're promoting, and it is the live-out value - // from the preheader. - LoadInst *PreheaderLoad = new LoadInst(SomePtr,SomePtr->getName()+".promoted", - Preheader->getTerminator()); + // Set up the preheader to have a definition of the value. It is the live-out + // value from the preheader that uses in the loop will use. + LoadInst *PreheaderLoad = + new LoadInst(SomePtr, SomePtr->getName()+".promoted", + Preheader->getTerminator()); SSA.AddAvailableValue(Preheader, PreheaderLoad); - // Now that the preheader is good to go, set up the exit blocks. Each exit - // block gets a store of the live-out values that feed them. Since we've - // already told the SSA updater about the defs in the loop and the preheader - // definition, it is all set and we can start using it. - SmallVector<BasicBlock*, 8> ExitBlocks; - CurLoop->getUniqueExitBlocks(ExitBlocks); - for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i) { - BasicBlock *ExitBlock = ExitBlocks[i]; - Value *LiveInValue = SSA.GetValueInMiddleOfBlock(ExitBlock); - Instruction *InsertPos = ExitBlock->getFirstNonPHI(); - new StoreInst(LiveInValue, SomePtr, InsertPos); + // Copy any value stored to or loaded from a must-alias of the pointer. + if (PreheaderLoad->getType()->isPointerTy()) { + Value *SomeValue; + if (LoadInst *LI = dyn_cast<LoadInst>(LoopUses[0])) + SomeValue = LI; + else + SomeValue = cast<StoreInst>(LoopUses[0])->getValueOperand(); + + CurAST->copyValue(SomeValue, PreheaderLoad); } - // Okay, now we rewrite all loads that use live-in values in the loop, - // inserting PHI nodes as necessary. - for (unsigned i = 0, e = LiveInLoads.size(); i != e; ++i) { - LoadInst *ALoad = LiveInLoads[i]; - Value *NewVal = SSA.GetValueInMiddleOfBlock(ALoad->getParent()); - ALoad->replaceAllUsesWith(NewVal); - CurAST->copyValue(ALoad, NewVal); - ReplacedLoads[ALoad] = NewVal; - } + // Rewrite all the loads in the loop and remember all the definitions from + // stores in the loop. + Promoter.run(LoopUses); // If the preheader load is itself a pointer, we need to tell alias analysis // about the new pointer we created in the preheader block and about any PHI // nodes that just got inserted. if (PreheaderLoad->getType()->isPointerTy()) { - // Copy any value stored to or loaded from a must-alias of the pointer. - CurAST->copyValue(SomeValue, PreheaderLoad); - for (unsigned i = 0, e = NewPHIs.size(); i != e; ++i) - CurAST->copyValue(SomeValue, NewPHIs[i]); - } - - // Now that everything is rewritten, delete the old instructions from the body - // of the loop. They should all be dead now. - for (unsigned i = 0, e = LoopUses.size(); i != e; ++i) { - Instruction *User = LoopUses[i]; - - // If this is a load that still has uses, then the load must have been added - // as a live value in the SSAUpdate data structure for a block (e.g. because - // the loaded value was stored later). In this case, we need to recursively - // propagate the updates until we get to the real value. - if (!User->use_empty()) { - Value *NewVal = ReplacedLoads[User]; - assert(NewVal && "not a replaced load?"); - - // Propagate down to the ultimate replacee. The intermediately loads - // could theoretically already have been deleted, so we don't want to - // dereference the Value*'s. - DenseMap<Value*, Value*>::iterator RLI = ReplacedLoads.find(NewVal); - while (RLI != ReplacedLoads.end()) { - NewVal = RLI->second; - RLI = ReplacedLoads.find(NewVal); - } - - User->replaceAllUsesWith(NewVal); - CurAST->copyValue(User, NewVal); - } - - CurAST->deleteValue(User); - User->eraseFromParent(); + CurAST->copyValue(PreheaderLoad, NewPHIs[i]); } // fwew, we're done! diff --git a/lib/Transforms/Scalar/LoopDeletion.cpp b/lib/Transforms/Scalar/LoopDeletion.cpp index 543dfc1..6d1d344 100644 --- a/lib/Transforms/Scalar/LoopDeletion.cpp +++ b/lib/Transforms/Scalar/LoopDeletion.cpp @@ -17,6 +17,7 @@ #define DEBUG_TYPE "loop-delete" #include "llvm/Transforms/Scalar.h" #include "llvm/Analysis/LoopPass.h" +#include "llvm/Analysis/Dominators.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/SmallVector.h" @@ -28,7 +29,9 @@ namespace { class LoopDeletion : public LoopPass { public: static char ID; // Pass ID, replacement for typeid - LoopDeletion() : LoopPass(ID) {} + LoopDeletion() : LoopPass(ID) { + initializeLoopDeletionPass(*PassRegistry::getPassRegistry()); + } // Possibly eliminate loop L if it is dead. bool runOnLoop(Loop* L, LPPassManager& LPM); @@ -49,14 +52,20 @@ namespace { AU.addPreserved<LoopInfo>(); AU.addPreservedID(LoopSimplifyID); AU.addPreservedID(LCSSAID); - AU.addPreserved<DominanceFrontier>(); } }; } char LoopDeletion::ID = 0; -INITIALIZE_PASS(LoopDeletion, "loop-deletion", - "Delete dead loops", false, false); +INITIALIZE_PASS_BEGIN(LoopDeletion, "loop-deletion", + "Delete dead loops", false, false) +INITIALIZE_PASS_DEPENDENCY(DominatorTree) +INITIALIZE_PASS_DEPENDENCY(LoopInfo) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) +INITIALIZE_PASS_DEPENDENCY(LoopSimplify) +INITIALIZE_PASS_DEPENDENCY(LCSSA) +INITIALIZE_PASS_END(LoopDeletion, "loop-deletion", + "Delete dead loops", false, false) Pass* llvm::createLoopDeletionPass() { return new LoopDeletion(); @@ -183,22 +192,19 @@ bool LoopDeletion::runOnLoop(Loop* L, LPPassManager& LPM) { // Update the dominator tree and remove the instructions and blocks that will // be deleted from the reference counting scheme. DominatorTree& DT = getAnalysis<DominatorTree>(); - DominanceFrontier* DF = getAnalysisIfAvailable<DominanceFrontier>(); - SmallPtrSet<DomTreeNode*, 8> ChildNodes; + SmallVector<DomTreeNode*, 8> ChildNodes; for (Loop::block_iterator LI = L->block_begin(), LE = L->block_end(); LI != LE; ++LI) { // Move all of the block's children to be children of the preheader, which // allows us to remove the domtree entry for the block. - ChildNodes.insert(DT[*LI]->begin(), DT[*LI]->end()); - for (SmallPtrSet<DomTreeNode*, 8>::iterator DI = ChildNodes.begin(), + ChildNodes.insert(ChildNodes.begin(), DT[*LI]->begin(), DT[*LI]->end()); + for (SmallVector<DomTreeNode*, 8>::iterator DI = ChildNodes.begin(), DE = ChildNodes.end(); DI != DE; ++DI) { DT.changeImmediateDominator(*DI, DT[preheader]); - if (DF) DF->changeImmediateDominator((*DI)->getBlock(), preheader, &DT); } ChildNodes.clear(); DT.eraseNode(*LI); - if (DF) DF->removeBlock(*LI); // Remove the block from the reference counting scheme, so that we can // delete it freely later. diff --git a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp new file mode 100644 index 0000000..d7fa149 --- /dev/null +++ b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp @@ -0,0 +1,594 @@ +//===-- LoopIdiomRecognize.cpp - Loop idiom recognition -------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass implements an idiom recognizer that transforms simple loops into a +// non-loop form. In cases that this kicks in, it can be a significant +// performance win. +// +//===----------------------------------------------------------------------===// +// +// TODO List: +// +// Future loop memory idioms to recognize: +// memcmp, memmove, strlen, etc. +// Future floating point idioms to recognize in -ffast-math mode: +// fpowi +// Future integer operation idioms to recognize: +// ctpop, ctlz, cttz +// +// Beware that isel's default lowering for ctpop is highly inefficient for +// i64 and larger types when i64 is legal and the value has few bits set. It +// would be good to enhance isel to emit a loop for ctpop in this case. +// +// We should enhance the memset/memcpy recognition to handle multiple stores in +// the loop. This would handle things like: +// void foo(_Complex float *P) +// for (i) { __real__(*P) = 0; __imag__(*P) = 0; } +// +// This could recognize common matrix multiplies and dot product idioms and +// replace them with calls to BLAS (if linked in??). +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "loop-idiom" +#include "llvm/Transforms/Scalar.h" +#include "llvm/IntrinsicInst.h" +#include "llvm/Module.h" +#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/LoopPass.h" +#include "llvm/Analysis/ScalarEvolutionExpressions.h" +#include "llvm/Analysis/ScalarEvolutionExpander.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/Target/TargetData.h" +#include "llvm/Target/TargetLibraryInfo.h" +#include "llvm/Transforms/Utils/Local.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/IRBuilder.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/ADT/Statistic.h" +using namespace llvm; + +STATISTIC(NumMemSet, "Number of memset's formed from loop stores"); +STATISTIC(NumMemCpy, "Number of memcpy's formed from loop load+stores"); + +namespace { + class LoopIdiomRecognize : public LoopPass { + Loop *CurLoop; + const TargetData *TD; + DominatorTree *DT; + ScalarEvolution *SE; + TargetLibraryInfo *TLI; + public: + static char ID; + explicit LoopIdiomRecognize() : LoopPass(ID) { + initializeLoopIdiomRecognizePass(*PassRegistry::getPassRegistry()); + } + + bool runOnLoop(Loop *L, LPPassManager &LPM); + bool runOnLoopBlock(BasicBlock *BB, const SCEV *BECount, + SmallVectorImpl<BasicBlock*> &ExitBlocks); + + bool processLoopStore(StoreInst *SI, const SCEV *BECount); + bool processLoopMemSet(MemSetInst *MSI, const SCEV *BECount); + + bool processLoopStridedStore(Value *DestPtr, unsigned StoreSize, + unsigned StoreAlignment, + Value *SplatValue, Instruction *TheStore, + const SCEVAddRecExpr *Ev, + const SCEV *BECount); + bool processLoopStoreOfLoopLoad(StoreInst *SI, unsigned StoreSize, + const SCEVAddRecExpr *StoreEv, + const SCEVAddRecExpr *LoadEv, + const SCEV *BECount); + + /// This transformation requires natural loop information & requires that + /// loop preheaders be inserted into the CFG. + /// + virtual void getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired<LoopInfo>(); + AU.addPreserved<LoopInfo>(); + AU.addRequiredID(LoopSimplifyID); + AU.addPreservedID(LoopSimplifyID); + AU.addRequiredID(LCSSAID); + AU.addPreservedID(LCSSAID); + AU.addRequired<AliasAnalysis>(); + AU.addPreserved<AliasAnalysis>(); + AU.addRequired<ScalarEvolution>(); + AU.addPreserved<ScalarEvolution>(); + AU.addPreserved<DominatorTree>(); + AU.addRequired<DominatorTree>(); + AU.addRequired<TargetLibraryInfo>(); + } + }; +} + +char LoopIdiomRecognize::ID = 0; +INITIALIZE_PASS_BEGIN(LoopIdiomRecognize, "loop-idiom", "Recognize loop idioms", + false, false) +INITIALIZE_PASS_DEPENDENCY(LoopInfo) +INITIALIZE_PASS_DEPENDENCY(DominatorTree) +INITIALIZE_PASS_DEPENDENCY(LoopSimplify) +INITIALIZE_PASS_DEPENDENCY(LCSSA) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo) +INITIALIZE_AG_DEPENDENCY(AliasAnalysis) +INITIALIZE_PASS_END(LoopIdiomRecognize, "loop-idiom", "Recognize loop idioms", + false, false) + +Pass *llvm::createLoopIdiomPass() { return new LoopIdiomRecognize(); } + +/// DeleteDeadInstruction - Delete this instruction. Before we do, go through +/// and zero out all the operands of this instruction. If any of them become +/// dead, delete them and the computation tree that feeds them. +/// +static void DeleteDeadInstruction(Instruction *I, ScalarEvolution &SE) { + SmallVector<Instruction*, 32> NowDeadInsts; + + NowDeadInsts.push_back(I); + + // Before we touch this instruction, remove it from SE! + do { + Instruction *DeadInst = NowDeadInsts.pop_back_val(); + + // This instruction is dead, zap it, in stages. Start by removing it from + // SCEV. + SE.forgetValue(DeadInst); + + for (unsigned op = 0, e = DeadInst->getNumOperands(); op != e; ++op) { + Value *Op = DeadInst->getOperand(op); + DeadInst->setOperand(op, 0); + + // If this operand just became dead, add it to the NowDeadInsts list. + if (!Op->use_empty()) continue; + + if (Instruction *OpI = dyn_cast<Instruction>(Op)) + if (isInstructionTriviallyDead(OpI)) + NowDeadInsts.push_back(OpI); + } + + DeadInst->eraseFromParent(); + + } while (!NowDeadInsts.empty()); +} + +bool LoopIdiomRecognize::runOnLoop(Loop *L, LPPassManager &LPM) { + CurLoop = L; + + // The trip count of the loop must be analyzable. + SE = &getAnalysis<ScalarEvolution>(); + if (!SE->hasLoopInvariantBackedgeTakenCount(L)) + return false; + const SCEV *BECount = SE->getBackedgeTakenCount(L); + if (isa<SCEVCouldNotCompute>(BECount)) return false; + + // If this loop executes exactly one time, then it should be peeled, not + // optimized by this pass. + if (const SCEVConstant *BECst = dyn_cast<SCEVConstant>(BECount)) + if (BECst->getValue()->getValue() == 0) + return false; + + // We require target data for now. + TD = getAnalysisIfAvailable<TargetData>(); + if (TD == 0) return false; + + DT = &getAnalysis<DominatorTree>(); + LoopInfo &LI = getAnalysis<LoopInfo>(); + TLI = &getAnalysis<TargetLibraryInfo>(); + + SmallVector<BasicBlock*, 8> ExitBlocks; + CurLoop->getUniqueExitBlocks(ExitBlocks); + + DEBUG(dbgs() << "loop-idiom Scanning: F[" + << L->getHeader()->getParent()->getName() + << "] Loop %" << L->getHeader()->getName() << "\n"); + + bool MadeChange = false; + // Scan all the blocks in the loop that are not in subloops. + for (Loop::block_iterator BI = L->block_begin(), E = L->block_end(); BI != E; + ++BI) { + // Ignore blocks in subloops. + if (LI.getLoopFor(*BI) != CurLoop) + continue; + + MadeChange |= runOnLoopBlock(*BI, BECount, ExitBlocks); + } + return MadeChange; +} + +/// runOnLoopBlock - Process the specified block, which lives in a counted loop +/// with the specified backedge count. This block is known to be in the current +/// loop and not in any subloops. +bool LoopIdiomRecognize::runOnLoopBlock(BasicBlock *BB, const SCEV *BECount, + SmallVectorImpl<BasicBlock*> &ExitBlocks) { + // We can only promote stores in this block if they are unconditionally + // executed in the loop. For a block to be unconditionally executed, it has + // to dominate all the exit blocks of the loop. Verify this now. + for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i) + if (!DT->dominates(BB, ExitBlocks[i])) + return false; + + bool MadeChange = false; + for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ) { + Instruction *Inst = I++; + // Look for store instructions, which may be optimized to memset/memcpy. + if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) { + WeakVH InstPtr(I); + if (!processLoopStore(SI, BECount)) continue; + MadeChange = true; + + // If processing the store invalidated our iterator, start over from the + // top of the block. + if (InstPtr == 0) + I = BB->begin(); + continue; + } + + // Look for memset instructions, which may be optimized to a larger memset. + if (MemSetInst *MSI = dyn_cast<MemSetInst>(Inst)) { + WeakVH InstPtr(I); + if (!processLoopMemSet(MSI, BECount)) continue; + MadeChange = true; + + // If processing the memset invalidated our iterator, start over from the + // top of the block. + if (InstPtr == 0) + I = BB->begin(); + continue; + } + } + + return MadeChange; +} + + +/// processLoopStore - See if this store can be promoted to a memset or memcpy. +bool LoopIdiomRecognize::processLoopStore(StoreInst *SI, const SCEV *BECount) { + if (SI->isVolatile()) return false; + + Value *StoredVal = SI->getValueOperand(); + Value *StorePtr = SI->getPointerOperand(); + + // Reject stores that are so large that they overflow an unsigned. + uint64_t SizeInBits = TD->getTypeSizeInBits(StoredVal->getType()); + if ((SizeInBits & 7) || (SizeInBits >> 32) != 0) + return false; + + // See if the pointer expression is an AddRec like {base,+,1} on the current + // loop, which indicates a strided store. If we have something else, it's a + // random store we can't handle. + const SCEVAddRecExpr *StoreEv = + dyn_cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr)); + if (StoreEv == 0 || StoreEv->getLoop() != CurLoop || !StoreEv->isAffine()) + return false; + + // Check to see if the stride matches the size of the store. If so, then we + // know that every byte is touched in the loop. + unsigned StoreSize = (unsigned)SizeInBits >> 3; + const SCEVConstant *Stride = dyn_cast<SCEVConstant>(StoreEv->getOperand(1)); + + // TODO: Could also handle negative stride here someday, that will require the + // validity check in mayLoopAccessLocation to be updated though. + if (Stride == 0 || StoreSize != Stride->getValue()->getValue()) + return false; + + // See if we can optimize just this store in isolation. + if (processLoopStridedStore(StorePtr, StoreSize, SI->getAlignment(), + StoredVal, SI, StoreEv, BECount)) + return true; + + // If the stored value is a strided load in the same loop with the same stride + // this this may be transformable into a memcpy. This kicks in for stuff like + // for (i) A[i] = B[i]; + if (LoadInst *LI = dyn_cast<LoadInst>(StoredVal)) { + const SCEVAddRecExpr *LoadEv = + dyn_cast<SCEVAddRecExpr>(SE->getSCEV(LI->getOperand(0))); + if (LoadEv && LoadEv->getLoop() == CurLoop && LoadEv->isAffine() && + StoreEv->getOperand(1) == LoadEv->getOperand(1) && !LI->isVolatile()) + if (processLoopStoreOfLoopLoad(SI, StoreSize, StoreEv, LoadEv, BECount)) + return true; + } + //errs() << "UNHANDLED strided store: " << *StoreEv << " - " << *SI << "\n"; + + return false; +} + +/// processLoopMemSet - See if this memset can be promoted to a large memset. +bool LoopIdiomRecognize:: +processLoopMemSet(MemSetInst *MSI, const SCEV *BECount) { + // We can only handle non-volatile memsets with a constant size. + if (MSI->isVolatile() || !isa<ConstantInt>(MSI->getLength())) return false; + + // If we're not allowed to hack on memset, we fail. + if (!TLI->has(LibFunc::memset)) + return false; + + Value *Pointer = MSI->getDest(); + + // See if the pointer expression is an AddRec like {base,+,1} on the current + // loop, which indicates a strided store. If we have something else, it's a + // random store we can't handle. + const SCEVAddRecExpr *Ev = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Pointer)); + if (Ev == 0 || Ev->getLoop() != CurLoop || !Ev->isAffine()) + return false; + + // Reject memsets that are so large that they overflow an unsigned. + uint64_t SizeInBytes = cast<ConstantInt>(MSI->getLength())->getZExtValue(); + if ((SizeInBytes >> 32) != 0) + return false; + + // Check to see if the stride matches the size of the memset. If so, then we + // know that every byte is touched in the loop. + const SCEVConstant *Stride = dyn_cast<SCEVConstant>(Ev->getOperand(1)); + + // TODO: Could also handle negative stride here someday, that will require the + // validity check in mayLoopAccessLocation to be updated though. + if (Stride == 0 || MSI->getLength() != Stride->getValue()) + return false; + + return processLoopStridedStore(Pointer, (unsigned)SizeInBytes, + MSI->getAlignment(), MSI->getValue(), + MSI, Ev, BECount); +} + + +/// mayLoopAccessLocation - Return true if the specified loop might access the +/// specified pointer location, which is a loop-strided access. The 'Access' +/// argument specifies what the verboten forms of access are (read or write). +static bool mayLoopAccessLocation(Value *Ptr,AliasAnalysis::ModRefResult Access, + Loop *L, const SCEV *BECount, + unsigned StoreSize, AliasAnalysis &AA, + Instruction *IgnoredStore) { + // Get the location that may be stored across the loop. Since the access is + // strided positively through memory, we say that the modified location starts + // at the pointer and has infinite size. + uint64_t AccessSize = AliasAnalysis::UnknownSize; + + // If the loop iterates a fixed number of times, we can refine the access size + // to be exactly the size of the memset, which is (BECount+1)*StoreSize + if (const SCEVConstant *BECst = dyn_cast<SCEVConstant>(BECount)) + AccessSize = (BECst->getValue()->getZExtValue()+1)*StoreSize; + + // TODO: For this to be really effective, we have to dive into the pointer + // operand in the store. Store to &A[i] of 100 will always return may alias + // with store of &A[100], we need to StoreLoc to be "A" with size of 100, + // which will then no-alias a store to &A[100]. + AliasAnalysis::Location StoreLoc(Ptr, AccessSize); + + for (Loop::block_iterator BI = L->block_begin(), E = L->block_end(); BI != E; + ++BI) + for (BasicBlock::iterator I = (*BI)->begin(), E = (*BI)->end(); I != E; ++I) + if (&*I != IgnoredStore && + (AA.getModRefInfo(I, StoreLoc) & Access)) + return true; + + return false; +} + +/// getMemSetPatternValue - If a strided store of the specified value is safe to +/// turn into a memset_pattern16, return a ConstantArray of 16 bytes that should +/// be passed in. Otherwise, return null. +/// +/// Note that we don't ever attempt to use memset_pattern8 or 4, because these +/// just replicate their input array and then pass on to memset_pattern16. +static Constant *getMemSetPatternValue(Value *V, const TargetData &TD) { + // If the value isn't a constant, we can't promote it to being in a constant + // array. We could theoretically do a store to an alloca or something, but + // that doesn't seem worthwhile. + Constant *C = dyn_cast<Constant>(V); + if (C == 0) return 0; + + // Only handle simple values that are a power of two bytes in size. + uint64_t Size = TD.getTypeSizeInBits(V->getType()); + if (Size == 0 || (Size & 7) || (Size & (Size-1))) + return 0; + + // Don't care enough about darwin/ppc to implement this. + if (TD.isBigEndian()) + return 0; + + // Convert to size in bytes. + Size /= 8; + + // TODO: If CI is larger than 16-bytes, we can try slicing it in half to see + // if the top and bottom are the same (e.g. for vectors and large integers). + if (Size > 16) return 0; + + // If the constant is exactly 16 bytes, just use it. + if (Size == 16) return C; + + // Otherwise, we'll use an array of the constants. + unsigned ArraySize = 16/Size; + ArrayType *AT = ArrayType::get(V->getType(), ArraySize); + return ConstantArray::get(AT, std::vector<Constant*>(ArraySize, C)); +} + + +/// processLoopStridedStore - We see a strided store of some value. If we can +/// transform this into a memset or memset_pattern in the loop preheader, do so. +bool LoopIdiomRecognize:: +processLoopStridedStore(Value *DestPtr, unsigned StoreSize, + unsigned StoreAlignment, Value *StoredVal, + Instruction *TheStore, const SCEVAddRecExpr *Ev, + const SCEV *BECount) { + + // If the stored value is a byte-wise value (like i32 -1), then it may be + // turned into a memset of i8 -1, assuming that all the consecutive bytes + // are stored. A store of i32 0x01020304 can never be turned into a memset, + // but it can be turned into memset_pattern if the target supports it. + Value *SplatValue = isBytewiseValue(StoredVal); + Constant *PatternValue = 0; + + // If we're allowed to form a memset, and the stored value would be acceptable + // for memset, use it. + if (SplatValue && TLI->has(LibFunc::memset) && + // Verify that the stored value is loop invariant. If not, we can't + // promote the memset. + CurLoop->isLoopInvariant(SplatValue)) { + // Keep and use SplatValue. + PatternValue = 0; + } else if (TLI->has(LibFunc::memset_pattern16) && + (PatternValue = getMemSetPatternValue(StoredVal, *TD))) { + // It looks like we can use PatternValue! + SplatValue = 0; + } else { + // Otherwise, this isn't an idiom we can transform. For example, we can't + // do anything with a 3-byte store, for example. + return false; + } + + + // Okay, we have a strided store "p[i]" of a splattable value. We can turn + // this into a memset in the loop preheader now if we want. However, this + // would be unsafe to do if there is anything else in the loop that may read + // or write to the aliased location. Check for an alias. + if (mayLoopAccessLocation(DestPtr, AliasAnalysis::ModRef, + CurLoop, BECount, + StoreSize, getAnalysis<AliasAnalysis>(), TheStore)) + return false; + + // Okay, everything looks good, insert the memset. + BasicBlock *Preheader = CurLoop->getLoopPreheader(); + + IRBuilder<> Builder(Preheader->getTerminator()); + + // The trip count of the loop and the base pointer of the addrec SCEV is + // guaranteed to be loop invariant, which means that it should dominate the + // header. Just insert code for it in the preheader. + SCEVExpander Expander(*SE); + + unsigned AddrSpace = cast<PointerType>(DestPtr->getType())->getAddressSpace(); + Value *BasePtr = + Expander.expandCodeFor(Ev->getStart(), Builder.getInt8PtrTy(AddrSpace), + Preheader->getTerminator()); + + // The # stored bytes is (BECount+1)*Size. Expand the trip count out to + // pointer size if it isn't already. + const Type *IntPtr = TD->getIntPtrType(DestPtr->getContext()); + BECount = SE->getTruncateOrZeroExtend(BECount, IntPtr); + + const SCEV *NumBytesS = SE->getAddExpr(BECount, SE->getConstant(IntPtr, 1), + true /*no unsigned overflow*/); + if (StoreSize != 1) + NumBytesS = SE->getMulExpr(NumBytesS, SE->getConstant(IntPtr, StoreSize), + true /*no unsigned overflow*/); + + Value *NumBytes = + Expander.expandCodeFor(NumBytesS, IntPtr, Preheader->getTerminator()); + + Value *NewCall; + if (SplatValue) + NewCall = Builder.CreateMemSet(BasePtr, SplatValue,NumBytes,StoreAlignment); + else { + Module *M = TheStore->getParent()->getParent()->getParent(); + Value *MSP = M->getOrInsertFunction("memset_pattern16", + Builder.getVoidTy(), + Builder.getInt8PtrTy(), + Builder.getInt8PtrTy(), IntPtr, + (void*)0); + + // Otherwise we should form a memset_pattern16. PatternValue is known to be + // an constant array of 16-bytes. Plop the value into a mergable global. + GlobalVariable *GV = new GlobalVariable(*M, PatternValue->getType(), true, + GlobalValue::InternalLinkage, + PatternValue, ".memset_pattern"); + GV->setUnnamedAddr(true); // Ok to merge these. + GV->setAlignment(16); + Value *PatternPtr = ConstantExpr::getBitCast(GV, Builder.getInt8PtrTy()); + NewCall = Builder.CreateCall3(MSP, BasePtr, PatternPtr, NumBytes); + } + + DEBUG(dbgs() << " Formed memset: " << *NewCall << "\n" + << " from store to: " << *Ev << " at: " << *TheStore << "\n"); + (void)NewCall; + + // Okay, the memset has been formed. Zap the original store and anything that + // feeds into it. + DeleteDeadInstruction(TheStore, *SE); + ++NumMemSet; + return true; +} + +/// processLoopStoreOfLoopLoad - We see a strided store whose value is a +/// same-strided load. +bool LoopIdiomRecognize:: +processLoopStoreOfLoopLoad(StoreInst *SI, unsigned StoreSize, + const SCEVAddRecExpr *StoreEv, + const SCEVAddRecExpr *LoadEv, + const SCEV *BECount) { + // If we're not allowed to form memcpy, we fail. + if (!TLI->has(LibFunc::memcpy)) + return false; + + LoadInst *LI = cast<LoadInst>(SI->getValueOperand()); + + // Okay, we have a strided store "p[i]" of a loaded value. We can turn + // this into a memcpy in the loop preheader now if we want. However, this + // would be unsafe to do if there is anything else in the loop that may read + // or write to the stored location (including the load feeding the stores). + // Check for an alias. + if (mayLoopAccessLocation(SI->getPointerOperand(), AliasAnalysis::ModRef, + CurLoop, BECount, StoreSize, + getAnalysis<AliasAnalysis>(), SI)) + return false; + + // For a memcpy, we have to make sure that the input array is not being + // mutated by the loop. + if (mayLoopAccessLocation(LI->getPointerOperand(), AliasAnalysis::Mod, + CurLoop, BECount, StoreSize, + getAnalysis<AliasAnalysis>(), SI)) + return false; + + // Okay, everything looks good, insert the memcpy. + BasicBlock *Preheader = CurLoop->getLoopPreheader(); + + IRBuilder<> Builder(Preheader->getTerminator()); + + // The trip count of the loop and the base pointer of the addrec SCEV is + // guaranteed to be loop invariant, which means that it should dominate the + // header. Just insert code for it in the preheader. + SCEVExpander Expander(*SE); + + Value *LoadBasePtr = + Expander.expandCodeFor(LoadEv->getStart(), + Builder.getInt8PtrTy(LI->getPointerAddressSpace()), + Preheader->getTerminator()); + Value *StoreBasePtr = + Expander.expandCodeFor(StoreEv->getStart(), + Builder.getInt8PtrTy(SI->getPointerAddressSpace()), + Preheader->getTerminator()); + + // The # stored bytes is (BECount+1)*Size. Expand the trip count out to + // pointer size if it isn't already. + const Type *IntPtr = TD->getIntPtrType(SI->getContext()); + BECount = SE->getTruncateOrZeroExtend(BECount, IntPtr); + + const SCEV *NumBytesS = SE->getAddExpr(BECount, SE->getConstant(IntPtr, 1), + true /*no unsigned overflow*/); + if (StoreSize != 1) + NumBytesS = SE->getMulExpr(NumBytesS, SE->getConstant(IntPtr, StoreSize), + true /*no unsigned overflow*/); + + Value *NumBytes = + Expander.expandCodeFor(NumBytesS, IntPtr, Preheader->getTerminator()); + + Value *NewCall = + Builder.CreateMemCpy(StoreBasePtr, LoadBasePtr, NumBytes, + std::min(SI->getAlignment(), LI->getAlignment())); + + DEBUG(dbgs() << " Formed memcpy: " << *NewCall << "\n" + << " from load ptr=" << *LoadEv << " at: " << *LI << "\n" + << " from store ptr=" << *StoreEv << " at: " << *SI << "\n"); + (void)NewCall; + + // Okay, the memset has been formed. Zap the original store and anything that + // feeds into it. + DeleteDeadInstruction(SI, *SE); + ++NumMemCpy; + return true; +} diff --git a/lib/Transforms/Scalar/LoopIndexSplit.cpp b/lib/Transforms/Scalar/LoopIndexSplit.cpp deleted file mode 100644 index a433674..0000000 --- a/lib/Transforms/Scalar/LoopIndexSplit.cpp +++ /dev/null @@ -1,1270 +0,0 @@ -//===- LoopIndexSplit.cpp - Loop Index Splitting Pass ---------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This file implements Loop Index Splitting Pass. This pass handles three -// kinds of loops. -// -// [1] A loop may be eliminated if the body is executed exactly once. -// For example, -// -// for (i = 0; i < N; ++i) { -// if (i == X) { -// body; -// } -// } -// -// is transformed to -// -// i = X; -// body; -// -// [2] A loop's iteration space may be shrunk if the loop body is executed -// for a proper sub-range of the loop's iteration space. For example, -// -// for (i = 0; i < N; ++i) { -// if (i > A && i < B) { -// ... -// } -// } -// -// is transformed to iterators from A to B, if A > 0 and B < N. -// -// [3] A loop may be split if the loop body is dominated by a branch. -// For example, -// -// for (i = LB; i < UB; ++i) { if (i < SV) A; else B; } -// -// is transformed into -// -// AEV = BSV = SV -// for (i = LB; i < min(UB, AEV); ++i) -// A; -// for (i = max(LB, BSV); i < UB; ++i); -// B; -// -//===----------------------------------------------------------------------===// - -#define DEBUG_TYPE "loop-index-split" -#include "llvm/Transforms/Scalar.h" -#include "llvm/IntrinsicInst.h" -#include "llvm/LLVMContext.h" -#include "llvm/Analysis/LoopPass.h" -#include "llvm/Analysis/ScalarEvolution.h" -#include "llvm/Analysis/Dominators.h" -#include "llvm/Transforms/Utils/BasicBlockUtils.h" -#include "llvm/Transforms/Utils/Cloning.h" -#include "llvm/Transforms/Utils/Local.h" -#include "llvm/ADT/DepthFirstIterator.h" -#include "llvm/ADT/Statistic.h" - -using namespace llvm; - -STATISTIC(NumIndexSplit, "Number of loop index split"); -STATISTIC(NumIndexSplitRemoved, "Number of loops eliminated by loop index split"); -STATISTIC(NumRestrictBounds, "Number of loop iteration space restricted"); - -namespace { - - class LoopIndexSplit : public LoopPass { - public: - static char ID; // Pass ID, replacement for typeid - LoopIndexSplit() : LoopPass(ID) {} - - // Index split Loop L. Return true if loop is split. - bool runOnLoop(Loop *L, LPPassManager &LPM); - - void getAnalysisUsage(AnalysisUsage &AU) const { - AU.addPreserved<ScalarEvolution>(); - AU.addRequiredID(LCSSAID); - AU.addPreservedID(LCSSAID); - AU.addRequired<LoopInfo>(); - AU.addPreserved<LoopInfo>(); - AU.addRequiredID(LoopSimplifyID); - AU.addPreservedID(LoopSimplifyID); - AU.addRequired<DominatorTree>(); - AU.addRequired<DominanceFrontier>(); - AU.addPreserved<DominatorTree>(); - AU.addPreserved<DominanceFrontier>(); - } - - private: - /// processOneIterationLoop -- Eliminate loop if loop body is executed - /// only once. For example, - /// for (i = 0; i < N; ++i) { - /// if ( i == X) { - /// ... - /// } - /// } - /// - bool processOneIterationLoop(); - - // -- Routines used by updateLoopIterationSpace(); - - /// updateLoopIterationSpace -- Update loop's iteration space if loop - /// body is executed for certain IV range only. For example, - /// - /// for (i = 0; i < N; ++i) { - /// if ( i > A && i < B) { - /// ... - /// } - /// } - /// is transformed to iterators from A to B, if A > 0 and B < N. - /// - bool updateLoopIterationSpace(); - - /// restrictLoopBound - Op dominates loop body. Op compares an IV based value - /// with a loop invariant value. Update loop's lower and upper bound based on - /// the loop invariant value. - bool restrictLoopBound(ICmpInst &Op); - - // --- Routines used by splitLoop(). --- / - - bool splitLoop(); - - /// removeBlocks - Remove basic block DeadBB and all blocks dominated by - /// DeadBB. This routine is used to remove split condition's dead branch, - /// dominated by DeadBB. LiveBB dominates split conidition's other branch. - void removeBlocks(BasicBlock *DeadBB, Loop *LP, BasicBlock *LiveBB); - - /// moveExitCondition - Move exit condition EC into split condition block. - void moveExitCondition(BasicBlock *CondBB, BasicBlock *ActiveBB, - BasicBlock *ExitBB, ICmpInst *EC, ICmpInst *SC, - PHINode *IV, Instruction *IVAdd, Loop *LP, - unsigned); - - /// updatePHINodes - CFG has been changed. - /// Before - /// - ExitBB's single predecessor was Latch - /// - Latch's second successor was Header - /// Now - /// - ExitBB's single predecessor was Header - /// - Latch's one and only successor was Header - /// - /// Update ExitBB PHINodes' to reflect this change. - void updatePHINodes(BasicBlock *ExitBB, BasicBlock *Latch, - BasicBlock *Header, - PHINode *IV, Instruction *IVIncrement, Loop *LP); - - // --- Utility routines --- / - - /// cleanBlock - A block is considered clean if all non terminal - /// instructions are either PHINodes or IV based values. - bool cleanBlock(BasicBlock *BB); - - /// IVisLT - If Op is comparing IV based value with an loop invariant and - /// IV based value is less than the loop invariant then return the loop - /// invariant. Otherwise return NULL. - Value * IVisLT(ICmpInst &Op); - - /// IVisLE - If Op is comparing IV based value with an loop invariant and - /// IV based value is less than or equal to the loop invariant then - /// return the loop invariant. Otherwise return NULL. - Value * IVisLE(ICmpInst &Op); - - /// IVisGT - If Op is comparing IV based value with an loop invariant and - /// IV based value is greater than the loop invariant then return the loop - /// invariant. Otherwise return NULL. - Value * IVisGT(ICmpInst &Op); - - /// IVisGE - If Op is comparing IV based value with an loop invariant and - /// IV based value is greater than or equal to the loop invariant then - /// return the loop invariant. Otherwise return NULL. - Value * IVisGE(ICmpInst &Op); - - private: - - // Current Loop information. - Loop *L; - LPPassManager *LPM; - LoopInfo *LI; - DominatorTree *DT; - DominanceFrontier *DF; - - PHINode *IndVar; - ICmpInst *ExitCondition; - ICmpInst *SplitCondition; - Value *IVStartValue; - Value *IVExitValue; - Instruction *IVIncrement; - SmallPtrSet<Value *, 4> IVBasedValues; - }; -} - -char LoopIndexSplit::ID = 0; -INITIALIZE_PASS(LoopIndexSplit, "loop-index-split", - "Index Split Loops", false, false); - -Pass *llvm::createLoopIndexSplitPass() { - return new LoopIndexSplit(); -} - -// Index split Loop L. Return true if loop is split. -bool LoopIndexSplit::runOnLoop(Loop *IncomingLoop, LPPassManager &LPM_Ref) { - L = IncomingLoop; - LPM = &LPM_Ref; - - // If LoopSimplify form is not available, stay out of trouble. - if (!L->isLoopSimplifyForm()) - return false; - - // FIXME - Nested loops make dominator info updates tricky. - if (!L->getSubLoops().empty()) - return false; - - DT = &getAnalysis<DominatorTree>(); - LI = &getAnalysis<LoopInfo>(); - DF = &getAnalysis<DominanceFrontier>(); - - // Initialize loop data. - IndVar = L->getCanonicalInductionVariable(); - if (!IndVar) return false; - - bool P1InLoop = L->contains(IndVar->getIncomingBlock(1)); - IVStartValue = IndVar->getIncomingValue(!P1InLoop); - IVIncrement = dyn_cast<Instruction>(IndVar->getIncomingValue(P1InLoop)); - if (!IVIncrement) return false; - - IVBasedValues.clear(); - IVBasedValues.insert(IndVar); - IVBasedValues.insert(IVIncrement); - for (Loop::block_iterator I = L->block_begin(), E = L->block_end(); - I != E; ++I) - for(BasicBlock::iterator BI = (*I)->begin(), BE = (*I)->end(); - BI != BE; ++BI) { - if (BinaryOperator *BO = dyn_cast<BinaryOperator>(BI)) - if (BO != IVIncrement - && (BO->getOpcode() == Instruction::Add - || BO->getOpcode() == Instruction::Sub)) - if (IVBasedValues.count(BO->getOperand(0)) - && L->isLoopInvariant(BO->getOperand(1))) - IVBasedValues.insert(BO); - } - - // Reject loop if loop exit condition is not suitable. - BasicBlock *ExitingBlock = L->getExitingBlock(); - if (!ExitingBlock) - return false; - BranchInst *EBR = dyn_cast<BranchInst>(ExitingBlock->getTerminator()); - if (!EBR) return false; - ExitCondition = dyn_cast<ICmpInst>(EBR->getCondition()); - if (!ExitCondition) return false; - if (ExitingBlock != L->getLoopLatch()) return false; - IVExitValue = ExitCondition->getOperand(1); - if (!L->isLoopInvariant(IVExitValue)) - IVExitValue = ExitCondition->getOperand(0); - if (!L->isLoopInvariant(IVExitValue)) - return false; - if (!IVBasedValues.count( - ExitCondition->getOperand(IVExitValue == ExitCondition->getOperand(0)))) - return false; - - // If start value is more then exit value where induction variable - // increments by 1 then we are potentially dealing with an infinite loop. - // Do not index split this loop. - if (ConstantInt *SV = dyn_cast<ConstantInt>(IVStartValue)) - if (ConstantInt *EV = dyn_cast<ConstantInt>(IVExitValue)) - if (SV->getSExtValue() > EV->getSExtValue()) - return false; - - if (processOneIterationLoop()) - return true; - - if (updateLoopIterationSpace()) - return true; - - if (splitLoop()) - return true; - - return false; -} - -// --- Helper routines --- -// isUsedOutsideLoop - Returns true iff V is used outside the loop L. -static bool isUsedOutsideLoop(Value *V, Loop *L) { - for(Value::use_iterator UI = V->use_begin(), E = V->use_end(); UI != E; ++UI) - if (!L->contains(cast<Instruction>(*UI))) - return true; - return false; -} - -// Return V+1 -static Value *getPlusOne(Value *V, bool Sign, Instruction *InsertPt, - LLVMContext &Context) { - Constant *One = ConstantInt::get(V->getType(), 1, Sign); - return BinaryOperator::CreateAdd(V, One, "lsp", InsertPt); -} - -// Return V-1 -static Value *getMinusOne(Value *V, bool Sign, Instruction *InsertPt, - LLVMContext &Context) { - Constant *One = ConstantInt::get(V->getType(), 1, Sign); - return BinaryOperator::CreateSub(V, One, "lsp", InsertPt); -} - -// Return min(V1, V1) -static Value *getMin(Value *V1, Value *V2, bool Sign, Instruction *InsertPt) { - - Value *C = new ICmpInst(InsertPt, - Sign ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT, - V1, V2, "lsp"); - return SelectInst::Create(C, V1, V2, "lsp", InsertPt); -} - -// Return max(V1, V2) -static Value *getMax(Value *V1, Value *V2, bool Sign, Instruction *InsertPt) { - - Value *C = new ICmpInst(InsertPt, - Sign ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT, - V1, V2, "lsp"); - return SelectInst::Create(C, V2, V1, "lsp", InsertPt); -} - -/// processOneIterationLoop -- Eliminate loop if loop body is executed -/// only once. For example, -/// for (i = 0; i < N; ++i) { -/// if ( i == X) { -/// ... -/// } -/// } -/// -bool LoopIndexSplit::processOneIterationLoop() { - SplitCondition = NULL; - BasicBlock *Latch = L->getLoopLatch(); - BasicBlock *Header = L->getHeader(); - BranchInst *BR = dyn_cast<BranchInst>(Header->getTerminator()); - if (!BR) return false; - if (!isa<BranchInst>(Latch->getTerminator())) return false; - if (BR->isUnconditional()) return false; - SplitCondition = dyn_cast<ICmpInst>(BR->getCondition()); - if (!SplitCondition) return false; - if (SplitCondition == ExitCondition) return false; - if (SplitCondition->getPredicate() != ICmpInst::ICMP_EQ) return false; - if (BR->getOperand(1) != Latch) return false; - if (!IVBasedValues.count(SplitCondition->getOperand(0)) - && !IVBasedValues.count(SplitCondition->getOperand(1))) - return false; - - // If IV is used outside the loop then this loop traversal is required. - // FIXME: Calculate and use last IV value. - if (isUsedOutsideLoop(IVIncrement, L)) - return false; - - // If BR operands are not IV or not loop invariants then skip this loop. - Value *OPV = SplitCondition->getOperand(0); - Value *SplitValue = SplitCondition->getOperand(1); - if (!L->isLoopInvariant(SplitValue)) - std::swap(OPV, SplitValue); - if (!L->isLoopInvariant(SplitValue)) - return false; - Instruction *OPI = dyn_cast<Instruction>(OPV); - if (!OPI) - return false; - if (OPI->getParent() != Header || isUsedOutsideLoop(OPI, L)) - return false; - Value *StartValue = IVStartValue; - Value *ExitValue = IVExitValue;; - - if (OPV != IndVar) { - // If BR operand is IV based then use this operand to calculate - // effective conditions for loop body. - BinaryOperator *BOPV = dyn_cast<BinaryOperator>(OPV); - if (!BOPV) - return false; - if (BOPV->getOpcode() != Instruction::Add) - return false; - StartValue = BinaryOperator::CreateAdd(OPV, StartValue, "" , BR); - ExitValue = BinaryOperator::CreateAdd(OPV, ExitValue, "" , BR); - } - - if (!cleanBlock(Header)) - return false; - - if (!cleanBlock(Latch)) - return false; - - // If the merge point for BR is not loop latch then skip this loop. - if (BR->getSuccessor(0) != Latch) { - DominanceFrontier::iterator DF0 = DF->find(BR->getSuccessor(0)); - assert (DF0 != DF->end() && "Unable to find dominance frontier"); - if (!DF0->second.count(Latch)) - return false; - } - - if (BR->getSuccessor(1) != Latch) { - DominanceFrontier::iterator DF1 = DF->find(BR->getSuccessor(1)); - assert (DF1 != DF->end() && "Unable to find dominance frontier"); - if (!DF1->second.count(Latch)) - return false; - } - - // Now, Current loop L contains compare instruction - // that compares induction variable, IndVar, against loop invariant. And - // entire (i.e. meaningful) loop body is dominated by this compare - // instruction. In such case eliminate - // loop structure surrounding this loop body. For example, - // for (int i = start; i < end; ++i) { - // if ( i == somevalue) { - // loop_body - // } - // } - // can be transformed into - // if (somevalue >= start && somevalue < end) { - // i = somevalue; - // loop_body - // } - - // Replace index variable with split value in loop body. Loop body is executed - // only when index variable is equal to split value. - IndVar->replaceAllUsesWith(SplitValue); - - // Replace split condition in header. - // Transform - // SplitCondition : icmp eq i32 IndVar, SplitValue - // into - // c1 = icmp uge i32 SplitValue, StartValue - // c2 = icmp ult i32 SplitValue, ExitValue - // and i32 c1, c2 - Instruction *C1 = new ICmpInst(BR, ExitCondition->isSigned() ? - ICmpInst::ICMP_SGE : ICmpInst::ICMP_UGE, - SplitValue, StartValue, "lisplit"); - - CmpInst::Predicate C2P = ExitCondition->getPredicate(); - BranchInst *LatchBR = cast<BranchInst>(Latch->getTerminator()); - if (LatchBR->getOperand(1) != Header) - C2P = CmpInst::getInversePredicate(C2P); - Instruction *C2 = new ICmpInst(BR, C2P, SplitValue, ExitValue, "lisplit"); - Instruction *NSplitCond = BinaryOperator::CreateAnd(C1, C2, "lisplit", BR); - - SplitCondition->replaceAllUsesWith(NSplitCond); - SplitCondition->eraseFromParent(); - - // Remove Latch to Header edge. - BasicBlock *LatchSucc = NULL; - Header->removePredecessor(Latch); - for (succ_iterator SI = succ_begin(Latch), E = succ_end(Latch); - SI != E; ++SI) { - if (Header != *SI) - LatchSucc = *SI; - } - - // Clean up latch block. - Value *LatchBRCond = LatchBR->getCondition(); - LatchBR->setUnconditionalDest(LatchSucc); - RecursivelyDeleteTriviallyDeadInstructions(LatchBRCond); - - LPM->deleteLoopFromQueue(L); - - // Update Dominator Info. - // Only CFG change done is to remove Latch to Header edge. This - // does not change dominator tree because Latch did not dominate - // Header. - if (DF) { - DominanceFrontier::iterator HeaderDF = DF->find(Header); - if (HeaderDF != DF->end()) - DF->removeFromFrontier(HeaderDF, Header); - - DominanceFrontier::iterator LatchDF = DF->find(Latch); - if (LatchDF != DF->end()) - DF->removeFromFrontier(LatchDF, Header); - } - - ++NumIndexSplitRemoved; - return true; -} - -/// restrictLoopBound - Op dominates loop body. Op compares an IV based value -/// with a loop invariant value. Update loop's lower and upper bound based on -/// the loop invariant value. -bool LoopIndexSplit::restrictLoopBound(ICmpInst &Op) { - bool Sign = Op.isSigned(); - Instruction *PHTerm = L->getLoopPreheader()->getTerminator(); - - if (IVisGT(*ExitCondition) || IVisGE(*ExitCondition)) { - BranchInst *EBR = - cast<BranchInst>(ExitCondition->getParent()->getTerminator()); - ExitCondition->setPredicate(ExitCondition->getInversePredicate()); - BasicBlock *T = EBR->getSuccessor(0); - EBR->setSuccessor(0, EBR->getSuccessor(1)); - EBR->setSuccessor(1, T); - } - - LLVMContext &Context = Op.getContext(); - - // New upper and lower bounds. - Value *NLB = NULL; - Value *NUB = NULL; - if (Value *V = IVisLT(Op)) { - // Restrict upper bound. - if (IVisLE(*ExitCondition)) - V = getMinusOne(V, Sign, PHTerm, Context); - NUB = getMin(V, IVExitValue, Sign, PHTerm); - } else if (Value *V = IVisLE(Op)) { - // Restrict upper bound. - if (IVisLT(*ExitCondition)) - V = getPlusOne(V, Sign, PHTerm, Context); - NUB = getMin(V, IVExitValue, Sign, PHTerm); - } else if (Value *V = IVisGT(Op)) { - // Restrict lower bound. - V = getPlusOne(V, Sign, PHTerm, Context); - NLB = getMax(V, IVStartValue, Sign, PHTerm); - } else if (Value *V = IVisGE(Op)) - // Restrict lower bound. - NLB = getMax(V, IVStartValue, Sign, PHTerm); - - if (!NLB && !NUB) - return false; - - if (NLB) { - unsigned i = IndVar->getBasicBlockIndex(L->getLoopPreheader()); - IndVar->setIncomingValue(i, NLB); - } - - if (NUB) { - unsigned i = (ExitCondition->getOperand(0) != IVExitValue); - ExitCondition->setOperand(i, NUB); - } - return true; -} - -/// updateLoopIterationSpace -- Update loop's iteration space if loop -/// body is executed for certain IV range only. For example, -/// -/// for (i = 0; i < N; ++i) { -/// if ( i > A && i < B) { -/// ... -/// } -/// } -/// is transformed to iterators from A to B, if A > 0 and B < N. -/// -bool LoopIndexSplit::updateLoopIterationSpace() { - SplitCondition = NULL; - if (ExitCondition->getPredicate() == ICmpInst::ICMP_NE - || ExitCondition->getPredicate() == ICmpInst::ICMP_EQ) - return false; - BasicBlock *Latch = L->getLoopLatch(); - BasicBlock *Header = L->getHeader(); - BranchInst *BR = dyn_cast<BranchInst>(Header->getTerminator()); - if (!BR) return false; - if (!isa<BranchInst>(Latch->getTerminator())) return false; - if (BR->isUnconditional()) return false; - BinaryOperator *AND = dyn_cast<BinaryOperator>(BR->getCondition()); - if (!AND) return false; - if (AND->getOpcode() != Instruction::And) return false; - ICmpInst *Op0 = dyn_cast<ICmpInst>(AND->getOperand(0)); - ICmpInst *Op1 = dyn_cast<ICmpInst>(AND->getOperand(1)); - if (!Op0 || !Op1) - return false; - IVBasedValues.insert(AND); - IVBasedValues.insert(Op0); - IVBasedValues.insert(Op1); - if (!cleanBlock(Header)) return false; - BasicBlock *ExitingBlock = ExitCondition->getParent(); - if (!cleanBlock(ExitingBlock)) return false; - - // If the merge point for BR is not loop latch then skip this loop. - if (BR->getSuccessor(0) != Latch) { - DominanceFrontier::iterator DF0 = DF->find(BR->getSuccessor(0)); - assert (DF0 != DF->end() && "Unable to find dominance frontier"); - if (!DF0->second.count(Latch)) - return false; - } - - if (BR->getSuccessor(1) != Latch) { - DominanceFrontier::iterator DF1 = DF->find(BR->getSuccessor(1)); - assert (DF1 != DF->end() && "Unable to find dominance frontier"); - if (!DF1->second.count(Latch)) - return false; - } - - // Verify that loop exiting block has only two predecessor, where one pred - // is split condition block. The other predecessor will become exiting block's - // dominator after CFG is updated. TODO : Handle CFG's where exiting block has - // more then two predecessors. This requires extra work in updating dominator - // information. - BasicBlock *ExitingBBPred = NULL; - for (pred_iterator PI = pred_begin(ExitingBlock), PE = pred_end(ExitingBlock); - PI != PE; ++PI) { - BasicBlock *BB = *PI; - if (Header == BB) - continue; - if (ExitingBBPred) - return false; - else - ExitingBBPred = BB; - } - - if (!restrictLoopBound(*Op0)) - return false; - - if (!restrictLoopBound(*Op1)) - return false; - - // Update CFG. - if (BR->getSuccessor(0) == ExitingBlock) - BR->setUnconditionalDest(BR->getSuccessor(1)); - else - BR->setUnconditionalDest(BR->getSuccessor(0)); - - AND->eraseFromParent(); - if (Op0->use_empty()) - Op0->eraseFromParent(); - if (Op1->use_empty()) - Op1->eraseFromParent(); - - // Update domiantor info. Now, ExitingBlock has only one predecessor, - // ExitingBBPred, and it is ExitingBlock's immediate domiantor. - DT->changeImmediateDominator(ExitingBlock, ExitingBBPred); - - BasicBlock *ExitBlock = ExitingBlock->getTerminator()->getSuccessor(1); - if (L->contains(ExitBlock)) - ExitBlock = ExitingBlock->getTerminator()->getSuccessor(0); - - // If ExitingBlock is a member of the loop basic blocks' DF list then - // replace ExitingBlock with header and exit block in the DF list - DominanceFrontier::iterator ExitingBlockDF = DF->find(ExitingBlock); - for (Loop::block_iterator I = L->block_begin(), E = L->block_end(); - I != E; ++I) { - BasicBlock *BB = *I; - if (BB == Header || BB == ExitingBlock) - continue; - DominanceFrontier::iterator BBDF = DF->find(BB); - DominanceFrontier::DomSetType::iterator DomSetI = BBDF->second.begin(); - DominanceFrontier::DomSetType::iterator DomSetE = BBDF->second.end(); - while (DomSetI != DomSetE) { - DominanceFrontier::DomSetType::iterator CurrentItr = DomSetI; - ++DomSetI; - BasicBlock *DFBB = *CurrentItr; - if (DFBB == ExitingBlock) { - BBDF->second.erase(DFBB); - for (DominanceFrontier::DomSetType::iterator - EBI = ExitingBlockDF->second.begin(), - EBE = ExitingBlockDF->second.end(); EBI != EBE; ++EBI) - BBDF->second.insert(*EBI); - } - } - } - ++NumRestrictBounds; - return true; -} - -/// removeBlocks - Remove basic block DeadBB and all blocks dominated by DeadBB. -/// This routine is used to remove split condition's dead branch, dominated by -/// DeadBB. LiveBB dominates split conidition's other branch. -void LoopIndexSplit::removeBlocks(BasicBlock *DeadBB, Loop *LP, - BasicBlock *LiveBB) { - - // First update DeadBB's dominance frontier. - SmallVector<BasicBlock *, 8> FrontierBBs; - DominanceFrontier::iterator DeadBBDF = DF->find(DeadBB); - if (DeadBBDF != DF->end()) { - SmallVector<BasicBlock *, 8> PredBlocks; - - DominanceFrontier::DomSetType DeadBBSet = DeadBBDF->second; - for (DominanceFrontier::DomSetType::iterator DeadBBSetI = DeadBBSet.begin(), - DeadBBSetE = DeadBBSet.end(); DeadBBSetI != DeadBBSetE; ++DeadBBSetI) - { - BasicBlock *FrontierBB = *DeadBBSetI; - FrontierBBs.push_back(FrontierBB); - - // Rremove any PHI incoming edge from blocks dominated by DeadBB. - PredBlocks.clear(); - for(pred_iterator PI = pred_begin(FrontierBB), PE = pred_end(FrontierBB); - PI != PE; ++PI) { - BasicBlock *P = *PI; - if (DT->dominates(DeadBB, P)) - PredBlocks.push_back(P); - } - - for(BasicBlock::iterator FBI = FrontierBB->begin(), FBE = FrontierBB->end(); - FBI != FBE; ++FBI) { - if (PHINode *PN = dyn_cast<PHINode>(FBI)) { - for(SmallVector<BasicBlock *, 8>::iterator PI = PredBlocks.begin(), - PE = PredBlocks.end(); PI != PE; ++PI) { - BasicBlock *P = *PI; - PN->removeIncomingValue(P); - } - } - else - break; - } - } - } - - // Now remove DeadBB and all nodes dominated by DeadBB in df order. - SmallVector<BasicBlock *, 32> WorkList; - DomTreeNode *DN = DT->getNode(DeadBB); - for (df_iterator<DomTreeNode*> DI = df_begin(DN), - E = df_end(DN); DI != E; ++DI) { - BasicBlock *BB = DI->getBlock(); - WorkList.push_back(BB); - BB->replaceAllUsesWith(UndefValue::get( - Type::getLabelTy(DeadBB->getContext()))); - } - - while (!WorkList.empty()) { - BasicBlock *BB = WorkList.pop_back_val(); - LPM->deleteSimpleAnalysisValue(BB, LP); - for(BasicBlock::iterator BBI = BB->begin(), BBE = BB->end(); - BBI != BBE; ) { - Instruction *I = BBI; - ++BBI; - I->replaceAllUsesWith(UndefValue::get(I->getType())); - LPM->deleteSimpleAnalysisValue(I, LP); - I->eraseFromParent(); - } - DT->eraseNode(BB); - DF->removeBlock(BB); - LI->removeBlock(BB); - BB->eraseFromParent(); - } - - // Update Frontier BBs' dominator info. - while (!FrontierBBs.empty()) { - BasicBlock *FBB = FrontierBBs.pop_back_val(); - BasicBlock *NewDominator = FBB->getSinglePredecessor(); - if (!NewDominator) { - pred_iterator PI = pred_begin(FBB), PE = pred_end(FBB); - NewDominator = *PI; - ++PI; - if (NewDominator != LiveBB) { - for(; PI != PE; ++PI) { - BasicBlock *P = *PI; - if (P == LiveBB) { - NewDominator = LiveBB; - break; - } - NewDominator = DT->findNearestCommonDominator(NewDominator, P); - } - } - } - assert (NewDominator && "Unable to fix dominator info."); - DT->changeImmediateDominator(FBB, NewDominator); - DF->changeImmediateDominator(FBB, NewDominator, DT); - } - -} - -// moveExitCondition - Move exit condition EC into split condition block CondBB. -void LoopIndexSplit::moveExitCondition(BasicBlock *CondBB, BasicBlock *ActiveBB, - BasicBlock *ExitBB, ICmpInst *EC, - ICmpInst *SC, PHINode *IV, - Instruction *IVAdd, Loop *LP, - unsigned ExitValueNum) { - - BasicBlock *ExitingBB = EC->getParent(); - Instruction *CurrentBR = CondBB->getTerminator(); - - // Move exit condition into split condition block. - EC->moveBefore(CurrentBR); - EC->setOperand(ExitValueNum == 0 ? 1 : 0, IV); - - // Move exiting block's branch into split condition block. Update its branch - // destination. - BranchInst *ExitingBR = cast<BranchInst>(ExitingBB->getTerminator()); - ExitingBR->moveBefore(CurrentBR); - BasicBlock *OrigDestBB = NULL; - if (ExitingBR->getSuccessor(0) == ExitBB) { - OrigDestBB = ExitingBR->getSuccessor(1); - ExitingBR->setSuccessor(1, ActiveBB); - } - else { - OrigDestBB = ExitingBR->getSuccessor(0); - ExitingBR->setSuccessor(0, ActiveBB); - } - - // Remove split condition and current split condition branch. - SC->eraseFromParent(); - CurrentBR->eraseFromParent(); - - // Connect exiting block to original destination. - BranchInst::Create(OrigDestBB, ExitingBB); - - // Update PHINodes - updatePHINodes(ExitBB, ExitingBB, CondBB, IV, IVAdd, LP); - - // Fix dominator info. - // ExitBB is now dominated by CondBB - DT->changeImmediateDominator(ExitBB, CondBB); - DF->changeImmediateDominator(ExitBB, CondBB, DT); - - // Blocks outside the loop may have been in the dominance frontier of blocks - // inside the condition; this is now impossible because the blocks inside the - // condition no loger dominate the exit. Remove the relevant blocks from - // the dominance frontiers. - for (Loop::block_iterator I = LP->block_begin(), E = LP->block_end(); - I != E; ++I) { - if (!DT->properlyDominates(CondBB, *I)) continue; - DominanceFrontier::iterator BBDF = DF->find(*I); - DominanceFrontier::DomSetType::iterator DomSetI = BBDF->second.begin(); - DominanceFrontier::DomSetType::iterator DomSetE = BBDF->second.end(); - while (DomSetI != DomSetE) { - DominanceFrontier::DomSetType::iterator CurrentItr = DomSetI; - ++DomSetI; - BasicBlock *DFBB = *CurrentItr; - if (!LP->contains(DFBB)) - BBDF->second.erase(DFBB); - } - } -} - -/// updatePHINodes - CFG has been changed. -/// Before -/// - ExitBB's single predecessor was Latch -/// - Latch's second successor was Header -/// Now -/// - ExitBB's single predecessor is Header -/// - Latch's one and only successor is Header -/// -/// Update ExitBB PHINodes' to reflect this change. -void LoopIndexSplit::updatePHINodes(BasicBlock *ExitBB, BasicBlock *Latch, - BasicBlock *Header, - PHINode *IV, Instruction *IVIncrement, - Loop *LP) { - - for (BasicBlock::iterator BI = ExitBB->begin(), BE = ExitBB->end(); - BI != BE; ) { - PHINode *PN = dyn_cast<PHINode>(BI); - ++BI; - if (!PN) - break; - - Value *V = PN->getIncomingValueForBlock(Latch); - if (PHINode *PHV = dyn_cast<PHINode>(V)) { - // PHV is in Latch. PHV has one use is in ExitBB PHINode. And one use - // in Header which is new incoming value for PN. - Value *NewV = NULL; - for (Value::use_iterator UI = PHV->use_begin(), E = PHV->use_end(); - UI != E; ++UI) - if (PHINode *U = dyn_cast<PHINode>(*UI)) - if (LP->contains(U)) { - NewV = U; - break; - } - - // Add incoming value from header only if PN has any use inside the loop. - if (NewV) - PN->addIncoming(NewV, Header); - - } else if (Instruction *PHI = dyn_cast<Instruction>(V)) { - // If this instruction is IVIncrement then IV is new incoming value - // from header otherwise this instruction must be incoming value from - // header because loop is in LCSSA form. - if (PHI == IVIncrement) - PN->addIncoming(IV, Header); - else - PN->addIncoming(V, Header); - } else - // Otherwise this is an incoming value from header because loop is in - // LCSSA form. - PN->addIncoming(V, Header); - - // Remove incoming value from Latch. - PN->removeIncomingValue(Latch); - } -} - -bool LoopIndexSplit::splitLoop() { - SplitCondition = NULL; - if (ExitCondition->getPredicate() == ICmpInst::ICMP_NE - || ExitCondition->getPredicate() == ICmpInst::ICMP_EQ) - return false; - BasicBlock *Header = L->getHeader(); - BasicBlock *Latch = L->getLoopLatch(); - BranchInst *SBR = NULL; // Split Condition Branch - BranchInst *EBR = cast<BranchInst>(ExitCondition->getParent()->getTerminator()); - // If Exiting block includes loop variant instructions then this - // loop may not be split safely. - BasicBlock *ExitingBlock = ExitCondition->getParent(); - if (!cleanBlock(ExitingBlock)) return false; - - LLVMContext &Context = Header->getContext(); - - for (Loop::block_iterator I = L->block_begin(), E = L->block_end(); - I != E; ++I) { - BranchInst *BR = dyn_cast<BranchInst>((*I)->getTerminator()); - if (!BR || BR->isUnconditional()) continue; - ICmpInst *CI = dyn_cast<ICmpInst>(BR->getCondition()); - if (!CI || CI == ExitCondition - || CI->getPredicate() == ICmpInst::ICMP_NE - || CI->getPredicate() == ICmpInst::ICMP_EQ) - continue; - - // Unable to handle triangle loops at the moment. - // In triangle loop, split condition is in header and one of the - // the split destination is loop latch. If split condition is EQ - // then such loops are already handle in processOneIterationLoop(). - if (Header == (*I) - && (Latch == BR->getSuccessor(0) || Latch == BR->getSuccessor(1))) - continue; - - // If the block does not dominate the latch then this is not a diamond. - // Such loop may not benefit from index split. - if (!DT->dominates((*I), Latch)) - continue; - - // If split condition branches heads do not have single predecessor, - // SplitCondBlock, then is not possible to remove inactive branch. - if (!BR->getSuccessor(0)->getSinglePredecessor() - || !BR->getSuccessor(1)->getSinglePredecessor()) - return false; - - // If the merge point for BR is not loop latch then skip this condition. - if (BR->getSuccessor(0) != Latch) { - DominanceFrontier::iterator DF0 = DF->find(BR->getSuccessor(0)); - assert (DF0 != DF->end() && "Unable to find dominance frontier"); - if (!DF0->second.count(Latch)) - continue; - } - - if (BR->getSuccessor(1) != Latch) { - DominanceFrontier::iterator DF1 = DF->find(BR->getSuccessor(1)); - assert (DF1 != DF->end() && "Unable to find dominance frontier"); - if (!DF1->second.count(Latch)) - continue; - } - SplitCondition = CI; - SBR = BR; - break; - } - - if (!SplitCondition) - return false; - - // If the predicate sign does not match then skip. - if (ExitCondition->isSigned() != SplitCondition->isSigned()) - return false; - - unsigned EVOpNum = (ExitCondition->getOperand(1) == IVExitValue); - unsigned SVOpNum = IVBasedValues.count(SplitCondition->getOperand(0)); - Value *SplitValue = SplitCondition->getOperand(SVOpNum); - if (!L->isLoopInvariant(SplitValue)) - return false; - if (!IVBasedValues.count(SplitCondition->getOperand(!SVOpNum))) - return false; - - // Check for side effects. - for (Loop::block_iterator I = L->block_begin(), E = L->block_end(); - I != E; ++I) { - BasicBlock *BB = *I; - - assert(DT->dominates(Header, BB)); - if (DT->properlyDominates(SplitCondition->getParent(), BB)) - continue; - - for (BasicBlock::iterator BI = BB->begin(), BE = BB->end(); - BI != BE; ++BI) { - Instruction *Inst = BI; - - if (!Inst->isSafeToSpeculativelyExecute() && !isa<PHINode>(Inst) - && !isa<BranchInst>(Inst) && !isa<DbgInfoIntrinsic>(Inst)) - return false; - } - } - - // Normalize loop conditions so that it is easier to calculate new loop - // bounds. - if (IVisGT(*ExitCondition) || IVisGE(*ExitCondition)) { - ExitCondition->setPredicate(ExitCondition->getInversePredicate()); - BasicBlock *T = EBR->getSuccessor(0); - EBR->setSuccessor(0, EBR->getSuccessor(1)); - EBR->setSuccessor(1, T); - } - - if (IVisGT(*SplitCondition) || IVisGE(*SplitCondition)) { - SplitCondition->setPredicate(SplitCondition->getInversePredicate()); - BasicBlock *T = SBR->getSuccessor(0); - SBR->setSuccessor(0, SBR->getSuccessor(1)); - SBR->setSuccessor(1, T); - } - - //[*] Calculate new loop bounds. - Value *AEV = SplitValue; - Value *BSV = SplitValue; - bool Sign = SplitCondition->isSigned(); - Instruction *PHTerm = L->getLoopPreheader()->getTerminator(); - - if (IVisLT(*ExitCondition)) { - if (IVisLT(*SplitCondition)) { - /* Do nothing */ - } - else if (IVisLE(*SplitCondition)) { - AEV = getPlusOne(SplitValue, Sign, PHTerm, Context); - BSV = getPlusOne(SplitValue, Sign, PHTerm, Context); - } else { - assert (0 && "Unexpected split condition!"); - } - } - else if (IVisLE(*ExitCondition)) { - if (IVisLT(*SplitCondition)) { - AEV = getMinusOne(SplitValue, Sign, PHTerm, Context); - } - else if (IVisLE(*SplitCondition)) { - BSV = getPlusOne(SplitValue, Sign, PHTerm, Context); - } else { - assert (0 && "Unexpected split condition!"); - } - } else { - assert (0 && "Unexpected exit condition!"); - } - AEV = getMin(AEV, IVExitValue, Sign, PHTerm); - BSV = getMax(BSV, IVStartValue, Sign, PHTerm); - - // [*] Clone Loop - ValueMap<const Value *, Value *> VMap; - Loop *BLoop = CloneLoop(L, LPM, LI, VMap, this); - Loop *ALoop = L; - - // [*] ALoop's exiting edge enters BLoop's header. - // ALoop's original exit block becomes BLoop's exit block. - PHINode *B_IndVar = cast<PHINode>(VMap[IndVar]); - BasicBlock *A_ExitingBlock = ExitCondition->getParent(); - BranchInst *A_ExitInsn = - dyn_cast<BranchInst>(A_ExitingBlock->getTerminator()); - assert (A_ExitInsn && "Unable to find suitable loop exit branch"); - BasicBlock *B_ExitBlock = A_ExitInsn->getSuccessor(1); - BasicBlock *B_Header = BLoop->getHeader(); - if (ALoop->contains(B_ExitBlock)) { - B_ExitBlock = A_ExitInsn->getSuccessor(0); - A_ExitInsn->setSuccessor(0, B_Header); - } else - A_ExitInsn->setSuccessor(1, B_Header); - - // [*] Update ALoop's exit value using new exit value. - ExitCondition->setOperand(EVOpNum, AEV); - - // [*] Update BLoop's header phi nodes. Remove incoming PHINode's from - // original loop's preheader. Add incoming PHINode values from - // ALoop's exiting block. Update BLoop header's domiantor info. - - // Collect inverse map of Header PHINodes. - DenseMap<Value *, Value *> InverseMap; - for (BasicBlock::iterator BI = ALoop->getHeader()->begin(), - BE = ALoop->getHeader()->end(); BI != BE; ++BI) { - if (PHINode *PN = dyn_cast<PHINode>(BI)) { - PHINode *PNClone = cast<PHINode>(VMap[PN]); - InverseMap[PNClone] = PN; - } else - break; - } - - BasicBlock *A_Preheader = ALoop->getLoopPreheader(); - for (BasicBlock::iterator BI = B_Header->begin(), BE = B_Header->end(); - BI != BE; ++BI) { - if (PHINode *PN = dyn_cast<PHINode>(BI)) { - // Remove incoming value from original preheader. - PN->removeIncomingValue(A_Preheader); - - // Add incoming value from A_ExitingBlock. - if (PN == B_IndVar) - PN->addIncoming(BSV, A_ExitingBlock); - else { - PHINode *OrigPN = cast<PHINode>(InverseMap[PN]); - Value *V2 = NULL; - // If loop header is also loop exiting block then - // OrigPN is incoming value for B loop header. - if (A_ExitingBlock == ALoop->getHeader()) - V2 = OrigPN; - else - V2 = OrigPN->getIncomingValueForBlock(A_ExitingBlock); - PN->addIncoming(V2, A_ExitingBlock); - } - } else - break; - } - - DT->changeImmediateDominator(B_Header, A_ExitingBlock); - DF->changeImmediateDominator(B_Header, A_ExitingBlock, DT); - - // [*] Update BLoop's exit block. Its new predecessor is BLoop's exit - // block. Remove incoming PHINode values from ALoop's exiting block. - // Add new incoming values from BLoop's incoming exiting value. - // Update BLoop exit block's dominator info.. - BasicBlock *B_ExitingBlock = cast<BasicBlock>(VMap[A_ExitingBlock]); - for (BasicBlock::iterator BI = B_ExitBlock->begin(), BE = B_ExitBlock->end(); - BI != BE; ++BI) { - if (PHINode *PN = dyn_cast<PHINode>(BI)) { - PN->addIncoming(VMap[PN->getIncomingValueForBlock(A_ExitingBlock)], - B_ExitingBlock); - PN->removeIncomingValue(A_ExitingBlock); - } else - break; - } - - DT->changeImmediateDominator(B_ExitBlock, B_ExitingBlock); - DF->changeImmediateDominator(B_ExitBlock, B_ExitingBlock, DT); - - //[*] Split ALoop's exit edge. This creates a new block which - // serves two purposes. First one is to hold PHINode defnitions - // to ensure that ALoop's LCSSA form. Second use it to act - // as a preheader for BLoop. - BasicBlock *A_ExitBlock = SplitEdge(A_ExitingBlock, B_Header, this); - - //[*] Preserve ALoop's LCSSA form. Create new forwarding PHINodes - // in A_ExitBlock to redefine outgoing PHI definitions from ALoop. - for(BasicBlock::iterator BI = B_Header->begin(), BE = B_Header->end(); - BI != BE; ++BI) { - if (PHINode *PN = dyn_cast<PHINode>(BI)) { - Value *V1 = PN->getIncomingValueForBlock(A_ExitBlock); - PHINode *newPHI = PHINode::Create(PN->getType(), PN->getName()); - newPHI->addIncoming(V1, A_ExitingBlock); - A_ExitBlock->getInstList().push_front(newPHI); - PN->removeIncomingValue(A_ExitBlock); - PN->addIncoming(newPHI, A_ExitBlock); - } else - break; - } - - //[*] Eliminate split condition's inactive branch from ALoop. - BasicBlock *A_SplitCondBlock = SplitCondition->getParent(); - BranchInst *A_BR = cast<BranchInst>(A_SplitCondBlock->getTerminator()); - BasicBlock *A_InactiveBranch = NULL; - BasicBlock *A_ActiveBranch = NULL; - A_ActiveBranch = A_BR->getSuccessor(0); - A_InactiveBranch = A_BR->getSuccessor(1); - A_BR->setUnconditionalDest(A_ActiveBranch); - removeBlocks(A_InactiveBranch, L, A_ActiveBranch); - - //[*] Eliminate split condition's inactive branch in from BLoop. - BasicBlock *B_SplitCondBlock = cast<BasicBlock>(VMap[A_SplitCondBlock]); - BranchInst *B_BR = cast<BranchInst>(B_SplitCondBlock->getTerminator()); - BasicBlock *B_InactiveBranch = NULL; - BasicBlock *B_ActiveBranch = NULL; - B_ActiveBranch = B_BR->getSuccessor(1); - B_InactiveBranch = B_BR->getSuccessor(0); - B_BR->setUnconditionalDest(B_ActiveBranch); - removeBlocks(B_InactiveBranch, BLoop, B_ActiveBranch); - - BasicBlock *A_Header = ALoop->getHeader(); - if (A_ExitingBlock == A_Header) - return true; - - //[*] Move exit condition into split condition block to avoid - // executing dead loop iteration. - ICmpInst *B_ExitCondition = cast<ICmpInst>(VMap[ExitCondition]); - Instruction *B_IndVarIncrement = cast<Instruction>(VMap[IVIncrement]); - ICmpInst *B_SplitCondition = cast<ICmpInst>(VMap[SplitCondition]); - - moveExitCondition(A_SplitCondBlock, A_ActiveBranch, A_ExitBlock, ExitCondition, - cast<ICmpInst>(SplitCondition), IndVar, IVIncrement, - ALoop, EVOpNum); - - moveExitCondition(B_SplitCondBlock, B_ActiveBranch, - B_ExitBlock, B_ExitCondition, - B_SplitCondition, B_IndVar, B_IndVarIncrement, - BLoop, EVOpNum); - - ++NumIndexSplit; - return true; -} - -/// cleanBlock - A block is considered clean if all non terminal instructions -/// are either, PHINodes, IV based. -bool LoopIndexSplit::cleanBlock(BasicBlock *BB) { - Instruction *Terminator = BB->getTerminator(); - for(BasicBlock::iterator BI = BB->begin(), BE = BB->end(); - BI != BE; ++BI) { - Instruction *I = BI; - - if (isa<PHINode>(I) || I == Terminator || I == ExitCondition - || I == SplitCondition || IVBasedValues.count(I) - || isa<DbgInfoIntrinsic>(I)) - continue; - - if (I->mayHaveSideEffects()) - return false; - - // I is used only inside this block then it is OK. - bool usedOutsideBB = false; - for (Value::use_iterator UI = I->use_begin(), UE = I->use_end(); - UI != UE; ++UI) { - Instruction *U = cast<Instruction>(*UI); - if (U->getParent() != BB) - usedOutsideBB = true; - } - if (!usedOutsideBB) - continue; - - // Otherwise we have a instruction that may not allow loop spliting. - return false; - } - return true; -} - -/// IVisLT - If Op is comparing IV based value with an loop invariant and -/// IV based value is less than the loop invariant then return the loop -/// invariant. Otherwise return NULL. -Value * LoopIndexSplit::IVisLT(ICmpInst &Op) { - ICmpInst::Predicate P = Op.getPredicate(); - if ((P == ICmpInst::ICMP_SLT || P == ICmpInst::ICMP_ULT) - && IVBasedValues.count(Op.getOperand(0)) - && L->isLoopInvariant(Op.getOperand(1))) - return Op.getOperand(1); - - if ((P == ICmpInst::ICMP_SGT || P == ICmpInst::ICMP_UGT) - && IVBasedValues.count(Op.getOperand(1)) - && L->isLoopInvariant(Op.getOperand(0))) - return Op.getOperand(0); - - return NULL; -} - -/// IVisLE - If Op is comparing IV based value with an loop invariant and -/// IV based value is less than or equal to the loop invariant then -/// return the loop invariant. Otherwise return NULL. -Value * LoopIndexSplit::IVisLE(ICmpInst &Op) { - ICmpInst::Predicate P = Op.getPredicate(); - if ((P == ICmpInst::ICMP_SLE || P == ICmpInst::ICMP_ULE) - && IVBasedValues.count(Op.getOperand(0)) - && L->isLoopInvariant(Op.getOperand(1))) - return Op.getOperand(1); - - if ((P == ICmpInst::ICMP_SGE || P == ICmpInst::ICMP_UGE) - && IVBasedValues.count(Op.getOperand(1)) - && L->isLoopInvariant(Op.getOperand(0))) - return Op.getOperand(0); - - return NULL; -} - -/// IVisGT - If Op is comparing IV based value with an loop invariant and -/// IV based value is greater than the loop invariant then return the loop -/// invariant. Otherwise return NULL. -Value * LoopIndexSplit::IVisGT(ICmpInst &Op) { - ICmpInst::Predicate P = Op.getPredicate(); - if ((P == ICmpInst::ICMP_SGT || P == ICmpInst::ICMP_UGT) - && IVBasedValues.count(Op.getOperand(0)) - && L->isLoopInvariant(Op.getOperand(1))) - return Op.getOperand(1); - - if ((P == ICmpInst::ICMP_SLT || P == ICmpInst::ICMP_ULT) - && IVBasedValues.count(Op.getOperand(1)) - && L->isLoopInvariant(Op.getOperand(0))) - return Op.getOperand(0); - - return NULL; -} - -/// IVisGE - If Op is comparing IV based value with an loop invariant and -/// IV based value is greater than or equal to the loop invariant then -/// return the loop invariant. Otherwise return NULL. -Value * LoopIndexSplit::IVisGE(ICmpInst &Op) { - ICmpInst::Predicate P = Op.getPredicate(); - if ((P == ICmpInst::ICMP_SGE || P == ICmpInst::ICMP_UGE) - && IVBasedValues.count(Op.getOperand(0)) - && L->isLoopInvariant(Op.getOperand(1))) - return Op.getOperand(1); - - if ((P == ICmpInst::ICMP_SLE || P == ICmpInst::ICMP_ULE) - && IVBasedValues.count(Op.getOperand(1)) - && L->isLoopInvariant(Op.getOperand(0))) - return Op.getOperand(0); - - return NULL; -} - diff --git a/lib/Transforms/Scalar/LoopInstSimplify.cpp b/lib/Transforms/Scalar/LoopInstSimplify.cpp new file mode 100644 index 0000000..af25c5c --- /dev/null +++ b/lib/Transforms/Scalar/LoopInstSimplify.cpp @@ -0,0 +1,170 @@ +//===- LoopInstSimplify.cpp - Loop Instruction Simplification Pass --------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass performs lightweight instruction simplification on loop bodies. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "loop-instsimplify" +#include "llvm/Instructions.h" +#include "llvm/Analysis/Dominators.h" +#include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/LoopPass.h" +#include "llvm/Support/Debug.h" +#include "llvm/Target/TargetData.h" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/Local.h" +#include "llvm/ADT/Statistic.h" +using namespace llvm; + +STATISTIC(NumSimplified, "Number of redundant instructions simplified"); + +namespace { + class LoopInstSimplify : public LoopPass { + public: + static char ID; // Pass ID, replacement for typeid + LoopInstSimplify() : LoopPass(ID) { + initializeLoopInstSimplifyPass(*PassRegistry::getPassRegistry()); + } + + bool runOnLoop(Loop*, LPPassManager&); + + virtual void getAnalysisUsage(AnalysisUsage &AU) const { + AU.setPreservesCFG(); + AU.addRequired<LoopInfo>(); + AU.addRequiredID(LoopSimplifyID); + AU.addPreservedID(LoopSimplifyID); + AU.addPreservedID(LCSSAID); + AU.addPreserved("scalar-evolution"); + } + }; +} + +char LoopInstSimplify::ID = 0; +INITIALIZE_PASS_BEGIN(LoopInstSimplify, "loop-instsimplify", + "Simplify instructions in loops", false, false) +INITIALIZE_PASS_DEPENDENCY(DominatorTree) +INITIALIZE_PASS_DEPENDENCY(LoopInfo) +INITIALIZE_PASS_DEPENDENCY(LCSSA) +INITIALIZE_PASS_END(LoopInstSimplify, "loop-instsimplify", + "Simplify instructions in loops", false, false) + +Pass *llvm::createLoopInstSimplifyPass() { + return new LoopInstSimplify(); +} + +bool LoopInstSimplify::runOnLoop(Loop *L, LPPassManager &LPM) { + DominatorTree *DT = getAnalysisIfAvailable<DominatorTree>(); + LoopInfo *LI = &getAnalysis<LoopInfo>(); + const TargetData *TD = getAnalysisIfAvailable<TargetData>(); + + SmallVector<BasicBlock*, 8> ExitBlocks; + L->getUniqueExitBlocks(ExitBlocks); + array_pod_sort(ExitBlocks.begin(), ExitBlocks.end()); + + SmallPtrSet<const Instruction*, 8> S1, S2, *ToSimplify = &S1, *Next = &S2; + + // The bit we are stealing from the pointer represents whether this basic + // block is the header of a subloop, in which case we only process its phis. + typedef PointerIntPair<BasicBlock*, 1> WorklistItem; + SmallVector<WorklistItem, 16> VisitStack; + SmallPtrSet<BasicBlock*, 32> Visited; + + bool Changed = false; + bool LocalChanged; + do { + LocalChanged = false; + + VisitStack.clear(); + Visited.clear(); + + VisitStack.push_back(WorklistItem(L->getHeader(), false)); + + while (!VisitStack.empty()) { + WorklistItem Item = VisitStack.pop_back_val(); + BasicBlock *BB = Item.getPointer(); + bool IsSubloopHeader = Item.getInt(); + + // Simplify instructions in the current basic block. + for (BasicBlock::iterator BI = BB->begin(), BE = BB->end(); BI != BE;) { + Instruction *I = BI++; + + // The first time through the loop ToSimplify is empty and we try to + // simplify all instructions. On later iterations ToSimplify is not + // empty and we only bother simplifying instructions that are in it. + if (!ToSimplify->empty() && !ToSimplify->count(I)) + continue; + + // Don't bother simplifying unused instructions. + if (!I->use_empty()) { + Value *V = SimplifyInstruction(I, TD, DT); + if (V && LI->replacementPreservesLCSSAForm(I, V)) { + // Mark all uses for resimplification next time round the loop. + for (Value::use_iterator UI = I->use_begin(), UE = I->use_end(); + UI != UE; ++UI) + Next->insert(cast<Instruction>(*UI)); + + I->replaceAllUsesWith(V); + LocalChanged = true; + ++NumSimplified; + } + } + LocalChanged |= RecursivelyDeleteTriviallyDeadInstructions(I); + + if (IsSubloopHeader && !isa<PHINode>(I)) + break; + } + + // Add all successors to the worklist, except for loop exit blocks and the + // bodies of subloops. We visit the headers of loops so that we can process + // their phis, but we contract the rest of the subloop body and only follow + // edges leading back to the original loop. + for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB); SI != SE; + ++SI) { + BasicBlock *SuccBB = *SI; + if (!Visited.insert(SuccBB)) + continue; + + const Loop *SuccLoop = LI->getLoopFor(SuccBB); + if (SuccLoop && SuccLoop->getHeader() == SuccBB + && L->contains(SuccLoop)) { + VisitStack.push_back(WorklistItem(SuccBB, true)); + + SmallVector<BasicBlock*, 8> SubLoopExitBlocks; + SuccLoop->getExitBlocks(SubLoopExitBlocks); + + for (unsigned i = 0; i < SubLoopExitBlocks.size(); ++i) { + BasicBlock *ExitBB = SubLoopExitBlocks[i]; + if (LI->getLoopFor(ExitBB) == L && Visited.insert(ExitBB)) + VisitStack.push_back(WorklistItem(ExitBB, false)); + } + + continue; + } + + bool IsExitBlock = std::binary_search(ExitBlocks.begin(), + ExitBlocks.end(), SuccBB); + if (IsExitBlock) + continue; + + VisitStack.push_back(WorklistItem(SuccBB, false)); + } + } + + // Place the list of instructions to simplify on the next loop iteration + // into ToSimplify. + std::swap(ToSimplify, Next); + Next->clear(); + + Changed |= LocalChanged; + } while (LocalChanged); + + return Changed; +} diff --git a/lib/Transforms/Scalar/LoopRotation.cpp b/lib/Transforms/Scalar/LoopRotation.cpp index 65acc1d..95e1578 100644 --- a/lib/Transforms/Scalar/LoopRotation.cpp +++ b/lib/Transforms/Scalar/LoopRotation.cpp @@ -15,16 +15,16 @@ #include "llvm/Transforms/Scalar.h" #include "llvm/Function.h" #include "llvm/IntrinsicInst.h" +#include "llvm/Analysis/CodeMetrics.h" #include "llvm/Analysis/LoopPass.h" -#include "llvm/Analysis/Dominators.h" +#include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/SSAUpdater.h" -#include "llvm/Support/CommandLine.h" +#include "llvm/Transforms/Utils/ValueMapper.h" #include "llvm/Support/Debug.h" #include "llvm/ADT/Statistic.h" -#include "llvm/ADT/SmallVector.h" using namespace llvm; #define MAX_HEADER_SIZE 16 @@ -35,16 +35,13 @@ namespace { class LoopRotate : public LoopPass { public: static char ID; // Pass ID, replacement for typeid - LoopRotate() : LoopPass(ID) {} - - // Rotate Loop L as many times as possible. Return true if - // loop is rotated at least once. - bool runOnLoop(Loop *L, LPPassManager &LPM); + LoopRotate() : LoopPass(ID) { + initializeLoopRotatePass(*PassRegistry::getPassRegistry()); + } // LCSSA form makes instruction renaming easier. virtual void getAnalysisUsage(AnalysisUsage &AU) const { AU.addPreserved<DominatorTree>(); - AU.addPreserved<DominanceFrontier>(); AU.addRequired<LoopInfo>(); AU.addPreserved<LoopInfo>(); AU.addRequiredID(LoopSimplifyID); @@ -54,79 +51,119 @@ namespace { AU.addPreserved<ScalarEvolution>(); } - // Helper functions - - /// Do actual work - bool rotateLoop(Loop *L, LPPassManager &LPM); + bool runOnLoop(Loop *L, LPPassManager &LPM); + bool rotateLoop(Loop *L); - /// Initialize local data - void initialize(); - - /// After loop rotation, loop pre-header has multiple sucessors. - /// Insert one forwarding basic block to ensure that loop pre-header - /// has only one successor. - void preserveCanonicalLoopForm(LPPassManager &LPM); - private: - Loop *L; - BasicBlock *OrigHeader; - BasicBlock *OrigPreHeader; - BasicBlock *OrigLatch; - BasicBlock *NewHeader; - BasicBlock *Exit; - LPPassManager *LPM_Ptr; + LoopInfo *LI; }; } char LoopRotate::ID = 0; -INITIALIZE_PASS(LoopRotate, "loop-rotate", "Rotate Loops", false, false); +INITIALIZE_PASS_BEGIN(LoopRotate, "loop-rotate", "Rotate Loops", false, false) +INITIALIZE_PASS_DEPENDENCY(LoopInfo) +INITIALIZE_PASS_DEPENDENCY(LoopSimplify) +INITIALIZE_PASS_DEPENDENCY(LCSSA) +INITIALIZE_PASS_END(LoopRotate, "loop-rotate", "Rotate Loops", false, false) Pass *llvm::createLoopRotatePass() { return new LoopRotate(); } /// Rotate Loop L as many times as possible. Return true if /// the loop is rotated at least once. -bool LoopRotate::runOnLoop(Loop *Lp, LPPassManager &LPM) { - - bool RotatedOneLoop = false; - initialize(); - LPM_Ptr = &LPM; +bool LoopRotate::runOnLoop(Loop *L, LPPassManager &LPM) { + LI = &getAnalysis<LoopInfo>(); // One loop can be rotated multiple times. - while (rotateLoop(Lp,LPM)) { - RotatedOneLoop = true; - initialize(); - } + bool MadeChange = false; + while (rotateLoop(L)) + MadeChange = true; - return RotatedOneLoop; + return MadeChange; } -/// Rotate loop LP. Return true if the loop is rotated. -bool LoopRotate::rotateLoop(Loop *Lp, LPPassManager &LPM) { - L = Lp; - - OrigPreHeader = L->getLoopPreheader(); - if (!OrigPreHeader) return false; - - OrigLatch = L->getLoopLatch(); - if (!OrigLatch) return false; +/// RewriteUsesOfClonedInstructions - We just cloned the instructions from the +/// old header into the preheader. If there were uses of the values produced by +/// these instruction that were outside of the loop, we have to insert PHI nodes +/// to merge the two values. Do this now. +static void RewriteUsesOfClonedInstructions(BasicBlock *OrigHeader, + BasicBlock *OrigPreheader, + ValueToValueMapTy &ValueMap) { + // Remove PHI node entries that are no longer live. + BasicBlock::iterator I, E = OrigHeader->end(); + for (I = OrigHeader->begin(); PHINode *PN = dyn_cast<PHINode>(I); ++I) + PN->removeIncomingValue(PN->getBasicBlockIndex(OrigPreheader)); + + // Now fix up users of the instructions in OrigHeader, inserting PHI nodes + // as necessary. + SSAUpdater SSA; + for (I = OrigHeader->begin(); I != E; ++I) { + Value *OrigHeaderVal = I; + + // If there are no uses of the value (e.g. because it returns void), there + // is nothing to rewrite. + if (OrigHeaderVal->use_empty()) + continue; + + Value *OrigPreHeaderVal = ValueMap[OrigHeaderVal]; - OrigHeader = L->getHeader(); + // The value now exits in two versions: the initial value in the preheader + // and the loop "next" value in the original header. + SSA.Initialize(OrigHeaderVal->getType(), OrigHeaderVal->getName()); + SSA.AddAvailableValue(OrigHeader, OrigHeaderVal); + SSA.AddAvailableValue(OrigPreheader, OrigPreHeaderVal); + + // Visit each use of the OrigHeader instruction. + for (Value::use_iterator UI = OrigHeaderVal->use_begin(), + UE = OrigHeaderVal->use_end(); UI != UE; ) { + // Grab the use before incrementing the iterator. + Use &U = UI.getUse(); + + // Increment the iterator before removing the use from the list. + ++UI; + + // SSAUpdater can't handle a non-PHI use in the same block as an + // earlier def. We can easily handle those cases manually. + Instruction *UserInst = cast<Instruction>(U.getUser()); + if (!isa<PHINode>(UserInst)) { + BasicBlock *UserBB = UserInst->getParent(); + + // The original users in the OrigHeader are already using the + // original definitions. + if (UserBB == OrigHeader) + continue; + + // Users in the OrigPreHeader need to use the value to which the + // original definitions are mapped. + if (UserBB == OrigPreheader) { + U = OrigPreHeaderVal; + continue; + } + } + + // Anything else can be handled by SSAUpdater. + SSA.RewriteUse(U); + } + } +} +/// Rotate loop LP. Return true if the loop is rotated. +bool LoopRotate::rotateLoop(Loop *L) { // If the loop has only one block then there is not much to rotate. if (L->getBlocks().size() == 1) return false; - + + BasicBlock *OrigHeader = L->getHeader(); + + BranchInst *BI = dyn_cast<BranchInst>(OrigHeader->getTerminator()); + if (BI == 0 || BI->isUnconditional()) + return false; + // If the loop header is not one of the loop exiting blocks then // either this loop is already rotated or it is not // suitable for loop rotation transformations. if (!L->isLoopExiting(OrigHeader)) return false; - BranchInst *BI = dyn_cast<BranchInst>(OrigHeader->getTerminator()); - if (!BI) - return false; - assert(BI->isConditional() && "Branch Instruction is not conditional"); - // Updating PHInodes in loops with multiple exits adds complexity. // Keep it simple, and restrict loop rotation to loops with one exit only. // In future, lift this restriction and support for multiple exits if @@ -136,24 +173,18 @@ bool LoopRotate::rotateLoop(Loop *Lp, LPPassManager &LPM) { if (ExitBlocks.size() > 1) return false; - // Check size of original header and reject - // loop if it is very big. - unsigned Size = 0; - - // FIXME: Use common api to estimate size. - for (BasicBlock::const_iterator OI = OrigHeader->begin(), - OE = OrigHeader->end(); OI != OE; ++OI) { - if (isa<PHINode>(OI)) - continue; // PHI nodes don't count. - if (isa<DbgInfoIntrinsic>(OI)) - continue; // Debug intrinsics don't count as size. - ++Size; + // Check size of original header and reject loop if it is very big. + { + CodeMetrics Metrics; + Metrics.analyzeBasicBlock(OrigHeader); + if (Metrics.NumInsts > MAX_HEADER_SIZE) + return false; } - if (Size > MAX_HEADER_SIZE) - return false; - // Now, this loop is suitable for rotation. + BasicBlock *OrigPreheader = L->getLoopPreheader(); + BasicBlock *OrigLatch = L->getLoopLatch(); + assert(OrigPreheader && OrigLatch && "Loop not in canonical form?"); // Anything ScalarEvolution may know about this loop or the PHI nodes // in its header will soon be invalidated. @@ -163,8 +194,8 @@ bool LoopRotate::rotateLoop(Loop *Lp, LPPassManager &LPM) { // Find new Loop header. NewHeader is a Header's one and only successor // that is inside loop. Header's other successor is outside the // loop. Otherwise loop is not suitable for rotation. - Exit = BI->getSuccessor(0); - NewHeader = BI->getSuccessor(1); + BasicBlock *Exit = BI->getSuccessor(0); + BasicBlock *NewHeader = BI->getSuccessor(1); if (L->contains(Exit)) std::swap(Exit, NewHeader); assert(NewHeader && "Unable to determine new loop header"); @@ -180,20 +211,54 @@ bool LoopRotate::rotateLoop(Loop *Lp, LPPassManager &LPM) { // Begin by walking OrigHeader and populating ValueMap with an entry for // each Instruction. BasicBlock::iterator I = OrigHeader->begin(), E = OrigHeader->end(); - DenseMap<const Value *, Value *> ValueMap; + ValueToValueMapTy ValueMap; // For PHI nodes, the value available in OldPreHeader is just the // incoming value from OldPreHeader. for (; PHINode *PN = dyn_cast<PHINode>(I); ++I) - ValueMap[PN] = PN->getIncomingValue(PN->getBasicBlockIndex(OrigPreHeader)); + ValueMap[PN] = PN->getIncomingValue(PN->getBasicBlockIndex(OrigPreheader)); - // For the rest of the instructions, create a clone in the OldPreHeader. - TerminatorInst *LoopEntryBranch = OrigPreHeader->getTerminator(); - for (; I != E; ++I) { - Instruction *C = I->clone(); - C->setName(I->getName()); - C->insertBefore(LoopEntryBranch); - ValueMap[I] = C; + // For the rest of the instructions, either hoist to the OrigPreheader if + // possible or create a clone in the OldPreHeader if not. + TerminatorInst *LoopEntryBranch = OrigPreheader->getTerminator(); + while (I != E) { + Instruction *Inst = I++; + + // If the instruction's operands are invariant and it doesn't read or write + // memory, then it is safe to hoist. Doing this doesn't change the order of + // execution in the preheader, but does prevent the instruction from + // executing in each iteration of the loop. This means it is safe to hoist + // something that might trap, but isn't safe to hoist something that reads + // memory (without proving that the loop doesn't write). + if (L->hasLoopInvariantOperands(Inst) && + !Inst->mayReadFromMemory() && !Inst->mayWriteToMemory() && + !isa<TerminatorInst>(Inst) && !isa<DbgInfoIntrinsic>(Inst)) { + Inst->moveBefore(LoopEntryBranch); + continue; + } + + // Otherwise, create a duplicate of the instruction. + Instruction *C = Inst->clone(); + + // Eagerly remap the operands of the instruction. + RemapInstruction(C, ValueMap, + RF_NoModuleLevelChanges|RF_IgnoreMissingEntries); + + // With the operands remapped, see if the instruction constant folds or is + // otherwise simplifyable. This commonly occurs because the entry from PHI + // nodes allows icmps and other instructions to fold. + Value *V = SimplifyInstruction(C); + if (V && LI->replacementPreservesLCSSAForm(C, V)) { + // If so, then delete the temporary instruction and stick the folded value + // in the map. + delete C; + ValueMap[Inst] = V; + } else { + // Otherwise, stick the new instruction into the new block! + C->setName(Inst->getName()); + C->insertBefore(LoopEntryBranch); + ValueMap[Inst] = C; + } } // Along with all the other instructions, we just cloned OrigHeader's @@ -203,221 +268,81 @@ bool LoopRotate::rotateLoop(Loop *Lp, LPPassManager &LPM) { for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i) for (BasicBlock::iterator BI = TI->getSuccessor(i)->begin(); PHINode *PN = dyn_cast<PHINode>(BI); ++BI) - PN->addIncoming(PN->getIncomingValueForBlock(OrigHeader), OrigPreHeader); + PN->addIncoming(PN->getIncomingValueForBlock(OrigHeader), OrigPreheader); // Now that OrigPreHeader has a clone of OrigHeader's terminator, remove // OrigPreHeader's old terminator (the original branch into the loop), and // remove the corresponding incoming values from the PHI nodes in OrigHeader. LoopEntryBranch->eraseFromParent(); - for (I = OrigHeader->begin(); PHINode *PN = dyn_cast<PHINode>(I); ++I) - PN->removeIncomingValue(PN->getBasicBlockIndex(OrigPreHeader)); - // Now fix up users of the instructions in OrigHeader, inserting PHI nodes - // as necessary. - SSAUpdater SSA; - for (I = OrigHeader->begin(); I != E; ++I) { - Value *OrigHeaderVal = I; - Value *OrigPreHeaderVal = ValueMap[OrigHeaderVal]; - - // The value now exits in two versions: the initial value in the preheader - // and the loop "next" value in the original header. - SSA.Initialize(OrigHeaderVal->getType(), OrigHeaderVal->getName()); - SSA.AddAvailableValue(OrigHeader, OrigHeaderVal); - SSA.AddAvailableValue(OrigPreHeader, OrigPreHeaderVal); - - // Visit each use of the OrigHeader instruction. - for (Value::use_iterator UI = OrigHeaderVal->use_begin(), - UE = OrigHeaderVal->use_end(); UI != UE; ) { - // Grab the use before incrementing the iterator. - Use &U = UI.getUse(); - - // Increment the iterator before removing the use from the list. - ++UI; - - // SSAUpdater can't handle a non-PHI use in the same block as an - // earlier def. We can easily handle those cases manually. - Instruction *UserInst = cast<Instruction>(U.getUser()); - if (!isa<PHINode>(UserInst)) { - BasicBlock *UserBB = UserInst->getParent(); - - // The original users in the OrigHeader are already using the - // original definitions. - if (UserBB == OrigHeader) - continue; - - // Users in the OrigPreHeader need to use the value to which the - // original definitions are mapped. - if (UserBB == OrigPreHeader) { - U = OrigPreHeaderVal; - continue; - } - } - - // Anything else can be handled by SSAUpdater. - SSA.RewriteUse(U); - } - } + // If there were any uses of instructions in the duplicated block outside the + // loop, update them, inserting PHI nodes as required + RewriteUsesOfClonedInstructions(OrigHeader, OrigPreheader, ValueMap); // NewHeader is now the header of the loop. L->moveToHeader(NewHeader); + assert(L->getHeader() == NewHeader && "Latch block is our new header"); - // Move the original header to the bottom of the loop, where it now more - // naturally belongs. This isn't necessary for correctness, and CodeGen can - // usually reorder blocks on its own to fix things like this up, but it's - // still nice to keep the IR readable. - // - // The original header should have only one predecessor at this point, since - // we checked that the loop had a proper preheader and unique backedge before - // we started. - assert(OrigHeader->getSinglePredecessor() && - "Original loop header has too many predecessors after loop rotation!"); - OrigHeader->moveAfter(OrigHeader->getSinglePredecessor()); - - // Also, since this original header only has one predecessor, zap its - // PHI nodes, which are now trivial. - FoldSingleEntryPHINodes(OrigHeader); - - // TODO: We could just go ahead and merge OrigHeader into its predecessor - // at this point, if we don't mind updating dominator info. - - // Establish a new preheader, update dominators, etc. - preserveCanonicalLoopForm(LPM); - - ++NumRotated; - return true; -} - -/// Initialize local data -void LoopRotate::initialize() { - L = NULL; - OrigHeader = NULL; - OrigPreHeader = NULL; - NewHeader = NULL; - Exit = NULL; -} - -/// After loop rotation, loop pre-header has multiple sucessors. -/// Insert one forwarding basic block to ensure that loop pre-header -/// has only one successor. -void LoopRotate::preserveCanonicalLoopForm(LPPassManager &LPM) { - - // Right now original pre-header has two successors, new header and - // exit block. Insert new block between original pre-header and - // new header such that loop's new pre-header has only one successor. - BasicBlock *NewPreHeader = BasicBlock::Create(OrigHeader->getContext(), - "bb.nph", - OrigHeader->getParent(), - NewHeader); - LoopInfo &LI = getAnalysis<LoopInfo>(); - if (Loop *PL = LI.getLoopFor(OrigPreHeader)) - PL->addBasicBlockToLoop(NewPreHeader, LI.getBase()); - BranchInst::Create(NewHeader, NewPreHeader); - BranchInst *OrigPH_BI = cast<BranchInst>(OrigPreHeader->getTerminator()); - if (OrigPH_BI->getSuccessor(0) == NewHeader) - OrigPH_BI->setSuccessor(0, NewPreHeader); - else { - assert(OrigPH_BI->getSuccessor(1) == NewHeader && - "Unexpected original pre-header terminator"); - OrigPH_BI->setSuccessor(1, NewPreHeader); - } - - PHINode *PN; - for (BasicBlock::iterator I = NewHeader->begin(); - (PN = dyn_cast<PHINode>(I)); ++I) { - int index = PN->getBasicBlockIndex(OrigPreHeader); - assert(index != -1 && "Expected incoming value from Original PreHeader"); - PN->setIncomingBlock(index, NewPreHeader); - assert(PN->getBasicBlockIndex(OrigPreHeader) == -1 && - "Expected only one incoming value from Original PreHeader"); - } - - if (DominatorTree *DT = getAnalysisIfAvailable<DominatorTree>()) { - DT->addNewBlock(NewPreHeader, OrigPreHeader); - DT->changeImmediateDominator(L->getHeader(), NewPreHeader); - DT->changeImmediateDominator(Exit, OrigPreHeader); - for (Loop::block_iterator BI = L->block_begin(), BE = L->block_end(); - BI != BE; ++BI) { - BasicBlock *B = *BI; - if (L->getHeader() != B) { - DomTreeNode *Node = DT->getNode(B); - if (Node && Node->getBlock() == OrigHeader) - DT->changeImmediateDominator(*BI, L->getHeader()); - } - } - DT->changeImmediateDominator(OrigHeader, OrigLatch); - } - - if (DominanceFrontier *DF = getAnalysisIfAvailable<DominanceFrontier>()) { - // New Preheader's dominance frontier is Exit block. - DominanceFrontier::DomSetType NewPHSet; - NewPHSet.insert(Exit); - DF->addBasicBlock(NewPreHeader, NewPHSet); - - // New Header's dominance frontier now includes itself and Exit block - DominanceFrontier::iterator HeadI = DF->find(L->getHeader()); - if (HeadI != DF->end()) { - DominanceFrontier::DomSetType & HeaderSet = HeadI->second; - HeaderSet.clear(); - HeaderSet.insert(L->getHeader()); - HeaderSet.insert(Exit); - } else { - DominanceFrontier::DomSetType HeaderSet; - HeaderSet.insert(L->getHeader()); - HeaderSet.insert(Exit); - DF->addBasicBlock(L->getHeader(), HeaderSet); - } - - // Original header (new Loop Latch)'s dominance frontier is Exit. - DominanceFrontier::iterator LatchI = DF->find(L->getLoopLatch()); - if (LatchI != DF->end()) { - DominanceFrontier::DomSetType &LatchSet = LatchI->second; - LatchSet = LatchI->second; - LatchSet.clear(); - LatchSet.insert(Exit); - } else { - DominanceFrontier::DomSetType LatchSet; - LatchSet.insert(Exit); - DF->addBasicBlock(L->getHeader(), LatchSet); + // At this point, we've finished our major CFG changes. As part of cloning + // the loop into the preheader we've simplified instructions and the + // duplicated conditional branch may now be branching on a constant. If it is + // branching on a constant and if that constant means that we enter the loop, + // then we fold away the cond branch to an uncond branch. This simplifies the + // loop in cases important for nested loops, and it also means we don't have + // to split as many edges. + BranchInst *PHBI = cast<BranchInst>(OrigPreheader->getTerminator()); + assert(PHBI->isConditional() && "Should be clone of BI condbr!"); + if (!isa<ConstantInt>(PHBI->getCondition()) || + PHBI->getSuccessor(cast<ConstantInt>(PHBI->getCondition())->isZero()) + != NewHeader) { + // The conditional branch can't be folded, handle the general case. + // Update DominatorTree to reflect the CFG change we just made. Then split + // edges as necessary to preserve LoopSimplify form. + if (DominatorTree *DT = getAnalysisIfAvailable<DominatorTree>()) { + // Since OrigPreheader now has the conditional branch to Exit block, it is + // the dominator of Exit. + DT->changeImmediateDominator(Exit, OrigPreheader); + DT->changeImmediateDominator(NewHeader, OrigPreheader); + + // Update OrigHeader to be dominated by the new header block. + DT->changeImmediateDominator(OrigHeader, OrigLatch); } - - // If a loop block dominates new loop latch then add to its frontiers - // new header and Exit and remove new latch (which is equal to original - // header). - BasicBlock *NewLatch = L->getLoopLatch(); - - assert(NewLatch == OrigHeader && "NewLatch is inequal to OrigHeader"); - + + // Right now OrigPreHeader has two successors, NewHeader and ExitBlock, and + // thus is not a preheader anymore. Split the edge to form a real preheader. + BasicBlock *NewPH = SplitCriticalEdge(OrigPreheader, NewHeader, this); + NewPH->setName(NewHeader->getName() + ".lr.ph"); + + // Preserve canonical loop form, which means that 'Exit' should have only one + // predecessor. + BasicBlock *ExitSplit = SplitCriticalEdge(L->getLoopLatch(), Exit, this); + ExitSplit->moveBefore(Exit); + } else { + // We can fold the conditional branch in the preheader, this makes things + // simpler. The first step is to remove the extra edge to the Exit block. + Exit->removePredecessor(OrigPreheader, true /*preserve LCSSA*/); + BranchInst::Create(NewHeader, PHBI); + PHBI->eraseFromParent(); + + // With our CFG finalized, update DomTree if it is available. if (DominatorTree *DT = getAnalysisIfAvailable<DominatorTree>()) { - for (Loop::block_iterator BI = L->block_begin(), BE = L->block_end(); - BI != BE; ++BI) { - BasicBlock *B = *BI; - if (DT->dominates(B, NewLatch)) { - DominanceFrontier::iterator BDFI = DF->find(B); - if (BDFI != DF->end()) { - DominanceFrontier::DomSetType &BSet = BDFI->second; - BSet.erase(NewLatch); - BSet.insert(L->getHeader()); - BSet.insert(Exit); - } else { - DominanceFrontier::DomSetType BSet; - BSet.insert(L->getHeader()); - BSet.insert(Exit); - DF->addBasicBlock(B, BSet); - } - } - } + // Update OrigHeader to be dominated by the new header block. + DT->changeImmediateDominator(NewHeader, OrigPreheader); + DT->changeImmediateDominator(OrigHeader, OrigLatch); } } - - // Preserve canonical loop form, which means Exit block should - // have only one predecessor. - SplitEdge(L->getLoopLatch(), Exit, this); - - assert(NewHeader && L->getHeader() == NewHeader && - "Invalid loop header after loop rotation"); - assert(NewPreHeader && L->getLoopPreheader() == NewPreHeader && - "Invalid loop preheader after loop rotation"); - assert(L->getLoopLatch() && - "Invalid loop latch after loop rotation"); + + assert(L->getLoopPreheader() && "Invalid loop preheader after loop rotation"); + assert(L->getLoopLatch() && "Invalid loop latch after loop rotation"); + + // Now that the CFG and DomTree are in a consistent state again, try to merge + // the OrigHeader block into OrigLatch. This will succeed if they are + // connected by an unconditional branch. This is just a cleanup so the + // emitted code isn't too gross in this common case. + MergeBlockIntoPredecessor(OrigHeader, this); + + ++NumRotated; + return true; } + diff --git a/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/lib/Transforms/Scalar/LoopStrengthReduce.cpp index e8dc5d3..ac4aea2 100644 --- a/lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -63,6 +63,7 @@ #include "llvm/Analysis/Dominators.h" #include "llvm/Analysis/LoopPass.h" #include "llvm/Analysis/ScalarEvolutionExpander.h" +#include "llvm/Assembly/Writer.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/ADT/SmallBitVector.h" @@ -113,7 +114,7 @@ class RegUseTracker { public: void CountRegister(const SCEV *Reg, size_t LUIdx); void DropRegister(const SCEV *Reg, size_t LUIdx); - void DropUse(size_t LUIdx); + void SwapAndDropUse(size_t LUIdx, size_t LastLUIdx); bool isRegUsedByUsesOtherThan(const SCEV *Reg, size_t LUIdx) const; @@ -152,11 +153,19 @@ RegUseTracker::DropRegister(const SCEV *Reg, size_t LUIdx) { } void -RegUseTracker::DropUse(size_t LUIdx) { - // Remove the use index from every register's use list. +RegUseTracker::SwapAndDropUse(size_t LUIdx, size_t LastLUIdx) { + assert(LUIdx <= LastLUIdx); + + // Update RegUses. The data structure is not optimized for this purpose; + // we must iterate through it and update each of the bit vectors. for (RegUsesTy::iterator I = RegUsesMap.begin(), E = RegUsesMap.end(); - I != E; ++I) - I->second.UsedByIndices.reset(LUIdx); + I != E; ++I) { + SmallBitVector &UsedByIndices = I->second.UsedByIndices; + if (LUIdx < UsedByIndices.size()) + UsedByIndices[LUIdx] = + LastLUIdx < UsedByIndices.size() ? UsedByIndices[LastLUIdx] : 0; + UsedByIndices.resize(std::min(UsedByIndices.size(), LastLUIdx)); + } } bool @@ -202,8 +211,7 @@ struct Formula { Formula() : ScaledReg(0) {} - void InitialMatch(const SCEV *S, Loop *L, - ScalarEvolution &SE, DominatorTree &DT); + void InitialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE); unsigned getNumRegs() const; const Type *getType() const; @@ -224,9 +232,9 @@ struct Formula { static void DoInitialMatch(const SCEV *S, Loop *L, SmallVectorImpl<const SCEV *> &Good, SmallVectorImpl<const SCEV *> &Bad, - ScalarEvolution &SE, DominatorTree &DT) { + ScalarEvolution &SE) { // Collect expressions which properly dominate the loop header. - if (S->properlyDominates(L->getHeader(), &DT)) { + if (SE.properlyDominates(S, L->getHeader())) { Good.push_back(S); return; } @@ -235,18 +243,18 @@ static void DoInitialMatch(const SCEV *S, Loop *L, if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) { for (SCEVAddExpr::op_iterator I = Add->op_begin(), E = Add->op_end(); I != E; ++I) - DoInitialMatch(*I, L, Good, Bad, SE, DT); + DoInitialMatch(*I, L, Good, Bad, SE); return; } // Look at addrec operands. if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) if (!AR->getStart()->isZero()) { - DoInitialMatch(AR->getStart(), L, Good, Bad, SE, DT); + DoInitialMatch(AR->getStart(), L, Good, Bad, SE); DoInitialMatch(SE.getAddRecExpr(SE.getConstant(AR->getType(), 0), AR->getStepRecurrence(SE), AR->getLoop()), - L, Good, Bad, SE, DT); + L, Good, Bad, SE); return; } @@ -258,7 +266,7 @@ static void DoInitialMatch(const SCEV *S, Loop *L, SmallVector<const SCEV *, 4> MyGood; SmallVector<const SCEV *, 4> MyBad; - DoInitialMatch(NewMul, L, MyGood, MyBad, SE, DT); + DoInitialMatch(NewMul, L, MyGood, MyBad, SE); const SCEV *NegOne = SE.getSCEV(ConstantInt::getAllOnesValue( SE.getEffectiveSCEVType(NewMul->getType()))); for (SmallVectorImpl<const SCEV *>::const_iterator I = MyGood.begin(), @@ -278,11 +286,10 @@ static void DoInitialMatch(const SCEV *S, Loop *L, /// InitialMatch - Incorporate loop-variant parts of S into this Formula, /// attempting to keep all loop-invariant and loop-computable values in a /// single base register. -void Formula::InitialMatch(const SCEV *S, Loop *L, - ScalarEvolution &SE, DominatorTree &DT) { +void Formula::InitialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE) { SmallVector<const SCEV *, 4> Good; SmallVector<const SCEV *, 4> Bad; - DoInitialMatch(S, L, Good, Bad, SE, DT); + DoInitialMatch(S, L, Good, Bad, SE); if (!Good.empty()) { const SCEV *Sum = SE.getAddExpr(Good); if (!Sum->isZero()) @@ -608,7 +615,7 @@ DeleteTriviallyDeadInstructions(SmallVectorImpl<WeakVH> &DeadInsts) { bool Changed = false; while (!DeadInsts.empty()) { - Instruction *I = dyn_cast_or_null<Instruction>(DeadInsts.pop_back_val()); + Instruction *I = dyn_cast_or_null<Instruction>(&*DeadInsts.pop_back_val()); if (I == 0 || !isInstructionTriviallyDead(I)) continue; @@ -645,8 +652,6 @@ public: : NumRegs(0), AddRecCost(0), NumIVMuls(0), NumBaseAdds(0), ImmCost(0), SetupCost(0) {} - unsigned getNumRegs() const { return NumRegs; } - bool operator<(const Cost &Other) const; void Loose(); @@ -722,6 +727,9 @@ void Cost::RateRegister(const SCEV *Reg, (isa<SCEVUnknown>(cast<SCEVAddRecExpr>(Reg)->getStart()) || isa<SCEVConstant>(cast<SCEVAddRecExpr>(Reg)->getStart())))) ++SetupCost; + + NumIVMuls += isa<SCEVMulExpr>(Reg) && + SE.hasComputableLoopEvolution(Reg, L); } /// RatePrimaryRegister - Record this register in the set. If we haven't seen it @@ -756,9 +764,6 @@ void Cost::RateFormula(const Formula &F, return; } RatePrimaryRegister(BaseReg, Regs, L, SE, DT); - - NumIVMuls += isa<SCEVMulExpr>(BaseReg) && - BaseReg->hasComputableLoopEvolution(L); } if (F.BaseRegs.size() > 1) @@ -1257,32 +1262,6 @@ struct UseMapDenseMapInfo { } }; -/// FormulaSorter - This class implements an ordering for formulae which sorts -/// the by their standalone cost. -class FormulaSorter { - /// These two sets are kept empty, so that we compute standalone costs. - DenseSet<const SCEV *> VisitedRegs; - SmallPtrSet<const SCEV *, 16> Regs; - Loop *L; - LSRUse *LU; - ScalarEvolution &SE; - DominatorTree &DT; - -public: - FormulaSorter(Loop *l, LSRUse &lu, ScalarEvolution &se, DominatorTree &dt) - : L(l), LU(&lu), SE(se), DT(dt) {} - - bool operator()(const Formula &A, const Formula &B) { - Cost CostA; - CostA.RateFormula(A, Regs, VisitedRegs, L, LU->Offsets, SE, DT); - Regs.clear(); - Cost CostB; - CostB.RateFormula(B, Regs, VisitedRegs, L, LU->Offsets, SE, DT); - Regs.clear(); - return CostA < CostB; - } -}; - /// LSRInstance - This class holds state for the main loop strength reduction /// logic. class LSRInstance { @@ -1341,7 +1320,7 @@ class LSRInstance { LSRUse::KindType Kind, const Type *AccessTy); - void DeleteUse(LSRUse &LU); + void DeleteUse(LSRUse &LU, size_t LUIdx); LSRUse *FindUseWithSimilarFormula(const Formula &F, const LSRUse &OrigLU); @@ -1925,10 +1904,13 @@ LSRInstance::getUse(const SCEV *&Expr, } /// DeleteUse - Delete the given use from the Uses list. -void LSRInstance::DeleteUse(LSRUse &LU) { +void LSRInstance::DeleteUse(LSRUse &LU, size_t LUIdx) { if (&LU != &Uses.back()) std::swap(LU, Uses.back()); Uses.pop_back(); + + // Update RegUses. + RegUses.SwapAndDropUse(LUIdx, Uses.size()); } /// FindUseWithFormula - Look for a use distinct from OrigLU which is has @@ -2073,7 +2055,7 @@ void LSRInstance::CollectFixupsAndInitialFormulae() { // x == y --> x - y == 0 const SCEV *N = SE.getSCEV(NV); - if (N->isLoopInvariant(L)) { + if (SE.isLoopInvariant(N, L)) { Kind = LSRUse::ICmpZero; S = SE.getMinusSCEV(N, S); } @@ -2113,7 +2095,7 @@ void LSRInstance::CollectFixupsAndInitialFormulae() { void LSRInstance::InsertInitialFormula(const SCEV *S, LSRUse &LU, size_t LUIdx) { Formula F; - F.InitialMatch(S, L, SE, DT); + F.InitialMatch(S, L, SE); bool Inserted = InsertFormula(LU, LUIdx, F); assert(Inserted && "Initial formula already exists!"); (void)Inserted; } @@ -2213,7 +2195,7 @@ LSRInstance::CollectLoopInvariantFixupsAndFormulae() { if (const ICmpInst *ICI = dyn_cast<ICmpInst>(UserInst)) { unsigned OtherIdx = !UI.getOperandNo(); Value *OtherOp = const_cast<Value *>(ICI->getOperand(OtherIdx)); - if (SE.getSCEV(OtherOp)->hasComputableLoopEvolution(L)) + if (SE.hasComputableLoopEvolution(SE.getSCEV(OtherOp), L)) continue; } @@ -2296,7 +2278,7 @@ void LSRInstance::GenerateReassociations(LSRUse &LU, unsigned LUIdx, // Loop-variant "unknown" values are uninteresting; we won't be able to // do anything meaningful with them. - if (isa<SCEVUnknown>(*J) && !(*J)->isLoopInvariant(L)) + if (isa<SCEVUnknown>(*J) && !SE.isLoopInvariant(*J, L)) continue; // Don't pull a constant into a register if the constant could be folded @@ -2347,8 +2329,8 @@ void LSRInstance::GenerateCombinations(LSRUse &LU, unsigned LUIdx, for (SmallVectorImpl<const SCEV *>::const_iterator I = Base.BaseRegs.begin(), E = Base.BaseRegs.end(); I != E; ++I) { const SCEV *BaseReg = *I; - if (BaseReg->properlyDominates(L->getHeader(), &DT) && - !BaseReg->hasComputableLoopEvolution(L)) + if (SE.properlyDominates(BaseReg, L->getHeader()) && + !SE.hasComputableLoopEvolution(BaseReg, L)) Ops.push_back(BaseReg); else F.BaseRegs.push_back(BaseReg); @@ -2813,9 +2795,11 @@ LSRInstance::GenerateAllReuseFormulae() { print_uses(dbgs())); } -/// If their are multiple formulae with the same set of registers used +/// If there are multiple formulae with the same set of registers used /// by other uses, pick the best one and delete the others. void LSRInstance::FilterOutUndesirableDedicatedRegisters() { + DenseSet<const SCEV *> VisitedRegs; + SmallPtrSet<const SCEV *, 16> Regs; #ifndef NDEBUG bool ChangedFormulae = false; #endif @@ -2828,7 +2812,6 @@ void LSRInstance::FilterOutUndesirableDedicatedRegisters() { for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) { LSRUse &LU = Uses[LUIdx]; - FormulaSorter Sorter(L, LU, SE, DT); DEBUG(dbgs() << "Filtering for use "; LU.print(dbgs()); dbgs() << '\n'); bool Any = false; @@ -2854,7 +2837,14 @@ void LSRInstance::FilterOutUndesirableDedicatedRegisters() { BestFormulae.insert(std::make_pair(Key, FIdx)); if (!P.second) { Formula &Best = LU.Formulae[P.first->second]; - if (Sorter.operator()(F, Best)) + + Cost CostF; + CostF.RateFormula(F, Regs, VisitedRegs, L, LU.Offsets, SE, DT); + Regs.clear(); + Cost CostBest; + CostBest.RateFormula(Best, Regs, VisitedRegs, L, LU.Offsets, SE, DT); + Regs.clear(); + if (CostF < CostBest) std::swap(F, Best); DEBUG(dbgs() << " Filtering out formula "; F.print(dbgs()); dbgs() << "\n" @@ -2894,7 +2884,7 @@ static const size_t ComplexityLimit = UINT16_MAX; /// this many solutions because it prune the search space, but the pruning /// isn't always sufficient. size_t LSRInstance::EstimateSearchSpaceComplexity() const { - uint32_t Power = 1; + size_t Power = 1; for (SmallVectorImpl<LSRUse>::const_iterator I = Uses.begin(), E = Uses.end(); I != E; ++I) { size_t FSize = I->Formulae.size(); @@ -3001,6 +2991,28 @@ void LSRInstance::NarrowSearchSpaceByCollapsingUnrolledCode() { LUThatHas->AllFixupsOutsideLoop &= LU.AllFixupsOutsideLoop; + // Update the relocs to reference the new use. + for (SmallVectorImpl<LSRFixup>::iterator I = Fixups.begin(), + E = Fixups.end(); I != E; ++I) { + LSRFixup &Fixup = *I; + if (Fixup.LUIdx == LUIdx) { + Fixup.LUIdx = LUThatHas - &Uses.front(); + Fixup.Offset += F.AM.BaseOffs; + // Add the new offset to LUThatHas' offset list. + if (LUThatHas->Offsets.back() != Fixup.Offset) { + LUThatHas->Offsets.push_back(Fixup.Offset); + if (Fixup.Offset > LUThatHas->MaxOffset) + LUThatHas->MaxOffset = Fixup.Offset; + if (Fixup.Offset < LUThatHas->MinOffset) + LUThatHas->MinOffset = Fixup.Offset; + } + DEBUG(dbgs() << "New fixup has offset " + << Fixup.Offset << '\n'); + } + if (Fixup.LUIdx == NumUses-1) + Fixup.LUIdx = LUIdx; + } + // Delete formulae from the new use which are no longer legal. bool Any = false; for (size_t i = 0, e = LUThatHas->Formulae.size(); i != e; ++i) { @@ -3019,22 +3031,8 @@ void LSRInstance::NarrowSearchSpaceByCollapsingUnrolledCode() { if (Any) LUThatHas->RecomputeRegs(LUThatHas - &Uses.front(), RegUses); - // Update the relocs to reference the new use. - for (SmallVectorImpl<LSRFixup>::iterator I = Fixups.begin(), - E = Fixups.end(); I != E; ++I) { - LSRFixup &Fixup = *I; - if (Fixup.LUIdx == LUIdx) { - Fixup.LUIdx = LUThatHas - &Uses.front(); - Fixup.Offset += F.AM.BaseOffs; - DEBUG(dbgs() << "New fixup has offset " - << Fixup.Offset << '\n'); - } - if (Fixup.LUIdx == NumUses-1) - Fixup.LUIdx = LUIdx; - } - // Delete the old use. - DeleteUse(LU); + DeleteUse(LU, LUIdx); --LUIdx; --NumUses; break; @@ -3546,21 +3544,23 @@ void LSRInstance::RewriteForPHI(PHINode *PN, // is the canonical backedge for this loop, which complicates post-inc // users. if (e != 1 && BB->getTerminator()->getNumSuccessors() > 1 && - !isa<IndirectBrInst>(BB->getTerminator()) && - (PN->getParent() != L->getHeader() || !L->contains(BB))) { - // Split the critical edge. - BasicBlock *NewBB = SplitCriticalEdge(BB, PN->getParent(), P); - - // If PN is outside of the loop and BB is in the loop, we want to - // move the block to be immediately before the PHI block, not - // immediately after BB. - if (L->contains(BB) && !L->contains(PN)) - NewBB->moveBefore(PN->getParent()); - - // Splitting the edge can reduce the number of PHI entries we have. - e = PN->getNumIncomingValues(); - BB = NewBB; - i = PN->getBasicBlockIndex(BB); + !isa<IndirectBrInst>(BB->getTerminator())) { + Loop *PNLoop = LI.getLoopFor(PN->getParent()); + if (!PNLoop || PN->getParent() != PNLoop->getHeader()) { + // Split the critical edge. + BasicBlock *NewBB = SplitCriticalEdge(BB, PN->getParent(), P); + + // If PN is outside of the loop and BB is in the loop, we want to + // move the block to be immediately before the PHI block, not + // immediately after BB. + if (L->contains(BB) && !L->contains(PN)) + NewBB->moveBefore(PN->getParent()); + + // Splitting the edge can reduce the number of PHI entries we have. + e = PN->getNumIncomingValues(); + BB = NewBB; + i = PN->getBasicBlockIndex(BB); + } } std::pair<DenseMap<BasicBlock *, Value *>::iterator, bool> Pair = @@ -3792,21 +3792,30 @@ private: } char LoopStrengthReduce::ID = 0; -INITIALIZE_PASS(LoopStrengthReduce, "loop-reduce", - "Loop Strength Reduction", false, false); +INITIALIZE_PASS_BEGIN(LoopStrengthReduce, "loop-reduce", + "Loop Strength Reduction", false, false) +INITIALIZE_PASS_DEPENDENCY(DominatorTree) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolution) +INITIALIZE_PASS_DEPENDENCY(IVUsers) +INITIALIZE_PASS_DEPENDENCY(LoopInfo) +INITIALIZE_PASS_DEPENDENCY(LoopSimplify) +INITIALIZE_PASS_END(LoopStrengthReduce, "loop-reduce", + "Loop Strength Reduction", false, false) + Pass *llvm::createLoopStrengthReducePass(const TargetLowering *TLI) { return new LoopStrengthReduce(TLI); } LoopStrengthReduce::LoopStrengthReduce(const TargetLowering *tli) - : LoopPass(ID), TLI(tli) {} + : LoopPass(ID), TLI(tli) { + initializeLoopStrengthReducePass(*PassRegistry::getPassRegistry()); + } void LoopStrengthReduce::getAnalysisUsage(AnalysisUsage &AU) const { // We split critical edges, so we change the CFG. However, we do update // many analyses if they are around. AU.addPreservedID(LoopSimplifyID); - AU.addPreserved("domfrontier"); AU.addRequired<LoopInfo>(); AU.addPreserved<LoopInfo>(); @@ -3815,6 +3824,9 @@ void LoopStrengthReduce::getAnalysisUsage(AnalysisUsage &AU) const { AU.addPreserved<DominatorTree>(); AU.addRequired<ScalarEvolution>(); AU.addPreserved<ScalarEvolution>(); + // Requiring LoopSimplify a second time here prevents IVUsers from running + // twice, since LoopSimplify was invalidated by running ScalarEvolution. + AU.addRequiredID(LoopSimplifyID); AU.addRequired<IVUsers>(); AU.addPreserved<IVUsers>(); } diff --git a/lib/Transforms/Scalar/LoopUnrollPass.cpp b/lib/Transforms/Scalar/LoopUnrollPass.cpp index d0edfa2..80b263a 100644 --- a/lib/Transforms/Scalar/LoopUnrollPass.cpp +++ b/lib/Transforms/Scalar/LoopUnrollPass.cpp @@ -16,7 +16,7 @@ #include "llvm/IntrinsicInst.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Analysis/LoopPass.h" -#include "llvm/Analysis/InlineCost.h" +#include "llvm/Analysis/CodeMetrics.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" @@ -27,7 +27,7 @@ using namespace llvm; static cl::opt<unsigned> -UnrollThreshold("unroll-threshold", cl::init(200), cl::Hidden, +UnrollThreshold("unroll-threshold", cl::init(150), cl::Hidden, cl::desc("The cut-off point for automatic loop unrolling")); static cl::opt<unsigned> @@ -43,12 +43,20 @@ namespace { class LoopUnroll : public LoopPass { public: static char ID; // Pass ID, replacement for typeid - LoopUnroll() : LoopPass(ID) {} + LoopUnroll() : LoopPass(ID) { + initializeLoopUnrollPass(*PassRegistry::getPassRegistry()); + } /// A magic value for use with the Threshold parameter to indicate /// that the loop unroll should be performed regardless of how much /// code expansion would result. static const unsigned NoThreshold = UINT_MAX; + + // Threshold to use when optsize is specified (and there is no + // explicit -unroll-threshold). + static const unsigned OptSizeUnrollThreshold = 50; + + unsigned CurrentThreshold; bool runOnLoop(Loop *L, LPPassManager &LPM); @@ -73,7 +81,11 @@ namespace { } char LoopUnroll::ID = 0; -INITIALIZE_PASS(LoopUnroll, "loop-unroll", "Unroll loops", false, false); +INITIALIZE_PASS_BEGIN(LoopUnroll, "loop-unroll", "Unroll loops", false, false) +INITIALIZE_PASS_DEPENDENCY(LoopInfo) +INITIALIZE_PASS_DEPENDENCY(LoopSimplify) +INITIALIZE_PASS_DEPENDENCY(LCSSA) +INITIALIZE_PASS_END(LoopUnroll, "loop-unroll", "Unroll loops", false, false) Pass *llvm::createLoopUnrollPass() { return new LoopUnroll(); } @@ -83,8 +95,16 @@ static unsigned ApproximateLoopSize(const Loop *L, unsigned &NumCalls) { for (Loop::block_iterator I = L->block_begin(), E = L->block_end(); I != E; ++I) Metrics.analyzeBasicBlock(*I); - NumCalls = Metrics.NumCalls; - return Metrics.NumInsts; + NumCalls = Metrics.NumInlineCandidates; + + unsigned LoopSize = Metrics.NumInsts; + + // Don't allow an estimate of size zero. This would allows unrolling of loops + // with huge iteration counts, which is a compile time problem even if it's + // not a problem for code quality. + if (LoopSize == 0) LoopSize = 1; + + return LoopSize; } bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) { @@ -94,6 +114,15 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) { DEBUG(dbgs() << "Loop Unroll: F[" << Header->getParent()->getName() << "] Loop %" << Header->getName() << "\n"); (void)Header; + + // Determine the current unrolling threshold. While this is normally set + // from UnrollThreshold, it is overridden to a smaller value if the current + // function is marked as optimize-for-size, and the unroll threshold was + // not user specified. + CurrentThreshold = UnrollThreshold; + if (Header->getParent()->hasFnAttr(Attribute::OptimizeForSize) && + UnrollThreshold.getNumOccurrences() == 0) + CurrentThreshold = OptSizeUnrollThreshold; // Find trip count unsigned TripCount = L->getSmallConstantTripCount(); @@ -111,25 +140,25 @@ bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) { } // Enforce the threshold. - if (UnrollThreshold != NoThreshold) { - unsigned NumCalls; - unsigned LoopSize = ApproximateLoopSize(L, NumCalls); + if (CurrentThreshold != NoThreshold) { + unsigned NumInlineCandidates; + unsigned LoopSize = ApproximateLoopSize(L, NumInlineCandidates); DEBUG(dbgs() << " Loop Size = " << LoopSize << "\n"); - if (NumCalls != 0) { - DEBUG(dbgs() << " Not unrolling loop with function calls.\n"); + if (NumInlineCandidates != 0) { + DEBUG(dbgs() << " Not unrolling loop with inlinable calls.\n"); return false; } uint64_t Size = (uint64_t)LoopSize*Count; - if (TripCount != 1 && Size > UnrollThreshold) { + if (TripCount != 1 && Size > CurrentThreshold) { DEBUG(dbgs() << " Too large to fully unroll with count: " << Count - << " because size: " << Size << ">" << UnrollThreshold << "\n"); + << " because size: " << Size << ">" << CurrentThreshold << "\n"); if (!UnrollAllowPartial) { DEBUG(dbgs() << " will not try to unroll partially because " << "-unroll-allow-partial not given\n"); return false; } // Reduce unroll count to be modulo of TripCount for partial unrolling - Count = UnrollThreshold / LoopSize; + Count = CurrentThreshold / LoopSize; while (Count != 0 && TripCount%Count != 0) { Count--; } diff --git a/lib/Transforms/Scalar/LoopUnswitch.cpp b/lib/Transforms/Scalar/LoopUnswitch.cpp index 9afe428..b4e3d31 100644 --- a/lib/Transforms/Scalar/LoopUnswitch.cpp +++ b/lib/Transforms/Scalar/LoopUnswitch.cpp @@ -32,12 +32,12 @@ #include "llvm/DerivedTypes.h" #include "llvm/Function.h" #include "llvm/Instructions.h" -#include "llvm/Analysis/ConstantFolding.h" #include "llvm/Analysis/InlineCost.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopPass.h" #include "llvm/Analysis/Dominators.h" +#include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Transforms/Utils/Cloning.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" @@ -93,7 +93,9 @@ namespace { explicit LoopUnswitch(bool Os = false) : LoopPass(ID), OptimizeForSize(Os), redoLoop(false), currentLoop(NULL), DT(NULL), loopHeader(NULL), - loopPreheader(NULL) {} + loopPreheader(NULL) { + initializeLoopUnswitchPass(*PassRegistry::getPassRegistry()); + } bool runOnLoop(Loop *L, LPPassManager &LPM); bool processCurrentLoop(); @@ -109,6 +111,7 @@ namespace { AU.addRequiredID(LCSSAID); AU.addPreservedID(LCSSAID); AU.addPreserved<DominatorTree>(); + AU.addPreserved<ScalarEvolution>(); } private: @@ -158,7 +161,13 @@ namespace { }; } char LoopUnswitch::ID = 0; -INITIALIZE_PASS(LoopUnswitch, "loop-unswitch", "Unswitch loops", false, false); +INITIALIZE_PASS_BEGIN(LoopUnswitch, "loop-unswitch", "Unswitch loops", + false, false) +INITIALIZE_PASS_DEPENDENCY(LoopSimplify) +INITIALIZE_PASS_DEPENDENCY(LoopInfo) +INITIALIZE_PASS_DEPENDENCY(LCSSA) +INITIALIZE_PASS_END(LoopUnswitch, "loop-unswitch", "Unswitch loops", + false, false) Pass *llvm::createLoopUnswitchPass(bool Os) { return new LoopUnswitch(Os); @@ -450,22 +459,9 @@ bool LoopUnswitch::UnswitchIfProfitable(Value *LoopCond, Constant *Val) { return true; } -// RemapInstruction - Convert the instruction operands from referencing the -// current values into those specified by VMap. -// -static inline void RemapInstruction(Instruction *I, - ValueMap<const Value *, Value*> &VMap) { - for (unsigned op = 0, E = I->getNumOperands(); op != E; ++op) { - Value *Op = I->getOperand(op); - ValueMap<const Value *, Value*>::iterator It = VMap.find(Op); - if (It != VMap.end()) Op = It->second; - I->setOperand(op, Op); - } -} - /// CloneLoop - Recursively clone the specified loop and all of its children, /// mapping the blocks with the specified map. -static Loop *CloneLoop(Loop *L, Loop *PL, ValueMap<const Value*, Value*> &VM, +static Loop *CloneLoop(Loop *L, Loop *PL, ValueToValueMapTy &VM, LoopInfo *LI, LPPassManager *LPM) { Loop *New = new Loop(); LPM->insertLoop(New, PL); @@ -580,6 +576,9 @@ void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val, << " blocks] in Function " << F->getName() << " when '" << *Val << "' == " << *LIC << "\n"); + if (ScalarEvolution *SE = getAnalysisIfAvailable<ScalarEvolution>()) + SE->forgetLoop(L); + LoopBlocks.clear(); NewBlocks.clear(); @@ -609,7 +608,7 @@ void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val, // the loop preheader and exit blocks), keeping track of the mapping between // the instructions and blocks. NewBlocks.reserve(LoopBlocks.size()); - ValueMap<const Value*, Value*> VMap; + ValueToValueMapTy VMap; for (unsigned i = 0, e = LoopBlocks.size(); i != e; ++i) { BasicBlock *NewBB = CloneBasicBlock(LoopBlocks[i], VMap, ".us", F); NewBlocks.push_back(NewBB); @@ -647,7 +646,7 @@ void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val, for (BasicBlock::iterator I = ExitSucc->begin(); isa<PHINode>(I); ++I) { PN = cast<PHINode>(I); Value *V = PN->getIncomingValueForBlock(ExitBlocks[i]); - ValueMap<const Value *, Value*>::iterator It = VMap.find(V); + ValueToValueMapTy::iterator It = VMap.find(V); if (It != VMap.end()) V = It->second; PN->addIncoming(V, NewExit); } @@ -657,7 +656,7 @@ void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val, for (unsigned i = 0, e = NewBlocks.size(); i != e; ++i) for (BasicBlock::iterator I = NewBlocks[i]->begin(), E = NewBlocks[i]->end(); I != E; ++I) - RemapInstruction(I, VMap); + RemapInstruction(I, VMap,RF_NoModuleLevelChanges|RF_IgnoreMissingEntries); // Rewrite the original preheader to select between versions of the loop. BranchInst *OldBR = cast<BranchInst>(loopPreheader->getTerminator()); @@ -961,13 +960,7 @@ void LoopUnswitch::SimplifyCode(std::vector<Instruction*> &Worklist, Loop *L) { while (!Worklist.empty()) { Instruction *I = Worklist.back(); Worklist.pop_back(); - - // Simple constant folding. - if (Constant *C = ConstantFoldInstruction(I)) { - ReplaceUsesOfWith(I, C, Worklist, L, LPM); - continue; - } - + // Simple DCE. if (isInstructionTriviallyDead(I)) { DEBUG(dbgs() << "Remove dead instruction '" << *I); @@ -982,15 +975,16 @@ void LoopUnswitch::SimplifyCode(std::vector<Instruction*> &Worklist, Loop *L) { ++NumSimplify; continue; } - + // See if instruction simplification can hack this up. This is common for // things like "select false, X, Y" after unswitching made the condition be // 'false'. - if (Value *V = SimplifyInstruction(I)) { - ReplaceUsesOfWith(I, V, Worklist, L, LPM); - continue; - } - + if (Value *V = SimplifyInstruction(I, 0, DT)) + if (LI->replacementPreservesLCSSAForm(I, V)) { + ReplaceUsesOfWith(I, V, Worklist, L, LPM); + continue; + } + // Special case hacks that appear commonly in unswitched code. if (BranchInst *BI = dyn_cast<BranchInst>(I)) { if (BI->isUnconditional()) { diff --git a/lib/Transforms/Scalar/LowerAtomic.cpp b/lib/Transforms/Scalar/LowerAtomic.cpp index 973ffe7..9087b46 100644 --- a/lib/Transforms/Scalar/LowerAtomic.cpp +++ b/lib/Transforms/Scalar/LowerAtomic.cpp @@ -14,26 +14,15 @@ #define DEBUG_TYPE "loweratomic" #include "llvm/Transforms/Scalar.h" -#include "llvm/BasicBlock.h" #include "llvm/Function.h" -#include "llvm/Instruction.h" -#include "llvm/Instructions.h" -#include "llvm/Intrinsics.h" +#include "llvm/IntrinsicInst.h" #include "llvm/Pass.h" #include "llvm/Support/IRBuilder.h" - using namespace llvm; -namespace { - -bool LowerAtomicIntrinsic(CallInst *CI) { - IRBuilder<> Builder(CI->getParent(), CI); - - Function *Callee = CI->getCalledFunction(); - if (!Callee) - return false; - - unsigned IID = Callee->getIntrinsicID(); +static bool LowerAtomicIntrinsic(IntrinsicInst *II) { + IRBuilder<> Builder(II->getParent(), II); + unsigned IID = II->getIntrinsicID(); switch (IID) { case Intrinsic::memory_barrier: break; @@ -48,80 +37,70 @@ bool LowerAtomicIntrinsic(CallInst *CI) { case Intrinsic::atomic_load_min: case Intrinsic::atomic_load_umax: case Intrinsic::atomic_load_umin: { - Value *Ptr = CI->getArgOperand(0); - Value *Delta = CI->getArgOperand(1); + Value *Ptr = II->getArgOperand(0), *Delta = II->getArgOperand(1); LoadInst *Orig = Builder.CreateLoad(Ptr); Value *Res = NULL; switch (IID) { - default: assert(0 && "Unrecognized atomic modify operation"); - case Intrinsic::atomic_load_add: - Res = Builder.CreateAdd(Orig, Delta); - break; - case Intrinsic::atomic_load_sub: - Res = Builder.CreateSub(Orig, Delta); - break; - case Intrinsic::atomic_load_and: - Res = Builder.CreateAnd(Orig, Delta); - break; - case Intrinsic::atomic_load_nand: - Res = Builder.CreateNot(Builder.CreateAnd(Orig, Delta)); - break; - case Intrinsic::atomic_load_or: - Res = Builder.CreateOr(Orig, Delta); - break; - case Intrinsic::atomic_load_xor: - Res = Builder.CreateXor(Orig, Delta); - break; - case Intrinsic::atomic_load_max: - Res = Builder.CreateSelect(Builder.CreateICmpSLT(Orig, Delta), - Delta, - Orig); - break; - case Intrinsic::atomic_load_min: - Res = Builder.CreateSelect(Builder.CreateICmpSLT(Orig, Delta), - Orig, - Delta); - break; - case Intrinsic::atomic_load_umax: - Res = Builder.CreateSelect(Builder.CreateICmpULT(Orig, Delta), - Delta, - Orig); - break; - case Intrinsic::atomic_load_umin: - Res = Builder.CreateSelect(Builder.CreateICmpULT(Orig, Delta), - Orig, - Delta); - break; + default: assert(0 && "Unrecognized atomic modify operation"); + case Intrinsic::atomic_load_add: + Res = Builder.CreateAdd(Orig, Delta); + break; + case Intrinsic::atomic_load_sub: + Res = Builder.CreateSub(Orig, Delta); + break; + case Intrinsic::atomic_load_and: + Res = Builder.CreateAnd(Orig, Delta); + break; + case Intrinsic::atomic_load_nand: + Res = Builder.CreateNot(Builder.CreateAnd(Orig, Delta)); + break; + case Intrinsic::atomic_load_or: + Res = Builder.CreateOr(Orig, Delta); + break; + case Intrinsic::atomic_load_xor: + Res = Builder.CreateXor(Orig, Delta); + break; + case Intrinsic::atomic_load_max: + Res = Builder.CreateSelect(Builder.CreateICmpSLT(Orig, Delta), + Delta, Orig); + break; + case Intrinsic::atomic_load_min: + Res = Builder.CreateSelect(Builder.CreateICmpSLT(Orig, Delta), + Orig, Delta); + break; + case Intrinsic::atomic_load_umax: + Res = Builder.CreateSelect(Builder.CreateICmpULT(Orig, Delta), + Delta, Orig); + break; + case Intrinsic::atomic_load_umin: + Res = Builder.CreateSelect(Builder.CreateICmpULT(Orig, Delta), + Orig, Delta); + break; } Builder.CreateStore(Res, Ptr); - CI->replaceAllUsesWith(Orig); + II->replaceAllUsesWith(Orig); break; } case Intrinsic::atomic_swap: { - Value *Ptr = CI->getArgOperand(0); - Value *Val = CI->getArgOperand(1); - + Value *Ptr = II->getArgOperand(0), *Val = II->getArgOperand(1); LoadInst *Orig = Builder.CreateLoad(Ptr); Builder.CreateStore(Val, Ptr); - - CI->replaceAllUsesWith(Orig); + II->replaceAllUsesWith(Orig); break; } case Intrinsic::atomic_cmp_swap: { - Value *Ptr = CI->getArgOperand(0); - Value *Cmp = CI->getArgOperand(1); - Value *Val = CI->getArgOperand(2); + Value *Ptr = II->getArgOperand(0), *Cmp = II->getArgOperand(1); + Value *Val = II->getArgOperand(2); LoadInst *Orig = Builder.CreateLoad(Ptr); Value *Equal = Builder.CreateICmpEQ(Orig, Cmp); Value *Res = Builder.CreateSelect(Equal, Val, Orig); Builder.CreateStore(Res, Ptr); - - CI->replaceAllUsesWith(Orig); + II->replaceAllUsesWith(Orig); break; } @@ -129,33 +108,32 @@ bool LowerAtomicIntrinsic(CallInst *CI) { return false; } - assert(CI->use_empty() && + assert(II->use_empty() && "Lowering should have eliminated any uses of the intrinsic call!"); - CI->eraseFromParent(); + II->eraseFromParent(); return true; } -struct LowerAtomic : public BasicBlockPass { - static char ID; - LowerAtomic() : BasicBlockPass(ID) {} - bool runOnBasicBlock(BasicBlock &BB) { - bool Changed = false; - for (BasicBlock::iterator DI = BB.begin(), DE = BB.end(); DI != DE; ) { - Instruction *Inst = DI++; - if (CallInst *CI = dyn_cast<CallInst>(Inst)) - Changed |= LowerAtomicIntrinsic(CI); +namespace { + struct LowerAtomic : public BasicBlockPass { + static char ID; + LowerAtomic() : BasicBlockPass(ID) { + initializeLowerAtomicPass(*PassRegistry::getPassRegistry()); } - return Changed; - } - -}; - + bool runOnBasicBlock(BasicBlock &BB) { + bool Changed = false; + for (BasicBlock::iterator DI = BB.begin(), DE = BB.end(); DI != DE; ) + if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(DI++)) + Changed |= LowerAtomicIntrinsic(II); + return Changed; + } + }; } char LowerAtomic::ID = 0; INITIALIZE_PASS(LowerAtomic, "loweratomic", "Lower atomic intrinsics to non-atomic form", - false, false); + false, false) Pass *llvm::createLowerAtomicPass() { return new LowerAtomic(); } diff --git a/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/lib/Transforms/Scalar/MemCpyOptimizer.cpp index 24fae42..bde0e53 100644 --- a/lib/Transforms/Scalar/MemCpyOptimizer.cpp +++ b/lib/Transforms/Scalar/MemCpyOptimizer.cpp @@ -14,16 +14,18 @@ #define DEBUG_TYPE "memcpyopt" #include "llvm/Transforms/Scalar.h" +#include "llvm/GlobalVariable.h" #include "llvm/IntrinsicInst.h" #include "llvm/Instructions.h" -#include "llvm/LLVMContext.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/Dominators.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/MemoryDependenceAnalysis.h" +#include "llvm/Analysis/ValueTracking.h" #include "llvm/Support/Debug.h" #include "llvm/Support/GetElementPtrTypeIterator.h" +#include "llvm/Support/IRBuilder.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetData.h" #include <list> @@ -32,62 +34,10 @@ using namespace llvm; STATISTIC(NumMemCpyInstr, "Number of memcpy instructions deleted"); STATISTIC(NumMemSetInfer, "Number of memsets inferred"); STATISTIC(NumMoveToCpy, "Number of memmoves converted to memcpy"); - -/// isBytewiseValue - If the specified value can be set by repeating the same -/// byte in memory, return the i8 value that it is represented with. This is -/// true for all i8 values obviously, but is also true for i32 0, i32 -1, -/// i16 0xF0F0, double 0.0 etc. If the value can't be handled with a repeated -/// byte store (e.g. i16 0x1234), return null. -static Value *isBytewiseValue(Value *V) { - LLVMContext &Context = V->getContext(); - - // All byte-wide stores are splatable, even of arbitrary variables. - if (V->getType()->isIntegerTy(8)) return V; - - // Constant float and double values can be handled as integer values if the - // corresponding integer value is "byteable". An important case is 0.0. - if (ConstantFP *CFP = dyn_cast<ConstantFP>(V)) { - if (CFP->getType()->isFloatTy()) - V = ConstantExpr::getBitCast(CFP, Type::getInt32Ty(Context)); - if (CFP->getType()->isDoubleTy()) - V = ConstantExpr::getBitCast(CFP, Type::getInt64Ty(Context)); - // Don't handle long double formats, which have strange constraints. - } - - // We can handle constant integers that are power of two in size and a - // multiple of 8 bits. - if (ConstantInt *CI = dyn_cast<ConstantInt>(V)) { - unsigned Width = CI->getBitWidth(); - if (isPowerOf2_32(Width) && Width > 8) { - // We can handle this value if the recursive binary decomposition is the - // same at all levels. - APInt Val = CI->getValue(); - APInt Val2; - while (Val.getBitWidth() != 8) { - unsigned NextWidth = Val.getBitWidth()/2; - Val2 = Val.lshr(NextWidth); - Val2.trunc(Val.getBitWidth()/2); - Val.trunc(Val.getBitWidth()/2); - - // If the top/bottom halves aren't the same, reject it. - if (Val != Val2) - return 0; - } - return ConstantInt::get(Context, Val); - } - } - - // Conceptually, we could handle things like: - // %a = zext i8 %X to i16 - // %b = shl i16 %a, 8 - // %c = or i16 %a, %b - // but until there is an example that actually needs this, it doesn't seem - // worth worrying about. - return 0; -} +STATISTIC(NumCpyToSet, "Number of memcpys converted to memset"); static int64_t GetOffsetFromIndex(const GetElementPtrInst *GEP, unsigned Idx, - bool &VariableIdxFound, TargetData &TD) { + bool &VariableIdxFound, const TargetData &TD){ // Skip over the first indices. gep_type_iterator GTI = gep_type_begin(GEP); for (unsigned i = 1; i != Idx; ++i, ++GTI) @@ -120,14 +70,31 @@ static int64_t GetOffsetFromIndex(const GetElementPtrInst *GEP, unsigned Idx, /// constant offset, and return that constant offset. For example, Ptr1 might /// be &A[42], and Ptr2 might be &A[40]. In this case offset would be -8. static bool IsPointerOffset(Value *Ptr1, Value *Ptr2, int64_t &Offset, - TargetData &TD) { + const TargetData &TD) { + Ptr1 = Ptr1->stripPointerCasts(); + Ptr2 = Ptr2->stripPointerCasts(); + GetElementPtrInst *GEP1 = dyn_cast<GetElementPtrInst>(Ptr1); + GetElementPtrInst *GEP2 = dyn_cast<GetElementPtrInst>(Ptr2); + + bool VariableIdxFound = false; + + // If one pointer is a GEP and the other isn't, then see if the GEP is a + // constant offset from the base, as in "P" and "gep P, 1". + if (GEP1 && GEP2 == 0 && GEP1->getOperand(0)->stripPointerCasts() == Ptr2) { + Offset = -GetOffsetFromIndex(GEP1, 1, VariableIdxFound, TD); + return !VariableIdxFound; + } + + if (GEP2 && GEP1 == 0 && GEP2->getOperand(0)->stripPointerCasts() == Ptr1) { + Offset = GetOffsetFromIndex(GEP2, 1, VariableIdxFound, TD); + return !VariableIdxFound; + } + // Right now we handle the case when Ptr1/Ptr2 are both GEPs with an identical // base. After that base, they may have some number of common (and // potentially variable) indices. After that they handle some constant // offset, which determines their offset from each other. At this point, we // handle no other case. - GetElementPtrInst *GEP1 = dyn_cast<GetElementPtrInst>(Ptr1); - GetElementPtrInst *GEP2 = dyn_cast<GetElementPtrInst>(Ptr2); if (!GEP1 || !GEP2 || GEP1->getOperand(0) != GEP2->getOperand(0)) return false; @@ -137,7 +104,6 @@ static bool IsPointerOffset(Value *Ptr1, Value *Ptr2, int64_t &Offset, if (GEP1->getOperand(Idx) != GEP2->getOperand(Idx)) break; - bool VariableIdxFound = false; int64_t Offset1 = GetOffsetFromIndex(GEP1, Idx, VariableIdxFound, TD); int64_t Offset2 = GetOffsetFromIndex(GEP2, Idx, VariableIdxFound, TD); if (VariableIdxFound) return false; @@ -171,7 +137,7 @@ struct MemsetRange { unsigned Alignment; /// TheStores - The actual stores that make up this range. - SmallVector<StoreInst*, 16> TheStores; + SmallVector<Instruction*, 16> TheStores; bool isProfitableToUseMemset(const TargetData &TD) const; @@ -181,10 +147,19 @@ struct MemsetRange { bool MemsetRange::isProfitableToUseMemset(const TargetData &TD) const { // If we found more than 8 stores to merge or 64 bytes, use memset. if (TheStores.size() >= 8 || End-Start >= 64) return true; + + // If there is nothing to merge, don't do anything. + if (TheStores.size() < 2) return false; + + // If any of the stores are a memset, then it is always good to extend the + // memset. + for (unsigned i = 0, e = TheStores.size(); i != e; ++i) + if (!isa<StoreInst>(TheStores[i])) + return true; // Assume that the code generator is capable of merging pairs of stores // together if it wants to. - if (TheStores.size() <= 2) return false; + if (TheStores.size() == 2) return false; // If we have fewer than 8 stores, it can still be worthwhile to do this. // For example, merging 4 i8 stores into an i32 store is useful almost always. @@ -215,31 +190,53 @@ class MemsetRanges { /// because each element is relatively large and expensive to copy. std::list<MemsetRange> Ranges; typedef std::list<MemsetRange>::iterator range_iterator; - TargetData &TD; + const TargetData &TD; public: - MemsetRanges(TargetData &td) : TD(td) {} + MemsetRanges(const TargetData &td) : TD(td) {} typedef std::list<MemsetRange>::const_iterator const_iterator; const_iterator begin() const { return Ranges.begin(); } const_iterator end() const { return Ranges.end(); } bool empty() const { return Ranges.empty(); } - void addStore(int64_t OffsetFromFirst, StoreInst *SI); + void addInst(int64_t OffsetFromFirst, Instruction *Inst) { + if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) + addStore(OffsetFromFirst, SI); + else + addMemSet(OffsetFromFirst, cast<MemSetInst>(Inst)); + } + + void addStore(int64_t OffsetFromFirst, StoreInst *SI) { + int64_t StoreSize = TD.getTypeStoreSize(SI->getOperand(0)->getType()); + + addRange(OffsetFromFirst, StoreSize, + SI->getPointerOperand(), SI->getAlignment(), SI); + } + + void addMemSet(int64_t OffsetFromFirst, MemSetInst *MSI) { + int64_t Size = cast<ConstantInt>(MSI->getLength())->getZExtValue(); + addRange(OffsetFromFirst, Size, MSI->getDest(), MSI->getAlignment(), MSI); + } + + void addRange(int64_t Start, int64_t Size, Value *Ptr, + unsigned Alignment, Instruction *Inst); + }; } // end anon namespace -/// addStore - Add a new store to the MemsetRanges data structure. This adds a +/// addRange - Add a new store to the MemsetRanges data structure. This adds a /// new range for the specified store at the specified offset, merging into /// existing ranges as appropriate. -void MemsetRanges::addStore(int64_t Start, StoreInst *SI) { - int64_t End = Start+TD.getTypeStoreSize(SI->getOperand(0)->getType()); - - // Do a linear search of the ranges to see if this can be joined and/or to - // find the insertion point in the list. We keep the ranges sorted for - // simplicity here. This is a linear search of a linked list, which is ugly, - // however the number of ranges is limited, so this won't get crazy slow. +/// +/// Do a linear search of the ranges to see if this can be joined and/or to +/// find the insertion point in the list. We keep the ranges sorted for +/// simplicity here. This is a linear search of a linked list, which is ugly, +/// however the number of ranges is limited, so this won't get crazy slow. +void MemsetRanges::addRange(int64_t Start, int64_t Size, Value *Ptr, + unsigned Alignment, Instruction *Inst) { + int64_t End = Start+Size; range_iterator I = Ranges.begin(), E = Ranges.end(); while (I != E && Start > I->End) @@ -252,14 +249,14 @@ void MemsetRanges::addStore(int64_t Start, StoreInst *SI) { MemsetRange &R = *Ranges.insert(I, MemsetRange()); R.Start = Start; R.End = End; - R.StartPtr = SI->getPointerOperand(); - R.Alignment = SI->getAlignment(); - R.TheStores.push_back(SI); + R.StartPtr = Ptr; + R.Alignment = Alignment; + R.TheStores.push_back(Inst); return; } - + // This store overlaps with I, add it. - I->TheStores.push_back(SI); + I->TheStores.push_back(Inst); // At this point, we may have an interval that completely contains our store. // If so, just add it to the interval and return. @@ -274,8 +271,8 @@ void MemsetRanges::addStore(int64_t Start, StoreInst *SI) { // stopped on *it*. if (Start < I->Start) { I->Start = Start; - I->StartPtr = SI->getPointerOperand(); - I->Alignment = SI->getAlignment(); + I->StartPtr = Ptr; + I->Alignment = Alignment; } // Now we know that Start <= I->End and Start >= I->Start (so the startpoint @@ -301,10 +298,16 @@ void MemsetRanges::addStore(int64_t Start, StoreInst *SI) { namespace { class MemCpyOpt : public FunctionPass { - bool runOnFunction(Function &F); + MemoryDependenceAnalysis *MD; + const TargetData *TD; public: static char ID; // Pass identification, replacement for typeid - MemCpyOpt() : FunctionPass(ID) {} + MemCpyOpt() : FunctionPass(ID) { + initializeMemCpyOptPass(*PassRegistry::getPassRegistry()); + MD = 0; + } + + bool runOnFunction(Function &F); private: // This transformation requires dominator postdominator info @@ -319,9 +322,17 @@ namespace { // Helper fuctions bool processStore(StoreInst *SI, BasicBlock::iterator &BBI); + bool processMemSet(MemSetInst *SI, BasicBlock::iterator &BBI); bool processMemCpy(MemCpyInst *M); bool processMemMove(MemMoveInst *M); - bool performCallSlotOptzn(MemCpyInst *cpy, CallInst *C); + bool performCallSlotOptzn(Instruction *cpy, Value *cpyDst, Value *cpySrc, + uint64_t cpyLen, CallInst *C); + bool processMemCpyMemCpyDependence(MemCpyInst *M, MemCpyInst *MDep, + uint64_t MSize); + bool processByValArgument(CallSite CS, unsigned ArgNo); + Instruction *tryMergingIntoMemset(Instruction *I, Value *StartPtr, + Value *ByteVal); + bool iterateOnFunction(Function &F); }; @@ -331,165 +342,199 @@ namespace { // createMemCpyOptPass - The public interface to this file... FunctionPass *llvm::createMemCpyOptPass() { return new MemCpyOpt(); } -INITIALIZE_PASS(MemCpyOpt, "memcpyopt", "MemCpy Optimization", false, false); - - +INITIALIZE_PASS_BEGIN(MemCpyOpt, "memcpyopt", "MemCpy Optimization", + false, false) +INITIALIZE_PASS_DEPENDENCY(DominatorTree) +INITIALIZE_PASS_DEPENDENCY(MemoryDependenceAnalysis) +INITIALIZE_AG_DEPENDENCY(AliasAnalysis) +INITIALIZE_PASS_END(MemCpyOpt, "memcpyopt", "MemCpy Optimization", + false, false) -/// processStore - When GVN is scanning forward over instructions, we look for +/// tryMergingIntoMemset - When scanning forward over instructions, we look for /// some other patterns to fold away. In particular, this looks for stores to -/// neighboring locations of memory. If it sees enough consequtive ones -/// (currently 4) it attempts to merge them together into a memcpy/memset. -bool MemCpyOpt::processStore(StoreInst *SI, BasicBlock::iterator &BBI) { - if (SI->isVolatile()) return false; - - LLVMContext &Context = SI->getContext(); - - // There are two cases that are interesting for this code to handle: memcpy - // and memset. Right now we only handle memset. +/// neighboring locations of memory. If it sees enough consecutive ones, it +/// attempts to merge them together into a memcpy/memset. +Instruction *MemCpyOpt::tryMergingIntoMemset(Instruction *StartInst, + Value *StartPtr, Value *ByteVal) { + if (TD == 0) return 0; - // Ensure that the value being stored is something that can be memset'able a - // byte at a time like "0" or "-1" or any width, as well as things like - // 0xA0A0A0A0 and 0.0. - Value *ByteVal = isBytewiseValue(SI->getOperand(0)); - if (!ByteVal) - return false; - - TargetData *TD = getAnalysisIfAvailable<TargetData>(); - if (!TD) return false; - AliasAnalysis &AA = getAnalysis<AliasAnalysis>(); - Module *M = SI->getParent()->getParent()->getParent(); - // Okay, so we now have a single store that can be splatable. Scan to find // all subsequent stores of the same value to offset from the same pointer. // Join these together into ranges, so we can decide whether contiguous blocks // are stored. MemsetRanges Ranges(*TD); - Value *StartPtr = SI->getPointerOperand(); - - BasicBlock::iterator BI = SI; + BasicBlock::iterator BI = StartInst; for (++BI; !isa<TerminatorInst>(BI); ++BI) { - if (isa<CallInst>(BI) || isa<InvokeInst>(BI)) { - // If the call is readnone, ignore it, otherwise bail out. We don't even - // allow readonly here because we don't want something like: + if (!isa<StoreInst>(BI) && !isa<MemSetInst>(BI)) { + // If the instruction is readnone, ignore it, otherwise bail out. We + // don't even allow readonly here because we don't want something like: // A[1] = 2; strlen(A); A[2] = 2; -> memcpy(A, ...); strlen(A). - if (AA.getModRefBehavior(CallSite(BI)) == - AliasAnalysis::DoesNotAccessMemory) - continue; - - // TODO: If this is a memset, try to join it in. - - break; - } else if (isa<VAArgInst>(BI) || isa<LoadInst>(BI)) - break; - - // If this is a non-store instruction it is fine, ignore it. - StoreInst *NextStore = dyn_cast<StoreInst>(BI); - if (NextStore == 0) continue; + if (BI->mayWriteToMemory() || BI->mayReadFromMemory()) + break; + continue; + } - // If this is a store, see if we can merge it in. - if (NextStore->isVolatile()) break; + if (StoreInst *NextStore = dyn_cast<StoreInst>(BI)) { + // If this is a store, see if we can merge it in. + if (NextStore->isVolatile()) break; - // Check to see if this stored value is of the same byte-splattable value. - if (ByteVal != isBytewiseValue(NextStore->getOperand(0))) - break; - - // Check to see if this store is to a constant offset from the start ptr. - int64_t Offset; - if (!IsPointerOffset(StartPtr, NextStore->getPointerOperand(), Offset, *TD)) - break; - - Ranges.addStore(Offset, NextStore); + // Check to see if this stored value is of the same byte-splattable value. + if (ByteVal != isBytewiseValue(NextStore->getOperand(0))) + break; + + // Check to see if this store is to a constant offset from the start ptr. + int64_t Offset; + if (!IsPointerOffset(StartPtr, NextStore->getPointerOperand(), + Offset, *TD)) + break; + + Ranges.addStore(Offset, NextStore); + } else { + MemSetInst *MSI = cast<MemSetInst>(BI); + + if (MSI->isVolatile() || ByteVal != MSI->getValue() || + !isa<ConstantInt>(MSI->getLength())) + break; + + // Check to see if this store is to a constant offset from the start ptr. + int64_t Offset; + if (!IsPointerOffset(StartPtr, MSI->getDest(), Offset, *TD)) + break; + + Ranges.addMemSet(Offset, MSI); + } } - + // If we have no ranges, then we just had a single store with nothing that // could be merged in. This is a very common case of course. if (Ranges.empty()) - return false; + return 0; // If we had at least one store that could be merged in, add the starting // store as well. We try to avoid this unless there is at least something // interesting as a small compile-time optimization. - Ranges.addStore(0, SI); - - + Ranges.addInst(0, StartInst); + + // If we create any memsets, we put it right before the first instruction that + // isn't part of the memset block. This ensure that the memset is dominated + // by any addressing instruction needed by the start of the block. + IRBuilder<> Builder(BI); + // Now that we have full information about ranges, loop over the ranges and // emit memset's for anything big enough to be worthwhile. - bool MadeChange = false; + Instruction *AMemSet = 0; for (MemsetRanges::const_iterator I = Ranges.begin(), E = Ranges.end(); I != E; ++I) { const MemsetRange &Range = *I; - + if (Range.TheStores.size() == 1) continue; // If it is profitable to lower this range to memset, do so now. if (!Range.isProfitableToUseMemset(*TD)) continue; - // Otherwise, we do want to transform this! Create a new memset. We put - // the memset right before the first instruction that isn't part of this - // memset block. This ensure that the memset is dominated by any addressing - // instruction needed by the start of the block. - BasicBlock::iterator InsertPt = BI; - + // Otherwise, we do want to transform this! Create a new memset. // Get the starting pointer of the block. StartPtr = Range.StartPtr; - + // Determine alignment unsigned Alignment = Range.Alignment; if (Alignment == 0) { const Type *EltType = - cast<PointerType>(StartPtr->getType())->getElementType(); + cast<PointerType>(StartPtr->getType())->getElementType(); Alignment = TD->getABITypeAlignment(EltType); } - - // Cast the start ptr to be i8* as memset requires. - const PointerType* StartPTy = cast<PointerType>(StartPtr->getType()); - const PointerType *i8Ptr = Type::getInt8PtrTy(Context, - StartPTy->getAddressSpace()); - if (StartPTy!= i8Ptr) - StartPtr = new BitCastInst(StartPtr, i8Ptr, StartPtr->getName(), - InsertPt); - - Value *Ops[] = { - StartPtr, ByteVal, // Start, value - // size - ConstantInt::get(Type::getInt64Ty(Context), Range.End-Range.Start), - // align - ConstantInt::get(Type::getInt32Ty(Context), Alignment), - // volatile - ConstantInt::get(Type::getInt1Ty(Context), 0), - }; - const Type *Tys[] = { Ops[0]->getType(), Ops[2]->getType() }; - - Function *MemSetF = Intrinsic::getDeclaration(M, Intrinsic::memset, Tys, 2); - - Value *C = CallInst::Create(MemSetF, Ops, Ops+5, "", InsertPt); + + AMemSet = + Builder.CreateMemSet(StartPtr, ByteVal, Range.End-Range.Start, Alignment); + DEBUG(dbgs() << "Replace stores:\n"; for (unsigned i = 0, e = Range.TheStores.size(); i != e; ++i) - dbgs() << *Range.TheStores[i]; - dbgs() << "With: " << *C); C=C; - - // Don't invalidate the iterator - BBI = BI; - + dbgs() << *Range.TheStores[i] << '\n'; + dbgs() << "With: " << *AMemSet << '\n'); + // Zap all the stores. - for (SmallVector<StoreInst*, 16>::const_iterator + for (SmallVector<Instruction*, 16>::const_iterator SI = Range.TheStores.begin(), - SE = Range.TheStores.end(); SI != SE; ++SI) + SE = Range.TheStores.end(); SI != SE; ++SI) { + MD->removeInstruction(*SI); (*SI)->eraseFromParent(); + } ++NumMemSetInfer; - MadeChange = true; } - return MadeChange; + return AMemSet; +} + + +bool MemCpyOpt::processStore(StoreInst *SI, BasicBlock::iterator &BBI) { + if (SI->isVolatile()) return false; + + if (TD == 0) return false; + + // Detect cases where we're performing call slot forwarding, but + // happen to be using a load-store pair to implement it, rather than + // a memcpy. + if (LoadInst *LI = dyn_cast<LoadInst>(SI->getOperand(0))) { + if (!LI->isVolatile() && LI->hasOneUse()) { + MemDepResult dep = MD->getDependency(LI); + CallInst *C = 0; + if (dep.isClobber() && !isa<MemCpyInst>(dep.getInst())) + C = dyn_cast<CallInst>(dep.getInst()); + + if (C) { + bool changed = performCallSlotOptzn(LI, + SI->getPointerOperand()->stripPointerCasts(), + LI->getPointerOperand()->stripPointerCasts(), + TD->getTypeStoreSize(SI->getOperand(0)->getType()), C); + if (changed) { + MD->removeInstruction(SI); + SI->eraseFromParent(); + MD->removeInstruction(LI); + LI->eraseFromParent(); + ++NumMemCpyInstr; + return true; + } + } + } + } + + // There are two cases that are interesting for this code to handle: memcpy + // and memset. Right now we only handle memset. + + // Ensure that the value being stored is something that can be memset'able a + // byte at a time like "0" or "-1" or any width, as well as things like + // 0xA0A0A0A0 and 0.0. + if (Value *ByteVal = isBytewiseValue(SI->getOperand(0))) + if (Instruction *I = tryMergingIntoMemset(SI, SI->getPointerOperand(), + ByteVal)) { + BBI = I; // Don't invalidate iterator. + return true; + } + + return false; +} + +bool MemCpyOpt::processMemSet(MemSetInst *MSI, BasicBlock::iterator &BBI) { + // See if there is another memset or store neighboring this memset which + // allows us to widen out the memset to do a single larger store. + if (isa<ConstantInt>(MSI->getLength()) && !MSI->isVolatile()) + if (Instruction *I = tryMergingIntoMemset(MSI, MSI->getDest(), + MSI->getValue())) { + BBI = I; // Don't invalidate iterator. + return true; + } + return false; } /// performCallSlotOptzn - takes a memcpy and a call that it depends on, /// and checks for the possibility of a call slot optimization by having /// the call write its result directly into the destination of the memcpy. -bool MemCpyOpt::performCallSlotOptzn(MemCpyInst *cpy, CallInst *C) { +bool MemCpyOpt::performCallSlotOptzn(Instruction *cpy, + Value *cpyDest, Value *cpySrc, + uint64_t cpyLen, CallInst *C) { // The general transformation to keep in mind is // // call @func(..., src, ...) @@ -506,24 +551,15 @@ bool MemCpyOpt::performCallSlotOptzn(MemCpyInst *cpy, CallInst *C) { // Deliberately get the source and destination with bitcasts stripped away, // because we'll need to do type comparisons based on the underlying type. - Value *cpyDest = cpy->getDest(); - Value *cpySrc = cpy->getSource(); CallSite CS(C); - // We need to be able to reason about the size of the memcpy, so we require - // that it be a constant. - ConstantInt *cpyLength = dyn_cast<ConstantInt>(cpy->getLength()); - if (!cpyLength) - return false; - // Require that src be an alloca. This simplifies the reasoning considerably. AllocaInst *srcAlloca = dyn_cast<AllocaInst>(cpySrc); if (!srcAlloca) return false; // Check that all of src is copied to dest. - TargetData *TD = getAnalysisIfAvailable<TargetData>(); - if (!TD) return false; + if (TD == 0) return false; ConstantInt *srcArraySize = dyn_cast<ConstantInt>(srcAlloca->getArraySize()); if (!srcArraySize) @@ -532,7 +568,7 @@ bool MemCpyOpt::performCallSlotOptzn(MemCpyInst *cpy, CallInst *C) { uint64_t srcSize = TD->getTypeAllocSize(srcAlloca->getAllocatedType()) * srcArraySize->getZExtValue(); - if (cpyLength->getZExtValue() < srcSize) + if (cpyLen < srcSize) return false; // Check that accessing the first srcSize bytes of dest will not cause a @@ -601,8 +637,7 @@ bool MemCpyOpt::performCallSlotOptzn(MemCpyInst *cpy, CallInst *C) { // the use analysis, we also need to know that it does not sneakily // access dest. We rely on AA to figure this out for us. AliasAnalysis &AA = getAnalysis<AliasAnalysis>(); - if (AA.getModRefInfo(C, cpy->getRawDest(), srcSize) != - AliasAnalysis::NoModRef) + if (AA.getModRefInfo(C, cpyDest, srcSize) != AliasAnalysis::NoModRef) return false; // All the checks have passed, so do the transformation. @@ -625,99 +660,142 @@ bool MemCpyOpt::performCallSlotOptzn(MemCpyInst *cpy, CallInst *C) { // Drop any cached information about the call, because we may have changed // its dependence information by changing its parameter. - MemoryDependenceAnalysis &MD = getAnalysis<MemoryDependenceAnalysis>(); - MD.removeInstruction(C); + MD->removeInstruction(C); - // Remove the memcpy - MD.removeInstruction(cpy); - cpy->eraseFromParent(); + // Remove the memcpy. + MD->removeInstruction(cpy); ++NumMemCpyInstr; return true; } -/// processMemCpy - perform simplification of memcpy's. If we have memcpy A -/// which copies X to Y, and memcpy B which copies Y to Z, then we can rewrite -/// B to be a memcpy from X to Z (or potentially a memmove, depending on -/// circumstances). This allows later passes to remove the first memcpy -/// altogether. -bool MemCpyOpt::processMemCpy(MemCpyInst *M) { - MemoryDependenceAnalysis &MD = getAnalysis<MemoryDependenceAnalysis>(); - - // The are two possible optimizations we can do for memcpy: - // a) memcpy-memcpy xform which exposes redundance for DSE. - // b) call-memcpy xform for return slot optimization. - MemDepResult dep = MD.getDependency(M); - if (!dep.isClobber()) - return false; - if (!isa<MemCpyInst>(dep.getInst())) { - if (CallInst *C = dyn_cast<CallInst>(dep.getInst())) - return performCallSlotOptzn(M, C); +/// processMemCpyMemCpyDependence - We've found that the (upward scanning) +/// memory dependence of memcpy 'M' is the memcpy 'MDep'. Try to simplify M to +/// copy from MDep's input if we can. MSize is the size of M's copy. +/// +bool MemCpyOpt::processMemCpyMemCpyDependence(MemCpyInst *M, MemCpyInst *MDep, + uint64_t MSize) { + // We can only transforms memcpy's where the dest of one is the source of the + // other. + if (M->getSource() != MDep->getDest() || MDep->isVolatile()) return false; - } - - MemCpyInst *MDep = cast<MemCpyInst>(dep.getInst()); - // We can only transforms memcpy's where the dest of one is the source of the - // other - if (M->getSource() != MDep->getDest()) + // If dep instruction is reading from our current input, then it is a noop + // transfer and substituting the input won't change this instruction. Just + // ignore the input and let someone else zap MDep. This handles cases like: + // memcpy(a <- a) + // memcpy(b <- a) + if (M->getSource() == MDep->getSource()) return false; // Second, the length of the memcpy's must be the same, or the preceeding one // must be larger than the following one. - ConstantInt *C1 = dyn_cast<ConstantInt>(MDep->getLength()); - ConstantInt *C2 = dyn_cast<ConstantInt>(M->getLength()); - if (!C1 || !C2) - return false; - - uint64_t DepSize = C1->getValue().getZExtValue(); - uint64_t CpySize = C2->getValue().getZExtValue(); - - if (DepSize < CpySize) + ConstantInt *MDepLen = dyn_cast<ConstantInt>(MDep->getLength()); + ConstantInt *MLen = dyn_cast<ConstantInt>(M->getLength()); + if (!MDepLen || !MLen || MDepLen->getZExtValue() < MLen->getZExtValue()) return false; - // Finally, we have to make sure that the dest of the second does not - // alias the source of the first AliasAnalysis &AA = getAnalysis<AliasAnalysis>(); - if (AA.alias(M->getRawDest(), CpySize, MDep->getRawSource(), DepSize) != - AliasAnalysis::NoAlias) + + // Verify that the copied-from memory doesn't change in between the two + // transfers. For example, in: + // memcpy(a <- b) + // *b = 42; + // memcpy(c <- a) + // It would be invalid to transform the second memcpy into memcpy(c <- b). + // + // TODO: If the code between M and MDep is transparent to the destination "c", + // then we could still perform the xform by moving M up to the first memcpy. + // + // NOTE: This is conservative, it will stop on any read from the source loc, + // not just the defining memcpy. + MemDepResult SourceDep = + MD->getPointerDependencyFrom(AA.getLocationForSource(MDep), + false, M, M->getParent()); + if (!SourceDep.isClobber() || SourceDep.getInst() != MDep) return false; - else if (AA.alias(M->getRawDest(), CpySize, M->getRawSource(), CpySize) != - AliasAnalysis::NoAlias) + + // If the dest of the second might alias the source of the first, then the + // source and dest might overlap. We still want to eliminate the intermediate + // value, but we have to generate a memmove instead of memcpy. + bool UseMemMove = false; + if (!AA.isNoAlias(AA.getLocationForDest(M), AA.getLocationForSource(MDep))) + UseMemMove = true; + + // If all checks passed, then we can transform M. + + // Make sure to use the lesser of the alignment of the source and the dest + // since we're changing where we're reading from, but don't want to increase + // the alignment past what can be read from or written to. + // TODO: Is this worth it if we're creating a less aligned memcpy? For + // example we could be moving from movaps -> movq on x86. + unsigned Align = std::min(MDep->getAlignment(), M->getAlignment()); + + IRBuilder<> Builder(M); + if (UseMemMove) + Builder.CreateMemMove(M->getRawDest(), MDep->getRawSource(), M->getLength(), + Align, M->isVolatile()); + else + Builder.CreateMemCpy(M->getRawDest(), MDep->getRawSource(), M->getLength(), + Align, M->isVolatile()); + + // Remove the instruction we're replacing. + MD->removeInstruction(M); + M->eraseFromParent(); + ++NumMemCpyInstr; + return true; +} + + +/// processMemCpy - perform simplification of memcpy's. If we have memcpy A +/// which copies X to Y, and memcpy B which copies Y to Z, then we can rewrite +/// B to be a memcpy from X to Z (or potentially a memmove, depending on +/// circumstances). This allows later passes to remove the first memcpy +/// altogether. +bool MemCpyOpt::processMemCpy(MemCpyInst *M) { + // We can only optimize statically-sized memcpy's that are non-volatile. + ConstantInt *CopySize = dyn_cast<ConstantInt>(M->getLength()); + if (CopySize == 0 || M->isVolatile()) return false; + + // If the source and destination of the memcpy are the same, then zap it. + if (M->getSource() == M->getDest()) { + MD->removeInstruction(M); + M->eraseFromParent(); return false; - else if (AA.alias(MDep->getRawDest(), DepSize, MDep->getRawSource(), DepSize) - != AliasAnalysis::NoAlias) + } + + // If copying from a constant, try to turn the memcpy into a memset. + if (GlobalVariable *GV = dyn_cast<GlobalVariable>(M->getSource())) + if (GV->isConstant() && GV->hasDefinitiveInitializer()) + if (Value *ByteVal = isBytewiseValue(GV->getInitializer())) { + IRBuilder<> Builder(M); + Builder.CreateMemSet(M->getRawDest(), ByteVal, CopySize, + M->getAlignment(), false); + MD->removeInstruction(M); + M->eraseFromParent(); + ++NumCpyToSet; + return true; + } + + // The are two possible optimizations we can do for memcpy: + // a) memcpy-memcpy xform which exposes redundance for DSE. + // b) call-memcpy xform for return slot optimization. + MemDepResult DepInfo = MD->getDependency(M); + if (!DepInfo.isClobber()) return false; - // If all checks passed, then we can transform these memcpy's - const Type *ArgTys[3] = { M->getRawDest()->getType(), - MDep->getRawSource()->getType(), - M->getLength()->getType() }; - Function *MemCpyFun = Intrinsic::getDeclaration( - M->getParent()->getParent()->getParent(), - M->getIntrinsicID(), ArgTys, 3); + if (MemCpyInst *MDep = dyn_cast<MemCpyInst>(DepInfo.getInst())) + return processMemCpyMemCpyDependence(M, MDep, CopySize->getZExtValue()); - Value *Args[5] = { - M->getRawDest(), MDep->getRawSource(), M->getLength(), - M->getAlignmentCst(), M->getVolatileCst() - }; - - CallInst *C = CallInst::Create(MemCpyFun, Args, Args+5, "", M); - - - // If C and M don't interfere, then this is a valid transformation. If they - // did, this would mean that the two sources overlap, which would be bad. - if (MD.getDependency(C) == dep) { - MD.removeInstruction(M); - M->eraseFromParent(); - ++NumMemCpyInstr; - return true; + if (CallInst *C = dyn_cast<CallInst>(DepInfo.getInst())) { + if (performCallSlotOptzn(M, M->getDest(), M->getSource(), + CopySize->getZExtValue(), C)) { + MD->removeInstruction(M); + M->eraseFromParent(); + return true; + } } - // Otherwise, there was no point in doing this, so we remove the call we - // inserted and act like nothing happened. - MD.removeInstruction(C); - C->eraseFromParent(); return false; } @@ -726,15 +804,8 @@ bool MemCpyOpt::processMemCpy(MemCpyInst *M) { bool MemCpyOpt::processMemMove(MemMoveInst *M) { AliasAnalysis &AA = getAnalysis<AliasAnalysis>(); - // If the memmove is a constant size, use it for the alias query, this allows - // us to optimize things like: memmove(P, P+64, 64); - uint64_t MemMoveSize = ~0ULL; - if (ConstantInt *Len = dyn_cast<ConstantInt>(M->getLength())) - MemMoveSize = Len->getZExtValue(); - // See if the pointers alias. - if (AA.alias(M->getRawDest(), MemMoveSize, M->getRawSource(), MemMoveSize) != - AliasAnalysis::NoAlias) + if (!AA.isNoAlias(AA.getLocationForDest(M), AA.getLocationForSource(M))) return false; DEBUG(dbgs() << "MemCpyOpt: Optimizing memmove -> memcpy: " << *M << "\n"); @@ -749,33 +820,107 @@ bool MemCpyOpt::processMemMove(MemMoveInst *M) { // MemDep may have over conservative information about this instruction, just // conservatively flush it from the cache. - getAnalysis<MemoryDependenceAnalysis>().removeInstruction(M); + MD->removeInstruction(M); ++NumMoveToCpy; return true; } +/// processByValArgument - This is called on every byval argument in call sites. +bool MemCpyOpt::processByValArgument(CallSite CS, unsigned ArgNo) { + if (TD == 0) return false; + + // Find out what feeds this byval argument. + Value *ByValArg = CS.getArgument(ArgNo); + const Type *ByValTy =cast<PointerType>(ByValArg->getType())->getElementType(); + uint64_t ByValSize = TD->getTypeAllocSize(ByValTy); + MemDepResult DepInfo = + MD->getPointerDependencyFrom(AliasAnalysis::Location(ByValArg, ByValSize), + true, CS.getInstruction(), + CS.getInstruction()->getParent()); + if (!DepInfo.isClobber()) + return false; + + // If the byval argument isn't fed by a memcpy, ignore it. If it is fed by + // a memcpy, see if we can byval from the source of the memcpy instead of the + // result. + MemCpyInst *MDep = dyn_cast<MemCpyInst>(DepInfo.getInst()); + if (MDep == 0 || MDep->isVolatile() || + ByValArg->stripPointerCasts() != MDep->getDest()) + return false; + + // The length of the memcpy must be larger or equal to the size of the byval. + ConstantInt *C1 = dyn_cast<ConstantInt>(MDep->getLength()); + if (C1 == 0 || C1->getValue().getZExtValue() < ByValSize) + return false; + + // Get the alignment of the byval. If it is greater than the memcpy, then we + // can't do the substitution. If the call doesn't specify the alignment, then + // it is some target specific value that we can't know. + unsigned ByValAlign = CS.getParamAlignment(ArgNo+1); + if (ByValAlign == 0 || MDep->getAlignment() < ByValAlign) + return false; + + // Verify that the copied-from memory doesn't change in between the memcpy and + // the byval call. + // memcpy(a <- b) + // *b = 42; + // foo(*a) + // It would be invalid to transform the second memcpy into foo(*b). + // + // NOTE: This is conservative, it will stop on any read from the source loc, + // not just the defining memcpy. + MemDepResult SourceDep = + MD->getPointerDependencyFrom(AliasAnalysis::getLocationForSource(MDep), + false, CS.getInstruction(), MDep->getParent()); + if (!SourceDep.isClobber() || SourceDep.getInst() != MDep) + return false; + + Value *TmpCast = MDep->getSource(); + if (MDep->getSource()->getType() != ByValArg->getType()) + TmpCast = new BitCastInst(MDep->getSource(), ByValArg->getType(), + "tmpcast", CS.getInstruction()); + + DEBUG(dbgs() << "MemCpyOpt: Forwarding memcpy to byval:\n" + << " " << *MDep << "\n" + << " " << *CS.getInstruction() << "\n"); + + // Otherwise we're good! Update the byval argument. + CS.setArgument(ArgNo, TmpCast); + ++NumMemCpyInstr; + return true; +} -// MemCpyOpt::iterateOnFunction - Executes one iteration of GVN. +/// iterateOnFunction - Executes one iteration of MemCpyOpt. bool MemCpyOpt::iterateOnFunction(Function &F) { bool MadeChange = false; // Walk all instruction in the function. for (Function::iterator BB = F.begin(), BBE = F.end(); BB != BBE; ++BB) { - for (BasicBlock::iterator BI = BB->begin(), BE = BB->end(); - BI != BE;) { + for (BasicBlock::iterator BI = BB->begin(), BE = BB->end(); BI != BE;) { // Avoid invalidating the iterator. Instruction *I = BI++; + bool RepeatInstruction = false; + if (StoreInst *SI = dyn_cast<StoreInst>(I)) MadeChange |= processStore(SI, BI); + else if (MemSetInst *M = dyn_cast<MemSetInst>(I)) + RepeatInstruction = processMemSet(M, BI); else if (MemCpyInst *M = dyn_cast<MemCpyInst>(I)) - MadeChange |= processMemCpy(M); - else if (MemMoveInst *M = dyn_cast<MemMoveInst>(I)) { - if (processMemMove(M)) { - --BI; // Reprocess the new memcpy. - MadeChange = true; - } + RepeatInstruction = processMemCpy(M); + else if (MemMoveInst *M = dyn_cast<MemMoveInst>(I)) + RepeatInstruction = processMemMove(M); + else if (CallSite CS = (Value*)I) { + for (unsigned i = 0, e = CS.arg_size(); i != e; ++i) + if (CS.paramHasAttr(i+1, Attribute::ByVal)) + MadeChange |= processByValArgument(CS, i); + } + + // Reprocess the instruction if desired. + if (RepeatInstruction) { + if (BI != BB->begin()) --BI; + MadeChange = true; } } } @@ -788,14 +933,14 @@ bool MemCpyOpt::iterateOnFunction(Function &F) { // bool MemCpyOpt::runOnFunction(Function &F) { bool MadeChange = false; + MD = &getAnalysis<MemoryDependenceAnalysis>(); + TD = getAnalysisIfAvailable<TargetData>(); while (1) { if (!iterateOnFunction(F)) break; MadeChange = true; } + MD = 0; return MadeChange; } - - - diff --git a/lib/Transforms/Scalar/Reassociate.cpp b/lib/Transforms/Scalar/Reassociate.cpp index b8afcc1..e093b52 100644 --- a/lib/Transforms/Scalar/Reassociate.cpp +++ b/lib/Transforms/Scalar/Reassociate.cpp @@ -77,7 +77,9 @@ namespace { bool MadeChange; public: static char ID; // Pass identification, replacement for typeid - Reassociate() : FunctionPass(ID) {} + Reassociate() : FunctionPass(ID) { + initializeReassociatePass(*PassRegistry::getPassRegistry()); + } bool runOnFunction(Function &F); @@ -104,7 +106,7 @@ namespace { char Reassociate::ID = 0; INITIALIZE_PASS(Reassociate, "reassociate", - "Reassociate expressions", false, false); + "Reassociate expressions", false, false) // Public interface to the Reassociate pass FunctionPass *llvm::createReassociatePass() { return new Reassociate(); } @@ -238,6 +240,12 @@ void Reassociate::LinearizeExpr(BinaryOperator *I) { RHS->setOperand(0, LHS); I->setOperand(0, RHS); + // Conservatively clear all the optional flags, which may not hold + // after the reassociation. + I->clearSubclassOptionalData(); + LHS->clearSubclassOptionalData(); + RHS->clearSubclassOptionalData(); + ++NumLinear; MadeChange = true; DEBUG(dbgs() << "Linearized: " << *I << '\n'); @@ -339,6 +347,12 @@ void Reassociate::RewriteExprTree(BinaryOperator *I, DEBUG(dbgs() << "RA: " << *I << '\n'); I->setOperand(0, Ops[i].Op); I->setOperand(1, Ops[i+1].Op); + + // Clear all the optional flags, which may not hold after the + // reassociation if the expression involved more than just this operation. + if (Ops.size() != 2) + I->clearSubclassOptionalData(); + DEBUG(dbgs() << "TO: " << *I << '\n'); MadeChange = true; ++NumChanged; @@ -354,6 +368,11 @@ void Reassociate::RewriteExprTree(BinaryOperator *I, if (I->getOperand(1) != Ops[i].Op) { DEBUG(dbgs() << "RA: " << *I << '\n'); I->setOperand(1, Ops[i].Op); + + // Conservatively clear all the optional flags, which may not hold + // after the reassociation. + I->clearSubclassOptionalData(); + DEBUG(dbgs() << "TO: " << *I << '\n'); MadeChange = true; ++NumChanged; @@ -809,16 +828,23 @@ Value *Reassociate::OptimizeAdd(Instruction *I, // RemoveFactorFromExpression on successive values to behave differently. Instruction *DummyInst = BinaryOperator::CreateAdd(MaxOccVal, MaxOccVal); SmallVector<Value*, 4> NewMulOps; - for (unsigned i = 0, e = Ops.size(); i != e; ++i) { + for (unsigned i = 0; i != Ops.size(); ++i) { // Only try to remove factors from expressions we're allowed to. BinaryOperator *BOp = dyn_cast<BinaryOperator>(Ops[i].Op); if (BOp == 0 || BOp->getOpcode() != Instruction::Mul || !BOp->use_empty()) continue; if (Value *V = RemoveFactorFromExpression(Ops[i].Op, MaxOccVal)) { - NewMulOps.push_back(V); - Ops.erase(Ops.begin()+i); - --i; --e; + // The factorized operand may occur several times. Convert them all in + // one fell swoop. + for (unsigned j = Ops.size(); j != i;) { + --j; + if (Ops[j].Op == Ops[i].Op) { + NewMulOps.push_back(V); + Ops.erase(Ops.begin()+j); + } + } + --i; } } diff --git a/lib/Transforms/Scalar/Reg2Mem.cpp b/lib/Transforms/Scalar/Reg2Mem.cpp index 506b72a..459bb06 100644 --- a/lib/Transforms/Scalar/Reg2Mem.cpp +++ b/lib/Transforms/Scalar/Reg2Mem.cpp @@ -36,7 +36,9 @@ STATISTIC(NumPhisDemoted, "Number of phi-nodes demoted"); namespace { struct RegToMem : public FunctionPass { static char ID; // Pass identification, replacement for typeid - RegToMem() : FunctionPass(ID) {} + RegToMem() : FunctionPass(ID) { + initializeRegToMemPass(*PassRegistry::getPassRegistry()); + } virtual void getAnalysisUsage(AnalysisUsage &AU) const { AU.addRequiredID(BreakCriticalEdgesID); @@ -59,9 +61,11 @@ namespace { } char RegToMem::ID = 0; -INITIALIZE_PASS(RegToMem, "reg2mem", "Demote all values to stack slots", - false, false); - +INITIALIZE_PASS_BEGIN(RegToMem, "reg2mem", "Demote all values to stack slots", + false, false) +INITIALIZE_PASS_DEPENDENCY(BreakCriticalEdges) +INITIALIZE_PASS_END(RegToMem, "reg2mem", "Demote all values to stack slots", + false, false) bool RegToMem::runOnFunction(Function &F) { if (F.isDeclaration()) diff --git a/lib/Transforms/Scalar/SCCP.cpp b/lib/Transforms/Scalar/SCCP.cpp index 6115c05..c82e929 100644 --- a/lib/Transforms/Scalar/SCCP.cpp +++ b/lib/Transforms/Scalar/SCCP.cpp @@ -481,6 +481,19 @@ private: } } + /// InsertInOverdefinedPHIs - Insert an entry in the UsersOfOverdefinedPHIS + /// map for I and PN, but if one is there already, do not create another. + /// (Duplicate entries do not break anything directly, but can lead to + /// exponential growth of the table in rare cases.) + void InsertInOverdefinedPHIs(Instruction *I, PHINode *PN) { + std::multimap<PHINode*, Instruction*>::iterator J, E; + tie(J, E) = UsersOfOverdefinedPHIs.equal_range(PN); + for (; J != E; ++J) + if (J->second == I) + return; + UsersOfOverdefinedPHIs.insert(std::make_pair(PN, I)); + } + private: friend class InstVisitor<SCCPSolver>; @@ -973,9 +986,9 @@ void SCCPSolver::visitBinaryOperator(Instruction &I) { if (Result.isConstant()) { markConstant(IV, &I, Result.getConstant()); // Remember that this instruction is virtually using the PHI node - // operands. - UsersOfOverdefinedPHIs.insert(std::make_pair(PN1, &I)); - UsersOfOverdefinedPHIs.insert(std::make_pair(PN2, &I)); + // operands. + InsertInOverdefinedPHIs(&I, PN1); + InsertInOverdefinedPHIs(&I, PN2); return; } @@ -1056,8 +1069,8 @@ void SCCPSolver::visitCmpInst(CmpInst &I) { markConstant(&I, Result.getConstant()); // Remember that this instruction is virtually using the PHI node // operands. - UsersOfOverdefinedPHIs.insert(std::make_pair(PN1, &I)); - UsersOfOverdefinedPHIs.insert(std::make_pair(PN2, &I)); + InsertInOverdefinedPHIs(&I, PN1); + InsertInOverdefinedPHIs(&I, PN2); return; } @@ -1585,22 +1598,20 @@ namespace { /// struct SCCP : public FunctionPass { static char ID; // Pass identification, replacement for typeid - SCCP() : FunctionPass(ID) {} + SCCP() : FunctionPass(ID) { + initializeSCCPPass(*PassRegistry::getPassRegistry()); + } // runOnFunction - Run the Sparse Conditional Constant Propagation // algorithm, and return true if the function was modified. // bool runOnFunction(Function &F); - - virtual void getAnalysisUsage(AnalysisUsage &AU) const { - AU.setPreservesCFG(); - } }; } // end anonymous namespace char SCCP::ID = 0; INITIALIZE_PASS(SCCP, "sccp", - "Sparse Conditional Constant Propagation", false, false); + "Sparse Conditional Constant Propagation", false, false) // createSCCPPass - This is the public interface to this file. FunctionPass *llvm::createSCCPPass() { @@ -1701,7 +1712,9 @@ namespace { /// struct IPSCCP : public ModulePass { static char ID; - IPSCCP() : ModulePass(ID) {} + IPSCCP() : ModulePass(ID) { + initializeIPSCCPPass(*PassRegistry::getPassRegistry()); + } bool runOnModule(Module &M); }; } // end anonymous namespace @@ -1709,7 +1722,7 @@ namespace { char IPSCCP::ID = 0; INITIALIZE_PASS(IPSCCP, "ipsccp", "Interprocedural Sparse Conditional Constant Propagation", - false, false); + false, false) // createIPSCCPPass - This is the public interface to this file. ModulePass *llvm::createIPSCCPPass() { diff --git a/lib/Transforms/Scalar/Scalar.cpp b/lib/Transforms/Scalar/Scalar.cpp index cb03423..bf9ca6d 100644 --- a/lib/Transforms/Scalar/Scalar.cpp +++ b/lib/Transforms/Scalar/Scalar.cpp @@ -7,12 +7,15 @@ // //===----------------------------------------------------------------------===// // -// This file implements the C bindings for libLLVMScalarOpts.a, which implements -// several scalar transformations over the LLVM intermediate representation. +// This file implements common infrastructure for libLLVMScalarOpts.a, which +// implements several scalar transformations over the LLVM intermediate +// representation, including the C bindings for that library. // //===----------------------------------------------------------------------===// #include "llvm-c/Transforms/Scalar.h" +#include "llvm-c/Initialization.h" +#include "llvm/InitializePasses.h" #include "llvm/PassManager.h" #include "llvm/Analysis/Verifier.h" #include "llvm/Target/TargetData.h" @@ -20,6 +23,50 @@ using namespace llvm; +/// initializeScalarOptsPasses - Initialize all passes linked into the +/// ScalarOpts library. +void llvm::initializeScalarOpts(PassRegistry &Registry) { + initializeADCEPass(Registry); + initializeBlockPlacementPass(Registry); + initializeCodeGenPreparePass(Registry); + initializeConstantPropagationPass(Registry); + initializeCorrelatedValuePropagationPass(Registry); + initializeDCEPass(Registry); + initializeDeadInstEliminationPass(Registry); + initializeDSEPass(Registry); + initializeGEPSplitterPass(Registry); + initializeGVNPass(Registry); + initializeEarlyCSEPass(Registry); + initializeIndVarSimplifyPass(Registry); + initializeJumpThreadingPass(Registry); + initializeLICMPass(Registry); + initializeLoopDeletionPass(Registry); + initializeLoopInstSimplifyPass(Registry); + initializeLoopRotatePass(Registry); + initializeLoopStrengthReducePass(Registry); + initializeLoopUnrollPass(Registry); + initializeLoopUnswitchPass(Registry); + initializeLoopIdiomRecognizePass(Registry); + initializeLowerAtomicPass(Registry); + initializeMemCpyOptPass(Registry); + initializeReassociatePass(Registry); + initializeRegToMemPass(Registry); + initializeSCCPPass(Registry); + initializeIPSCCPPass(Registry); + initializeSROA_DTPass(Registry); + initializeSROA_SSAUpPass(Registry); + initializeCFGSimplifyPassPass(Registry); + initializeSimplifyHalfPowrLibCallsPass(Registry); + initializeSimplifyLibCallsPass(Registry); + initializeSinkingPass(Registry); + initializeTailDupPass(Registry); + initializeTailCallElimPass(Registry); +} + +void LLVMInitializeScalarOpts(LLVMPassRegistryRef R) { + initializeScalarOpts(*unwrap(R)); +} + void LLVMAddAggressiveDCEPass(LLVMPassManagerRef PM) { unwrap(PM)->add(createAggressiveDCEPass()); } @@ -56,10 +103,6 @@ void LLVMAddLoopDeletionPass(LLVMPassManagerRef PM) { unwrap(PM)->add(createLoopDeletionPass()); } -void LLVMAddLoopIndexSplitPass(LLVMPassManagerRef PM) { - unwrap(PM)->add(createLoopIndexSplitPass()); -} - void LLVMAddLoopRotatePass(LLVMPassManagerRef PM) { unwrap(PM)->add(createLoopRotatePass()); } diff --git a/lib/Transforms/Scalar/ScalarReplAggregates.cpp b/lib/Transforms/Scalar/ScalarReplAggregates.cpp index fee317d..c3ca852 100644 --- a/lib/Transforms/Scalar/ScalarReplAggregates.cpp +++ b/lib/Transforms/Scalar/ScalarReplAggregates.cpp @@ -31,28 +31,34 @@ #include "llvm/Module.h" #include "llvm/Pass.h" #include "llvm/Analysis/Dominators.h" +#include "llvm/Analysis/Loads.h" +#include "llvm/Analysis/ValueTracking.h" #include "llvm/Target/TargetData.h" #include "llvm/Transforms/Utils/PromoteMemToReg.h" #include "llvm/Transforms/Utils/Local.h" +#include "llvm/Transforms/Utils/SSAUpdater.h" +#include "llvm/Support/CallSite.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/GetElementPtrTypeIterator.h" #include "llvm/Support/IRBuilder.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" using namespace llvm; STATISTIC(NumReplaced, "Number of allocas broken up"); STATISTIC(NumPromoted, "Number of allocas promoted"); +STATISTIC(NumAdjusted, "Number of scalar allocas adjusted to allow promotion"); STATISTIC(NumConverted, "Number of aggregates converted to scalar"); STATISTIC(NumGlobals, "Number of allocas copied from constant global"); namespace { struct SROA : public FunctionPass { - static char ID; // Pass identification, replacement for typeid - explicit SROA(signed T = -1) : FunctionPass(ID) { + SROA(int T, bool hasDT, char &ID) + : FunctionPass(ID), HasDomTree(hasDT) { if (T == -1) SRThreshold = 128; else @@ -64,17 +70,10 @@ namespace { bool performScalarRepl(Function &F); bool performPromotion(Function &F); - // getAnalysisUsage - This pass does not require any passes, but we know it - // will not alter the CFG, so say so. - virtual void getAnalysisUsage(AnalysisUsage &AU) const { - AU.addRequired<DominatorTree>(); - AU.addRequired<DominanceFrontier>(); - AU.setPreservesCFG(); - } - private: + bool HasDomTree; TargetData *TD; - + /// DeadInsts - Keep track of instructions we have made dead, so that /// we can remove them after we are done working. SmallVector<Value*, 32> DeadInsts; @@ -83,39 +82,61 @@ namespace { /// information about the uses. All these fields are initialized to false /// and set to true when something is learned. struct AllocaInfo { + /// The alloca to promote. + AllocaInst *AI; + + /// CheckedPHIs - This is a set of verified PHI nodes, to prevent infinite + /// looping and avoid redundant work. + SmallPtrSet<PHINode*, 8> CheckedPHIs; + /// isUnsafe - This is set to true if the alloca cannot be SROA'd. bool isUnsafe : 1; - + /// isMemCpySrc - This is true if this aggregate is memcpy'd from. bool isMemCpySrc : 1; /// isMemCpyDst - This is true if this aggregate is memcpy'd into. bool isMemCpyDst : 1; - AllocaInfo() - : isUnsafe(false), isMemCpySrc(false), isMemCpyDst(false) {} + /// hasSubelementAccess - This is true if a subelement of the alloca is + /// ever accessed, or false if the alloca is only accessed with mem + /// intrinsics or load/store that only access the entire alloca at once. + bool hasSubelementAccess : 1; + + /// hasALoadOrStore - This is true if there are any loads or stores to it. + /// The alloca may just be accessed with memcpy, for example, which would + /// not set this. + bool hasALoadOrStore : 1; + + explicit AllocaInfo(AllocaInst *ai) + : AI(ai), isUnsafe(false), isMemCpySrc(false), isMemCpyDst(false), + hasSubelementAccess(false), hasALoadOrStore(false) {} }; - + unsigned SRThreshold; - void MarkUnsafe(AllocaInfo &I) { I.isUnsafe = true; } + void MarkUnsafe(AllocaInfo &I, Instruction *User) { + I.isUnsafe = true; + DEBUG(dbgs() << " Transformation preventing inst: " << *User << '\n'); + } bool isSafeAllocaToScalarRepl(AllocaInst *AI); - void isSafeForScalarRepl(Instruction *I, AllocaInst *AI, uint64_t Offset, - AllocaInfo &Info); - void isSafeGEP(GetElementPtrInst *GEPI, AllocaInst *AI, uint64_t &Offset, - AllocaInfo &Info); - void isSafeMemAccess(AllocaInst *AI, uint64_t Offset, uint64_t MemSize, - const Type *MemOpType, bool isStore, AllocaInfo &Info); + void isSafeForScalarRepl(Instruction *I, uint64_t Offset, AllocaInfo &Info); + void isSafePHISelectUseForScalarRepl(Instruction *User, uint64_t Offset, + AllocaInfo &Info); + void isSafeGEP(GetElementPtrInst *GEPI, uint64_t &Offset, AllocaInfo &Info); + void isSafeMemAccess(uint64_t Offset, uint64_t MemSize, + const Type *MemOpType, bool isStore, AllocaInfo &Info, + Instruction *TheAccess, bool AllowWholeAccess); bool TypeHasComponent(const Type *T, uint64_t Offset, uint64_t Size); uint64_t FindElementAndOffset(const Type *&T, uint64_t &Offset, const Type *&IdxTy); - - void DoScalarReplacement(AllocaInst *AI, + + void DoScalarReplacement(AllocaInst *AI, std::vector<AllocaInst*> &WorkList); void DeleteDeadInstructions(); - + void RewriteForScalarRepl(Instruction *I, AllocaInst *AI, uint64_t Offset, SmallVector<AllocaInst*, 32> &NewElts); void RewriteBitCast(BitCastInst *BC, AllocaInst *AI, uint64_t Offset, @@ -129,18 +150,63 @@ namespace { SmallVector<AllocaInst*, 32> &NewElts); void RewriteLoadUserOfWholeAlloca(LoadInst *LI, AllocaInst *AI, SmallVector<AllocaInst*, 32> &NewElts); - + static MemTransferInst *isOnlyCopiedFromConstantGlobal(AllocaInst *AI); }; + + // SROA_DT - SROA that uses DominatorTree. + struct SROA_DT : public SROA { + static char ID; + public: + SROA_DT(int T = -1) : SROA(T, true, ID) { + initializeSROA_DTPass(*PassRegistry::getPassRegistry()); + } + + // getAnalysisUsage - This pass does not require any passes, but we know it + // will not alter the CFG, so say so. + virtual void getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired<DominatorTree>(); + AU.setPreservesCFG(); + } + }; + + // SROA_SSAUp - SROA that uses SSAUpdater. + struct SROA_SSAUp : public SROA { + static char ID; + public: + SROA_SSAUp(int T = -1) : SROA(T, false, ID) { + initializeSROA_SSAUpPass(*PassRegistry::getPassRegistry()); + } + + // getAnalysisUsage - This pass does not require any passes, but we know it + // will not alter the CFG, so say so. + virtual void getAnalysisUsage(AnalysisUsage &AU) const { + AU.setPreservesCFG(); + } + }; + } -char SROA::ID = 0; -INITIALIZE_PASS(SROA, "scalarrepl", - "Scalar Replacement of Aggregates", false, false); +char SROA_DT::ID = 0; +char SROA_SSAUp::ID = 0; + +INITIALIZE_PASS_BEGIN(SROA_DT, "scalarrepl", + "Scalar Replacement of Aggregates (DT)", false, false) +INITIALIZE_PASS_DEPENDENCY(DominatorTree) +INITIALIZE_PASS_END(SROA_DT, "scalarrepl", + "Scalar Replacement of Aggregates (DT)", false, false) + +INITIALIZE_PASS_BEGIN(SROA_SSAUp, "scalarrepl-ssa", + "Scalar Replacement of Aggregates (SSAUp)", false, false) +INITIALIZE_PASS_END(SROA_SSAUp, "scalarrepl-ssa", + "Scalar Replacement of Aggregates (SSAUp)", false, false) // Public interface to the ScalarReplAggregates pass -FunctionPass *llvm::createScalarReplAggregatesPass(signed int Threshold) { - return new SROA(Threshold); +FunctionPass *llvm::createScalarReplAggregatesPass(int Threshold, + bool UseDomTree) { + if (UseDomTree) + return new SROA_DT(Threshold); + return new SROA_SSAUp(Threshold); } @@ -156,16 +222,16 @@ class ConvertToScalarInfo { /// AllocaSize - The size of the alloca being considered. unsigned AllocaSize; const TargetData &TD; - + /// IsNotTrivial - This is set to true if there is some access to the object /// which means that mem2reg can't promote it. bool IsNotTrivial; - + /// VectorTy - This tracks the type that we should promote the vector to if /// it is possible to turn it into a vector. This starts out null, and if it /// isn't possible to turn into a vector type, it gets set to VoidTy. const Type *VectorTy; - + /// HadAVector - True if there is at least one vector access to the alloca. /// We don't want to turn random arrays into vectors and use vector element /// insert/extract, but if there are element accesses to something that is @@ -179,14 +245,14 @@ public: VectorTy = 0; HadAVector = false; } - + AllocaInst *TryConvert(AllocaInst *AI); - + private: bool CanConvertToScalar(Value *V, uint64_t Offset); void MergeInType(const Type *In, uint64_t Offset); void ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI, uint64_t Offset); - + Value *ConvertScalar_ExtractValue(Value *NV, const Type *ToType, uint64_t Offset, IRBuilder<> &Builder); Value *ConvertScalar_InsertValue(Value *StoredVal, Value *ExistingVal, @@ -195,26 +261,6 @@ private: } // end anonymous namespace. -/// IsVerbotenVectorType - Return true if this is a vector type ScalarRepl isn't -/// allowed to form. We do this to avoid MMX types, which is a complete hack, -/// but is required until the backend is fixed. -static bool IsVerbotenVectorType(const VectorType *VTy, const Instruction *I) { - StringRef Triple(I->getParent()->getParent()->getParent()->getTargetTriple()); - if (!Triple.startswith("i386") && - !Triple.startswith("x86_64")) - return false; - - // Reject all the MMX vector types. - switch (VTy->getNumElements()) { - default: return false; - case 1: return VTy->getElementType()->isIntegerTy(64); - case 2: return VTy->getElementType()->isIntegerTy(32); - case 4: return VTy->getElementType()->isIntegerTy(16); - case 8: return VTy->getElementType()->isIntegerTy(8); - } -} - - /// TryConvert - Analyze the specified alloca, and if it is safe to do so, /// rewrite it to be a new alloca which is mem2reg'able. This returns the new /// alloca if possible or null if not. @@ -223,7 +269,7 @@ AllocaInst *ConvertToScalarInfo::TryConvert(AllocaInst *AI) { // out. if (!CanConvertToScalar(AI, 0) || !IsNotTrivial) return 0; - + // If we were able to find a vector type that can handle this with // insert/extract elements, and if there was at least one use that had // a vector type, promote this to a vector. We don't want to promote @@ -231,8 +277,7 @@ AllocaInst *ConvertToScalarInfo::TryConvert(AllocaInst *AI) { // we just get a lot of insert/extracts. If at least one vector is // involved, then we probably really do have a union of vector/array. const Type *NewTy; - if (VectorTy && VectorTy->isVectorTy() && HadAVector && - !IsVerbotenVectorType(cast<VectorType>(VectorTy), AI)) { + if (VectorTy && VectorTy->isVectorTy() && HadAVector) { DEBUG(dbgs() << "CONVERT TO VECTOR: " << *AI << "\n TYPE = " << *VectorTy << '\n'); NewTy = VectorTy; // Use the vector type. @@ -263,7 +308,7 @@ void ConvertToScalarInfo::MergeInType(const Type *In, uint64_t Offset) { // nothing to be done. if (VectorTy && VectorTy->isVoidTy()) return; - + // If this could be contributing to a vector, analyze it. // If the In type is a vector that is the same size as the alloca, see if it @@ -271,7 +316,7 @@ void ConvertToScalarInfo::MergeInType(const Type *In, uint64_t Offset) { if (const VectorType *VInTy = dyn_cast<VectorType>(In)) { // Remember if we saw a vector type. HadAVector = true; - + if (VInTy->getBitWidth()/8 == AllocaSize && Offset == 0) { // If we're storing/loading a vector of the right size, allow it as a // vector. If this the first vector we see, remember the type so that @@ -290,7 +335,7 @@ void ConvertToScalarInfo::MergeInType(const Type *In, uint64_t Offset) { // compatible with it. unsigned EltSize = In->getPrimitiveSizeInBits()/8; if (Offset % EltSize == 0 && AllocaSize % EltSize == 0 && - (VectorTy == 0 || + (VectorTy == 0 || cast<VectorType>(VectorTy)->getElementType() ->getPrimitiveSizeInBits()/8 == EltSize)) { if (VectorTy == 0) @@ -298,7 +343,7 @@ void ConvertToScalarInfo::MergeInType(const Type *In, uint64_t Offset) { return; } } - + // Otherwise, we have a case that we can't handle with an optimized vector // form. We can still turn this into a large integer. VectorTy = Type::getVoidTy(In->getContext()); @@ -316,22 +361,28 @@ void ConvertToScalarInfo::MergeInType(const Type *In, uint64_t Offset) { bool ConvertToScalarInfo::CanConvertToScalar(Value *V, uint64_t Offset) { for (Value::use_iterator UI = V->use_begin(), E = V->use_end(); UI!=E; ++UI) { Instruction *User = cast<Instruction>(*UI); - + if (LoadInst *LI = dyn_cast<LoadInst>(User)) { // Don't break volatile loads. if (LI->isVolatile()) return false; + // Don't touch MMX operations. + if (LI->getType()->isX86_MMXTy()) + return false; MergeInType(LI->getType(), Offset); continue; } - + if (StoreInst *SI = dyn_cast<StoreInst>(User)) { // Storing the pointer, not into the value? if (SI->getOperand(0) == V || SI->isVolatile()) return false; + // Don't touch MMX operations. + if (SI->getOperand(0)->getType()->isX86_MMXTy()) + return false; MergeInType(SI->getOperand(0)->getType(), Offset); continue; } - + if (BitCastInst *BCI = dyn_cast<BitCastInst>(User)) { IsNotTrivial = true; // Can't be mem2reg'd. if (!CanConvertToScalar(BCI, Offset)) @@ -343,7 +394,7 @@ bool ConvertToScalarInfo::CanConvertToScalar(Value *V, uint64_t Offset) { // If this is a GEP with a variable indices, we can't handle it. if (!GEP->hasAllConstantIndices()) return false; - + // Compute the offset that this GEP adds to the pointer. SmallVector<Value*, 8> Indices(GEP->op_begin()+1, GEP->op_end()); uint64_t GEPOffset = TD.getIndexedOffset(GEP->getPointerOperandType(), @@ -372,15 +423,15 @@ bool ConvertToScalarInfo::CanConvertToScalar(Value *V, uint64_t Offset) { ConstantInt *Len = dyn_cast<ConstantInt>(MTI->getLength()); if (Len == 0 || Len->getZExtValue() != AllocaSize || Offset != 0) return false; - + IsNotTrivial = true; // Can't be mem2reg'd. continue; } - + // Otherwise, we cannot handle this! return false; } - + return true; } @@ -411,9 +462,9 @@ void ConvertToScalarInfo::ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI, GEP->eraseFromParent(); continue; } - - IRBuilder<> Builder(User->getParent(), User); - + + IRBuilder<> Builder(User); + if (LoadInst *LI = dyn_cast<LoadInst>(User)) { // The load is a bit extract from NewAI shifted right by Offset bits. Value *LoadedVal = Builder.CreateLoad(NewAI, "tmp"); @@ -423,7 +474,7 @@ void ConvertToScalarInfo::ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI, LI->eraseFromParent(); continue; } - + if (StoreInst *SI = dyn_cast<StoreInst>(User)) { assert(SI->getOperand(0) != Ptr && "Consistency error!"); Instruction *Old = Builder.CreateLoad(NewAI, NewAI->getName()+".in"); @@ -431,14 +482,14 @@ void ConvertToScalarInfo::ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI, Builder); Builder.CreateStore(New, NewAI); SI->eraseFromParent(); - + // If the load we just inserted is now dead, then the inserted store // overwrote the entire thing. if (Old->use_empty()) Old->eraseFromParent(); continue; } - + // If this is a constant sized memset of a constant value (e.g. 0) we can // transform it into a store of the expanded constant value. if (MemSetInst *MSI = dyn_cast<MemSetInst>(User)) { @@ -446,7 +497,7 @@ void ConvertToScalarInfo::ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI, unsigned NumBytes = cast<ConstantInt>(MSI->getLength())->getZExtValue(); if (NumBytes != 0) { unsigned Val = cast<ConstantInt>(MSI->getValue())->getZExtValue(); - + // Compute the value replicated the right number of times. APInt APVal(NumBytes*8, Val); @@ -454,17 +505,17 @@ void ConvertToScalarInfo::ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI, if (Val) for (unsigned i = 1; i != NumBytes; ++i) APVal |= APVal << 8; - + Instruction *Old = Builder.CreateLoad(NewAI, NewAI->getName()+".in"); Value *New = ConvertScalar_InsertValue( ConstantInt::get(User->getContext(), APVal), Old, Offset, Builder); Builder.CreateStore(New, NewAI); - + // If the load we just inserted is now dead, then the memset overwrote // the entire thing. if (Old->use_empty()) - Old->eraseFromParent(); + Old->eraseFromParent(); } MSI->eraseFromParent(); continue; @@ -474,29 +525,42 @@ void ConvertToScalarInfo::ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI, // can handle it like a load or store of the scalar type. if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(User)) { assert(Offset == 0 && "must be store to start of alloca"); - + // If the source and destination are both to the same alloca, then this is // a noop copy-to-self, just delete it. Otherwise, emit a load and store // as appropriate. - AllocaInst *OrigAI = cast<AllocaInst>(Ptr->getUnderlyingObject(0)); - - if (MTI->getSource()->getUnderlyingObject(0) != OrigAI) { + AllocaInst *OrigAI = cast<AllocaInst>(GetUnderlyingObject(Ptr, &TD, 0)); + + if (GetUnderlyingObject(MTI->getSource(), &TD, 0) != OrigAI) { // Dest must be OrigAI, change this to be a load from the original // pointer (bitcasted), then a store to our new alloca. assert(MTI->getRawDest() == Ptr && "Neither use is of pointer?"); Value *SrcPtr = MTI->getSource(); - SrcPtr = Builder.CreateBitCast(SrcPtr, NewAI->getType()); - + const PointerType* SPTy = cast<PointerType>(SrcPtr->getType()); + const PointerType* AIPTy = cast<PointerType>(NewAI->getType()); + if (SPTy->getAddressSpace() != AIPTy->getAddressSpace()) { + AIPTy = PointerType::get(AIPTy->getElementType(), + SPTy->getAddressSpace()); + } + SrcPtr = Builder.CreateBitCast(SrcPtr, AIPTy); + LoadInst *SrcVal = Builder.CreateLoad(SrcPtr, "srcval"); SrcVal->setAlignment(MTI->getAlignment()); Builder.CreateStore(SrcVal, NewAI); - } else if (MTI->getDest()->getUnderlyingObject(0) != OrigAI) { + } else if (GetUnderlyingObject(MTI->getDest(), &TD, 0) != OrigAI) { // Src must be OrigAI, change this to be a load from NewAI then a store // through the original dest pointer (bitcasted). assert(MTI->getRawSource() == Ptr && "Neither use is of pointer?"); LoadInst *SrcVal = Builder.CreateLoad(NewAI, "srcval"); - Value *DstPtr = Builder.CreateBitCast(MTI->getDest(), NewAI->getType()); + const PointerType* DPTy = cast<PointerType>(MTI->getDest()->getType()); + const PointerType* AIPTy = cast<PointerType>(NewAI->getType()); + if (DPTy->getAddressSpace() != AIPTy->getAddressSpace()) { + AIPTy = PointerType::get(AIPTy->getElementType(), + DPTy->getAddressSpace()); + } + Value *DstPtr = Builder.CreateBitCast(MTI->getDest(), AIPTy); + StoreInst *NewStore = Builder.CreateStore(SrcVal, DstPtr); NewStore->setAlignment(MTI->getAlignment()); } else { @@ -506,7 +570,7 @@ void ConvertToScalarInfo::ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI, MTI->eraseFromParent(); continue; } - + llvm_unreachable("Unsupported operation!"); } } @@ -548,7 +612,7 @@ ConvertScalar_ExtractValue(Value *FromVal, const Type *ToType, V = Builder.CreateBitCast(V, ToType, "tmp"); return V; } - + // If ToType is a first class aggregate, extract out each of the pieces and // use insertvalue's to form the FCA. if (const StructType *ST = dyn_cast<StructType>(ToType)) { @@ -562,7 +626,7 @@ ConvertScalar_ExtractValue(Value *FromVal, const Type *ToType, } return Res; } - + if (const ArrayType *AT = dyn_cast<ArrayType>(ToType)) { uint64_t EltSize = TD.getTypeAllocSizeInBits(AT->getElementType()); Value *Res = UndefValue::get(AT); @@ -598,7 +662,7 @@ ConvertScalar_ExtractValue(Value *FromVal, const Type *ToType, ConstantInt::get(FromVal->getType(), ShAmt), "tmp"); else if (ShAmt < 0 && (unsigned)-ShAmt < NTy->getBitWidth()) - FromVal = Builder.CreateShl(FromVal, + FromVal = Builder.CreateShl(FromVal, ConstantInt::get(FromVal->getType(), -ShAmt), "tmp"); @@ -606,11 +670,11 @@ ConvertScalar_ExtractValue(Value *FromVal, const Type *ToType, unsigned LIBitWidth = TD.getTypeSizeInBits(ToType); if (LIBitWidth < NTy->getBitWidth()) FromVal = - Builder.CreateTrunc(FromVal, IntegerType::get(FromVal->getContext(), + Builder.CreateTrunc(FromVal, IntegerType::get(FromVal->getContext(), LIBitWidth), "tmp"); else if (LIBitWidth > NTy->getBitWidth()) FromVal = - Builder.CreateZExt(FromVal, IntegerType::get(FromVal->getContext(), + Builder.CreateZExt(FromVal, IntegerType::get(FromVal->getContext(), LIBitWidth), "tmp"); // If the result is an integer, this is a trunc or bitcast. @@ -647,7 +711,7 @@ ConvertScalar_InsertValue(Value *SV, Value *Old, if (const VectorType *VTy = dyn_cast<VectorType>(AllocaType)) { uint64_t VecSize = TD.getTypeAllocSizeInBits(VTy); uint64_t ValSize = TD.getTypeAllocSizeInBits(SV->getType()); - + // Changing the whole vector with memset or with an access of a different // vector type? if (ValSize == VecSize) @@ -657,28 +721,28 @@ ConvertScalar_InsertValue(Value *SV, Value *Old, // Must be an element insertion. unsigned Elt = Offset/EltSize; - + if (SV->getType() != VTy->getElementType()) SV = Builder.CreateBitCast(SV, VTy->getElementType(), "tmp"); - - SV = Builder.CreateInsertElement(Old, SV, + + SV = Builder.CreateInsertElement(Old, SV, ConstantInt::get(Type::getInt32Ty(SV->getContext()), Elt), "tmp"); return SV; } - + // If SV is a first-class aggregate value, insert each value recursively. if (const StructType *ST = dyn_cast<StructType>(SV->getType())) { const StructLayout &Layout = *TD.getStructLayout(ST); for (unsigned i = 0, e = ST->getNumElements(); i != e; ++i) { Value *Elt = Builder.CreateExtractValue(SV, i, "tmp"); - Old = ConvertScalar_InsertValue(Elt, Old, + Old = ConvertScalar_InsertValue(Elt, Old, Offset+Layout.getElementOffsetInBits(i), Builder); } return Old; } - + if (const ArrayType *AT = dyn_cast<ArrayType>(SV->getType())) { uint64_t EltSize = TD.getTypeAllocSizeInBits(AT->getElementType()); for (unsigned i = 0, e = AT->getNumElements(); i != e; ++i) { @@ -778,16 +842,298 @@ bool SROA::runOnFunction(Function &F) { return Changed; } +namespace { +class AllocaPromoter : public LoadAndStorePromoter { + AllocaInst *AI; +public: + AllocaPromoter(const SmallVectorImpl<Instruction*> &Insts, SSAUpdater &S) + : LoadAndStorePromoter(Insts, S), AI(0) {} + + void run(AllocaInst *AI, const SmallVectorImpl<Instruction*> &Insts) { + // Remember which alloca we're promoting (for isInstInList). + this->AI = AI; + LoadAndStorePromoter::run(Insts); + AI->eraseFromParent(); + } + + virtual bool isInstInList(Instruction *I, + const SmallVectorImpl<Instruction*> &Insts) const { + if (LoadInst *LI = dyn_cast<LoadInst>(I)) + return LI->getOperand(0) == AI; + return cast<StoreInst>(I)->getPointerOperand() == AI; + } +}; +} // end anon namespace + +/// isSafeSelectToSpeculate - Select instructions that use an alloca and are +/// subsequently loaded can be rewritten to load both input pointers and then +/// select between the result, allowing the load of the alloca to be promoted. +/// From this: +/// %P2 = select i1 %cond, i32* %Alloca, i32* %Other +/// %V = load i32* %P2 +/// to: +/// %V1 = load i32* %Alloca -> will be mem2reg'd +/// %V2 = load i32* %Other +/// %V = select i1 %cond, i32 %V1, i32 %V2 +/// +/// We can do this to a select if its only uses are loads and if the operand to +/// the select can be loaded unconditionally. +static bool isSafeSelectToSpeculate(SelectInst *SI, const TargetData *TD) { + bool TDerefable = SI->getTrueValue()->isDereferenceablePointer(); + bool FDerefable = SI->getFalseValue()->isDereferenceablePointer(); + + for (Value::use_iterator UI = SI->use_begin(), UE = SI->use_end(); + UI != UE; ++UI) { + LoadInst *LI = dyn_cast<LoadInst>(*UI); + if (LI == 0 || LI->isVolatile()) return false; + + // Both operands to the select need to be dereferencable, either absolutely + // (e.g. allocas) or at this point because we can see other accesses to it. + if (!TDerefable && !isSafeToLoadUnconditionally(SI->getTrueValue(), LI, + LI->getAlignment(), TD)) + return false; + if (!FDerefable && !isSafeToLoadUnconditionally(SI->getFalseValue(), LI, + LI->getAlignment(), TD)) + return false; + } + + return true; +} + +/// isSafePHIToSpeculate - PHI instructions that use an alloca and are +/// subsequently loaded can be rewritten to load both input pointers in the pred +/// blocks and then PHI the results, allowing the load of the alloca to be +/// promoted. +/// From this: +/// %P2 = phi [i32* %Alloca, i32* %Other] +/// %V = load i32* %P2 +/// to: +/// %V1 = load i32* %Alloca -> will be mem2reg'd +/// ... +/// %V2 = load i32* %Other +/// ... +/// %V = phi [i32 %V1, i32 %V2] +/// +/// We can do this to a select if its only uses are loads and if the operand to +/// the select can be loaded unconditionally. +static bool isSafePHIToSpeculate(PHINode *PN, const TargetData *TD) { + // For now, we can only do this promotion if the load is in the same block as + // the PHI, and if there are no stores between the phi and load. + // TODO: Allow recursive phi users. + // TODO: Allow stores. + BasicBlock *BB = PN->getParent(); + unsigned MaxAlign = 0; + for (Value::use_iterator UI = PN->use_begin(), UE = PN->use_end(); + UI != UE; ++UI) { + LoadInst *LI = dyn_cast<LoadInst>(*UI); + if (LI == 0 || LI->isVolatile()) return false; + + // For now we only allow loads in the same block as the PHI. This is a + // common case that happens when instcombine merges two loads through a PHI. + if (LI->getParent() != BB) return false; + + // Ensure that there are no instructions between the PHI and the load that + // could store. + for (BasicBlock::iterator BBI = PN; &*BBI != LI; ++BBI) + if (BBI->mayWriteToMemory()) + return false; + + MaxAlign = std::max(MaxAlign, LI->getAlignment()); + } + + // Okay, we know that we have one or more loads in the same block as the PHI. + // We can transform this if it is safe to push the loads into the predecessor + // blocks. The only thing to watch out for is that we can't put a possibly + // trapping load in the predecessor if it is a critical edge. + for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) { + BasicBlock *Pred = PN->getIncomingBlock(i); + + // If the predecessor has a single successor, then the edge isn't critical. + if (Pred->getTerminator()->getNumSuccessors() == 1) + continue; + + Value *InVal = PN->getIncomingValue(i); + + // If the InVal is an invoke in the pred, we can't put a load on the edge. + if (InvokeInst *II = dyn_cast<InvokeInst>(InVal)) + if (II->getParent() == Pred) + return false; + + // If this pointer is always safe to load, or if we can prove that there is + // already a load in the block, then we can move the load to the pred block. + if (InVal->isDereferenceablePointer() || + isSafeToLoadUnconditionally(InVal, Pred->getTerminator(), MaxAlign, TD)) + continue; + + return false; + } + + return true; +} + + +/// tryToMakeAllocaBePromotable - This returns true if the alloca only has +/// direct (non-volatile) loads and stores to it. If the alloca is close but +/// not quite there, this will transform the code to allow promotion. As such, +/// it is a non-pure predicate. +static bool tryToMakeAllocaBePromotable(AllocaInst *AI, const TargetData *TD) { + SetVector<Instruction*, SmallVector<Instruction*, 4>, + SmallPtrSet<Instruction*, 4> > InstsToRewrite; + + for (Value::use_iterator UI = AI->use_begin(), UE = AI->use_end(); + UI != UE; ++UI) { + User *U = *UI; + if (LoadInst *LI = dyn_cast<LoadInst>(U)) { + if (LI->isVolatile()) + return false; + continue; + } + + if (StoreInst *SI = dyn_cast<StoreInst>(U)) { + if (SI->getOperand(0) == AI || SI->isVolatile()) + return false; // Don't allow a store OF the AI, only INTO the AI. + continue; + } + + if (SelectInst *SI = dyn_cast<SelectInst>(U)) { + // If the condition being selected on is a constant, fold the select, yes + // this does (rarely) happen early on. + if (ConstantInt *CI = dyn_cast<ConstantInt>(SI->getCondition())) { + Value *Result = SI->getOperand(1+CI->isZero()); + SI->replaceAllUsesWith(Result); + SI->eraseFromParent(); + + // This is very rare and we just scrambled the use list of AI, start + // over completely. + return tryToMakeAllocaBePromotable(AI, TD); + } + + // If it is safe to turn "load (select c, AI, ptr)" into a select of two + // loads, then we can transform this by rewriting the select. + if (!isSafeSelectToSpeculate(SI, TD)) + return false; + + InstsToRewrite.insert(SI); + continue; + } + + if (PHINode *PN = dyn_cast<PHINode>(U)) { + if (PN->use_empty()) { // Dead PHIs can be stripped. + InstsToRewrite.insert(PN); + continue; + } + + // If it is safe to turn "load (phi [AI, ptr, ...])" into a PHI of loads + // in the pred blocks, then we can transform this by rewriting the PHI. + if (!isSafePHIToSpeculate(PN, TD)) + return false; + + InstsToRewrite.insert(PN); + continue; + } + + return false; + } + + // If there are no instructions to rewrite, then all uses are load/stores and + // we're done! + if (InstsToRewrite.empty()) + return true; + + // If we have instructions that need to be rewritten for this to be promotable + // take care of it now. + for (unsigned i = 0, e = InstsToRewrite.size(); i != e; ++i) { + if (SelectInst *SI = dyn_cast<SelectInst>(InstsToRewrite[i])) { + // Selects in InstsToRewrite only have load uses. Rewrite each as two + // loads with a new select. + while (!SI->use_empty()) { + LoadInst *LI = cast<LoadInst>(SI->use_back()); + + IRBuilder<> Builder(LI); + LoadInst *TrueLoad = + Builder.CreateLoad(SI->getTrueValue(), LI->getName()+".t"); + LoadInst *FalseLoad = + Builder.CreateLoad(SI->getFalseValue(), LI->getName()+".t"); + + // Transfer alignment and TBAA info if present. + TrueLoad->setAlignment(LI->getAlignment()); + FalseLoad->setAlignment(LI->getAlignment()); + if (MDNode *Tag = LI->getMetadata(LLVMContext::MD_tbaa)) { + TrueLoad->setMetadata(LLVMContext::MD_tbaa, Tag); + FalseLoad->setMetadata(LLVMContext::MD_tbaa, Tag); + } + + Value *V = Builder.CreateSelect(SI->getCondition(), TrueLoad, FalseLoad); + V->takeName(LI); + LI->replaceAllUsesWith(V); + LI->eraseFromParent(); + } + + // Now that all the loads are gone, the select is gone too. + SI->eraseFromParent(); + continue; + } + + // Otherwise, we have a PHI node which allows us to push the loads into the + // predecessors. + PHINode *PN = cast<PHINode>(InstsToRewrite[i]); + if (PN->use_empty()) { + PN->eraseFromParent(); + continue; + } + + const Type *LoadTy = cast<PointerType>(PN->getType())->getElementType(); + PHINode *NewPN = PHINode::Create(LoadTy, PN->getName()+".ld", PN); + + // Get the TBAA tag and alignment to use from one of the loads. It doesn't + // matter which one we get and if any differ, it doesn't matter. + LoadInst *SomeLoad = cast<LoadInst>(PN->use_back()); + MDNode *TBAATag = SomeLoad->getMetadata(LLVMContext::MD_tbaa); + unsigned Align = SomeLoad->getAlignment(); + + // Rewrite all loads of the PN to use the new PHI. + while (!PN->use_empty()) { + LoadInst *LI = cast<LoadInst>(PN->use_back()); + LI->replaceAllUsesWith(NewPN); + LI->eraseFromParent(); + } + + // Inject loads into all of the pred blocks. Keep track of which blocks we + // insert them into in case we have multiple edges from the same block. + DenseMap<BasicBlock*, LoadInst*> InsertedLoads; + + for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) { + BasicBlock *Pred = PN->getIncomingBlock(i); + LoadInst *&Load = InsertedLoads[Pred]; + if (Load == 0) { + Load = new LoadInst(PN->getIncomingValue(i), + PN->getName() + "." + Pred->getName(), + Pred->getTerminator()); + Load->setAlignment(Align); + if (TBAATag) Load->setMetadata(LLVMContext::MD_tbaa, TBAATag); + } + + NewPN->addIncoming(Load, Pred); + } + + PN->eraseFromParent(); + } + + ++NumAdjusted; + return true; +} + bool SROA::performPromotion(Function &F) { std::vector<AllocaInst*> Allocas; - DominatorTree &DT = getAnalysis<DominatorTree>(); - DominanceFrontier &DF = getAnalysis<DominanceFrontier>(); + DominatorTree *DT = 0; + if (HasDomTree) + DT = &getAnalysis<DominatorTree>(); BasicBlock &BB = F.getEntryBlock(); // Get the entry node for the function bool Changed = false; - + SmallVector<Instruction*, 64> Insts; while (1) { Allocas.clear(); @@ -795,12 +1141,27 @@ bool SROA::performPromotion(Function &F) { // the entry node for (BasicBlock::iterator I = BB.begin(), E = --BB.end(); I != E; ++I) if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) // Is it an alloca? - if (isAllocaPromotable(AI)) + if (tryToMakeAllocaBePromotable(AI, TD)) Allocas.push_back(AI); if (Allocas.empty()) break; - PromoteMemToReg(Allocas, DT, DF); + if (HasDomTree) + PromoteMemToReg(Allocas, *DT); + else { + SSAUpdater SSA; + for (unsigned i = 0, e = Allocas.size(); i != e; ++i) { + AllocaInst *AI = Allocas[i]; + + // Build list of instructions to promote. + for (Value::use_iterator UI = AI->use_begin(), E = AI->use_end(); + UI != E; ++UI) + Insts.push_back(cast<Instruction>(*UI)); + + AllocaPromoter(Insts, SSA).run(AI, Insts); + Insts.clear(); + } + } NumPromoted += Allocas.size(); Changed = true; } @@ -842,7 +1203,7 @@ bool SROA::performScalarRepl(Function &F) { while (!WorkList.empty()) { AllocaInst *AI = WorkList.back(); WorkList.pop_back(); - + // Handle dead allocas trivially. These can be formed by SROA'ing arrays // with unused elements. if (AI->use_empty()) { @@ -854,7 +1215,7 @@ bool SROA::performScalarRepl(Function &F) { // If this alloca is impossible for us to promote, reject it early. if (AI->isArrayAllocation() || !AI->getAllocatedType()->isSized()) continue; - + // Check to see if this allocation is only modified by a memcpy/memmove from // a constant global. If this is the case, we can change all users to use // the constant global instead. This is commonly produced by the CFE by @@ -871,7 +1232,7 @@ bool SROA::performScalarRepl(Function &F) { Changed = true; continue; } - + // Check to see if we can perform the core SROA transformation. We cannot // transform the allocation instruction if it is an array allocation // (allocations OF arrays are ok though), and an allocation of a scalar @@ -880,10 +1241,10 @@ bool SROA::performScalarRepl(Function &F) { // Do not promote [0 x %struct]. if (AllocaSize == 0) continue; - + // Do not promote any struct whose size is too big. if (AllocaSize > SRThreshold) continue; - + // If the alloca looks like a good candidate for scalar replacement, and if // all its users can be transformed, then split up the aggregate into its // separate elements. @@ -906,8 +1267,8 @@ bool SROA::performScalarRepl(Function &F) { ++NumConverted; Changed = true; continue; - } - + } + // Otherwise, couldn't process this alloca. } @@ -916,14 +1277,14 @@ bool SROA::performScalarRepl(Function &F) { /// DoScalarReplacement - This alloca satisfied the isSafeAllocaToScalarRepl /// predicate, do SROA now. -void SROA::DoScalarReplacement(AllocaInst *AI, +void SROA::DoScalarReplacement(AllocaInst *AI, std::vector<AllocaInst*> &WorkList) { DEBUG(dbgs() << "Found inst to SROA: " << *AI << '\n'); SmallVector<AllocaInst*, 32> ElementAllocas; if (const StructType *ST = dyn_cast<StructType>(AI->getAllocatedType())) { ElementAllocas.reserve(ST->getNumContainedTypes()); for (unsigned i = 0, e = ST->getNumContainedTypes(); i != e; ++i) { - AllocaInst *NA = new AllocaInst(ST->getContainedType(i), 0, + AllocaInst *NA = new AllocaInst(ST->getContainedType(i), 0, AI->getAlignment(), AI->getName() + "." + Twine(i), AI); ElementAllocas.push_back(NA); @@ -971,48 +1332,106 @@ void SROA::DeleteDeadInstructions() { I->eraseFromParent(); } } - + /// isSafeForScalarRepl - Check if instruction I is a safe use with regard to /// performing scalar replacement of alloca AI. The results are flagged in /// the Info parameter. Offset indicates the position within AI that is /// referenced by this instruction. -void SROA::isSafeForScalarRepl(Instruction *I, AllocaInst *AI, uint64_t Offset, +void SROA::isSafeForScalarRepl(Instruction *I, uint64_t Offset, AllocaInfo &Info) { for (Value::use_iterator UI = I->use_begin(), E = I->use_end(); UI!=E; ++UI) { Instruction *User = cast<Instruction>(*UI); if (BitCastInst *BC = dyn_cast<BitCastInst>(User)) { - isSafeForScalarRepl(BC, AI, Offset, Info); + isSafeForScalarRepl(BC, Offset, Info); } else if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(User)) { uint64_t GEPOffset = Offset; - isSafeGEP(GEPI, AI, GEPOffset, Info); + isSafeGEP(GEPI, GEPOffset, Info); if (!Info.isUnsafe) - isSafeForScalarRepl(GEPI, AI, GEPOffset, Info); + isSafeForScalarRepl(GEPI, GEPOffset, Info); } else if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(User)) { ConstantInt *Length = dyn_cast<ConstantInt>(MI->getLength()); - if (Length) - isSafeMemAccess(AI, Offset, Length->getZExtValue(), 0, - UI.getOperandNo() == 0, Info); - else - MarkUnsafe(Info); + if (Length == 0) + return MarkUnsafe(Info, User); + isSafeMemAccess(Offset, Length->getZExtValue(), 0, + UI.getOperandNo() == 0, Info, MI, + true /*AllowWholeAccess*/); + } else if (LoadInst *LI = dyn_cast<LoadInst>(User)) { + if (LI->isVolatile()) + return MarkUnsafe(Info, User); + const Type *LIType = LI->getType(); + isSafeMemAccess(Offset, TD->getTypeAllocSize(LIType), + LIType, false, Info, LI, true /*AllowWholeAccess*/); + Info.hasALoadOrStore = true; + + } else if (StoreInst *SI = dyn_cast<StoreInst>(User)) { + // Store is ok if storing INTO the pointer, not storing the pointer + if (SI->isVolatile() || SI->getOperand(0) == I) + return MarkUnsafe(Info, User); + + const Type *SIType = SI->getOperand(0)->getType(); + isSafeMemAccess(Offset, TD->getTypeAllocSize(SIType), + SIType, true, Info, SI, true /*AllowWholeAccess*/); + Info.hasALoadOrStore = true; + } else if (isa<PHINode>(User) || isa<SelectInst>(User)) { + isSafePHISelectUseForScalarRepl(User, Offset, Info); + } else { + return MarkUnsafe(Info, User); + } + if (Info.isUnsafe) return; + } +} + + +/// isSafePHIUseForScalarRepl - If we see a PHI node or select using a pointer +/// derived from the alloca, we can often still split the alloca into elements. +/// This is useful if we have a large alloca where one element is phi'd +/// together somewhere: we can SRoA and promote all the other elements even if +/// we end up not being able to promote this one. +/// +/// All we require is that the uses of the PHI do not index into other parts of +/// the alloca. The most important use case for this is single load and stores +/// that are PHI'd together, which can happen due to code sinking. +void SROA::isSafePHISelectUseForScalarRepl(Instruction *I, uint64_t Offset, + AllocaInfo &Info) { + // If we've already checked this PHI, don't do it again. + if (PHINode *PN = dyn_cast<PHINode>(I)) + if (!Info.CheckedPHIs.insert(PN)) + return; + + for (Value::use_iterator UI = I->use_begin(), E = I->use_end(); UI!=E; ++UI) { + Instruction *User = cast<Instruction>(*UI); + + if (BitCastInst *BC = dyn_cast<BitCastInst>(User)) { + isSafePHISelectUseForScalarRepl(BC, Offset, Info); + } else if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(User)) { + // Only allow "bitcast" GEPs for simplicity. We could generalize this, + // but would have to prove that we're staying inside of an element being + // promoted. + if (!GEPI->hasAllZeroIndices()) + return MarkUnsafe(Info, User); + isSafePHISelectUseForScalarRepl(GEPI, Offset, Info); } else if (LoadInst *LI = dyn_cast<LoadInst>(User)) { - if (!LI->isVolatile()) { - const Type *LIType = LI->getType(); - isSafeMemAccess(AI, Offset, TD->getTypeAllocSize(LIType), - LIType, false, Info); - } else - MarkUnsafe(Info); + if (LI->isVolatile()) + return MarkUnsafe(Info, User); + const Type *LIType = LI->getType(); + isSafeMemAccess(Offset, TD->getTypeAllocSize(LIType), + LIType, false, Info, LI, false /*AllowWholeAccess*/); + Info.hasALoadOrStore = true; + } else if (StoreInst *SI = dyn_cast<StoreInst>(User)) { // Store is ok if storing INTO the pointer, not storing the pointer - if (!SI->isVolatile() && SI->getOperand(0) != I) { - const Type *SIType = SI->getOperand(0)->getType(); - isSafeMemAccess(AI, Offset, TD->getTypeAllocSize(SIType), - SIType, true, Info); - } else - MarkUnsafe(Info); + if (SI->isVolatile() || SI->getOperand(0) == I) + return MarkUnsafe(Info, User); + + const Type *SIType = SI->getOperand(0)->getType(); + isSafeMemAccess(Offset, TD->getTypeAllocSize(SIType), + SIType, true, Info, SI, false /*AllowWholeAccess*/); + Info.hasALoadOrStore = true; + } else if (isa<PHINode>(User) || isa<SelectInst>(User)) { + isSafePHISelectUseForScalarRepl(User, Offset, Info); } else { - DEBUG(errs() << " Transformation preventing inst: " << *User << '\n'); - MarkUnsafe(Info); + return MarkUnsafe(Info, User); } if (Info.isUnsafe) return; } @@ -1023,7 +1442,7 @@ void SROA::isSafeForScalarRepl(Instruction *I, AllocaInst *AI, uint64_t Offset, /// references, and when the resulting offset corresponds to an element within /// the alloca type. The results are flagged in the Info parameter. Upon /// return, Offset is adjusted as specified by the GEP indices. -void SROA::isSafeGEP(GetElementPtrInst *GEPI, AllocaInst *AI, +void SROA::isSafeGEP(GetElementPtrInst *GEPI, uint64_t &Offset, AllocaInfo &Info) { gep_type_iterator GEPIt = gep_type_begin(GEPI), E = gep_type_end(GEPI); if (GEPIt == E) @@ -1038,7 +1457,7 @@ void SROA::isSafeGEP(GetElementPtrInst *GEPI, AllocaInst *AI, ConstantInt *IdxVal = dyn_cast<ConstantInt>(GEPIt.getOperand()); if (!IdxVal) - return MarkUnsafe(Info); + return MarkUnsafe(Info, GEPI); } // Compute the offset due to this GEP and check if the alloca has a @@ -1046,40 +1465,92 @@ void SROA::isSafeGEP(GetElementPtrInst *GEPI, AllocaInst *AI, SmallVector<Value*, 8> Indices(GEPI->op_begin() + 1, GEPI->op_end()); Offset += TD->getIndexedOffset(GEPI->getPointerOperandType(), &Indices[0], Indices.size()); - if (!TypeHasComponent(AI->getAllocatedType(), Offset, 0)) - MarkUnsafe(Info); + if (!TypeHasComponent(Info.AI->getAllocatedType(), Offset, 0)) + MarkUnsafe(Info, GEPI); +} + +/// isHomogeneousAggregate - Check if type T is a struct or array containing +/// elements of the same type (which is always true for arrays). If so, +/// return true with NumElts and EltTy set to the number of elements and the +/// element type, respectively. +static bool isHomogeneousAggregate(const Type *T, unsigned &NumElts, + const Type *&EltTy) { + if (const ArrayType *AT = dyn_cast<ArrayType>(T)) { + NumElts = AT->getNumElements(); + EltTy = (NumElts == 0 ? 0 : AT->getElementType()); + return true; + } + if (const StructType *ST = dyn_cast<StructType>(T)) { + NumElts = ST->getNumContainedTypes(); + EltTy = (NumElts == 0 ? 0 : ST->getContainedType(0)); + for (unsigned n = 1; n < NumElts; ++n) { + if (ST->getContainedType(n) != EltTy) + return false; + } + return true; + } + return false; +} + +/// isCompatibleAggregate - Check if T1 and T2 are either the same type or are +/// "homogeneous" aggregates with the same element type and number of elements. +static bool isCompatibleAggregate(const Type *T1, const Type *T2) { + if (T1 == T2) + return true; + + unsigned NumElts1, NumElts2; + const Type *EltTy1, *EltTy2; + if (isHomogeneousAggregate(T1, NumElts1, EltTy1) && + isHomogeneousAggregate(T2, NumElts2, EltTy2) && + NumElts1 == NumElts2 && + EltTy1 == EltTy2) + return true; + + return false; } /// isSafeMemAccess - Check if a load/store/memcpy operates on the entire AI /// alloca or has an offset and size that corresponds to a component element /// within it. The offset checked here may have been formed from a GEP with a /// pointer bitcasted to a different type. -void SROA::isSafeMemAccess(AllocaInst *AI, uint64_t Offset, uint64_t MemSize, +/// +/// If AllowWholeAccess is true, then this allows uses of the entire alloca as a +/// unit. If false, it only allows accesses known to be in a single element. +void SROA::isSafeMemAccess(uint64_t Offset, uint64_t MemSize, const Type *MemOpType, bool isStore, - AllocaInfo &Info) { + AllocaInfo &Info, Instruction *TheAccess, + bool AllowWholeAccess) { // Check if this is a load/store of the entire alloca. - if (Offset == 0 && MemSize == TD->getTypeAllocSize(AI->getAllocatedType())) { - bool UsesAggregateType = (MemOpType == AI->getAllocatedType()); - // This is safe for MemIntrinsics (where MemOpType is 0), integer types - // (which are essentially the same as the MemIntrinsics, especially with - // regard to copying padding between elements), or references using the - // aggregate type of the alloca. - if (!MemOpType || MemOpType->isIntegerTy() || UsesAggregateType) { - if (!UsesAggregateType) { - if (isStore) - Info.isMemCpyDst = true; - else - Info.isMemCpySrc = true; - } + if (Offset == 0 && AllowWholeAccess && + MemSize == TD->getTypeAllocSize(Info.AI->getAllocatedType())) { + // This can be safe for MemIntrinsics (where MemOpType is 0) and integer + // loads/stores (which are essentially the same as the MemIntrinsics with + // regard to copying padding between elements). But, if an alloca is + // flagged as both a source and destination of such operations, we'll need + // to check later for padding between elements. + if (!MemOpType || MemOpType->isIntegerTy()) { + if (isStore) + Info.isMemCpyDst = true; + else + Info.isMemCpySrc = true; + return; + } + // This is also safe for references using a type that is compatible with + // the type of the alloca, so that loads/stores can be rewritten using + // insertvalue/extractvalue. + if (isCompatibleAggregate(MemOpType, Info.AI->getAllocatedType())) { + Info.hasSubelementAccess = true; return; } } // Check if the offset/size correspond to a component within the alloca type. - const Type *T = AI->getAllocatedType(); - if (TypeHasComponent(T, Offset, MemSize)) + const Type *T = Info.AI->getAllocatedType(); + if (TypeHasComponent(T, Offset, MemSize)) { + Info.hasSubelementAccess = true; return; + } - return MarkUnsafe(Info); + return MarkUnsafe(Info, TheAccess); } /// TypeHasComponent - Return true if T has a component type with the @@ -1116,14 +1587,21 @@ bool SROA::TypeHasComponent(const Type *T, uint64_t Offset, uint64_t Size) { /// instruction. void SROA::RewriteForScalarRepl(Instruction *I, AllocaInst *AI, uint64_t Offset, SmallVector<AllocaInst*, 32> &NewElts) { - for (Value::use_iterator UI = I->use_begin(), E = I->use_end(); UI!=E; ++UI) { - Instruction *User = cast<Instruction>(*UI); + for (Value::use_iterator UI = I->use_begin(), E = I->use_end(); UI!=E;) { + Use &TheUse = UI.getUse(); + Instruction *User = cast<Instruction>(*UI++); if (BitCastInst *BC = dyn_cast<BitCastInst>(User)) { RewriteBitCast(BC, AI, Offset, NewElts); - } else if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(User)) { + continue; + } + + if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(User)) { RewriteGEP(GEPI, AI, Offset, NewElts); - } else if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(User)) { + continue; + } + + if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(User)) { ConstantInt *Length = dyn_cast<ConstantInt>(MI->getLength()); uint64_t MemSize = Length->getZExtValue(); if (Offset == 0 && @@ -1131,9 +1609,13 @@ void SROA::RewriteForScalarRepl(Instruction *I, AllocaInst *AI, uint64_t Offset, RewriteMemIntrinUserOfAlloca(MI, I, AI, NewElts); // Otherwise the intrinsic can only touch a single element and the // address operand will be updated, so nothing else needs to be done. - } else if (LoadInst *LI = dyn_cast<LoadInst>(User)) { + continue; + } + + if (LoadInst *LI = dyn_cast<LoadInst>(User)) { const Type *LIType = LI->getType(); - if (LIType == AI->getAllocatedType()) { + + if (isCompatibleAggregate(LIType, AI->getAllocatedType())) { // Replace: // %res = load { i32, i32 }* %alloc // with: @@ -1155,10 +1637,13 @@ void SROA::RewriteForScalarRepl(Instruction *I, AllocaInst *AI, uint64_t Offset, // If this is a load of the entire alloca to an integer, rewrite it. RewriteLoadUserOfWholeAlloca(LI, AI, NewElts); } - } else if (StoreInst *SI = dyn_cast<StoreInst>(User)) { + continue; + } + + if (StoreInst *SI = dyn_cast<StoreInst>(User)) { Value *Val = SI->getOperand(0); const Type *SIType = Val->getType(); - if (SIType == AI->getAllocatedType()) { + if (isCompatibleAggregate(SIType, AI->getAllocatedType())) { // Replace: // store { i32, i32 } %val, { i32, i32 }* %alloc // with: @@ -1178,6 +1663,26 @@ void SROA::RewriteForScalarRepl(Instruction *I, AllocaInst *AI, uint64_t Offset, // If this is a store of the entire alloca from an integer, rewrite it. RewriteStoreUserOfWholeAlloca(SI, AI, NewElts); } + continue; + } + + if (isa<SelectInst>(User) || isa<PHINode>(User)) { + // If we have a PHI user of the alloca itself (as opposed to a GEP or + // bitcast) we have to rewrite it. GEP and bitcast uses will be RAUW'd to + // the new pointer. + if (!isa<AllocaInst>(I)) continue; + + assert(Offset == 0 && NewElts[0] && + "Direct alloca use should have a zero offset"); + + // If we have a use of the alloca, we know the derived uses will be + // utilizing just the first element of the scalarized result. Insert a + // bitcast of the first alloca before the user as required. + AllocaInst *NewAI = NewElts[0]; + BitCastInst *BCI = new BitCastInst(NewAI, AI->getType(), "", NewAI); + NewAI->moveBefore(BCI); + TheUse = BCI; + continue; } } } @@ -1305,7 +1810,7 @@ void SROA::RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *Inst, // function is only called for mem intrinsics that access the whole // aggregate, so non-zero GEPs are not an issue here.) OtherPtr = OtherPtr->stripPointerCasts(); - + // Copying the alloca to itself is a no-op: just delete it. if (OtherPtr == AI || OtherPtr == NewElts[0]) { // This code will run twice for a no-op memcpy -- once for each operand. @@ -1316,28 +1821,26 @@ void SROA::RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *Inst, DeadInsts.push_back(MI); return; } - + // If the pointer is not the right type, insert a bitcast to the right // type. const Type *NewTy = PointerType::get(AI->getType()->getElementType(), AddrSpace); - + if (OtherPtr->getType() != NewTy) OtherPtr = new BitCastInst(OtherPtr, NewTy, OtherPtr->getName(), MI); } - + // Process each element of the aggregate. - Value *TheFn = MI->getCalledValue(); - const Type *BytePtrTy = MI->getRawDest()->getType(); bool SROADest = MI->getRawDest() == Inst; - + Constant *Zero = Constant::getNullValue(Type::getInt32Ty(MI->getContext())); for (unsigned i = 0, e = NewElts.size(); i != e; ++i) { // If this is a memcpy/memmove, emit a GEP of the other element address. Value *OtherElt = 0; unsigned OtherEltAlign = MemAlignment; - + if (OtherPtr) { Value *Idx[2] = { Zero, ConstantInt::get(Type::getInt32Ty(MI->getContext()), i) }; @@ -1353,7 +1856,7 @@ void SROA::RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *Inst, const Type *EltTy = cast<SequentialType>(OtherTy)->getElementType(); EltOffset = TD->getTypeAllocSize(EltTy)*i; } - + // The alignment of the other pointer is the guaranteed alignment of the // element, which is affected by both the known alignment of the whole // mem intrinsic and the alignment of the element. If the alignment of @@ -1361,10 +1864,10 @@ void SROA::RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *Inst, // known alignment is just 4 bytes. OtherEltAlign = (unsigned)MinAlign(OtherEltAlign, EltOffset); } - + Value *EltPtr = NewElts[i]; const Type *EltTy = cast<PointerType>(EltPtr->getType())->getElementType(); - + // If we got down to a scalar, insert a load or store as appropriate. if (EltTy->isSingleValueType()) { if (isa<MemTransferInst>(MI)) { @@ -1380,7 +1883,7 @@ void SROA::RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *Inst, continue; } assert(isa<MemSetInst>(MI)); - + // If the stored element is zero (common case), just store a null // constant. Constant *StoreVal; @@ -1400,7 +1903,7 @@ void SROA::RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *Inst, TotalVal = TotalVal.shl(8); TotalVal |= OneVal; } - + // Convert the integer value to the appropriate type. StoreVal = ConstantInt::get(CI->getContext(), TotalVal); if (ValTy->isPointerTy()) @@ -1408,12 +1911,12 @@ void SROA::RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *Inst, else if (ValTy->isFloatingPointTy()) StoreVal = ConstantExpr::getBitCast(StoreVal, ValTy); assert(StoreVal->getType() == ValTy && "Type mismatch!"); - + // If the requested value was a vector constant, create it. if (EltTy != ValTy) { unsigned NumElts = cast<VectorType>(ValTy)->getNumElements(); SmallVector<Constant*, 16> Elts(NumElts, StoreVal); - StoreVal = ConstantVector::get(&Elts[0], NumElts); + StoreVal = ConstantVector::get(Elts); } } new StoreInst(StoreVal, EltPtr, MI); @@ -1422,55 +1925,24 @@ void SROA::RewriteMemIntrinUserOfAlloca(MemIntrinsic *MI, Instruction *Inst, // Otherwise, if we're storing a byte variable, use a memset call for // this element. } - - // Cast the element pointer to BytePtrTy. - if (EltPtr->getType() != BytePtrTy) - EltPtr = new BitCastInst(EltPtr, BytePtrTy, EltPtr->getName(), MI); - - // Cast the other pointer (if we have one) to BytePtrTy. - if (OtherElt && OtherElt->getType() != BytePtrTy) { - // Preserve address space of OtherElt - const PointerType* OtherPTy = cast<PointerType>(OtherElt->getType()); - const PointerType* PTy = cast<PointerType>(BytePtrTy); - if (OtherPTy->getElementType() != PTy->getElementType()) { - Type *NewOtherPTy = PointerType::get(PTy->getElementType(), - OtherPTy->getAddressSpace()); - OtherElt = new BitCastInst(OtherElt, NewOtherPTy, - OtherElt->getNameStr(), MI); - } - } - + unsigned EltSize = TD->getTypeAllocSize(EltTy); - + + IRBuilder<> Builder(MI); + // Finally, insert the meminst for this element. - if (isa<MemTransferInst>(MI)) { - Value *Ops[] = { - SROADest ? EltPtr : OtherElt, // Dest ptr - SROADest ? OtherElt : EltPtr, // Src ptr - ConstantInt::get(MI->getArgOperand(2)->getType(), EltSize), // Size - // Align - ConstantInt::get(Type::getInt32Ty(MI->getContext()), OtherEltAlign), - MI->getVolatileCst() - }; - // In case we fold the address space overloaded memcpy of A to B - // with memcpy of B to C, change the function to be a memcpy of A to C. - const Type *Tys[] = { Ops[0]->getType(), Ops[1]->getType(), - Ops[2]->getType() }; - Module *M = MI->getParent()->getParent()->getParent(); - TheFn = Intrinsic::getDeclaration(M, MI->getIntrinsicID(), Tys, 3); - CallInst::Create(TheFn, Ops, Ops + 5, "", MI); + if (isa<MemSetInst>(MI)) { + Builder.CreateMemSet(EltPtr, MI->getArgOperand(1), EltSize, + MI->isVolatile()); } else { - assert(isa<MemSetInst>(MI)); - Value *Ops[] = { - EltPtr, MI->getArgOperand(1), // Dest, Value, - ConstantInt::get(MI->getArgOperand(2)->getType(), EltSize), // Size - Zero, // Align - ConstantInt::get(Type::getInt1Ty(MI->getContext()), 0) // isVolatile - }; - const Type *Tys[] = { Ops[0]->getType(), Ops[2]->getType() }; - Module *M = MI->getParent()->getParent()->getParent(); - TheFn = Intrinsic::getDeclaration(M, Intrinsic::memset, Tys, 2); - CallInst::Create(TheFn, Ops, Ops + 5, "", MI); + assert(isa<MemTransferInst>(MI)); + Value *Dst = SROADest ? EltPtr : OtherElt; // Dest ptr + Value *Src = SROADest ? OtherElt : EltPtr; // Src ptr + + if (isa<MemCpyInst>(MI)) + Builder.CreateMemCpy(Dst, Src, EltSize, OtherEltAlign,MI->isVolatile()); + else + Builder.CreateMemMove(Dst, Src, EltSize,OtherEltAlign,MI->isVolatile()); } } DeadInsts.push_back(MI); @@ -1486,12 +1958,13 @@ void SROA::RewriteStoreUserOfWholeAlloca(StoreInst *SI, AllocaInst *AI, Value *SrcVal = SI->getOperand(0); const Type *AllocaEltTy = AI->getAllocatedType(); uint64_t AllocaSizeBits = TD->getTypeAllocSizeInBits(AllocaEltTy); + + IRBuilder<> Builder(SI); // Handle tail padding by extending the operand if (TD->getTypeSizeInBits(SrcVal->getType()) != AllocaSizeBits) - SrcVal = new ZExtInst(SrcVal, - IntegerType::get(SI->getContext(), AllocaSizeBits), - "", SI); + SrcVal = Builder.CreateZExt(SrcVal, + IntegerType::get(SI->getContext(), AllocaSizeBits)); DEBUG(dbgs() << "PROMOTING STORE TO WHOLE ALLOCA: " << *AI << '\n' << *SI << '\n'); @@ -1500,47 +1973,44 @@ void SROA::RewriteStoreUserOfWholeAlloca(StoreInst *SI, AllocaInst *AI, // have different ways to compute the element offset. if (const StructType *EltSTy = dyn_cast<StructType>(AllocaEltTy)) { const StructLayout *Layout = TD->getStructLayout(EltSTy); - + for (unsigned i = 0, e = NewElts.size(); i != e; ++i) { // Get the number of bits to shift SrcVal to get the value. const Type *FieldTy = EltSTy->getElementType(i); uint64_t Shift = Layout->getElementOffsetInBits(i); - + if (TD->isBigEndian()) Shift = AllocaSizeBits-Shift-TD->getTypeAllocSizeInBits(FieldTy); - + Value *EltVal = SrcVal; if (Shift) { Value *ShiftVal = ConstantInt::get(EltVal->getType(), Shift); - EltVal = BinaryOperator::CreateLShr(EltVal, ShiftVal, - "sroa.store.elt", SI); + EltVal = Builder.CreateLShr(EltVal, ShiftVal, "sroa.store.elt"); } - + // Truncate down to an integer of the right size. uint64_t FieldSizeBits = TD->getTypeSizeInBits(FieldTy); - + // Ignore zero sized fields like {}, they obviously contain no data. if (FieldSizeBits == 0) continue; - + if (FieldSizeBits != AllocaSizeBits) - EltVal = new TruncInst(EltVal, - IntegerType::get(SI->getContext(), FieldSizeBits), - "", SI); + EltVal = Builder.CreateTrunc(EltVal, + IntegerType::get(SI->getContext(), FieldSizeBits)); Value *DestField = NewElts[i]; if (EltVal->getType() == FieldTy) { // Storing to an integer field of this size, just do it. } else if (FieldTy->isFloatingPointTy() || FieldTy->isVectorTy()) { // Bitcast to the right element type (for fp/vector values). - EltVal = new BitCastInst(EltVal, FieldTy, "", SI); + EltVal = Builder.CreateBitCast(EltVal, FieldTy); } else { // Otherwise, bitcast the dest pointer (for aggregates). - DestField = new BitCastInst(DestField, - PointerType::getUnqual(EltVal->getType()), - "", SI); + DestField = Builder.CreateBitCast(DestField, + PointerType::getUnqual(EltVal->getType())); } new StoreInst(EltVal, DestField, SI); } - + } else { const ArrayType *ATy = cast<ArrayType>(AllocaEltTy); const Type *ArrayEltTy = ATy->getElementType(); @@ -1548,50 +2018,48 @@ void SROA::RewriteStoreUserOfWholeAlloca(StoreInst *SI, AllocaInst *AI, uint64_t ElementSizeBits = TD->getTypeSizeInBits(ArrayEltTy); uint64_t Shift; - + if (TD->isBigEndian()) Shift = AllocaSizeBits-ElementOffset; - else + else Shift = 0; - + for (unsigned i = 0, e = NewElts.size(); i != e; ++i) { // Ignore zero sized fields like {}, they obviously contain no data. if (ElementSizeBits == 0) continue; - + Value *EltVal = SrcVal; if (Shift) { Value *ShiftVal = ConstantInt::get(EltVal->getType(), Shift); - EltVal = BinaryOperator::CreateLShr(EltVal, ShiftVal, - "sroa.store.elt", SI); + EltVal = Builder.CreateLShr(EltVal, ShiftVal, "sroa.store.elt"); } - + // Truncate down to an integer of the right size. if (ElementSizeBits != AllocaSizeBits) - EltVal = new TruncInst(EltVal, - IntegerType::get(SI->getContext(), - ElementSizeBits),"",SI); + EltVal = Builder.CreateTrunc(EltVal, + IntegerType::get(SI->getContext(), + ElementSizeBits)); Value *DestField = NewElts[i]; if (EltVal->getType() == ArrayEltTy) { // Storing to an integer field of this size, just do it. } else if (ArrayEltTy->isFloatingPointTy() || ArrayEltTy->isVectorTy()) { // Bitcast to the right element type (for fp/vector values). - EltVal = new BitCastInst(EltVal, ArrayEltTy, "", SI); + EltVal = Builder.CreateBitCast(EltVal, ArrayEltTy); } else { // Otherwise, bitcast the dest pointer (for aggregates). - DestField = new BitCastInst(DestField, - PointerType::getUnqual(EltVal->getType()), - "", SI); + DestField = Builder.CreateBitCast(DestField, + PointerType::getUnqual(EltVal->getType())); } new StoreInst(EltVal, DestField, SI); - + if (TD->isBigEndian()) Shift -= ElementOffset; - else + else Shift += ElementOffset; } } - + DeadInsts.push_back(SI); } @@ -1603,10 +2071,10 @@ void SROA::RewriteLoadUserOfWholeAlloca(LoadInst *LI, AllocaInst *AI, // and form the result value. const Type *AllocaEltTy = AI->getAllocatedType(); uint64_t AllocaSizeBits = TD->getTypeAllocSizeInBits(AllocaEltTy); - + DEBUG(dbgs() << "PROMOTING LOAD OF WHOLE ALLOCA: " << *AI << '\n' << *LI << '\n'); - + // There are two forms here: AI could be an array or struct. Both cases // have different ways to compute the element offset. const StructLayout *Layout = 0; @@ -1616,11 +2084,11 @@ void SROA::RewriteLoadUserOfWholeAlloca(LoadInst *LI, AllocaInst *AI, } else { const Type *ArrayEltTy = cast<ArrayType>(AllocaEltTy)->getElementType(); ArrayEltBitOffset = TD->getTypeAllocSizeInBits(ArrayEltTy); - } - - Value *ResultVal = + } + + Value *ResultVal = Constant::getNullValue(IntegerType::get(LI->getContext(), AllocaSizeBits)); - + for (unsigned i = 0, e = NewElts.size(); i != e; ++i) { // Load the value from the alloca. If the NewElt is an aggregate, cast // the pointer to an integer of the same size before doing the load. @@ -1628,11 +2096,11 @@ void SROA::RewriteLoadUserOfWholeAlloca(LoadInst *LI, AllocaInst *AI, const Type *FieldTy = cast<PointerType>(SrcField->getType())->getElementType(); uint64_t FieldSizeBits = TD->getTypeSizeInBits(FieldTy); - + // Ignore zero sized fields like {}, they obviously contain no data. if (FieldSizeBits == 0) continue; - - const IntegerType *FieldIntTy = IntegerType::get(LI->getContext(), + + const IntegerType *FieldIntTy = IntegerType::get(LI->getContext(), FieldSizeBits); if (!FieldTy->isIntegerTy() && !FieldTy->isFloatingPointTy() && !FieldTy->isVectorTy()) @@ -1650,17 +2118,17 @@ void SROA::RewriteLoadUserOfWholeAlloca(LoadInst *LI, AllocaInst *AI, // we can shift and insert it. if (SrcField->getType() != ResultVal->getType()) SrcField = new ZExtInst(SrcField, ResultVal->getType(), "", LI); - + // Determine the number of bits to shift SrcField. uint64_t Shift; if (Layout) // Struct case. Shift = Layout->getElementOffsetInBits(i); else // Array case. Shift = i*ArrayEltBitOffset; - + if (TD->isBigEndian()) Shift = AllocaSizeBits-Shift-FieldIntTy->getBitWidth(); - + if (Shift) { Value *ShiftVal = ConstantInt::get(SrcField->getType(), Shift); SrcField = BinaryOperator::CreateShl(SrcField, ShiftVal, "", LI); @@ -1683,46 +2151,39 @@ void SROA::RewriteLoadUserOfWholeAlloca(LoadInst *LI, AllocaInst *AI, } /// HasPadding - Return true if the specified type has any structure or -/// alignment padding, false otherwise. +/// alignment padding in between the elements that would be split apart +/// by SROA; return false otherwise. static bool HasPadding(const Type *Ty, const TargetData &TD) { - if (const ArrayType *ATy = dyn_cast<ArrayType>(Ty)) - return HasPadding(ATy->getElementType(), TD); - - if (const VectorType *VTy = dyn_cast<VectorType>(Ty)) - return HasPadding(VTy->getElementType(), TD); - - if (const StructType *STy = dyn_cast<StructType>(Ty)) { - const StructLayout *SL = TD.getStructLayout(STy); - unsigned PrevFieldBitOffset = 0; - for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) { - unsigned FieldBitOffset = SL->getElementOffsetInBits(i); - - // Padding in sub-elements? - if (HasPadding(STy->getElementType(i), TD)) - return true; + if (const ArrayType *ATy = dyn_cast<ArrayType>(Ty)) { + Ty = ATy->getElementType(); + return TD.getTypeSizeInBits(Ty) != TD.getTypeAllocSizeInBits(Ty); + } - // Check to see if there is any padding between this element and the - // previous one. - if (i) { - unsigned PrevFieldEnd = + // SROA currently handles only Arrays and Structs. + const StructType *STy = cast<StructType>(Ty); + const StructLayout *SL = TD.getStructLayout(STy); + unsigned PrevFieldBitOffset = 0; + for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) { + unsigned FieldBitOffset = SL->getElementOffsetInBits(i); + + // Check to see if there is any padding between this element and the + // previous one. + if (i) { + unsigned PrevFieldEnd = PrevFieldBitOffset+TD.getTypeSizeInBits(STy->getElementType(i-1)); - if (PrevFieldEnd < FieldBitOffset) - return true; - } - - PrevFieldBitOffset = FieldBitOffset; - } - - // Check for tail padding. - if (unsigned EltCount = STy->getNumElements()) { - unsigned PrevFieldEnd = PrevFieldBitOffset + - TD.getTypeSizeInBits(STy->getElementType(EltCount-1)); - if (PrevFieldEnd < SL->getSizeInBits()) + if (PrevFieldEnd < FieldBitOffset) return true; } + PrevFieldBitOffset = FieldBitOffset; } - - return TD.getTypeSizeInBits(Ty) != TD.getTypeAllocSizeInBits(Ty); + // Check for tail padding. + if (unsigned EltCount = STy->getNumElements()) { + unsigned PrevFieldEnd = PrevFieldBitOffset + + TD.getTypeSizeInBits(STy->getElementType(EltCount-1)); + if (PrevFieldEnd < SL->getSizeInBits()) + return true; + } + return false; } /// isSafeStructAllocaToScalarRepl - Check to see if the specified allocation of @@ -1731,14 +2192,14 @@ static bool HasPadding(const Type *Ty, const TargetData &TD) { bool SROA::isSafeAllocaToScalarRepl(AllocaInst *AI) { // Loop over the use list of the alloca. We can only transform it if all of // the users are safe to transform. - AllocaInfo Info; - - isSafeForScalarRepl(AI, AI, 0, Info); + AllocaInfo Info(AI); + + isSafeForScalarRepl(AI, 0, Info); if (Info.isUnsafe) { DEBUG(dbgs() << "Cannot transform: " << *AI << '\n'); return false; } - + // Okay, we know all the users are promotable. If the aggregate is a memcpy // source and destination, we have to be careful. In particular, the memcpy // could be moving around elements that live in structure padding of the LLVM @@ -1748,6 +2209,20 @@ bool SROA::isSafeAllocaToScalarRepl(AllocaInst *AI) { HasPadding(AI->getAllocatedType(), *TD)) return false; + // If the alloca never has an access to just *part* of it, but is accessed + // via loads and stores, then we should use ConvertToScalarInfo to promote + // the alloca instead of promoting each piece at a time and inserting fission + // and fusion code. + if (!Info.hasSubelementAccess && Info.hasALoadOrStore) { + // If the struct/array just has one element, use basic SRoA. + if (const StructType *ST = dyn_cast<StructType>(AI->getAllocatedType())) { + if (ST->getNumElements() > 1) return false; + } else { + if (cast<ArrayType>(AI->getAllocatedType())->getNumElements() > 1) + return false; + } + } + return true; } @@ -1760,7 +2235,7 @@ static bool PointsToConstantGlobal(Value *V) { if (GlobalVariable *GV = dyn_cast<GlobalVariable>(V)) return GV->isConstant(); if (ConstantExpr *CE = dyn_cast<ConstantExpr>(V)) - if (CE->getOpcode() == Instruction::BitCast || + if (CE->getOpcode() == Instruction::BitCast || CE->getOpcode() == Instruction::GetElementPtr) return PointsToConstantGlobal(CE->getOperand(0)); return false; @@ -1771,18 +2246,19 @@ static bool PointsToConstantGlobal(Value *V) { /// see any stores or other unknown uses. If we see pointer arithmetic, keep /// track of whether it moves the pointer (with isOffset) but otherwise traverse /// the uses. If we see a memcpy/memmove that targets an unoffseted pointer to -/// the alloca, and if the source pointer is a pointer to a constant global, we +/// the alloca, and if the source pointer is a pointer to a constant global, we /// can optimize this. static bool isOnlyCopiedFromConstantGlobal(Value *V, MemTransferInst *&TheCopy, bool isOffset) { for (Value::use_iterator UI = V->use_begin(), E = V->use_end(); UI!=E; ++UI) { User *U = cast<Instruction>(*UI); - if (LoadInst *LI = dyn_cast<LoadInst>(U)) + if (LoadInst *LI = dyn_cast<LoadInst>(U)) { // Ignore non-volatile loads, they are always ok. - if (!LI->isVolatile()) - continue; - + if (LI->isVolatile()) return false; + continue; + } + if (BitCastInst *BCI = dyn_cast<BitCastInst>(U)) { // If uses of the bitcast are ok, we are ok. if (!isOnlyCopiedFromConstantGlobal(BCI, TheCopy, isOffset)) @@ -1797,27 +2273,52 @@ static bool isOnlyCopiedFromConstantGlobal(Value *V, MemTransferInst *&TheCopy, return false; continue; } - + + if (CallSite CS = U) { + // If this is a readonly/readnone call site, then we know it is just a + // load and we can ignore it. + if (CS.onlyReadsMemory()) + continue; + + // If this is the function being called then we treat it like a load and + // ignore it. + if (CS.isCallee(UI)) + continue; + + // If this is being passed as a byval argument, the caller is making a + // copy, so it is only a read of the alloca. + unsigned ArgNo = CS.getArgumentNo(UI); + if (CS.paramHasAttr(ArgNo+1, Attribute::ByVal)) + continue; + } + // If this is isn't our memcpy/memmove, reject it as something we can't // handle. MemTransferInst *MI = dyn_cast<MemTransferInst>(U); if (MI == 0) return false; + // If the transfer is using the alloca as a source of the transfer, then + // ignore it since it is a load (unless the transfer is volatile). + if (UI.getOperandNo() == 1) { + if (MI->isVolatile()) return false; + continue; + } + // If we already have seen a copy, reject the second one. if (TheCopy) return false; - + // If the pointer has been offset from the start of the alloca, we can't // safely handle this. if (isOffset) return false; // If the memintrinsic isn't using the alloca as the dest, reject it. if (UI.getOperandNo() != 0) return false; - + // If the source of the memcpy/move is not a constant global, reject it. if (!PointsToConstantGlobal(MI->getSource())) return false; - + // Otherwise, the transform is safe. Remember the copy instruction. TheCopy = MI; } diff --git a/lib/Transforms/Scalar/SimplifyCFGPass.cpp b/lib/Transforms/Scalar/SimplifyCFGPass.cpp index 360749c..ce5dd73 100644 --- a/lib/Transforms/Scalar/SimplifyCFGPass.cpp +++ b/lib/Transforms/Scalar/SimplifyCFGPass.cpp @@ -42,7 +42,9 @@ STATISTIC(NumSimpl, "Number of blocks simplified"); namespace { struct CFGSimplifyPass : public FunctionPass { static char ID; // Pass identification, replacement for typeid - CFGSimplifyPass() : FunctionPass(ID) {} + CFGSimplifyPass() : FunctionPass(ID) { + initializeCFGSimplifyPassPass(*PassRegistry::getPassRegistry()); + } virtual bool runOnFunction(Function &F); }; @@ -50,7 +52,7 @@ namespace { char CFGSimplifyPass::ID = 0; INITIALIZE_PASS(CFGSimplifyPass, "simplifycfg", - "Simplify the CFG", false, false); + "Simplify the CFG", false, false) // Public interface to the CFGSimplification pass FunctionPass *llvm::createCFGSimplificationPass() { diff --git a/lib/Transforms/Scalar/SimplifyHalfPowrLibCalls.cpp b/lib/Transforms/Scalar/SimplifyHalfPowrLibCalls.cpp index 3ec70ec..70ff32e 100644 --- a/lib/Transforms/Scalar/SimplifyHalfPowrLibCalls.cpp +++ b/lib/Transforms/Scalar/SimplifyHalfPowrLibCalls.cpp @@ -32,7 +32,9 @@ namespace { const TargetData *TD; public: static char ID; // Pass identification - SimplifyHalfPowrLibCalls() : FunctionPass(ID) {} + SimplifyHalfPowrLibCalls() : FunctionPass(ID) { + initializeSimplifyHalfPowrLibCallsPass(*PassRegistry::getPassRegistry()); + } bool runOnFunction(Function &F); @@ -47,7 +49,7 @@ namespace { } // end anonymous namespace. INITIALIZE_PASS(SimplifyHalfPowrLibCalls, "simplify-libcalls-halfpowr", - "Simplify half_powr library calls", false, false); + "Simplify half_powr library calls", false, false) // Public interface to the Simplify HalfPowr LibCalls pass. FunctionPass *llvm::createSimplifyHalfPowrLibCallsPass() { @@ -95,7 +97,8 @@ InlineHalfPowrs(const std::vector<Instruction *> &HalfPowrs, InlineFunctionInfo IFI(0, TD); bool B = InlineFunction(Call, IFI); - assert(B && "half_powr didn't inline?"); B=B; + assert(B && "half_powr didn't inline?"); + (void)B; BasicBlock *NewBody = NewBlock->getSinglePredecessor(); assert(NewBody); diff --git a/lib/Transforms/Scalar/SimplifyLibCalls.cpp b/lib/Transforms/Scalar/SimplifyLibCalls.cpp index d7ce53f..ec45b71 100644 --- a/lib/Transforms/Scalar/SimplifyLibCalls.cpp +++ b/lib/Transforms/Scalar/SimplifyLibCalls.cpp @@ -123,7 +123,7 @@ struct StrCatOpt : public LibCallOptimization { // Verify the "strcat" function prototype. const FunctionType *FT = Callee->getFunctionType(); if (FT->getNumParams() != 2 || - FT->getReturnType() != Type::getInt8PtrTy(*Context) || + FT->getReturnType() != B.getInt8PtrTy() || FT->getParamType(0) != FT->getReturnType() || FT->getParamType(1) != FT->getReturnType()) return 0; @@ -160,9 +160,8 @@ struct StrCatOpt : public LibCallOptimization { // We have enough information to now generate the memcpy call to do the // concatenation for us. Make a memcpy to copy the nul byte with align = 1. - EmitMemCpy(CpyDst, Src, - ConstantInt::get(TD->getIntPtrType(*Context), Len+1), - 1, false, B, TD); + B.CreateMemCpy(CpyDst, Src, + ConstantInt::get(TD->getIntPtrType(*Context), Len + 1), 1); } }; @@ -174,7 +173,7 @@ struct StrNCatOpt : public StrCatOpt { // Verify the "strncat" function prototype. const FunctionType *FT = Callee->getFunctionType(); if (FT->getNumParams() != 3 || - FT->getReturnType() != Type::getInt8PtrTy(*Context) || + FT->getReturnType() != B.getInt8PtrTy() || FT->getParamType(0) != FT->getReturnType() || FT->getParamType(1) != FT->getReturnType() || !FT->getParamType(2)->isIntegerTy()) @@ -222,8 +221,9 @@ struct StrChrOpt : public LibCallOptimization { // Verify the "strchr" function prototype. const FunctionType *FT = Callee->getFunctionType(); if (FT->getNumParams() != 2 || - FT->getReturnType() != Type::getInt8PtrTy(*Context) || - FT->getParamType(0) != FT->getReturnType()) + FT->getReturnType() != B.getInt8PtrTy() || + FT->getParamType(0) != FT->getReturnType() || + !FT->getParamType(1)->isIntegerTy(32)) return 0; Value *SrcStr = CI->getArgOperand(0); @@ -252,22 +252,55 @@ struct StrChrOpt : public LibCallOptimization { // strchr can find the nul character. Str += '\0'; - char CharValue = CharC->getSExtValue(); // Compute the offset. - uint64_t i = 0; - while (1) { - if (i == Str.size()) // Didn't find the char. strchr returns null. - return Constant::getNullValue(CI->getType()); - // Did we find our match? - if (Str[i] == CharValue) - break; - ++i; - } + size_t I = Str.find(CharC->getSExtValue()); + if (I == std::string::npos) // Didn't find the char. strchr returns null. + return Constant::getNullValue(CI->getType()); // strchr(s+n,c) -> gep(s+n+i,c) - Value *Idx = ConstantInt::get(Type::getInt64Ty(*Context), i); - return B.CreateGEP(SrcStr, Idx, "strchr"); + return B.CreateGEP(SrcStr, B.getInt64(I), "strchr"); + } +}; + +//===---------------------------------------===// +// 'strrchr' Optimizations + +struct StrRChrOpt : public LibCallOptimization { + virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + // Verify the "strrchr" function prototype. + const FunctionType *FT = Callee->getFunctionType(); + if (FT->getNumParams() != 2 || + FT->getReturnType() != B.getInt8PtrTy() || + FT->getParamType(0) != FT->getReturnType() || + !FT->getParamType(1)->isIntegerTy(32)) + return 0; + + Value *SrcStr = CI->getArgOperand(0); + ConstantInt *CharC = dyn_cast<ConstantInt>(CI->getArgOperand(1)); + + // Cannot fold anything if we're not looking for a constant. + if (!CharC) + return 0; + + std::string Str; + if (!GetConstantStringInfo(SrcStr, Str)) { + // strrchr(s, 0) -> strchr(s, 0) + if (TD && CharC->isZero()) + return EmitStrChr(SrcStr, '\0', B, TD); + return 0; + } + + // strrchr can find the nul character. + Str += '\0'; + + // Compute the offset. + size_t I = Str.rfind(CharC->getSExtValue()); + if (I == std::string::npos) // Didn't find the char. Return null. + return Constant::getNullValue(CI->getType()); + + // strrchr(s+n,c) -> gep(s+n+i,c) + return B.CreateGEP(SrcStr, B.getInt64(I), "strrchr"); } }; @@ -281,7 +314,7 @@ struct StrCmpOpt : public LibCallOptimization { if (FT->getNumParams() != 2 || !FT->getReturnType()->isIntegerTy(32) || FT->getParamType(0) != FT->getParamType(1) || - FT->getParamType(0) != Type::getInt8PtrTy(*Context)) + FT->getParamType(0) != B.getInt8PtrTy()) return 0; Value *Str1P = CI->getArgOperand(0), *Str2P = CI->getArgOperand(1); @@ -329,7 +362,7 @@ struct StrNCmpOpt : public LibCallOptimization { if (FT->getNumParams() != 3 || !FT->getReturnType()->isIntegerTy(32) || FT->getParamType(0) != FT->getParamType(1) || - FT->getParamType(0) != Type::getInt8PtrTy(*Context) || + FT->getParamType(0) != B.getInt8PtrTy() || !FT->getParamType(2)->isIntegerTy()) return 0; @@ -384,7 +417,7 @@ struct StrCpyOpt : public LibCallOptimization { if (FT->getNumParams() != NumParams || FT->getReturnType() != FT->getParamType(0) || FT->getParamType(0) != FT->getParamType(1) || - FT->getParamType(0) != Type::getInt8PtrTy(*Context)) + FT->getParamType(0) != B.getInt8PtrTy()) return 0; Value *Dst = CI->getArgOperand(0), *Src = CI->getArgOperand(1); @@ -405,9 +438,8 @@ struct StrCpyOpt : public LibCallOptimization { ConstantInt::get(TD->getIntPtrType(*Context), Len), CI->getArgOperand(2), B, TD); else - EmitMemCpy(Dst, Src, - ConstantInt::get(TD->getIntPtrType(*Context), Len), - 1, false, B, TD); + B.CreateMemCpy(Dst, Src, + ConstantInt::get(TD->getIntPtrType(*Context), Len), 1); return Dst; } }; @@ -420,7 +452,7 @@ struct StrNCpyOpt : public LibCallOptimization { const FunctionType *FT = Callee->getFunctionType(); if (FT->getNumParams() != 3 || FT->getReturnType() != FT->getParamType(0) || FT->getParamType(0) != FT->getParamType(1) || - FT->getParamType(0) != Type::getInt8PtrTy(*Context) || + FT->getParamType(0) != B.getInt8PtrTy() || !FT->getParamType(2)->isIntegerTy()) return 0; @@ -435,8 +467,7 @@ struct StrNCpyOpt : public LibCallOptimization { if (SrcLen == 0) { // strncpy(x, "", y) -> memset(x, '\0', y, 1) - EmitMemSet(Dst, ConstantInt::get(Type::getInt8Ty(*Context), '\0'), - LenOp, false, B, TD); + B.CreateMemSet(Dst, B.getInt8('\0'), LenOp, 1); return Dst; } @@ -455,9 +486,8 @@ struct StrNCpyOpt : public LibCallOptimization { if (Len > SrcLen+1) return 0; // strncpy(x, s, c) -> memcpy(x, s, c, 1) [s and c are constant] - EmitMemCpy(Dst, Src, - ConstantInt::get(TD->getIntPtrType(*Context), Len), - 1, false, B, TD); + B.CreateMemCpy(Dst, Src, + ConstantInt::get(TD->getIntPtrType(*Context), Len), 1); return Dst; } @@ -470,7 +500,7 @@ struct StrLenOpt : public LibCallOptimization { virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { const FunctionType *FT = Callee->getFunctionType(); if (FT->getNumParams() != 1 || - FT->getParamType(0) != Type::getInt8PtrTy(*Context) || + FT->getParamType(0) != B.getInt8PtrTy() || !FT->getReturnType()->isIntegerTy()) return 0; @@ -488,6 +518,45 @@ struct StrLenOpt : public LibCallOptimization { } }; + +//===---------------------------------------===// +// 'strpbrk' Optimizations + +struct StrPBrkOpt : public LibCallOptimization { + virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + const FunctionType *FT = Callee->getFunctionType(); + if (FT->getNumParams() != 2 || + FT->getParamType(0) != B.getInt8PtrTy() || + FT->getParamType(1) != FT->getParamType(0) || + FT->getReturnType() != FT->getParamType(0)) + return 0; + + std::string S1, S2; + bool HasS1 = GetConstantStringInfo(CI->getArgOperand(0), S1); + bool HasS2 = GetConstantStringInfo(CI->getArgOperand(1), S2); + + // strpbrk(s, "") -> NULL + // strpbrk("", s) -> NULL + if ((HasS1 && S1.empty()) || (HasS2 && S2.empty())) + return Constant::getNullValue(CI->getType()); + + // Constant folding. + if (HasS1 && HasS2) { + size_t I = S1.find_first_of(S2); + if (I == std::string::npos) // No match. + return Constant::getNullValue(CI->getType()); + + return B.CreateGEP(CI->getArgOperand(0), B.getInt64(I), "strpbrk"); + } + + // strpbrk(s, "a") -> strchr(s, 'a') + if (TD && HasS2 && S2.size() == 1) + return EmitStrChr(CI->getArgOperand(0), S2[0], B, TD); + + return 0; + } +}; + //===---------------------------------------===// // 'strto*' Optimizations. This handles strtol, strtod, strtof, strtoul, etc. @@ -501,7 +570,8 @@ struct StrToOpt : public LibCallOptimization { Value *EndPtr = CI->getArgOperand(1); if (isa<ConstantPointerNull>(EndPtr)) { - CI->setOnlyReadsMemory(); + // With a null EndPtr, this function won't capture the main argument. + // It would be readonly too, except that it still may write to errno. CI->addAttribute(1, Attribute::NoCapture); } @@ -510,6 +580,67 @@ struct StrToOpt : public LibCallOptimization { }; //===---------------------------------------===// +// 'strspn' Optimizations + +struct StrSpnOpt : public LibCallOptimization { + virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + const FunctionType *FT = Callee->getFunctionType(); + if (FT->getNumParams() != 2 || + FT->getParamType(0) != B.getInt8PtrTy() || + FT->getParamType(1) != FT->getParamType(0) || + !FT->getReturnType()->isIntegerTy()) + return 0; + + std::string S1, S2; + bool HasS1 = GetConstantStringInfo(CI->getArgOperand(0), S1); + bool HasS2 = GetConstantStringInfo(CI->getArgOperand(1), S2); + + // strspn(s, "") -> 0 + // strspn("", s) -> 0 + if ((HasS1 && S1.empty()) || (HasS2 && S2.empty())) + return Constant::getNullValue(CI->getType()); + + // Constant folding. + if (HasS1 && HasS2) + return ConstantInt::get(CI->getType(), strspn(S1.c_str(), S2.c_str())); + + return 0; + } +}; + +//===---------------------------------------===// +// 'strcspn' Optimizations + +struct StrCSpnOpt : public LibCallOptimization { + virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + const FunctionType *FT = Callee->getFunctionType(); + if (FT->getNumParams() != 2 || + FT->getParamType(0) != B.getInt8PtrTy() || + FT->getParamType(1) != FT->getParamType(0) || + !FT->getReturnType()->isIntegerTy()) + return 0; + + std::string S1, S2; + bool HasS1 = GetConstantStringInfo(CI->getArgOperand(0), S1); + bool HasS2 = GetConstantStringInfo(CI->getArgOperand(1), S2); + + // strcspn("", s) -> 0 + if (HasS1 && S1.empty()) + return Constant::getNullValue(CI->getType()); + + // Constant folding. + if (HasS1 && HasS2) + return ConstantInt::get(CI->getType(), strcspn(S1.c_str(), S2.c_str())); + + // strcspn(s, "") -> strlen(s) + if (TD && HasS2 && S2.empty()) + return EmitStrLen(CI->getArgOperand(0), B, TD); + + return 0; + } +}; + +//===---------------------------------------===// // 'strstr' Optimizations struct StrStrOpt : public LibCallOptimization { @@ -637,8 +768,8 @@ struct MemCpyOpt : public LibCallOptimization { return 0; // memcpy(x, y, n) -> llvm.memcpy(x, y, n, 1) - EmitMemCpy(CI->getArgOperand(0), CI->getArgOperand(1), - CI->getArgOperand(2), 1, false, B, TD); + B.CreateMemCpy(CI->getArgOperand(0), CI->getArgOperand(1), + CI->getArgOperand(2), 1); return CI->getArgOperand(0); } }; @@ -659,8 +790,8 @@ struct MemMoveOpt : public LibCallOptimization { return 0; // memmove(x, y, n) -> llvm.memmove(x, y, n, 1) - EmitMemMove(CI->getArgOperand(0), CI->getArgOperand(1), - CI->getArgOperand(2), 1, false, B, TD); + B.CreateMemMove(CI->getArgOperand(0), CI->getArgOperand(1), + CI->getArgOperand(2), 1); return CI->getArgOperand(0); } }; @@ -681,9 +812,8 @@ struct MemSetOpt : public LibCallOptimization { return 0; // memset(p, v, n) -> llvm.memset(p, v, n, 1) - Value *Val = B.CreateIntCast(CI->getArgOperand(1), - Type::getInt8Ty(*Context), false); - EmitMemSet(CI->getArgOperand(0), Val, CI->getArgOperand(2), false, B, TD); + Value *Val = B.CreateIntCast(CI->getArgOperand(1), B.getInt8Ty(), false); + B.CreateMemSet(CI->getArgOperand(0), Val, CI->getArgOperand(2), 1); return CI->getArgOperand(0); } }; @@ -765,12 +895,10 @@ struct Exp2Opt : public LibCallOptimization { Value *LdExpArg = 0; if (SIToFPInst *OpC = dyn_cast<SIToFPInst>(Op)) { if (OpC->getOperand(0)->getType()->getPrimitiveSizeInBits() <= 32) - LdExpArg = B.CreateSExt(OpC->getOperand(0), - Type::getInt32Ty(*Context), "tmp"); + LdExpArg = B.CreateSExt(OpC->getOperand(0), B.getInt32Ty(), "tmp"); } else if (UIToFPInst *OpC = dyn_cast<UIToFPInst>(Op)) { if (OpC->getOperand(0)->getType()->getPrimitiveSizeInBits() < 32) - LdExpArg = B.CreateZExt(OpC->getOperand(0), - Type::getInt32Ty(*Context), "tmp"); + LdExpArg = B.CreateZExt(OpC->getOperand(0), B.getInt32Ty(), "tmp"); } if (LdExpArg) { @@ -789,7 +917,7 @@ struct Exp2Opt : public LibCallOptimization { Module *M = Caller->getParent(); Value *Callee = M->getOrInsertFunction(Name, Op->getType(), Op->getType(), - Type::getInt32Ty(*Context),NULL); + B.getInt32Ty(), NULL); CallInst *CI = B.CreateCall2(Callee, One, LdExpArg); if (const Function *F = dyn_cast<Function>(Callee->stripPointerCasts())) CI->setCallingConv(F->getCallingConv()); @@ -819,7 +947,7 @@ struct UnaryDoubleFPOpt : public LibCallOptimization { Value *V = Cast->getOperand(0); V = EmitUnaryFloatFnCall(V, Callee->getName().data(), B, Callee->getAttributes()); - return B.CreateFPExt(V, Type::getDoubleTy(*Context)); + return B.CreateFPExt(V, B.getDoubleTy()); } }; @@ -846,8 +974,8 @@ struct FFSOpt : public LibCallOptimization { if (ConstantInt *CI = dyn_cast<ConstantInt>(Op)) { if (CI->getValue() == 0) // ffs(0) -> 0. return Constant::getNullValue(CI->getType()); - return ConstantInt::get(Type::getInt32Ty(*Context), // ffs(c) -> cttz(c)+1 - CI->getValue().countTrailingZeros()+1); + // ffs(c) -> cttz(c)+1 + return B.getInt32(CI->getValue().countTrailingZeros() + 1); } // ffs(x) -> x != 0 ? (i32)llvm.cttz(x)+1 : 0 @@ -856,11 +984,10 @@ struct FFSOpt : public LibCallOptimization { Intrinsic::cttz, &ArgType, 1); Value *V = B.CreateCall(F, Op, "cttz"); V = B.CreateAdd(V, ConstantInt::get(V->getType(), 1), "tmp"); - V = B.CreateIntCast(V, Type::getInt32Ty(*Context), false, "tmp"); + V = B.CreateIntCast(V, B.getInt32Ty(), false, "tmp"); Value *Cond = B.CreateICmpNE(Op, Constant::getNullValue(ArgType), "tmp"); - return B.CreateSelect(Cond, V, - ConstantInt::get(Type::getInt32Ty(*Context), 0)); + return B.CreateSelect(Cond, V, B.getInt32(0)); } }; @@ -877,10 +1004,8 @@ struct IsDigitOpt : public LibCallOptimization { // isdigit(c) -> (c-'0') <u 10 Value *Op = CI->getArgOperand(0); - Op = B.CreateSub(Op, ConstantInt::get(Type::getInt32Ty(*Context), '0'), - "isdigittmp"); - Op = B.CreateICmpULT(Op, ConstantInt::get(Type::getInt32Ty(*Context), 10), - "isdigit"); + Op = B.CreateSub(Op, B.getInt32('0'), "isdigittmp"); + Op = B.CreateICmpULT(Op, B.getInt32(10), "isdigit"); return B.CreateZExt(Op, CI->getType()); } }; @@ -898,8 +1023,7 @@ struct IsAsciiOpt : public LibCallOptimization { // isascii(c) -> c <u 128 Value *Op = CI->getArgOperand(0); - Op = B.CreateICmpULT(Op, ConstantInt::get(Type::getInt32Ty(*Context), 128), - "isascii"); + Op = B.CreateICmpULT(Op, B.getInt32(128), "isascii"); return B.CreateZExt(Op, CI->getType()); } }; @@ -917,8 +1041,7 @@ struct AbsOpt : public LibCallOptimization { // abs(x) -> x >s -1 ? x : -x Value *Op = CI->getArgOperand(0); - Value *Pos = B.CreateICmpSGT(Op, - Constant::getAllOnesValue(Op->getType()), + Value *Pos = B.CreateICmpSGT(Op, Constant::getAllOnesValue(Op->getType()), "ispos"); Value *Neg = B.CreateNeg(Op, "neg"); return B.CreateSelect(Pos, Op, Neg); @@ -969,11 +1092,15 @@ struct PrintFOpt : public LibCallOptimization { return CI->use_empty() ? (Value*)CI : ConstantInt::get(CI->getType(), 0); - // printf("x") -> putchar('x'), even for '%'. Return the result of putchar - // in case there is an error writing to stdout. + // Do not do any of the following transformations if the printf return value + // is used, in general the printf return value is not compatible with either + // putchar() or puts(). + if (!CI->use_empty()) + return 0; + + // printf("x") -> putchar('x'), even for '%'. if (FormatStr.size() == 1) { - Value *Res = EmitPutChar(ConstantInt::get(Type::getInt32Ty(*Context), - FormatStr[0]), B, TD); + Value *Res = EmitPutChar(B.getInt32(FormatStr[0]), B, TD); if (CI->use_empty()) return CI; return B.CreateIntCast(Res, CI->getType(), true); } @@ -1004,8 +1131,7 @@ struct PrintFOpt : public LibCallOptimization { // printf("%s\n", str) --> puts(str) if (FormatStr == "%s\n" && CI->getNumArgOperands() > 1 && - CI->getArgOperand(1)->getType()->isPointerTy() && - CI->use_empty()) { + CI->getArgOperand(1)->getType()->isPointerTy()) { EmitPutS(CI->getArgOperand(1), B, TD); return CI; } @@ -1042,9 +1168,9 @@ struct SPrintFOpt : public LibCallOptimization { if (!TD) return 0; // sprintf(str, fmt) -> llvm.memcpy(str, fmt, strlen(fmt)+1, 1) - EmitMemCpy(CI->getArgOperand(0), CI->getArgOperand(1), // Copy the - ConstantInt::get(TD->getIntPtrType(*Context), // nul byte. - FormatStr.size() + 1), 1, false, B, TD); + B.CreateMemCpy(CI->getArgOperand(0), CI->getArgOperand(1), + ConstantInt::get(TD->getIntPtrType(*Context), // Copy the + FormatStr.size() + 1), 1); // nul byte. return ConstantInt::get(CI->getType(), FormatStr.size()); } @@ -1058,13 +1184,11 @@ struct SPrintFOpt : public LibCallOptimization { if (FormatStr[1] == 'c') { // sprintf(dst, "%c", chr) --> *(i8*)dst = chr; *((i8*)dst+1) = 0 if (!CI->getArgOperand(2)->getType()->isIntegerTy()) return 0; - Value *V = B.CreateTrunc(CI->getArgOperand(2), - Type::getInt8Ty(*Context), "char"); + Value *V = B.CreateTrunc(CI->getArgOperand(2), B.getInt8Ty(), "char"); Value *Ptr = CastToCStr(CI->getArgOperand(0), B); B.CreateStore(V, Ptr); - Ptr = B.CreateGEP(Ptr, ConstantInt::get(Type::getInt32Ty(*Context), 1), - "nul"); - B.CreateStore(Constant::getNullValue(Type::getInt8Ty(*Context)), Ptr); + Ptr = B.CreateGEP(Ptr, B.getInt32(1), "nul"); + B.CreateStore(B.getInt8(0), Ptr); return ConstantInt::get(CI->getType(), 1); } @@ -1080,8 +1204,7 @@ struct SPrintFOpt : public LibCallOptimization { Value *IncLen = B.CreateAdd(Len, ConstantInt::get(Len->getType(), 1), "leninc"); - EmitMemCpy(CI->getArgOperand(0), CI->getArgOperand(2), - IncLen, 1, false, B, TD); + B.CreateMemCpy(CI->getArgOperand(0), CI->getArgOperand(2), IncLen, 1); // The sprintf result is the unincremented number of bytes in the string. return B.CreateIntCast(Len, CI->getType(), false); @@ -1208,6 +1331,34 @@ struct FPrintFOpt : public LibCallOptimization { } }; +//===---------------------------------------===// +// 'puts' Optimizations + +struct PutsOpt : public LibCallOptimization { + virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + // Require one fixed pointer argument and an integer/void result. + const FunctionType *FT = Callee->getFunctionType(); + if (FT->getNumParams() < 1 || !FT->getParamType(0)->isPointerTy() || + !(FT->getReturnType()->isIntegerTy() || + FT->getReturnType()->isVoidTy())) + return 0; + + // Check for a constant string. + std::string Str; + if (!GetConstantStringInfo(CI->getArgOperand(0), Str)) + return 0; + + if (Str.empty() && CI->use_empty()) { + // puts("") -> putchar('\n') + Value *Res = EmitPutChar(B.getInt32('\n'), B, TD); + if (CI->use_empty()) return CI; + return B.CreateIntCast(Res, CI->getType(), true); + } + + return 0; + } +}; + } // end anonymous namespace. //===----------------------------------------------------------------------===// @@ -1220,10 +1371,10 @@ namespace { class SimplifyLibCalls : public FunctionPass { StringMap<LibCallOptimization*> Optimizations; // String and Memory LibCall Optimizations - StrCatOpt StrCat; StrNCatOpt StrNCat; StrChrOpt StrChr; StrCmpOpt StrCmp; - StrNCmpOpt StrNCmp; StrCpyOpt StrCpy; StrCpyOpt StrCpyChk; - StrNCpyOpt StrNCpy; StrLenOpt StrLen; - StrToOpt StrTo; StrStrOpt StrStr; + StrCatOpt StrCat; StrNCatOpt StrNCat; StrChrOpt StrChr; StrRChrOpt StrRChr; + StrCmpOpt StrCmp; StrNCmpOpt StrNCmp; StrCpyOpt StrCpy; StrCpyOpt StrCpyChk; + StrNCpyOpt StrNCpy; StrLenOpt StrLen; StrPBrkOpt StrPBrk; + StrToOpt StrTo; StrSpnOpt StrSpn; StrCSpnOpt StrCSpn; StrStrOpt StrStr; MemCmpOpt MemCmp; MemCpyOpt MemCpy; MemMoveOpt MemMove; MemSetOpt MemSet; // Math Library Optimizations PowOpt Pow; Exp2Opt Exp2; UnaryDoubleFPOpt UnaryDoubleFP; @@ -1233,11 +1384,14 @@ namespace { // Formatting and IO Optimizations SPrintFOpt SPrintF; PrintFOpt PrintF; FWriteOpt FWrite; FPutsOpt FPuts; FPrintFOpt FPrintF; + PutsOpt Puts; bool Modified; // This is only used by doInitialization. public: static char ID; // Pass identification - SimplifyLibCalls() : FunctionPass(ID), StrCpy(false), StrCpyChk(true) {} + SimplifyLibCalls() : FunctionPass(ID), StrCpy(false), StrCpyChk(true) { + initializeSimplifyLibCallsPass(*PassRegistry::getPassRegistry()); + } void InitOptimizations(); bool runOnFunction(Function &F); @@ -1255,7 +1409,7 @@ namespace { } // end anonymous namespace. INITIALIZE_PASS(SimplifyLibCalls, "simplify-libcalls", - "Simplify well-known library calls", false, false); + "Simplify well-known library calls", false, false) // Public interface to the Simplify LibCalls pass. FunctionPass *llvm::createSimplifyLibCallsPass() { @@ -1269,11 +1423,13 @@ void SimplifyLibCalls::InitOptimizations() { Optimizations["strcat"] = &StrCat; Optimizations["strncat"] = &StrNCat; Optimizations["strchr"] = &StrChr; + Optimizations["strrchr"] = &StrRChr; Optimizations["strcmp"] = &StrCmp; Optimizations["strncmp"] = &StrNCmp; Optimizations["strcpy"] = &StrCpy; Optimizations["strncpy"] = &StrNCpy; Optimizations["strlen"] = &StrLen; + Optimizations["strpbrk"] = &StrPBrk; Optimizations["strtol"] = &StrTo; Optimizations["strtod"] = &StrTo; Optimizations["strtof"] = &StrTo; @@ -1281,6 +1437,8 @@ void SimplifyLibCalls::InitOptimizations() { Optimizations["strtoll"] = &StrTo; Optimizations["strtold"] = &StrTo; Optimizations["strtoull"] = &StrTo; + Optimizations["strspn"] = &StrSpn; + Optimizations["strcspn"] = &StrCSpn; Optimizations["strstr"] = &StrStr; Optimizations["memcmp"] = &MemCmp; Optimizations["memcpy"] = &MemCpy; @@ -1341,6 +1499,7 @@ void SimplifyLibCalls::InitOptimizations() { Optimizations["fwrite"] = &FWrite; Optimizations["fputs"] = &FPuts; Optimizations["fprintf"] = &FPrintF; + Optimizations["puts"] = &Puts; } @@ -2155,9 +2314,6 @@ bool SimplifyLibCalls::doInitialization(Module &M) { // * pow(sqrt(x),y) -> pow(x,y*0.5) // * pow(pow(x,y),z)-> pow(x,y*z) // -// puts: -// * puts("") -> putchar('\n') -// // round, roundf, roundl: // * round(cnst) -> cnst' // @@ -2173,24 +2329,6 @@ bool SimplifyLibCalls::doInitialization(Module &M) { // stpcpy: // * stpcpy(str, "literal") -> // llvm.memcpy(str,"literal",strlen("literal")+1,1) -// strrchr: -// * strrchr(s,c) -> reverse_offset_of_in(c,s) -// (if c is a constant integer and s is a constant string) -// * strrchr(s1,0) -> strchr(s1,0) -// -// strpbrk: -// * strpbrk(s,a) -> offset_in_for(s,a) -// (if s and a are both constant strings) -// * strpbrk(s,"") -> 0 -// * strpbrk(s,a) -> strchr(s,a[0]) (if a is constant string of length 1) -// -// strspn, strcspn: -// * strspn(s,a) -> const_int (if both args are constant) -// * strspn("",a) -> 0 -// * strspn(s,"") -> 0 -// * strcspn(s,a) -> const_int (if both args are constant) -// * strcspn("",a) -> 0 -// * strcspn(s,"") -> strlen(a) // // tan, tanf, tanl: // * tan(atan(x)) -> x diff --git a/lib/Transforms/Scalar/Sink.cpp b/lib/Transforms/Scalar/Sink.cpp index 95d3ded..705f442 100644 --- a/lib/Transforms/Scalar/Sink.cpp +++ b/lib/Transforms/Scalar/Sink.cpp @@ -35,7 +35,9 @@ namespace { public: static char ID; // Pass identification - Sinking() : FunctionPass(ID) {} + Sinking() : FunctionPass(ID) { + initializeSinkingPass(*PassRegistry::getPassRegistry()); + } virtual bool runOnFunction(Function &F); @@ -56,7 +58,11 @@ namespace { } // end anonymous namespace char Sinking::ID = 0; -INITIALIZE_PASS(Sinking, "sink", "Code sinking", false, false); +INITIALIZE_PASS_BEGIN(Sinking, "sink", "Code sinking", false, false) +INITIALIZE_PASS_DEPENDENCY(LoopInfo) +INITIALIZE_PASS_DEPENDENCY(DominatorTree) +INITIALIZE_AG_DEPENDENCY(AliasAnalysis) +INITIALIZE_PASS_END(Sinking, "sink", "Code sinking", false, false) FunctionPass *llvm::createSinkingPass() { return new Sinking(); } @@ -150,11 +156,10 @@ static bool isSafeToMove(Instruction *Inst, AliasAnalysis *AA, if (LoadInst *L = dyn_cast<LoadInst>(Inst)) { if (L->isVolatile()) return false; - Value *Ptr = L->getPointerOperand(); - unsigned Size = AA->getTypeStoreSize(L->getType()); + AliasAnalysis::Location Loc = AA->getLocation(L); for (SmallPtrSet<Instruction *, 8>::iterator I = Stores.begin(), E = Stores.end(); I != E; ++I) - if (AA->getModRefInfo(*I, Ptr, Size) & AliasAnalysis::Mod) + if (AA->getModRefInfo(*I, Loc) & AliasAnalysis::Mod) return false; } @@ -163,7 +168,10 @@ static bool isSafeToMove(Instruction *Inst, AliasAnalysis *AA, return false; } - return Inst->isSafeToSpeculativelyExecute(); + if (isa<TerminatorInst>(Inst) || isa<PHINode>(Inst)) + return false; + + return true; } /// SinkInstruction - Determine whether it is safe to sink the specified machine diff --git a/lib/Transforms/Scalar/TailDuplication.cpp b/lib/Transforms/Scalar/TailDuplication.cpp index 2e437ac..9dd83c0 100644 --- a/lib/Transforms/Scalar/TailDuplication.cpp +++ b/lib/Transforms/Scalar/TailDuplication.cpp @@ -26,14 +26,14 @@ #include "llvm/IntrinsicInst.h" #include "llvm/Pass.h" #include "llvm/Type.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Support/CFG.h" -#include "llvm/Analysis/ConstantFolding.h" -#include "llvm/Transforms/Utils/Local.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/ADT/Statistic.h" -#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/Transforms/Utils/Local.h" #include <map> using namespace llvm; @@ -49,7 +49,9 @@ namespace { bool runOnFunction(Function &F); public: static char ID; // Pass identification, replacement for typeid - TailDup() : FunctionPass(ID) {} + TailDup() : FunctionPass(ID) { + initializeTailDupPass(*PassRegistry::getPassRegistry()); + } private: inline bool shouldEliminateUnconditionalBranch(TerminatorInst *, unsigned); @@ -59,7 +61,7 @@ namespace { } char TailDup::ID = 0; -INITIALIZE_PASS(TailDup, "tailduplicate", "Tail Duplication", false, false); +INITIALIZE_PASS(TailDup, "tailduplicate", "Tail Duplication", false, false) // Public interface to the Tail Duplication pass FunctionPass *llvm::createTailDuplicationPass() { return new TailDup(); } @@ -360,8 +362,8 @@ void TailDup::eliminateUnconditionalBranch(BranchInst *Branch) { Instruction *Inst = BI++; if (isInstructionTriviallyDead(Inst)) Inst->eraseFromParent(); - else if (Constant *C = ConstantFoldInstruction(Inst)) { - Inst->replaceAllUsesWith(C); + else if (Value *V = SimplifyInstruction(Inst)) { + Inst->replaceAllUsesWith(V); Inst->eraseFromParent(); } } diff --git a/lib/Transforms/Scalar/TailRecursionElimination.cpp b/lib/Transforms/Scalar/TailRecursionElimination.cpp index 3717254..5b6bc04 100644 --- a/lib/Transforms/Scalar/TailRecursionElimination.cpp +++ b/lib/Transforms/Scalar/TailRecursionElimination.cpp @@ -52,31 +52,52 @@ #define DEBUG_TYPE "tailcallelim" #include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Constants.h" #include "llvm/DerivedTypes.h" #include "llvm/Function.h" #include "llvm/Instructions.h" +#include "llvm/IntrinsicInst.h" #include "llvm/Pass.h" #include "llvm/Analysis/CaptureTracking.h" #include "llvm/Analysis/InlineCost.h" +#include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/Loads.h" #include "llvm/Support/CallSite.h" #include "llvm/Support/CFG.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" #include "llvm/ADT/Statistic.h" +#include "llvm/ADT/STLExtras.h" using namespace llvm; STATISTIC(NumEliminated, "Number of tail calls removed"); +STATISTIC(NumRetDuped, "Number of return duplicated"); STATISTIC(NumAccumAdded, "Number of accumulators introduced"); namespace { struct TailCallElim : public FunctionPass { static char ID; // Pass identification, replacement for typeid - TailCallElim() : FunctionPass(ID) {} + TailCallElim() : FunctionPass(ID) { + initializeTailCallElimPass(*PassRegistry::getPassRegistry()); + } virtual bool runOnFunction(Function &F); private: + CallInst *FindTRECandidate(Instruction *I, + bool CannotTailCallElimCallsMarkedTail); + bool EliminateRecursiveTailCall(CallInst *CI, ReturnInst *Ret, + BasicBlock *&OldEntry, + bool &TailCallsAreMarkedTail, + SmallVector<PHINode*, 8> &ArgumentPHIs, + bool CannotTailCallElimCallsMarkedTail); + bool FoldReturnAndProcessPred(BasicBlock *BB, + ReturnInst *Ret, BasicBlock *&OldEntry, + bool &TailCallsAreMarkedTail, + SmallVector<PHINode*, 8> &ArgumentPHIs, + bool CannotTailCallElimCallsMarkedTail); bool ProcessReturningBlock(ReturnInst *RI, BasicBlock *&OldEntry, bool &TailCallsAreMarkedTail, SmallVector<PHINode*, 8> &ArgumentPHIs, @@ -88,7 +109,7 @@ namespace { char TailCallElim::ID = 0; INITIALIZE_PASS(TailCallElim, "tailcallelim", - "Tail Call Elimination", false, false); + "Tail Call Elimination", false, false) // Public interface to the TailCallElimination pass FunctionPass *llvm::createTailCallEliminationPass() { @@ -133,7 +154,6 @@ bool TailCallElim::runOnFunction(Function &F) { bool TailCallsAreMarkedTail = false; SmallVector<PHINode*, 8> ArgumentPHIs; bool MadeChange = false; - bool FunctionContainsEscapingAllocas = false; // CannotTCETailMarkedCall - If true, we cannot perform TCE on tail calls @@ -160,10 +180,17 @@ bool TailCallElim::runOnFunction(Function &F) { return false; // Second pass, change any tail calls to loops. - for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) - if (ReturnInst *Ret = dyn_cast<ReturnInst>(BB->getTerminator())) - MadeChange |= ProcessReturningBlock(Ret, OldEntry, TailCallsAreMarkedTail, + for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) { + if (ReturnInst *Ret = dyn_cast<ReturnInst>(BB->getTerminator())) { + bool Change = ProcessReturningBlock(Ret, OldEntry, TailCallsAreMarkedTail, ArgumentPHIs,CannotTCETailMarkedCall); + if (!Change && BB->getFirstNonPHIOrDbg() == Ret) + Change = FoldReturnAndProcessPred(BB, Ret, OldEntry, + TailCallsAreMarkedTail, ArgumentPHIs, + CannotTCETailMarkedCall); + MadeChange |= Change; + } + } // If we eliminated any tail recursions, it's possible that we inserted some // silly PHI nodes which just merge an initial value (the incoming operand) @@ -175,7 +202,7 @@ bool TailCallElim::runOnFunction(Function &F) { PHINode *PN = ArgumentPHIs[i]; // If the PHI Node is a dynamic constant, replace it with the value it is. - if (Value *PNV = PN->hasConstantValue()) { + if (Value *PNV = SimplifyInstruction(PN)) { PN->replaceAllUsesWith(PNV); PN->eraseFromParent(); } @@ -322,41 +349,47 @@ Value *TailCallElim::CanTransformAccumulatorRecursion(Instruction *I, return getCommonReturnValue(cast<ReturnInst>(I->use_back()), CI); } -bool TailCallElim::ProcessReturningBlock(ReturnInst *Ret, BasicBlock *&OldEntry, - bool &TailCallsAreMarkedTail, - SmallVector<PHINode*, 8> &ArgumentPHIs, - bool CannotTailCallElimCallsMarkedTail) { - BasicBlock *BB = Ret->getParent(); +static Instruction *FirstNonDbg(BasicBlock::iterator I) { + while (isa<DbgInfoIntrinsic>(I)) + ++I; + return &*I; +} + +CallInst* +TailCallElim::FindTRECandidate(Instruction *TI, + bool CannotTailCallElimCallsMarkedTail) { + BasicBlock *BB = TI->getParent(); Function *F = BB->getParent(); - if (&BB->front() == Ret) // Make sure there is something before the ret... - return false; + if (&BB->front() == TI) // Make sure there is something before the terminator. + return 0; // Scan backwards from the return, checking to see if there is a tail call in // this block. If so, set CI to it. - CallInst *CI; - BasicBlock::iterator BBI = Ret; - while (1) { + CallInst *CI = 0; + BasicBlock::iterator BBI = TI; + while (true) { CI = dyn_cast<CallInst>(BBI); if (CI && CI->getCalledFunction() == F) break; if (BBI == BB->begin()) - return false; // Didn't find a potential tail call. + return 0; // Didn't find a potential tail call. --BBI; } // If this call is marked as a tail call, and if there are dynamic allocas in // the function, we cannot perform this optimization. if (CI->isTailCall() && CannotTailCallElimCallsMarkedTail) - return false; + return 0; // As a special case, detect code like this: // double fabs(double f) { return __builtin_fabs(f); } // a 'fabs' call // and disable this xform in this case, because the code generator will // lower the call to fabs into inline code. if (BB == &F->getEntryBlock() && - &BB->front() == CI && &*++BB->begin() == Ret && + FirstNonDbg(BB->front()) == CI && + FirstNonDbg(llvm::next(BB->begin())) == TI && callIsSmall(F)) { // A single-block function with just a call and a return. Check that // the arguments match. @@ -367,9 +400,17 @@ bool TailCallElim::ProcessReturningBlock(ReturnInst *Ret, BasicBlock *&OldEntry, for (; I != E && FI != FE; ++I, ++FI) if (*I != &*FI) break; if (I == E && FI == FE) - return false; + return 0; } + return CI; +} + +bool TailCallElim::EliminateRecursiveTailCall(CallInst *CI, ReturnInst *Ret, + BasicBlock *&OldEntry, + bool &TailCallsAreMarkedTail, + SmallVector<PHINode*, 8> &ArgumentPHIs, + bool CannotTailCallElimCallsMarkedTail) { // If we are introducing accumulator recursion to eliminate operations after // the call instruction that are both associative and commutative, the initial // value for the accumulator is placed in this variable. If this value is set @@ -387,7 +428,8 @@ bool TailCallElim::ProcessReturningBlock(ReturnInst *Ret, BasicBlock *&OldEntry, // tail call if all of the instructions between the call and the return are // movable to above the call itself, leaving the call next to the return. // Check that this is the case now. - for (BBI = CI, ++BBI; &*BBI != Ret; ++BBI) { + BasicBlock::iterator BBI = CI; + for (++BBI; &*BBI != Ret; ++BBI) { if (CanMoveAboveCall(BBI, CI)) continue; // If we can't move the instruction above the call, it might be because it @@ -424,6 +466,9 @@ bool TailCallElim::ProcessReturningBlock(ReturnInst *Ret, BasicBlock *&OldEntry, return false; } + BasicBlock *BB = Ret->getParent(); + Function *F = BB->getParent(); + // OK! We can transform this tail call. If this is the first one found, // create the new entry block, allowing us to branch back to the old entry. if (OldEntry == 0) { @@ -533,3 +578,53 @@ bool TailCallElim::ProcessReturningBlock(ReturnInst *Ret, BasicBlock *&OldEntry, ++NumEliminated; return true; } + +bool TailCallElim::FoldReturnAndProcessPred(BasicBlock *BB, + ReturnInst *Ret, BasicBlock *&OldEntry, + bool &TailCallsAreMarkedTail, + SmallVector<PHINode*, 8> &ArgumentPHIs, + bool CannotTailCallElimCallsMarkedTail) { + bool Change = false; + + // If the return block contains nothing but the return and PHI's, + // there might be an opportunity to duplicate the return in its + // predecessors and perform TRC there. Look for predecessors that end + // in unconditional branch and recursive call(s). + SmallVector<BranchInst*, 8> UncondBranchPreds; + for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) { + BasicBlock *Pred = *PI; + TerminatorInst *PTI = Pred->getTerminator(); + if (BranchInst *BI = dyn_cast<BranchInst>(PTI)) + if (BI->isUnconditional()) + UncondBranchPreds.push_back(BI); + } + + while (!UncondBranchPreds.empty()) { + BranchInst *BI = UncondBranchPreds.pop_back_val(); + BasicBlock *Pred = BI->getParent(); + if (CallInst *CI = FindTRECandidate(BI, CannotTailCallElimCallsMarkedTail)){ + DEBUG(dbgs() << "FOLDING: " << *BB + << "INTO UNCOND BRANCH PRED: " << *Pred); + EliminateRecursiveTailCall(CI, FoldReturnIntoUncondBranch(Ret, BB, Pred), + OldEntry, TailCallsAreMarkedTail, ArgumentPHIs, + CannotTailCallElimCallsMarkedTail); + ++NumRetDuped; + Change = true; + } + } + + return Change; +} + +bool TailCallElim::ProcessReturningBlock(ReturnInst *Ret, BasicBlock *&OldEntry, + bool &TailCallsAreMarkedTail, + SmallVector<PHINode*, 8> &ArgumentPHIs, + bool CannotTailCallElimCallsMarkedTail) { + CallInst *CI = FindTRECandidate(Ret, CannotTailCallElimCallsMarkedTail); + if (!CI) + return false; + + return EliminateRecursiveTailCall(CI, Ret, OldEntry, TailCallsAreMarkedTail, + ArgumentPHIs, + CannotTailCallElimCallsMarkedTail); +} |