diff options
Diffstat (limited to 'lib/Transforms')
79 files changed, 5578 insertions, 3001 deletions
diff --git a/lib/Transforms/IPO/ArgumentPromotion.cpp b/lib/Transforms/IPO/ArgumentPromotion.cpp index e160f63..b94dd69 100644 --- a/lib/Transforms/IPO/ArgumentPromotion.cpp +++ b/lib/Transforms/IPO/ArgumentPromotion.cpp @@ -245,10 +245,7 @@ static bool IsPrefix(const ArgPromotion::IndicesVector &Prefix, const ArgPromotion::IndicesVector &Longer) { if (Prefix.size() > Longer.size()) return false; - for (unsigned i = 0, e = Prefix.size(); i != e; ++i) - if (Prefix[i] != Longer[i]) - return false; - return true; + return std::equal(Prefix.begin(), Prefix.end(), Longer.begin()); } @@ -616,8 +613,7 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F, // Recompute the parameter attributes list based on the new arguments for // the function. - NF->setAttributes(AttrListPtr::get(AttributesVec.begin(), - AttributesVec.end())); + NF->setAttributes(AttrListPtr::get(AttributesVec)); AttributesVec.clear(); F->getParent()->getFunctionList().insert(F, NF); @@ -734,13 +730,11 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F, New = InvokeInst::Create(NF, II->getNormalDest(), II->getUnwindDest(), Args, "", Call); cast<InvokeInst>(New)->setCallingConv(CS.getCallingConv()); - cast<InvokeInst>(New)->setAttributes(AttrListPtr::get(AttributesVec.begin(), - AttributesVec.end())); + cast<InvokeInst>(New)->setAttributes(AttrListPtr::get(AttributesVec)); } else { New = CallInst::Create(NF, Args, "", Call); cast<CallInst>(New)->setCallingConv(CS.getCallingConv()); - cast<CallInst>(New)->setAttributes(AttrListPtr::get(AttributesVec.begin(), - AttributesVec.end())); + cast<CallInst>(New)->setAttributes(AttrListPtr::get(AttributesVec)); if (cast<CallInst>(Call)->isTailCall()) cast<CallInst>(New)->setTailCall(); } diff --git a/lib/Transforms/IPO/CMakeLists.txt b/lib/Transforms/IPO/CMakeLists.txt index 58b3551..3f6b1de 100644 --- a/lib/Transforms/IPO/CMakeLists.txt +++ b/lib/Transforms/IPO/CMakeLists.txt @@ -20,3 +20,5 @@ add_llvm_library(LLVMipo StripDeadPrototypes.cpp StripSymbols.cpp ) + +add_dependencies(LLVMipo intrinsics_gen) diff --git a/lib/Transforms/IPO/DeadArgumentElimination.cpp b/lib/Transforms/IPO/DeadArgumentElimination.cpp index 95aef27..fd23a93 100644 --- a/lib/Transforms/IPO/DeadArgumentElimination.cpp +++ b/lib/Transforms/IPO/DeadArgumentElimination.cpp @@ -238,7 +238,7 @@ bool DAE::DeleteDeadVarargs(Function &Fn) { AttributesVec.push_back(PAL.getSlot(i)); if (Attributes FnAttrs = PAL.getFnAttributes()) AttributesVec.push_back(AttributeWithIndex::get(~0, FnAttrs)); - PAL = AttrListPtr::get(AttributesVec.begin(), AttributesVec.end()); + PAL = AttrListPtr::get(AttributesVec); } Instruction *New; @@ -753,8 +753,7 @@ bool DAE::RemoveDeadStuffFromFunction(Function *F) { AttributesVec.push_back(AttributeWithIndex::get(~0, FnAttrs)); // Reconstruct the AttributesList based on the vector we constructed. - AttrListPtr NewPAL = AttrListPtr::get(AttributesVec.begin(), - AttributesVec.end()); + AttrListPtr NewPAL = AttrListPtr::get(AttributesVec); // Create the new function type based on the recomputed parameters. FunctionType *NFTy = FunctionType::get(NRetTy, Params, FTy->isVarArg()); @@ -816,8 +815,7 @@ bool DAE::RemoveDeadStuffFromFunction(Function *F) { AttributesVec.push_back(AttributeWithIndex::get(~0, FnAttrs)); // Reconstruct the AttributesList based on the vector we constructed. - AttrListPtr NewCallPAL = AttrListPtr::get(AttributesVec.begin(), - AttributesVec.end()); + AttrListPtr NewCallPAL = AttrListPtr::get(AttributesVec); Instruction *New; if (InvokeInst *II = dyn_cast<InvokeInst>(Call)) { diff --git a/lib/Transforms/IPO/ExtractGV.cpp b/lib/Transforms/IPO/ExtractGV.cpp index d9911bf..4c7f0ed 100644 --- a/lib/Transforms/IPO/ExtractGV.cpp +++ b/lib/Transforms/IPO/ExtractGV.cpp @@ -53,12 +53,12 @@ namespace { I != E; ++I) { if (deleteStuff == (bool)Named.count(I) && !I->isDeclaration()) { I->setInitializer(0); - } else { - if (I->hasAvailableExternallyLinkage()) - continue; - if (I->getName() == "llvm.global_ctors") - continue; - } + } else { + if (I->hasAvailableExternallyLinkage()) + continue; + if (I->getName() == "llvm.global_ctors") + continue; + } if (I->hasLocalLinkage()) I->setVisibility(GlobalValue::HiddenVisibility); @@ -69,10 +69,10 @@ namespace { for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) { if (deleteStuff == (bool)Named.count(I) && !I->isDeclaration()) { I->deleteBody(); - } else { - if (I->hasAvailableExternallyLinkage()) - continue; - } + } else { + if (I->hasAvailableExternallyLinkage()) + continue; + } if (I->hasLocalLinkage()) I->setVisibility(GlobalValue::HiddenVisibility); diff --git a/lib/Transforms/IPO/GlobalDCE.cpp b/lib/Transforms/IPO/GlobalDCE.cpp index 2b427aa..18c1c7b 100644 --- a/lib/Transforms/IPO/GlobalDCE.cpp +++ b/lib/Transforms/IPO/GlobalDCE.cpp @@ -65,7 +65,7 @@ bool GlobalDCE::runOnModule(Module &M) { for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) { Changed |= RemoveUnusedGlobalValue(*I); // Functions with external linkage are needed if they have a body - if (!I->hasLocalLinkage() && !I->hasLinkOnceLinkage() && + if (!I->isDiscardableIfUnused() && !I->isDeclaration() && !I->hasAvailableExternallyLinkage()) GlobalIsNeeded(I); } @@ -75,7 +75,7 @@ bool GlobalDCE::runOnModule(Module &M) { Changed |= RemoveUnusedGlobalValue(*I); // Externally visible & appending globals are needed, if they have an // initializer. - if (!I->hasLocalLinkage() && !I->hasLinkOnceLinkage() && + if (!I->isDiscardableIfUnused() && !I->isDeclaration() && !I->hasAvailableExternallyLinkage()) GlobalIsNeeded(I); } @@ -84,7 +84,7 @@ bool GlobalDCE::runOnModule(Module &M) { I != E; ++I) { Changed |= RemoveUnusedGlobalValue(*I); // Externally visible aliases are needed. - if (!I->hasLocalLinkage() && !I->hasLinkOnceLinkage()) + if (!I->isDiscardableIfUnused()) GlobalIsNeeded(I); } diff --git a/lib/Transforms/IPO/GlobalOpt.cpp b/lib/Transforms/IPO/GlobalOpt.cpp index 1522aa4..6d950d2 100644 --- a/lib/Transforms/IPO/GlobalOpt.cpp +++ b/lib/Transforms/IPO/GlobalOpt.cpp @@ -254,6 +254,8 @@ static bool AnalyzeGlobal(const Value *V, GlobalStatus &GS, GS.StoredType = GlobalStatus::isStored; } } + } else if (isa<BitCastInst>(I)) { + if (AnalyzeGlobal(I, GS, PHIUsers)) return true; } else if (isa<GetElementPtrInst>(I)) { if (AnalyzeGlobal(I, GS, PHIUsers)) return true; } else if (isa<SelectInst>(I)) { @@ -294,6 +296,168 @@ static bool AnalyzeGlobal(const Value *V, GlobalStatus &GS, return false; } +/// isLeakCheckerRoot - Is this global variable possibly used by a leak checker +/// as a root? If so, we might not really want to eliminate the stores to it. +static bool isLeakCheckerRoot(GlobalVariable *GV) { + // A global variable is a root if it is a pointer, or could plausibly contain + // a pointer. There are two challenges; one is that we could have a struct + // the has an inner member which is a pointer. We recurse through the type to + // detect these (up to a point). The other is that we may actually be a union + // of a pointer and another type, and so our LLVM type is an integer which + // gets converted into a pointer, or our type is an [i8 x #] with a pointer + // potentially contained here. + + if (GV->hasPrivateLinkage()) + return false; + + SmallVector<Type *, 4> Types; + Types.push_back(cast<PointerType>(GV->getType())->getElementType()); + + unsigned Limit = 20; + do { + Type *Ty = Types.pop_back_val(); + switch (Ty->getTypeID()) { + default: break; + case Type::PointerTyID: return true; + case Type::ArrayTyID: + case Type::VectorTyID: { + SequentialType *STy = cast<SequentialType>(Ty); + Types.push_back(STy->getElementType()); + break; + } + case Type::StructTyID: { + StructType *STy = cast<StructType>(Ty); + if (STy->isOpaque()) return true; + for (StructType::element_iterator I = STy->element_begin(), + E = STy->element_end(); I != E; ++I) { + Type *InnerTy = *I; + if (isa<PointerType>(InnerTy)) return true; + if (isa<CompositeType>(InnerTy)) + Types.push_back(InnerTy); + } + break; + } + } + if (--Limit == 0) return true; + } while (!Types.empty()); + return false; +} + +/// Given a value that is stored to a global but never read, determine whether +/// it's safe to remove the store and the chain of computation that feeds the +/// store. +static bool IsSafeComputationToRemove(Value *V) { + do { + if (isa<Constant>(V)) + return true; + if (!V->hasOneUse()) + return false; + if (isa<LoadInst>(V) || isa<InvokeInst>(V) || isa<Argument>(V) || + isa<GlobalValue>(V)) + return false; + if (isAllocationFn(V)) + return true; + + Instruction *I = cast<Instruction>(V); + if (I->mayHaveSideEffects()) + return false; + if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I)) { + if (!GEP->hasAllConstantIndices()) + return false; + } else if (I->getNumOperands() != 1) { + return false; + } + + V = I->getOperand(0); + } while (1); +} + +/// CleanupPointerRootUsers - This GV is a pointer root. Loop over all users +/// of the global and clean up any that obviously don't assign the global a +/// value that isn't dynamically allocated. +/// +static bool CleanupPointerRootUsers(GlobalVariable *GV) { + // A brief explanation of leak checkers. The goal is to find bugs where + // pointers are forgotten, causing an accumulating growth in memory + // usage over time. The common strategy for leak checkers is to whitelist the + // memory pointed to by globals at exit. This is popular because it also + // solves another problem where the main thread of a C++ program may shut down + // before other threads that are still expecting to use those globals. To + // handle that case, we expect the program may create a singleton and never + // destroy it. + + bool Changed = false; + + // If Dead[n].first is the only use of a malloc result, we can delete its + // chain of computation and the store to the global in Dead[n].second. + SmallVector<std::pair<Instruction *, Instruction *>, 32> Dead; + + // Constants can't be pointers to dynamically allocated memory. + for (Value::use_iterator UI = GV->use_begin(), E = GV->use_end(); + UI != E;) { + User *U = *UI++; + if (StoreInst *SI = dyn_cast<StoreInst>(U)) { + Value *V = SI->getValueOperand(); + if (isa<Constant>(V)) { + Changed = true; + SI->eraseFromParent(); + } else if (Instruction *I = dyn_cast<Instruction>(V)) { + if (I->hasOneUse()) + Dead.push_back(std::make_pair(I, SI)); + } + } else if (MemSetInst *MSI = dyn_cast<MemSetInst>(U)) { + if (isa<Constant>(MSI->getValue())) { + Changed = true; + MSI->eraseFromParent(); + } else if (Instruction *I = dyn_cast<Instruction>(MSI->getValue())) { + if (I->hasOneUse()) + Dead.push_back(std::make_pair(I, MSI)); + } + } else if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(U)) { + GlobalVariable *MemSrc = dyn_cast<GlobalVariable>(MTI->getSource()); + if (MemSrc && MemSrc->isConstant()) { + Changed = true; + MTI->eraseFromParent(); + } else if (Instruction *I = dyn_cast<Instruction>(MemSrc)) { + if (I->hasOneUse()) + Dead.push_back(std::make_pair(I, MTI)); + } + } else if (ConstantExpr *CE = dyn_cast<ConstantExpr>(U)) { + if (CE->use_empty()) { + CE->destroyConstant(); + Changed = true; + } + } else if (Constant *C = dyn_cast<Constant>(U)) { + if (SafeToDestroyConstant(C)) { + C->destroyConstant(); + // This could have invalidated UI, start over from scratch. + Dead.clear(); + CleanupPointerRootUsers(GV); + return true; + } + } + } + + for (int i = 0, e = Dead.size(); i != e; ++i) { + if (IsSafeComputationToRemove(Dead[i].first)) { + Dead[i].second->eraseFromParent(); + Instruction *I = Dead[i].first; + do { + if (isAllocationFn(I)) + break; + Instruction *J = dyn_cast<Instruction>(I->getOperand(0)); + if (!J) + break; + I->eraseFromParent(); + I = J; + } while (1); + I->eraseFromParent(); + } + } + + return Changed; +} + /// CleanupConstantGlobalUsers - We just marked GV constant. Loop over all /// users of the global, cleaning up the obvious ones. This is largely just a /// quick scan over the use list to clean up the easy and obvious cruft. This @@ -517,7 +681,7 @@ static GlobalVariable *SRAGlobal(GlobalVariable *GV, const TargetData &TD) { GlobalVariable *NGV = new GlobalVariable(STy->getElementType(i), false, GlobalVariable::InternalLinkage, In, GV->getName()+"."+Twine(i), - GV->isThreadLocal(), + GV->getThreadLocalMode(), GV->getType()->getAddressSpace()); Globals.insert(GV, NGV); NewGlobals.push_back(NGV); @@ -550,7 +714,7 @@ static GlobalVariable *SRAGlobal(GlobalVariable *GV, const TargetData &TD) { GlobalVariable *NGV = new GlobalVariable(STy->getElementType(), false, GlobalVariable::InternalLinkage, In, GV->getName()+"."+Twine(i), - GV->isThreadLocal(), + GV->getThreadLocalMode(), GV->getType()->getAddressSpace()); Globals.insert(GV, NGV); NewGlobals.push_back(NGV); @@ -810,13 +974,18 @@ static bool OptimizeAwayTrappingUsesOfLoads(GlobalVariable *GV, Constant *LV, // If we nuked all of the loads, then none of the stores are needed either, // nor is the global. if (AllNonStoreUsesGone) { - DEBUG(dbgs() << " *** GLOBAL NOW DEAD!\n"); - CleanupConstantGlobalUsers(GV, 0, TD, TLI); + if (isLeakCheckerRoot(GV)) { + Changed |= CleanupPointerRootUsers(GV); + } else { + Changed = true; + CleanupConstantGlobalUsers(GV, 0, TD, TLI); + } if (GV->use_empty()) { + DEBUG(dbgs() << " *** GLOBAL NOW DEAD!\n"); + Changed = true; GV->eraseFromParent(); ++NumDeleted; } - Changed = true; } return Changed; } @@ -866,7 +1035,7 @@ static GlobalVariable *OptimizeGlobalAddressOfMalloc(GlobalVariable *GV, UndefValue::get(GlobalType), GV->getName()+".body", GV, - GV->isThreadLocal()); + GV->getThreadLocalMode()); // If there are bitcast users of the malloc (which is typical, usually we have // a malloc + bitcast) then replace them with uses of the new global. Update @@ -899,7 +1068,7 @@ static GlobalVariable *OptimizeGlobalAddressOfMalloc(GlobalVariable *GV, new GlobalVariable(Type::getInt1Ty(GV->getContext()), false, GlobalValue::InternalLinkage, ConstantInt::getFalse(GV->getContext()), - GV->getName()+".init", GV->isThreadLocal()); + GV->getName()+".init", GV->getThreadLocalMode()); bool InitBoolUsed = false; // Loop over all uses of GV, processing them in turn. @@ -1321,7 +1490,7 @@ static GlobalVariable *PerformHeapAllocSRoA(GlobalVariable *GV, CallInst *CI, PFieldTy, false, GlobalValue::InternalLinkage, Constant::getNullValue(PFieldTy), GV->getName() + ".f" + Twine(FieldNo), GV, - GV->isThreadLocal()); + GV->getThreadLocalMode()); FieldGlobals.push_back(NGV); unsigned TypeSize = TD->getTypeAllocSize(FieldTy); @@ -1567,8 +1736,10 @@ static bool TryToOptimizeStoreOfMallocToGlobal(GlobalVariable *GV, Instruction *Cast = new BitCastInst(Malloc, CI->getType(), "tmp", CI); CI->replaceAllUsesWith(Cast); CI->eraseFromParent(); - CI = dyn_cast<BitCastInst>(Malloc) ? - extractMallocCallFromBitCast(Malloc) : cast<CallInst>(Malloc); + if (BitCastInst *BCI = dyn_cast<BitCastInst>(Malloc)) + CI = cast<CallInst>(BCI->getOperand(0)); + else + CI = cast<CallInst>(Malloc); } GVI = PerformHeapAllocSRoA(GV, CI, getMallocArraySize(CI, TD, true), TD); @@ -1645,7 +1816,7 @@ static bool TryToShrinkGlobalToBoolean(GlobalVariable *GV, Constant *OtherVal) { GlobalValue::InternalLinkage, ConstantInt::getFalse(GV->getContext()), GV->getName()+".b", - GV->isThreadLocal()); + GV->getThreadLocalMode()); GV->getParent()->getGlobalList().insert(GV, NewGV); Constant *InitVal = GV->getInitializer(); @@ -1716,7 +1887,7 @@ static bool TryToShrinkGlobalToBoolean(GlobalVariable *GV, Constant *OtherVal) { /// possible. If we make a change, return true. bool GlobalOpt::ProcessGlobal(GlobalVariable *GV, Module::global_iterator &GVI) { - if (!GV->hasLocalLinkage()) + if (!GV->isDiscardableIfUnused()) return false; // Do more involved optimizations if the global is internal. @@ -1729,6 +1900,9 @@ bool GlobalOpt::ProcessGlobal(GlobalVariable *GV, return true; } + if (!GV->hasLocalLinkage()) + return false; + SmallPtrSet<const PHINode*, 16> PHIUsers; GlobalStatus GS; @@ -1787,10 +1961,15 @@ bool GlobalOpt::ProcessInternalGlobal(GlobalVariable *GV, if (!GS.isLoaded) { DEBUG(dbgs() << "GLOBAL NEVER LOADED: " << *GV); - // Delete any stores we can find to the global. We may not be able to - // make it completely dead though. - bool Changed = CleanupConstantGlobalUsers(GV, GV->getInitializer(), - TD, TLI); + bool Changed; + if (isLeakCheckerRoot(GV)) { + // Delete any constant stores to the global. + Changed = CleanupPointerRootUsers(GV); + } else { + // Delete any stores we can find to the global. We may not be able to + // make it completely dead though. + Changed = CleanupConstantGlobalUsers(GV, GV->getInitializer(), TD, TLI); + } // If the global is dead now, delete it. if (GV->use_empty()) { @@ -1838,7 +2017,7 @@ bool GlobalOpt::ProcessInternalGlobal(GlobalVariable *GV, if (GV->use_empty()) { DEBUG(dbgs() << " *** Substituting initializer allowed us to " - << "simplify all users and delete global!\n"); + << "simplify all users and delete global!\n"); GV->eraseFromParent(); ++NumDeleted; } else { @@ -1870,6 +2049,8 @@ bool GlobalOpt::ProcessInternalGlobal(GlobalVariable *GV, /// function, changing them to FastCC. static void ChangeCalleesToFastCall(Function *F) { for (Value::use_iterator UI = F->use_begin(), E = F->use_end(); UI != E;++UI){ + if (isa<BlockAddress>(*UI)) + continue; CallSite User(cast<Instruction>(*UI)); User.setCallingConv(CallingConv::Fast); } @@ -1890,6 +2071,8 @@ static AttrListPtr StripNest(const AttrListPtr &Attrs) { static void RemoveNestAttribute(Function *F) { F->setAttributes(StripNest(F->getAttributes())); for (Value::use_iterator UI = F->use_begin(), E = F->use_end(); UI != E;++UI){ + if (isa<BlockAddress>(*UI)) + continue; CallSite User(cast<Instruction>(*UI)); User.setAttributes(StripNest(User.getAttributes())); } @@ -2045,7 +2228,7 @@ static GlobalVariable *InstallGlobalCtors(GlobalVariable *GCL, // Create the new global and insert it next to the existing list. GlobalVariable *NGV = new GlobalVariable(CA->getType(), GCL->isConstant(), GCL->getLinkage(), CA, "", - GCL->isThreadLocal()); + GCL->getThreadLocalMode()); GCL->getParent()->getGlobalList().insert(GCL, NGV); NGV->takeName(GCL); @@ -2701,7 +2884,7 @@ static bool EvaluateStaticConstructor(Function *F, const TargetData *TD, << " stores.\n"); for (DenseMap<Constant*, Constant*>::const_iterator I = Eval.getMutatedMemory().begin(), E = Eval.getMutatedMemory().end(); - I != E; ++I) + I != E; ++I) CommitValueTo(I->second, I->first); for (SmallPtrSet<GlobalVariable*, 8>::const_iterator I = Eval.getInvariants().begin(), E = Eval.getInvariants().end(); diff --git a/lib/Transforms/IPO/Inliner.cpp b/lib/Transforms/IPO/Inliner.cpp index dc9cbfb..712888a 100644 --- a/lib/Transforms/IPO/Inliner.cpp +++ b/lib/Transforms/IPO/Inliner.cpp @@ -36,7 +36,7 @@ STATISTIC(NumCallsDeleted, "Number of call sites deleted, not inlined"); STATISTIC(NumDeleted, "Number of functions deleted because all callers found"); STATISTIC(NumMergedAllocas, "Number of allocas merged together"); -// This weirdly named statistic tracks the number of times that, when attemting +// This weirdly named statistic tracks the number of times that, when attempting // to inline a function A into B, we analyze the callers of B in order to see // if those would be more profitable and blocked inline steps. STATISTIC(NumCallerCallersAnalyzed, "Number of caller-callers analyzed"); @@ -201,19 +201,22 @@ static bool InlineCallIfPossible(CallSite CS, InlineFunctionInfo &IFI, } unsigned Inliner::getInlineThreshold(CallSite CS) const { - int thres = InlineThreshold; + int thres = InlineThreshold; // -inline-threshold or else selected by + // overall opt level - // Listen to optsize when -inline-limit is not given. + // If -inline-threshold is not given, listen to the optsize attribute when it + // would decrease the threshold. Function *Caller = CS.getCaller(); - if (Caller && !Caller->isDeclaration() && - Caller->hasFnAttr(Attribute::OptimizeForSize) && - InlineLimit.getNumOccurrences() == 0) + bool OptSize = Caller && !Caller->isDeclaration() && + Caller->hasFnAttr(Attribute::OptimizeForSize); + if (!(InlineLimit.getNumOccurrences() > 0) && OptSize && OptSizeThreshold < thres) thres = OptSizeThreshold; - // Listen to inlinehint when it would increase the threshold. + // Listen to the inlinehint attribute when it would increase the threshold. Function *Callee = CS.getCalledFunction(); - if (HintThreshold > thres && Callee && !Callee->isDeclaration() && - Callee->hasFnAttr(Attribute::InlineHint)) + bool InlineHint = Callee && !Callee->isDeclaration() && + Callee->hasFnAttr(Attribute::InlineHint); + if (InlineHint && HintThreshold > thres) thres = HintThreshold; return thres; diff --git a/lib/Transforms/IPO/LoopExtractor.cpp b/lib/Transforms/IPO/LoopExtractor.cpp index 4f96afe4..97d7cdc 100644 --- a/lib/Transforms/IPO/LoopExtractor.cpp +++ b/lib/Transforms/IPO/LoopExtractor.cpp @@ -24,7 +24,7 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" -#include "llvm/Transforms/Utils/FunctionUtils.h" +#include "llvm/Transforms/Utils/CodeExtractor.h" #include "llvm/ADT/Statistic.h" #include <fstream> #include <set> @@ -132,7 +132,8 @@ bool LoopExtractor::runOnLoop(Loop *L, LPPassManager &LPM) { if (ShouldExtractLoop) { if (NumLoops == 0) return Changed; --NumLoops; - if (ExtractLoop(DT, L) != 0) { + CodeExtractor Extractor(DT, *L); + if (Extractor.extractCodeRegion() != 0) { Changed = true; // After extraction, the loop is replaced by a function call, so // we shouldn't try to run any more loop passes on it. @@ -296,7 +297,7 @@ bool BlockExtractorPass::runOnModule(Module &M) { if (const InvokeInst *II = dyn_cast<InvokeInst>(BlocksToExtract[i]->getTerminator())) BlocksToExtractVec.push_back(II->getUnwindDest()); - ExtractBasicBlock(BlocksToExtractVec); + CodeExtractor(BlocksToExtractVec).extractCodeRegion(); } return !BlocksToExtract.empty(); diff --git a/lib/Transforms/IPO/MergeFunctions.cpp b/lib/Transforms/IPO/MergeFunctions.cpp index 0b01c38..9f70f66 100644 --- a/lib/Transforms/IPO/MergeFunctions.cpp +++ b/lib/Transforms/IPO/MergeFunctions.cpp @@ -45,22 +45,22 @@ #define DEBUG_TYPE "mergefunc" #include "llvm/Transforms/IPO.h" -#include "llvm/ADT/DenseSet.h" -#include "llvm/ADT/FoldingSet.h" -#include "llvm/ADT/SmallSet.h" -#include "llvm/ADT/Statistic.h" -#include "llvm/ADT/STLExtras.h" #include "llvm/Constants.h" +#include "llvm/IRBuilder.h" #include "llvm/InlineAsm.h" #include "llvm/Instructions.h" #include "llvm/LLVMContext.h" #include "llvm/Module.h" #include "llvm/Operator.h" #include "llvm/Pass.h" +#include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/FoldingSet.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/Statistic.h" #include "llvm/Support/CallSite.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" -#include "llvm/Support/IRBuilder.h" #include "llvm/Support/ValueHandle.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetData.h" @@ -389,7 +389,7 @@ bool FunctionComparator::enumerate(const Value *V1, const Value *V2) { if (!C2) return false; // TODO: constant expressions with GEP or references to F1 or F2. if (C1->isNullValue() && C2->isNullValue() && - isEquivalentType(C1->getType(), C2->getType())) + isEquivalentType(C1->getType(), C2->getType())) return true; // Try bitcasting C2 to C1's type. If the bitcast is legal and returns C1 // then they must have equal bit patterns. diff --git a/lib/Transforms/IPO/PartialInlining.cpp b/lib/Transforms/IPO/PartialInlining.cpp index d9d1d10..9c9910b 100644 --- a/lib/Transforms/IPO/PartialInlining.cpp +++ b/lib/Transforms/IPO/PartialInlining.cpp @@ -19,7 +19,7 @@ #include "llvm/Pass.h" #include "llvm/Analysis/Dominators.h" #include "llvm/Transforms/Utils/Cloning.h" -#include "llvm/Transforms/Utils/FunctionUtils.h" +#include "llvm/Transforms/Utils/CodeExtractor.h" #include "llvm/ADT/Statistic.h" #include "llvm/Support/CFG.h" using namespace llvm; @@ -122,7 +122,8 @@ Function* PartialInliner::unswitchFunction(Function* F) { DT.runOnFunction(*duplicateFunction); // Extract the body of the if. - Function* extractedFunction = ExtractCodeRegion(DT, toExtract); + Function* extractedFunction + = CodeExtractor(toExtract, &DT).extractCodeRegion(); InlineFunctionInfo IFI; diff --git a/lib/Transforms/IPO/StripSymbols.cpp b/lib/Transforms/IPO/StripSymbols.cpp index b5caa9a..80bfc1c 100644 --- a/lib/Transforms/IPO/StripSymbols.cpp +++ b/lib/Transforms/IPO/StripSymbols.cpp @@ -22,11 +22,12 @@ #include "llvm/Transforms/IPO.h" #include "llvm/Constants.h" +#include "llvm/DebugInfo.h" #include "llvm/DerivedTypes.h" #include "llvm/Instructions.h" #include "llvm/Module.h" #include "llvm/Pass.h" -#include "llvm/Analysis/DebugInfo.h" +#include "llvm/TypeFinder.h" #include "llvm/ValueSymbolTable.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/ADT/DenseMap.h" @@ -175,8 +176,8 @@ static void StripSymtab(ValueSymbolTable &ST, bool PreserveDbgInfo) { // Strip any named types of their names. static void StripTypeNames(Module &M, bool PreserveDbgInfo) { - std::vector<StructType*> StructTypes; - M.findUsedStructTypes(StructTypes); + TypeFinder StructTypes; + StructTypes.run(M, false); for (unsigned i = 0, e = StructTypes.size(); i != e; ++i) { StructType *STy = StructTypes[i]; diff --git a/lib/Transforms/InstCombine/CMakeLists.txt b/lib/Transforms/InstCombine/CMakeLists.txt index d070ccc..72cfe2c 100644 --- a/lib/Transforms/InstCombine/CMakeLists.txt +++ b/lib/Transforms/InstCombine/CMakeLists.txt @@ -13,3 +13,5 @@ add_llvm_library(LLVMInstCombine InstCombineSimplifyDemanded.cpp InstCombineVectorOps.cpp ) + +add_dependencies(LLVMInstCombine intrinsics_gen) diff --git a/lib/Transforms/InstCombine/InstCombine.h b/lib/Transforms/InstCombine/InstCombine.h index 199df51..0d5ef90 100644 --- a/lib/Transforms/InstCombine/InstCombine.h +++ b/lib/Transforms/InstCombine/InstCombine.h @@ -11,11 +11,11 @@ #define INSTCOMBINE_INSTCOMBINE_H #include "InstCombineWorklist.h" +#include "llvm/IRBuilder.h" #include "llvm/IntrinsicInst.h" #include "llvm/Operator.h" #include "llvm/Pass.h" #include "llvm/Analysis/ValueTracking.h" -#include "llvm/Support/IRBuilder.h" #include "llvm/Support/InstVisitor.h" #include "llvm/Support/TargetFolder.h" @@ -187,7 +187,7 @@ public: Instruction *visitPHINode(PHINode &PN); Instruction *visitGetElementPtrInst(GetElementPtrInst &GEP); Instruction *visitAllocaInst(AllocaInst &AI); - Instruction *visitMalloc(Instruction &FI); + Instruction *visitAllocSite(Instruction &FI); Instruction *visitFree(CallInst &FI); Instruction *visitLoadInst(LoadInst &LI); Instruction *visitStoreInst(StoreInst &SI); diff --git a/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/lib/Transforms/InstCombine/InstCombineAddSub.cpp index 05e702f..99b62f8 100644 --- a/lib/Transforms/InstCombine/InstCombineAddSub.cpp +++ b/lib/Transforms/InstCombine/InstCombineAddSub.cpp @@ -170,10 +170,11 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) { // -A + B --> B - A // -A + -B --> -(A + B) if (Value *LHSV = dyn_castNegVal(LHS)) { - if (Value *RHSV = dyn_castNegVal(RHS)) { - Value *NewAdd = Builder->CreateAdd(LHSV, RHSV, "sum"); - return BinaryOperator::CreateNeg(NewAdd); - } + if (!isa<Constant>(RHS)) + if (Value *RHSV = dyn_castNegVal(RHS)) { + Value *NewAdd = Builder->CreateAdd(LHSV, RHSV, "sum"); + return BinaryOperator::CreateNeg(NewAdd); + } return BinaryOperator::CreateSub(RHS, LHSV); } @@ -329,6 +330,20 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) { } } + // Check for (x & y) + (x ^ y) + { + Value *A = 0, *B = 0; + if (match(RHS, m_Xor(m_Value(A), m_Value(B))) && + (match(LHS, m_And(m_Specific(A), m_Specific(B))) || + match(LHS, m_And(m_Specific(B), m_Specific(A))))) + return BinaryOperator::CreateOr(A, B); + + if (match(LHS, m_Xor(m_Value(A), m_Value(B))) && + (match(RHS, m_And(m_Specific(A), m_Specific(B))) || + match(RHS, m_And(m_Specific(B), m_Specific(A))))) + return BinaryOperator::CreateOr(A, B); + } + return Changed ? &I : 0; } @@ -406,66 +421,6 @@ Instruction *InstCombiner::visitFAdd(BinaryOperator &I) { } -/// EmitGEPOffset - Given a getelementptr instruction/constantexpr, emit the -/// code necessary to compute the offset from the base pointer (without adding -/// in the base pointer). Return the result as a signed integer of intptr size. -Value *InstCombiner::EmitGEPOffset(User *GEP) { - TargetData &TD = *getTargetData(); - gep_type_iterator GTI = gep_type_begin(GEP); - Type *IntPtrTy = TD.getIntPtrType(GEP->getContext()); - Value *Result = Constant::getNullValue(IntPtrTy); - - // If the GEP is inbounds, we know that none of the addressing operations will - // overflow in an unsigned sense. - bool isInBounds = cast<GEPOperator>(GEP)->isInBounds(); - - // Build a mask for high order bits. - unsigned IntPtrWidth = TD.getPointerSizeInBits(); - uint64_t PtrSizeMask = ~0ULL >> (64-IntPtrWidth); - - for (User::op_iterator i = GEP->op_begin() + 1, e = GEP->op_end(); i != e; - ++i, ++GTI) { - Value *Op = *i; - uint64_t Size = TD.getTypeAllocSize(GTI.getIndexedType()) & PtrSizeMask; - if (ConstantInt *OpC = dyn_cast<ConstantInt>(Op)) { - if (OpC->isZero()) continue; - - // Handle a struct index, which adds its field offset to the pointer. - if (StructType *STy = dyn_cast<StructType>(*GTI)) { - Size = TD.getStructLayout(STy)->getElementOffset(OpC->getZExtValue()); - - if (Size) - Result = Builder->CreateAdd(Result, ConstantInt::get(IntPtrTy, Size), - GEP->getName()+".offs"); - continue; - } - - Constant *Scale = ConstantInt::get(IntPtrTy, Size); - Constant *OC = - ConstantExpr::getIntegerCast(OpC, IntPtrTy, true /*SExt*/); - Scale = ConstantExpr::getMul(OC, Scale, isInBounds/*NUW*/); - // Emit an add instruction. - Result = Builder->CreateAdd(Result, Scale, GEP->getName()+".offs"); - continue; - } - // Convert to correct type. - if (Op->getType() != IntPtrTy) - Op = Builder->CreateIntCast(Op, IntPtrTy, true, Op->getName()+".c"); - if (Size != 1) { - // We'll let instcombine(mul) convert this to a shl if possible. - Op = Builder->CreateMul(Op, ConstantInt::get(IntPtrTy, Size), - GEP->getName()+".idx", isInBounds /*NUW*/); - } - - // Emit an add instruction. - Result = Builder->CreateAdd(Op, Result, GEP->getName()+".offs"); - } - return Result; -} - - - - /// Optimize pointer differences into the same array into a size. Consider: /// &A[10] - &A[0]: we should compile this to "10". LHS/RHS are the pointer /// operands to the ptrtoint instructions for the LHS/RHS of the subtract. @@ -589,11 +544,6 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) { if (Instruction *R = FoldOpIntoSelect(I, SI)) return R; - // C - zext(bool) -> bool ? C - 1 : C - if (ZExtInst *ZI = dyn_cast<ZExtInst>(Op1)) - if (ZI->getSrcTy()->isIntegerTy(1)) - return SelectInst::Create(ZI->getOperand(0), SubOne(C), C); - // C-(X+C2) --> (C-C2)-X ConstantInt *C2; if (match(Op1, m_Add(m_Value(X), m_ConstantInt(C2)))) diff --git a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp index 0dbe11d..7d0af0d 100644 --- a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp +++ b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp @@ -986,19 +986,23 @@ Value *InstCombiner::FoldAndOfFCmps(FCmpInst *LHS, FCmpInst *RHS) { bool Op1Ordered; unsigned Op0Pred = getFCmpCode(Op0CC, Op0Ordered); unsigned Op1Pred = getFCmpCode(Op1CC, Op1Ordered); + // uno && ord -> false + if (Op0Pred == 0 && Op1Pred == 0 && Op0Ordered != Op1Ordered) + return ConstantInt::get(CmpInst::makeCmpResultType(LHS->getType()), 0); if (Op1Pred == 0) { std::swap(LHS, RHS); std::swap(Op0Pred, Op1Pred); std::swap(Op0Ordered, Op1Ordered); } if (Op0Pred == 0) { - // uno && ueq -> uno && (uno || eq) -> ueq + // uno && ueq -> uno && (uno || eq) -> uno // ord && olt -> ord && (ord && lt) -> olt - if (Op0Ordered == Op1Ordered) + if (!Op0Ordered && (Op0Ordered == Op1Ordered)) + return LHS; + if (Op0Ordered && (Op0Ordered == Op1Ordered)) return RHS; // uno && oeq -> uno && (ord && eq) -> false - // uno && ord -> false if (!Op0Ordered) return ConstantInt::get(CmpInst::makeCmpResultType(LHS->getType()), 0); // ord && ueq -> ord && (uno || eq) -> oeq @@ -1932,10 +1936,15 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) { // A | ( A ^ B) -> A | B // A | (~A ^ B) -> A | ~B + // (A & B) | (A ^ B) if (match(Op1, m_Xor(m_Value(A), m_Value(B)))) { if (Op0 == A || Op0 == B) return BinaryOperator::CreateOr(A, B); + if (match(Op0, m_And(m_Specific(A), m_Specific(B))) || + match(Op0, m_And(m_Specific(B), m_Specific(A)))) + return BinaryOperator::CreateOr(A, B); + if (Op1->hasOneUse() && match(A, m_Not(m_Specific(Op0)))) { Value *Not = Builder->CreateNot(B, B->getName()+".not"); return BinaryOperator::CreateOr(Not, Op0); @@ -2212,7 +2221,7 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) { if (Op0I && Op1I && Op0I->isShift() && Op0I->getOpcode() == Op1I->getOpcode() && Op0I->getOperand(1) == Op1I->getOperand(1) && - (Op1I->hasOneUse() || Op1I->hasOneUse())) { + (Op0I->hasOneUse() || Op1I->hasOneUse())) { Value *NewOp = Builder->CreateXor(Op0I->getOperand(0), Op1I->getOperand(0), Op0I->getName()); diff --git a/lib/Transforms/InstCombine/InstCombineCalls.cpp b/lib/Transforms/InstCombine/InstCombineCalls.cpp index 77e4727..d34fab1 100644 --- a/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -172,8 +172,6 @@ Instruction *InstCombiner::SimplifyMemSet(MemSetInst *MI) { Instruction *InstCombiner::visitCallInst(CallInst &CI) { if (isFreeCall(&CI)) return visitFree(CI); - if (isMalloc(&CI)) - return visitMalloc(CI); // If the caller function is nounwind, mark the call as nounwind, even if the // callee isn't. @@ -246,78 +244,10 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { switch (II->getIntrinsicID()) { default: break; case Intrinsic::objectsize: { - // We need target data for just about everything so depend on it. - if (!TD) break; - - Type *ReturnTy = CI.getType(); - uint64_t DontKnow = II->getArgOperand(1) == Builder->getTrue() ? 0 : -1ULL; - - // Get to the real allocated thing and offset as fast as possible. - Value *Op1 = II->getArgOperand(0)->stripPointerCasts(); - - uint64_t Offset = 0; - uint64_t Size = -1ULL; - - // Try to look through constant GEPs. - if (GEPOperator *GEP = dyn_cast<GEPOperator>(Op1)) { - if (!GEP->hasAllConstantIndices()) break; - - // Get the current byte offset into the thing. Use the original - // operand in case we're looking through a bitcast. - SmallVector<Value*, 8> Ops(GEP->idx_begin(), GEP->idx_end()); - if (!GEP->getPointerOperandType()->isPointerTy()) - return 0; - Offset = TD->getIndexedOffset(GEP->getPointerOperandType(), Ops); - - Op1 = GEP->getPointerOperand()->stripPointerCasts(); - - // Make sure we're not a constant offset from an external - // global. - if (GlobalVariable *GV = dyn_cast<GlobalVariable>(Op1)) - if (!GV->hasDefinitiveInitializer()) break; - } - - // If we've stripped down to a single global variable that we - // can know the size of then just return that. - if (GlobalVariable *GV = dyn_cast<GlobalVariable>(Op1)) { - if (GV->hasDefinitiveInitializer()) { - Constant *C = GV->getInitializer(); - Size = TD->getTypeAllocSize(C->getType()); - } else { - // Can't determine size of the GV. - Constant *RetVal = ConstantInt::get(ReturnTy, DontKnow); - return ReplaceInstUsesWith(CI, RetVal); - } - } else if (AllocaInst *AI = dyn_cast<AllocaInst>(Op1)) { - // Get alloca size. - if (AI->getAllocatedType()->isSized()) { - Size = TD->getTypeAllocSize(AI->getAllocatedType()); - if (AI->isArrayAllocation()) { - const ConstantInt *C = dyn_cast<ConstantInt>(AI->getArraySize()); - if (!C) break; - Size *= C->getZExtValue(); - } - } - } else if (CallInst *MI = extractMallocCall(Op1)) { - // Get allocation size. - Type* MallocType = getMallocAllocatedType(MI); - if (MallocType && MallocType->isSized()) - if (Value *NElems = getMallocArraySize(MI, TD, true)) - if (ConstantInt *NElements = dyn_cast<ConstantInt>(NElems)) - Size = NElements->getZExtValue() * TD->getTypeAllocSize(MallocType); - } - - // Do not return "I don't know" here. Later optimization passes could - // make it possible to evaluate objectsize to a constant. - if (Size == -1ULL) - break; - - if (Size < Offset) { - // Out of bound reference? Negative index normalized to large - // index? Just return "I don't know". - return ReplaceInstUsesWith(CI, ConstantInt::get(ReturnTy, DontKnow)); - } - return ReplaceInstUsesWith(CI, ConstantInt::get(ReturnTy, Size-Offset)); + uint64_t Size; + if (getObjectSize(II->getArgOperand(0), Size, TD)) + return ReplaceInstUsesWith(CI, ConstantInt::get(CI.getType(), Size)); + return 0; } case Intrinsic::bswap: // bswap(bswap(x)) -> x @@ -694,6 +624,57 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { break; } + case Intrinsic::arm_neon_vmulls: + case Intrinsic::arm_neon_vmullu: { + Value *Arg0 = II->getArgOperand(0); + Value *Arg1 = II->getArgOperand(1); + + // Handle mul by zero first: + if (isa<ConstantAggregateZero>(Arg0) || isa<ConstantAggregateZero>(Arg1)) { + return ReplaceInstUsesWith(CI, ConstantAggregateZero::get(II->getType())); + } + + // Check for constant LHS & RHS - in this case we just simplify. + bool Zext = (II->getIntrinsicID() == Intrinsic::arm_neon_vmullu); + VectorType *NewVT = cast<VectorType>(II->getType()); + unsigned NewWidth = NewVT->getElementType()->getIntegerBitWidth(); + if (ConstantDataVector *CV0 = dyn_cast<ConstantDataVector>(Arg0)) { + if (ConstantDataVector *CV1 = dyn_cast<ConstantDataVector>(Arg1)) { + VectorType* VT = cast<VectorType>(CV0->getType()); + SmallVector<Constant*, 4> NewElems; + for (unsigned i = 0; i < VT->getNumElements(); ++i) { + APInt CV0E = + (cast<ConstantInt>(CV0->getAggregateElement(i)))->getValue(); + CV0E = Zext ? CV0E.zext(NewWidth) : CV0E.sext(NewWidth); + APInt CV1E = + (cast<ConstantInt>(CV1->getAggregateElement(i)))->getValue(); + CV1E = Zext ? CV1E.zext(NewWidth) : CV1E.sext(NewWidth); + NewElems.push_back( + ConstantInt::get(NewVT->getElementType(), CV0E * CV1E)); + } + return ReplaceInstUsesWith(CI, ConstantVector::get(NewElems)); + } + + // Couldn't simplify - cannonicalize constant to the RHS. + std::swap(Arg0, Arg1); + } + + // Handle mul by one: + if (ConstantDataVector *CV1 = dyn_cast<ConstantDataVector>(Arg1)) { + if (ConstantInt *Splat = + dyn_cast_or_null<ConstantInt>(CV1->getSplatValue())) { + if (Splat->isOne()) { + if (Zext) + return CastInst::CreateZExtOrBitCast(Arg0, II->getType()); + // else + return CastInst::CreateSExtOrBitCast(Arg0, II->getType()); + } + } + } + + break; + } + case Intrinsic::stackrestore: { // If the save is right next to the restore, remove the restore. This can // happen when variable allocas are DCE'd. @@ -711,7 +692,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { TerminatorInst *TI = II->getParent()->getTerminator(); bool CannotRemove = false; for (++BI; &*BI != TI; ++BI) { - if (isa<AllocaInst>(BI) || isMalloc(BI)) { + if (isa<AllocaInst>(BI)) { CannotRemove = true; break; } @@ -814,7 +795,7 @@ Instruction *InstCombiner::tryOptimizeCall(CallInst *CI, const TargetData *TD) { if (CI->getCalledFunction() == 0) return 0; InstCombineFortifiedLibCalls Simplifier(this); - Simplifier.fold(CI, TD); + Simplifier.fold(CI, TD, TLI); return Simplifier.NewInstruction; } @@ -898,6 +879,9 @@ static IntrinsicInst *FindInitTrampoline(Value *Callee) { // visitCallSite - Improvements for call and invoke instructions. // Instruction *InstCombiner::visitCallSite(CallSite CS) { + if (isAllocLikeFn(CS.getInstruction())) + return visitAllocSite(*CS.getInstruction()); + bool Changed = false; // If the callee is a pointer to a function, attempt to move any casts to the @@ -933,24 +917,24 @@ Instruction *InstCombiner::visitCallSite(CallSite CS) { } if (isa<ConstantPointerNull>(Callee) || isa<UndefValue>(Callee)) { - // This instruction is not reachable, just remove it. We insert a store to - // undef so that we know that this code is not reachable, despite the fact - // that we can't modify the CFG here. - new StoreInst(ConstantInt::getTrue(Callee->getContext()), - UndefValue::get(Type::getInt1PtrTy(Callee->getContext())), - CS.getInstruction()); - // If CS does not return void then replaceAllUsesWith undef. // This allows ValueHandlers and custom metadata to adjust itself. if (!CS.getInstruction()->getType()->isVoidTy()) ReplaceInstUsesWith(*CS.getInstruction(), UndefValue::get(CS.getInstruction()->getType())); - if (InvokeInst *II = dyn_cast<InvokeInst>(CS.getInstruction())) { - // Don't break the CFG, insert a dummy cond branch. - BranchInst::Create(II->getNormalDest(), II->getUnwindDest(), - ConstantInt::getTrue(Callee->getContext()), II); + if (isa<InvokeInst>(CS.getInstruction())) { + // Can't remove an invoke because we cannot change the CFG. + return 0; } + + // This instruction is not reachable, just remove it. We insert a store to + // undef so that we know that this code is not reachable, despite the fact + // that we can't modify the CFG here. + new StoreInst(ConstantInt::getTrue(Callee->getContext()), + UndefValue::get(Type::getInt1PtrTy(Callee->getContext())), + CS.getInstruction()); + return EraseInstFromFunction(*CS.getInstruction()); } @@ -1194,8 +1178,7 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) { if (NewRetTy->isVoidTy()) Caller->setName(""); // Void type should not have a name. - const AttrListPtr &NewCallerPAL = AttrListPtr::get(attrVec.begin(), - attrVec.end()); + const AttrListPtr &NewCallerPAL = AttrListPtr::get(attrVec); Instruction *NC; if (InvokeInst *II = dyn_cast<InvokeInst>(Caller)) { @@ -1367,8 +1350,7 @@ InstCombiner::transformCallThroughTrampoline(CallSite CS, NestF->getType() == PointerType::getUnqual(NewFTy) ? NestF : ConstantExpr::getBitCast(NestF, PointerType::getUnqual(NewFTy)); - const AttrListPtr &NewPAL = AttrListPtr::get(NewAttrs.begin(), - NewAttrs.end()); + const AttrListPtr &NewPAL = AttrListPtr::get(NewAttrs); Instruction *NewCaller; if (InvokeInst *II = dyn_cast<InvokeInst>(Caller)) { diff --git a/lib/Transforms/InstCombine/InstCombineCasts.cpp b/lib/Transforms/InstCombine/InstCombineCasts.cpp index 39279f4..555b442 100644 --- a/lib/Transforms/InstCombine/InstCombineCasts.cpp +++ b/lib/Transforms/InstCombine/InstCombineCasts.cpp @@ -34,7 +34,7 @@ static Value *DecomposeSimpleLinearExpr(Value *Val, unsigned &Scale, if (BinaryOperator *I = dyn_cast<BinaryOperator>(Val)) { // Cannot look past anything that might overflow. OverflowingBinaryOperator *OBI = dyn_cast<OverflowingBinaryOperator>(Val); - if (OBI && !OBI->hasNoUnsignedWrap()) { + if (OBI && !OBI->hasNoUnsignedWrap() && !OBI->hasNoSignedWrap()) { Scale = 1; Offset = 0; return Val; @@ -648,10 +648,8 @@ static bool CanEvaluateZExtd(Value *V, Type *Ty, unsigned &BitsToClear) { if (!I) return false; // If the input is a truncate from the destination type, we can trivially - // eliminate it, even if it has multiple uses. - // FIXME: This is currently disabled until codegen can handle this without - // pessimizing code, PR5997. - if (0 && isa<TruncInst>(I) && I->getOperand(0)->getType() == Ty) + // eliminate it. + if (isa<TruncInst>(I) && I->getOperand(0)->getType() == Ty) return true; // We can't extend or shrink something that has multiple uses: doing so would @@ -992,11 +990,8 @@ static bool CanEvaluateSExtd(Value *V, Type *Ty) { Instruction *I = dyn_cast<Instruction>(V); if (!I) return false; - // If this is a truncate from the dest type, we can trivially eliminate it, - // even if it has multiple uses. - // FIXME: This is currently disabled until codegen can handle this without - // pessimizing code, PR5997. - if (0 && isa<TruncInst>(I) && I->getOperand(0)->getType() == Ty) + // If this is a truncate from the dest type, we can trivially eliminate it. + if (isa<TruncInst>(I) && I->getOperand(0)->getType() == Ty) return true; // We can't extend or shrink something that has multiple uses: doing so would @@ -1341,10 +1336,9 @@ Instruction *InstCombiner::commonPointerCastTransforms(CastInst &CI) { // non-type-safe code. if (TD && GEP->hasOneUse() && isa<BitCastInst>(GEP->getOperand(0)) && GEP->hasAllConstantIndices()) { - // We are guaranteed to get a constant from EmitGEPOffset. - ConstantInt *OffsetV = cast<ConstantInt>(EmitGEPOffset(GEP)); - int64_t Offset = OffsetV->getSExtValue(); - + SmallVector<Value*, 8> Ops(GEP->idx_begin(), GEP->idx_end()); + int64_t Offset = TD->getIndexedOffset(GEP->getPointerOperandType(), Ops); + // Get the base pointer input of the bitcast, and the type it points to. Value *OrigBase = cast<BitCastInst>(GEP->getOperand(0))->getOperand(0); Type *GEPIdxTy = diff --git a/lib/Transforms/InstCombine/InstCombineCompares.cpp b/lib/Transforms/InstCombine/InstCombineCompares.cpp index ab2987f..bdd310e 100644 --- a/lib/Transforms/InstCombine/InstCombineCompares.cpp +++ b/lib/Transforms/InstCombine/InstCombineCompares.cpp @@ -1035,7 +1035,7 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI, if ((KnownZero|KnownOne).countLeadingOnes() >= SrcBits-DstBits) { // Pull in the high bits from known-ones set. APInt NewRHS = RHS->getValue().zext(SrcBits); - NewRHS |= KnownOne; + NewRHS |= KnownOne & APInt::getHighBitsSet(SrcBits, SrcBits-DstBits); return new ICmpInst(ICI.getPredicate(), LHSI->getOperand(0), ConstantInt::get(ICI.getContext(), NewRHS)); } @@ -2580,10 +2580,25 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) { } } + // Transform (zext A) == (B & (1<<X)-1) --> A == (trunc B) + // and (B & (1<<X)-1) == (zext A) --> A == (trunc B) + ConstantInt *Cst1; + if ((Op0->hasOneUse() && + match(Op0, m_ZExt(m_Value(A))) && + match(Op1, m_And(m_Value(B), m_ConstantInt(Cst1)))) || + (Op1->hasOneUse() && + match(Op0, m_And(m_Value(B), m_ConstantInt(Cst1))) && + match(Op1, m_ZExt(m_Value(A))))) { + APInt Pow2 = Cst1->getValue() + 1; + if (Pow2.isPowerOf2() && isa<IntegerType>(A->getType()) && + Pow2.logBase2() == cast<IntegerType>(A->getType())->getBitWidth()) + return new ICmpInst(I.getPredicate(), A, + Builder->CreateTrunc(B, A->getType())); + } + // Transform "icmp eq (trunc (lshr(X, cst1)), cst" to // "icmp (and X, mask), cst" uint64_t ShAmt = 0; - ConstantInt *Cst1; if (Op0->hasOneUse() && match(Op0, m_Trunc(m_OneUse(m_LShr(m_Value(A), m_ConstantInt(ShAmt))))) && @@ -2809,7 +2824,7 @@ Instruction *InstCombiner::FoldFCmp_IntToFP_Cst(FCmpInst &I, case ICmpInst::ICMP_UGE: // (float)int >= -4.4 --> true // (float)int >= 4.4 --> int > 4 - if (!RHS.isNegative()) + if (RHS.isNegative()) return ReplaceInstUsesWith(I, ConstantInt::getTrue(I.getContext())); Pred = ICmpInst::ICMP_UGT; break; diff --git a/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp index b2f2e24..c485844 100644 --- a/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp +++ b/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp @@ -22,72 +22,6 @@ using namespace llvm; STATISTIC(NumDeadStore, "Number of dead stores eliminated"); -// Try to kill dead allocas by walking through its uses until we see some use -// that could escape. This is a conservative analysis which tries to handle -// GEPs, bitcasts, stores, and no-op intrinsics. These tend to be the things -// left after inlining and SROA finish chewing on an alloca. -static Instruction *removeDeadAlloca(InstCombiner &IC, AllocaInst &AI) { - SmallVector<Instruction *, 4> Worklist, DeadStores; - Worklist.push_back(&AI); - do { - Instruction *PI = Worklist.pop_back_val(); - for (Value::use_iterator UI = PI->use_begin(), UE = PI->use_end(); - UI != UE; ++UI) { - Instruction *I = cast<Instruction>(*UI); - switch (I->getOpcode()) { - default: - // Give up the moment we see something we can't handle. - return 0; - - case Instruction::GetElementPtr: - case Instruction::BitCast: - Worklist.push_back(I); - continue; - - case Instruction::Call: - // We can handle a limited subset of calls to no-op intrinsics. - if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) { - switch (II->getIntrinsicID()) { - case Intrinsic::dbg_declare: - case Intrinsic::dbg_value: - case Intrinsic::invariant_start: - case Intrinsic::invariant_end: - case Intrinsic::lifetime_start: - case Intrinsic::lifetime_end: - continue; - default: - return 0; - } - } - // Reject everything else. - return 0; - - case Instruction::Store: { - // Stores into the alloca are only live if the alloca is live. - StoreInst *SI = cast<StoreInst>(I); - // We can eliminate atomic stores, but not volatile. - if (SI->isVolatile()) - return 0; - // The store is only trivially safe if the poniter is the destination - // as opposed to the value. We're conservative here and don't check for - // the case where we store the address of a dead alloca into a dead - // alloca. - if (SI->getPointerOperand() != PI) - return 0; - DeadStores.push_back(I); - continue; - } - } - } - } while (!Worklist.empty()); - - // The alloca is dead. Kill off all the stores to it, and then replace it - // with undef. - while (!DeadStores.empty()) - IC.EraseInstFromFunction(*DeadStores.pop_back_val()); - return IC.ReplaceInstUsesWith(AI, UndefValue::get(AI.getType())); -} - Instruction *InstCombiner::visitAllocaInst(AllocaInst &AI) { // Ensure that the alloca array size argument has type intptr_t, so that // any casting is exposed early. @@ -106,7 +40,6 @@ Instruction *InstCombiner::visitAllocaInst(AllocaInst &AI) { if (const ConstantInt *C = dyn_cast<ConstantInt>(AI.getArraySize())) { Type *NewTy = ArrayType::get(AI.getAllocatedType(), C->getZExtValue()); - assert(isa<AllocaInst>(AI) && "Unknown type of allocation inst!"); AllocaInst *New = Builder->CreateAlloca(NewTy, 0, AI.getName()); New->setAlignment(AI.getAlignment()); @@ -135,22 +68,54 @@ Instruction *InstCombiner::visitAllocaInst(AllocaInst &AI) { } } - if (TD && isa<AllocaInst>(AI) && AI.getAllocatedType()->isSized()) { - // If alloca'ing a zero byte object, replace the alloca with a null pointer. - // Note that we only do this for alloca's, because malloc should allocate - // and return a unique pointer, even for a zero byte allocation. - if (TD->getTypeAllocSize(AI.getAllocatedType()) == 0) - return ReplaceInstUsesWith(AI, Constant::getNullValue(AI.getType())); - + if (TD && AI.getAllocatedType()->isSized()) { // If the alignment is 0 (unspecified), assign it the preferred alignment. if (AI.getAlignment() == 0) AI.setAlignment(TD->getPrefTypeAlignment(AI.getAllocatedType())); + + // Move all alloca's of zero byte objects to the entry block and merge them + // together. Note that we only do this for alloca's, because malloc should + // allocate and return a unique pointer, even for a zero byte allocation. + if (TD->getTypeAllocSize(AI.getAllocatedType()) == 0) { + // For a zero sized alloca there is no point in doing an array allocation. + // This is helpful if the array size is a complicated expression not used + // elsewhere. + if (AI.isArrayAllocation()) { + AI.setOperand(0, ConstantInt::get(AI.getArraySize()->getType(), 1)); + return &AI; + } + + // Get the first instruction in the entry block. + BasicBlock &EntryBlock = AI.getParent()->getParent()->getEntryBlock(); + Instruction *FirstInst = EntryBlock.getFirstNonPHIOrDbg(); + if (FirstInst != &AI) { + // If the entry block doesn't start with a zero-size alloca then move + // this one to the start of the entry block. There is no problem with + // dominance as the array size was forced to a constant earlier already. + AllocaInst *EntryAI = dyn_cast<AllocaInst>(FirstInst); + if (!EntryAI || !EntryAI->getAllocatedType()->isSized() || + TD->getTypeAllocSize(EntryAI->getAllocatedType()) != 0) { + AI.moveBefore(FirstInst); + return &AI; + } + + // Replace this zero-sized alloca with the one at the start of the entry + // block after ensuring that the address will be aligned enough for both + // types. + unsigned MaxAlign = + std::max(TD->getPrefTypeAlignment(EntryAI->getAllocatedType()), + TD->getPrefTypeAlignment(AI.getAllocatedType())); + EntryAI->setAlignment(MaxAlign); + if (AI.getType() != EntryAI->getType()) + return new BitCastInst(EntryAI, AI.getType()); + return ReplaceInstUsesWith(AI, EntryAI); + } + } } - // Try to aggressively remove allocas which are only used for GEPs, lifetime - // markers, and stores. This happens when SROA iteratively promotes stores - // out of the alloca, and we need to cleanup after it. - return removeDeadAlloca(*this, AI); + // At last, use the generic allocation site handler to aggressively remove + // unused allocas. + return visitAllocSite(AI); } diff --git a/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp index 5168e2a..35a0bbb 100644 --- a/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp +++ b/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp @@ -464,9 +464,12 @@ Instruction *InstCombiner::visitUDiv(BinaryOperator &I) { // X udiv (C1 << N), where C1 is "1<<C2" --> X >> (N+C2) { const APInt *CI; Value *N; - if (match(Op1, m_Shl(m_Power2(CI), m_Value(N)))) { + if (match(Op1, m_Shl(m_Power2(CI), m_Value(N))) || + match(Op1, m_ZExt(m_Shl(m_Power2(CI), m_Value(N))))) { if (*CI != 1) N = Builder->CreateAdd(N, ConstantInt::get(I.getType(),CI->logBase2())); + if (ZExtInst *Z = dyn_cast<ZExtInst>(Op1)) + N = Builder->CreateZExt(N, Z->getDestTy()); if (I.isExact()) return BinaryOperator::CreateExactLShr(Op0, N); return BinaryOperator::CreateLShr(Op0, N); diff --git a/lib/Transforms/InstCombine/InstCombineSelect.cpp b/lib/Transforms/InstCombine/InstCombineSelect.cpp index e727b2c..291e800 100644 --- a/lib/Transforms/InstCombine/InstCombineSelect.cpp +++ b/lib/Transforms/InstCombine/InstCombineSelect.cpp @@ -129,6 +129,12 @@ Instruction *InstCombiner::FoldSelectOpOp(SelectInst &SI, Instruction *TI, if (TI->isCast()) { if (TI->getOperand(0)->getType() != FI->getOperand(0)->getType()) return 0; + // The select condition may be a vector. We may only change the operand + // type if the vector width remains the same (and matches the condition). + Type *CondTy = SI.getCondition()->getType(); + if (CondTy->isVectorTy() && CondTy->getVectorNumElements() != + FI->getOperand(0)->getType()->getVectorNumElements()) + return 0; } else { return 0; // unknown unary op. } @@ -498,7 +504,7 @@ Instruction *InstCombiner::visitSelectInstWithICmp(SelectInst &SI, // NOTE: if we wanted to, this is where to detect integer MIN/MAX - if (isa<Constant>(CmpRHS)) { + if (CmpRHS != CmpLHS && isa<Constant>(CmpRHS)) { if (CmpLHS == TrueVal && Pred == ICmpInst::ICMP_EQ) { // Transform (X == C) ? X : Y -> (X == C) ? C : Y SI.setOperand(1, CmpRHS); @@ -875,12 +881,16 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) { if (SelectInst *TrueSI = dyn_cast<SelectInst>(TrueVal)) { if (TrueSI->getCondition() == CondVal) { + if (SI.getTrueValue() == TrueSI->getTrueValue()) + return 0; SI.setOperand(1, TrueSI->getTrueValue()); return &SI; } } if (SelectInst *FalseSI = dyn_cast<SelectInst>(FalseVal)) { if (FalseSI->getCondition() == CondVal) { + if (SI.getFalseValue() == FalseSI->getFalseValue()) + return 0; SI.setOperand(2, FalseSI->getFalseValue()); return &SI; } @@ -893,5 +903,16 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) { return &SI; } + if (VectorType* VecTy = dyn_cast<VectorType>(SI.getType())) { + unsigned VWidth = VecTy->getNumElements(); + APInt UndefElts(VWidth, 0); + APInt AllOnesEltMask(APInt::getAllOnesValue(VWidth)); + if (Value *V = SimplifyDemandedVectorElts(&SI, AllOnesEltMask, UndefElts)) { + if (V != &SI) + return ReplaceInstUsesWith(SI, V); + return &SI; + } + } + return 0; } diff --git a/lib/Transforms/InstCombine/InstCombineShifts.cpp b/lib/Transforms/InstCombine/InstCombineShifts.cpp index b31049e..4bb2403 100644 --- a/lib/Transforms/InstCombine/InstCombineShifts.cpp +++ b/lib/Transforms/InstCombine/InstCombineShifts.cpp @@ -151,7 +151,7 @@ static bool CanEvaluateShifted(Value *V, unsigned NumBits, bool isLeftShift, // We can always turn lshr(c1)+shl(c2) -> lshr(c3)+and(c4), but it isn't // profitable unless we know the and'd out bits are already zero. - if (CI->getZExtValue() > NumBits) { + if (CI->getValue().ult(TypeWidth) && CI->getZExtValue() > NumBits) { unsigned LowBits = CI->getZExtValue() - NumBits; if (MaskedValueIsZero(I->getOperand(0), APInt::getLowBitsSet(TypeWidth, NumBits) << LowBits)) @@ -529,6 +529,19 @@ Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, ConstantInt *Op1, ShiftOp = 0; if (ShiftOp && isa<ConstantInt>(ShiftOp->getOperand(1))) { + + // This is a constant shift of a constant shift. Be careful about hiding + // shl instructions behind bit masks. They are used to represent multiplies + // by a constant, and it is important that simple arithmetic expressions + // are still recognizable by scalar evolution. + // + // The transforms applied to shl are very similar to the transforms applied + // to mul by constant. We can be more aggressive about optimizing right + // shifts. + // + // Combinations of right and left shifts will still be optimized in + // DAGCombine where scalar evolution no longer applies. + ConstantInt *ShiftAmt1C = cast<ConstantInt>(ShiftOp->getOperand(1)); uint32_t ShiftAmt1 = ShiftAmt1C->getLimitedValue(TypeBits); uint32_t ShiftAmt2 = Op1->getLimitedValue(TypeBits); @@ -554,13 +567,6 @@ Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, ConstantInt *Op1, } if (ShiftAmt1 == ShiftAmt2) { - // If we have ((X >>? C) << C), turn this into X & (-1 << C). - if (I.getOpcode() == Instruction::Shl && - ShiftOp->getOpcode() != Instruction::Shl) { - APInt Mask(APInt::getHighBitsSet(TypeBits, TypeBits - ShiftAmt1)); - return BinaryOperator::CreateAnd(X, - ConstantInt::get(I.getContext(),Mask)); - } // If we have ((X << C) >>u C), turn this into X & (-1 >>u C). if (I.getOpcode() == Instruction::LShr && ShiftOp->getOpcode() == Instruction::Shl) { @@ -570,28 +576,23 @@ Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, ConstantInt *Op1, } } else if (ShiftAmt1 < ShiftAmt2) { uint32_t ShiftDiff = ShiftAmt2-ShiftAmt1; - - // (X >>? C1) << C2 --> X << (C2-C1) & (-1 << C2) + + // (X >>?,exact C1) << C2 --> X << (C2-C1) + // The inexact version is deferred to DAGCombine so we don't hide shl + // behind a bit mask. if (I.getOpcode() == Instruction::Shl && - ShiftOp->getOpcode() != Instruction::Shl) { + ShiftOp->getOpcode() != Instruction::Shl && + ShiftOp->isExact()) { assert(ShiftOp->getOpcode() == Instruction::LShr || ShiftOp->getOpcode() == Instruction::AShr); ConstantInt *ShiftDiffCst = ConstantInt::get(Ty, ShiftDiff); - if (ShiftOp->isExact()) { - // (X >>?,exact C1) << C2 --> X << (C2-C1) - BinaryOperator *NewShl = BinaryOperator::Create(Instruction::Shl, - X, ShiftDiffCst); - NewShl->setHasNoUnsignedWrap(I.hasNoUnsignedWrap()); - NewShl->setHasNoSignedWrap(I.hasNoSignedWrap()); - return NewShl; - } - Value *Shift = Builder->CreateShl(X, ShiftDiffCst); - - APInt Mask(APInt::getHighBitsSet(TypeBits, TypeBits - ShiftAmt2)); - return BinaryOperator::CreateAnd(Shift, - ConstantInt::get(I.getContext(),Mask)); + BinaryOperator *NewShl = BinaryOperator::Create(Instruction::Shl, + X, ShiftDiffCst); + NewShl->setHasNoUnsignedWrap(I.hasNoUnsignedWrap()); + NewShl->setHasNoSignedWrap(I.hasNoSignedWrap()); + return NewShl; } - + // (X << C1) >>u C2 --> X >>u (C2-C1) & (-1 >> C2) if (I.getOpcode() == Instruction::LShr && ShiftOp->getOpcode() == Instruction::Shl) { @@ -627,24 +628,19 @@ Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, ConstantInt *Op1, assert(ShiftAmt2 < ShiftAmt1); uint32_t ShiftDiff = ShiftAmt1-ShiftAmt2; - // (X >>? C1) << C2 --> X >>? (C1-C2) & (-1 << C2) + // (X >>?exact C1) << C2 --> X >>?exact (C1-C2) + // The inexact version is deferred to DAGCombine so we don't hide shl + // behind a bit mask. if (I.getOpcode() == Instruction::Shl && - ShiftOp->getOpcode() != Instruction::Shl) { + ShiftOp->getOpcode() != Instruction::Shl && + ShiftOp->isExact()) { ConstantInt *ShiftDiffCst = ConstantInt::get(Ty, ShiftDiff); - if (ShiftOp->isExact()) { - // (X >>?exact C1) << C2 --> X >>?exact (C1-C2) - BinaryOperator *NewShr = BinaryOperator::Create(ShiftOp->getOpcode(), - X, ShiftDiffCst); - NewShr->setIsExact(true); - return NewShr; - } - Value *Shift = Builder->CreateBinOp(ShiftOp->getOpcode(), - X, ShiftDiffCst); - APInt Mask(APInt::getHighBitsSet(TypeBits, TypeBits - ShiftAmt2)); - return BinaryOperator::CreateAnd(Shift, - ConstantInt::get(I.getContext(),Mask)); + BinaryOperator *NewShr = BinaryOperator::Create(ShiftOp->getOpcode(), + X, ShiftDiffCst); + NewShr->setIsExact(true); + return NewShr; } - + // (X << C1) >>u C2 --> X << (C1-C2) & (-1 >> C2) if (I.getOpcode() == Instruction::LShr && ShiftOp->getOpcode() == Instruction::Shl) { diff --git a/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp index 125c74a..54be8ed 100644 --- a/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp +++ b/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp @@ -989,6 +989,29 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts, } break; } + case Instruction::Select: { + APInt LeftDemanded(DemandedElts), RightDemanded(DemandedElts); + if (ConstantVector* CV = dyn_cast<ConstantVector>(I->getOperand(0))) { + for (unsigned i = 0; i < VWidth; i++) { + if (CV->getAggregateElement(i)->isNullValue()) + LeftDemanded.clearBit(i); + else + RightDemanded.clearBit(i); + } + } + + TmpV = SimplifyDemandedVectorElts(I->getOperand(1), LeftDemanded, + UndefElts, Depth+1); + if (TmpV) { I->setOperand(1, TmpV); MadeChange = true; } + + TmpV = SimplifyDemandedVectorElts(I->getOperand(2), RightDemanded, + UndefElts2, Depth+1); + if (TmpV) { I->setOperand(2, TmpV); MadeChange = true; } + + // Output elements are undefined if both are undefined. + UndefElts &= UndefElts2; + break; + } case Instruction::BitCast: { // Vector->vector casts only. VectorType *VTy = dyn_cast<VectorType>(I->getOperand(0)->getType()); @@ -1074,6 +1097,12 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts, // like undef&0. The result is known zero, not undef. UndefElts &= UndefElts2; break; + case Instruction::FPTrunc: + case Instruction::FPExt: + TmpV = SimplifyDemandedVectorElts(I->getOperand(0), DemandedElts, + UndefElts, Depth+1); + if (TmpV) { I->setOperand(0, TmpV); MadeChange = true; } + break; case Instruction::Call: { IntrinsicInst *II = dyn_cast<IntrinsicInst>(I); diff --git a/lib/Transforms/InstCombine/InstructionCombining.cpp b/lib/Transforms/InstCombine/InstructionCombining.cpp index 066b2ec..68ecd51 100644 --- a/lib/Transforms/InstCombine/InstructionCombining.cpp +++ b/lib/Transforms/InstCombine/InstructionCombining.cpp @@ -87,30 +87,34 @@ void InstCombiner::getAnalysisUsage(AnalysisUsage &AU) const { } +Value *InstCombiner::EmitGEPOffset(User *GEP) { + return llvm::EmitGEPOffset(Builder, *getTargetData(), GEP); +} + /// ShouldChangeType - Return true if it is desirable to convert a computation /// from 'From' to 'To'. We don't want to convert from a legal to an illegal /// type for example, or from a smaller to a larger illegal type. bool InstCombiner::ShouldChangeType(Type *From, Type *To) const { assert(From->isIntegerTy() && To->isIntegerTy()); - + // If we don't have TD, we don't know if the source/dest are legal. if (!TD) return false; - + unsigned FromWidth = From->getPrimitiveSizeInBits(); unsigned ToWidth = To->getPrimitiveSizeInBits(); bool FromLegal = TD->isLegalInteger(FromWidth); bool ToLegal = TD->isLegalInteger(ToWidth); - + // If this is a legal integer from type, and the result would be an illegal // type, don't do the transformation. if (FromLegal && !ToLegal) return false; - + // Otherwise, if both are illegal, do not increase the size of the result. We // do allow things like i160 -> i64, but not i64 -> i160. if (!FromLegal && !ToLegal && ToWidth > FromWidth) return false; - + return true; } @@ -127,7 +131,7 @@ static bool MaintainNoSignedWrap(BinaryOperator &I, Value *B, Value *C) { // We reason about Add and Sub Only. Instruction::BinaryOps Opcode = I.getOpcode(); - if (Opcode != Instruction::Add && + if (Opcode != Instruction::Add && Opcode != Instruction::Sub) { return false; } @@ -203,7 +207,7 @@ bool InstCombiner::SimplifyAssociativeOrCommutative(BinaryOperator &I) { // Conservatively clear the optional flags, since they may not be // preserved by the reassociation. if (MaintainNoSignedWrap(I, B, C) && - (!Op0 || (isa<BinaryOperator>(Op0) && Op0->hasNoSignedWrap()))) { + (!Op0 || (isa<BinaryOperator>(Op0) && Op0->hasNoSignedWrap()))) { // Note: this is only valid because SimplifyBinOp doesn't look at // the operands to Op0. I.clearSubclassOptionalData(); @@ -211,7 +215,7 @@ bool InstCombiner::SimplifyAssociativeOrCommutative(BinaryOperator &I) { } else { I.clearSubclassOptionalData(); } - + Changed = true; ++NumReassoc; continue; @@ -540,7 +544,7 @@ static Value *FoldOperationIntoSelectOperand(Instruction &I, Value *SO, Value *Op0 = SO, *Op1 = ConstOperand; if (!ConstIsRHS) std::swap(Op0, Op1); - + if (BinaryOperator *BO = dyn_cast<BinaryOperator>(&I)) return IC->Builder->CreateBinOp(BO->getOpcode(), Op0, Op1, SO->getName()+".op"); @@ -579,7 +583,7 @@ Instruction *InstCombiner::FoldOpIntoSelect(Instruction &Op, SelectInst *SI) { if (SrcTy && SrcTy->getNumElements() != DestTy->getNumElements()) return 0; } - + Value *SelectTrueVal = FoldOperationIntoSelectOperand(Op, TV, this); Value *SelectFalseVal = FoldOperationIntoSelectOperand(Op, FV, this); @@ -599,7 +603,7 @@ Instruction *InstCombiner::FoldOpIntoPhi(Instruction &I) { unsigned NumPHIValues = PN->getNumIncomingValues(); if (NumPHIValues == 0) return 0; - + // We normally only transform phis with a single use. However, if a PHI has // multiple uses and they are all the same operation, we can fold *all* of the // uses into the PHI. @@ -613,7 +617,7 @@ Instruction *InstCombiner::FoldOpIntoPhi(Instruction &I) { } // Otherwise, we can replace *all* users with the new PHI we form. } - + // Check to see if all of the operands of the PHI are simple constants // (constantint/constantfp/undef). If there is one non-constant value, // remember the BB it is in. If there is more than one or if *it* is a PHI, @@ -627,7 +631,7 @@ Instruction *InstCombiner::FoldOpIntoPhi(Instruction &I) { if (isa<PHINode>(InVal)) return 0; // Itself a phi. if (NonConstBB) return 0; // More than one non-const value. - + NonConstBB = PN->getIncomingBlock(i); // If the InVal is an invoke at the end of the pred block, then we can't @@ -635,14 +639,14 @@ Instruction *InstCombiner::FoldOpIntoPhi(Instruction &I) { if (InvokeInst *II = dyn_cast<InvokeInst>(InVal)) if (II->getParent() == NonConstBB) return 0; - + // If the incoming non-constant value is in I's block, we will remove one // instruction, but insert another equivalent one, leading to infinite // instcombine. if (NonConstBB == I.getParent()) return 0; } - + // If there is exactly one non-constant value, we can insert a copy of the // operation in that block. However, if this is a critical edge, we would be // inserting the computation one some other paths (e.g. inside a loop). Only @@ -656,12 +660,12 @@ Instruction *InstCombiner::FoldOpIntoPhi(Instruction &I) { PHINode *NewPN = PHINode::Create(I.getType(), PN->getNumIncomingValues()); InsertNewInstBefore(NewPN, *PN); NewPN->takeName(PN); - + // If we are going to have to insert a new computation, do so right before the // predecessors terminator. if (NonConstBB) Builder->SetInsertPoint(NonConstBB->getTerminator()); - + // Next, add all of the operands to the PHI. if (SelectInst *SI = dyn_cast<SelectInst>(&I)) { // We only currently try to fold the condition of a select when it is a phi, @@ -706,20 +710,20 @@ Instruction *InstCombiner::FoldOpIntoPhi(Instruction &I) { PN->getIncomingValue(i), C, "phitmp"); NewPN->addIncoming(InV, PN->getIncomingBlock(i)); } - } else { + } else { CastInst *CI = cast<CastInst>(&I); Type *RetTy = CI->getType(); for (unsigned i = 0; i != NumPHIValues; ++i) { Value *InV; if (Constant *InC = dyn_cast<Constant>(PN->getIncomingValue(i))) InV = ConstantExpr::getCast(CI->getOpcode(), InC, RetTy); - else + else InV = Builder->CreateCast(CI->getOpcode(), PN->getIncomingValue(i), I.getType(), "phitmp"); NewPN->addIncoming(InV, PN->getIncomingBlock(i)); } } - + for (Value::use_iterator UI = PN->use_begin(), E = PN->use_end(); UI != E; ) { Instruction *User = cast<Instruction>(*UI++); @@ -734,11 +738,11 @@ Instruction *InstCombiner::FoldOpIntoPhi(Instruction &I) { /// or not there is a sequence of GEP indices into the type that will land us at /// the specified offset. If so, fill them into NewIndices and return the /// resultant element type, otherwise return null. -Type *InstCombiner::FindElementAtOffset(Type *Ty, int64_t Offset, +Type *InstCombiner::FindElementAtOffset(Type *Ty, int64_t Offset, SmallVectorImpl<Value*> &NewIndices) { if (!TD) return 0; if (!Ty->isSized()) return 0; - + // Start with the index over the outer type. Note that the type size // might be zero (even if the offset isn't zero) if the indexed type // is something like [0 x {int, int}] @@ -747,7 +751,7 @@ Type *InstCombiner::FindElementAtOffset(Type *Ty, int64_t Offset, if (int64_t TySize = TD->getTypeAllocSize(Ty)) { FirstIdx = Offset/TySize; Offset -= FirstIdx*TySize; - + // Handle hosts where % returns negative instead of values [0..TySize). if (Offset < 0) { --FirstIdx; @@ -756,24 +760,24 @@ Type *InstCombiner::FindElementAtOffset(Type *Ty, int64_t Offset, } assert((uint64_t)Offset < (uint64_t)TySize && "Out of range offset"); } - + NewIndices.push_back(ConstantInt::get(IntPtrTy, FirstIdx)); - + // Index into the types. If we fail, set OrigBase to null. while (Offset) { // Indexing into tail padding between struct/array elements. if (uint64_t(Offset*8) >= TD->getTypeSizeInBits(Ty)) return 0; - + if (StructType *STy = dyn_cast<StructType>(Ty)) { const StructLayout *SL = TD->getStructLayout(STy); assert(Offset < (int64_t)SL->getSizeInBytes() && "Offset must stay within the indexed type"); - + unsigned Elt = SL->getElementContainingOffset(Offset); NewIndices.push_back(ConstantInt::get(Type::getInt32Ty(Ty->getContext()), Elt)); - + Offset -= SL->getElementOffset(Elt); Ty = STy->getElementType(Elt); } else if (ArrayType *AT = dyn_cast<ArrayType>(Ty)) { @@ -787,7 +791,7 @@ Type *InstCombiner::FindElementAtOffset(Type *Ty, int64_t Offset, return 0; } } - + return Ty; } @@ -948,7 +952,7 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) { Res->setIsInBounds(GEP.isInBounds()); return Res; } - + if (ArrayType *XATy = dyn_cast<ArrayType>(StrippedPtrTy->getElementType())){ // GEP (bitcast [10 x i8]* X to [0 x i8]*), i32 0, ... ? @@ -981,16 +985,16 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) { // V and GEP are both pointer types --> BitCast return new BitCastInst(NewGEP, GEP.getType()); } - + // Transform things like: // getelementptr i8* bitcast ([100 x double]* X to i8*), i32 %tmp // (where tmp = 8*tmp2) into: // getelementptr [100 x double]* %arr, i32 0, i32 %tmp2; bitcast - + if (TD && SrcElTy->isArrayTy() && ResElTy->isIntegerTy(8)) { uint64_t ArrayEltSize = TD->getTypeAllocSize(cast<ArrayType>(SrcElTy)->getElementType()); - + // Check to see if "tmp" is a scale by a multiple of ArrayEltSize. We // allow either a mul, shift, or constant here. Value *NewIdx = 0; @@ -1015,7 +1019,7 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) { NewIdx = Inst->getOperand(0); } } - + // If the index will be to exactly the right offset with the scale taken // out, perform the transformation. Note, we don't know whether Scale is // signed or not. We'll use unsigned version of division/modulo @@ -1054,10 +1058,9 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) { !isa<BitCastInst>(BCI->getOperand(0)) && GEP.hasAllConstantIndices() && StrippedPtrTy->getAddressSpace() == GEP.getPointerAddressSpace()) { - // Determine how much the GEP moves the pointer. We are guaranteed to get - // a constant back from EmitGEPOffset. - ConstantInt *OffsetV = cast<ConstantInt>(EmitGEPOffset(&GEP)); - int64_t Offset = OffsetV->getSExtValue(); + // Determine how much the GEP moves the pointer. + SmallVector<Value*, 8> Ops(GEP.idx_begin(), GEP.idx_end()); + int64_t Offset = TD->getIndexedOffset(GEP.getPointerOperandType(), Ops); // If this GEP instruction doesn't move the pointer, just replace the GEP // with a bitcast of the real input to the dest type. @@ -1065,7 +1068,7 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) { // If the bitcast is of an allocation, and the allocation will be // converted to match the type of the cast, don't touch this. if (isa<AllocaInst>(BCI->getOperand(0)) || - isMalloc(BCI->getOperand(0))) { + isAllocationFn(BCI->getOperand(0))) { // See if the bitcast simplifies, if so, don't nuke this GEP yet. if (Instruction *I = visitBitCast(*BCI)) { if (I != BCI) { @@ -1078,7 +1081,7 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) { } return new BitCastInst(BCI->getOperand(0), GEP.getType()); } - + // Otherwise, if the offset is non-zero, we need to find out if there is a // field at Offset in 'A's type. If so, we can pull the cast through the // GEP. @@ -1089,68 +1092,103 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) { Value *NGEP = GEP.isInBounds() ? Builder->CreateInBoundsGEP(BCI->getOperand(0), NewIndices) : Builder->CreateGEP(BCI->getOperand(0), NewIndices); - + if (NGEP->getType() == GEP.getType()) return ReplaceInstUsesWith(GEP, NGEP); NGEP->takeName(&GEP); return new BitCastInst(NGEP, GEP.getType()); } } - } - + } + return 0; } -static bool IsOnlyNullComparedAndFreed(Value *V, SmallVectorImpl<WeakVH> &Users, - int Depth = 0) { - if (Depth == 8) - return false; +static bool +isAllocSiteRemovable(Instruction *AI, SmallVectorImpl<WeakVH> &Users) { + SmallVector<Instruction*, 4> Worklist; + Worklist.push_back(AI); - for (Value::use_iterator UI = V->use_begin(), UE = V->use_end(); - UI != UE; ++UI) { - User *U = *UI; - if (isFreeCall(U)) { - Users.push_back(U); - continue; - } - if (ICmpInst *ICI = dyn_cast<ICmpInst>(U)) { - if (ICI->isEquality() && isa<ConstantPointerNull>(ICI->getOperand(1))) { - Users.push_back(ICI); + do { + Instruction *PI = Worklist.pop_back_val(); + for (Value::use_iterator UI = PI->use_begin(), UE = PI->use_end(); UI != UE; + ++UI) { + Instruction *I = cast<Instruction>(*UI); + switch (I->getOpcode()) { + default: + // Give up the moment we see something we can't handle. + return false; + + case Instruction::BitCast: + case Instruction::GetElementPtr: + Users.push_back(I); + Worklist.push_back(I); continue; - } - } - if (BitCastInst *BCI = dyn_cast<BitCastInst>(U)) { - if (IsOnlyNullComparedAndFreed(BCI, Users, Depth+1)) { - Users.push_back(BCI); + + case Instruction::ICmp: { + ICmpInst *ICI = cast<ICmpInst>(I); + // We can fold eq/ne comparisons with null to false/true, respectively. + if (!ICI->isEquality() || !isa<ConstantPointerNull>(ICI->getOperand(1))) + return false; + Users.push_back(I); continue; } - } - if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(U)) { - if (IsOnlyNullComparedAndFreed(GEPI, Users, Depth+1)) { - Users.push_back(GEPI); + + case Instruction::Call: + // Ignore no-op and store intrinsics. + if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) { + switch (II->getIntrinsicID()) { + default: + return false; + + case Intrinsic::memmove: + case Intrinsic::memcpy: + case Intrinsic::memset: { + MemIntrinsic *MI = cast<MemIntrinsic>(II); + if (MI->isVolatile() || MI->getRawDest() != PI) + return false; + } + // fall through + case Intrinsic::dbg_declare: + case Intrinsic::dbg_value: + case Intrinsic::invariant_start: + case Intrinsic::invariant_end: + case Intrinsic::lifetime_start: + case Intrinsic::lifetime_end: + case Intrinsic::objectsize: + Users.push_back(I); + continue; + } + } + + if (isFreeCall(I)) { + Users.push_back(I); + continue; + } + return false; + + case Instruction::Store: { + StoreInst *SI = cast<StoreInst>(I); + if (SI->isVolatile() || SI->getPointerOperand() != PI) + return false; + Users.push_back(I); continue; } - } - if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(U)) { - if (II->getIntrinsicID() == Intrinsic::lifetime_start || - II->getIntrinsicID() == Intrinsic::lifetime_end) { - Users.push_back(II); - continue; } + llvm_unreachable("missing a return?"); } - return false; - } + } while (!Worklist.empty()); return true; } -Instruction *InstCombiner::visitMalloc(Instruction &MI) { +Instruction *InstCombiner::visitAllocSite(Instruction &MI) { // If we have a malloc call which is only used in any amount of comparisons // to null and free calls, delete the calls and replace the comparisons with // true or false as appropriate. SmallVector<WeakVH, 64> Users; - if (IsOnlyNullComparedAndFreed(&MI, Users)) { + if (isAllocSiteRemovable(&MI, Users)) { for (unsigned i = 0, e = Users.size(); i != e; ++i) { Instruction *I = cast_or_null<Instruction>(&*Users[i]); if (!I) continue; @@ -1161,9 +1199,23 @@ Instruction *InstCombiner::visitMalloc(Instruction &MI) { C->isFalseWhenEqual())); } else if (isa<BitCastInst>(I) || isa<GetElementPtrInst>(I)) { ReplaceInstUsesWith(*I, UndefValue::get(I->getType())); + } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) { + if (II->getIntrinsicID() == Intrinsic::objectsize) { + ConstantInt *CI = cast<ConstantInt>(II->getArgOperand(1)); + uint64_t DontKnow = CI->isZero() ? -1ULL : 0; + ReplaceInstUsesWith(*I, ConstantInt::get(I->getType(), DontKnow)); + } } EraseInstFromFunction(*I); } + + if (InvokeInst *II = dyn_cast<InvokeInst>(&MI)) { + // Replace invoke with a NOP intrinsic to maintain the original CFG + Module *M = II->getParent()->getParent()->getParent(); + Function *F = Intrinsic::getDeclaration(M, Intrinsic::donothing); + InvokeInst::Create(F, II->getNormalDest(), II->getUnwindDest(), + ArrayRef<Value *>(), "", II->getParent()); + } return EraseInstFromFunction(MI); } return 0; @@ -1181,7 +1233,7 @@ Instruction *InstCombiner::visitFree(CallInst &FI) { UndefValue::get(Type::getInt1PtrTy(FI.getContext()))); return EraseInstFromFunction(FI); } - + // If we have 'free null' delete the instruction. This can happen in stl code // when lots of inlining happens. if (isa<ConstantPointerNull>(Op)) @@ -1207,14 +1259,14 @@ Instruction *InstCombiner::visitBranchInst(BranchInst &BI) { // Cannonicalize fcmp_one -> fcmp_oeq FCmpInst::Predicate FPred; Value *Y; - if (match(&BI, m_Br(m_FCmp(FPred, m_Value(X), m_Value(Y)), + if (match(&BI, m_Br(m_FCmp(FPred, m_Value(X), m_Value(Y)), TrueDest, FalseDest)) && BI.getCondition()->hasOneUse()) if (FPred == FCmpInst::FCMP_ONE || FPred == FCmpInst::FCMP_OLE || FPred == FCmpInst::FCMP_OGE) { FCmpInst *Cond = cast<FCmpInst>(BI.getCondition()); Cond->setPredicate(FCmpInst::getInversePredicate(FPred)); - + // Swap Destinations and condition. BI.swapSuccessors(); Worklist.Add(Cond); @@ -1280,7 +1332,7 @@ Instruction *InstCombiner::visitExtractValueInst(ExtractValueInst &EV) { } return 0; // Can't handle other constants } - + if (InsertValueInst *IV = dyn_cast<InsertValueInst>(Agg)) { // We're extracting from an insertvalue instruction, compare the indices const unsigned *exti, *exte, *insi, *inse; @@ -1329,7 +1381,7 @@ Instruction *InstCombiner::visitExtractValueInst(ExtractValueInst &EV) { // %E = extractvalue { i32, { i32 } } %I, 1, 0 // with // %E extractvalue { i32 } { i32 42 }, 0 - return ExtractValueInst::Create(IV->getInsertedValueOperand(), + return ExtractValueInst::Create(IV->getInsertedValueOperand(), makeArrayRef(exti, exte)); } if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Agg)) { @@ -1349,7 +1401,7 @@ Instruction *InstCombiner::visitExtractValueInst(ExtractValueInst &EV) { EraseInstFromFunction(*II); return BinaryOperator::CreateAdd(LHS, RHS); } - + // If the normal result of the add is dead, and the RHS is a constant, // we can transform this into a range comparison. // overflow = uadd a, -4 --> overflow = icmp ugt a, 3 @@ -1798,7 +1850,7 @@ static bool TryToSinkInstruction(Instruction *I, BasicBlock *DestBlock) { /// many instructions are dead or constant). Additionally, if we find a branch /// whose condition is a known constant, we only visit the reachable successors. /// -static bool AddReachableCodeToWorklist(BasicBlock *BB, +static bool AddReachableCodeToWorklist(BasicBlock *BB, SmallPtrSet<BasicBlock*, 64> &Visited, InstCombiner &IC, const TargetData *TD, @@ -1812,13 +1864,13 @@ static bool AddReachableCodeToWorklist(BasicBlock *BB, do { BB = Worklist.pop_back_val(); - + // We have now visited this block! If we've already been here, ignore it. if (!Visited.insert(BB)) continue; for (BasicBlock::iterator BBI = BB->begin(), E = BB->end(); BBI != E; ) { Instruction *Inst = BBI++; - + // DCE instruction if trivially dead. if (isInstructionTriviallyDead(Inst)) { ++NumDeadInst; @@ -1826,7 +1878,7 @@ static bool AddReachableCodeToWorklist(BasicBlock *BB, Inst->eraseFromParent(); continue; } - + // ConstantProp instruction if trivially constant. if (!Inst->use_empty() && isa<Constant>(Inst->getOperand(0))) if (Constant *C = ConstantFoldInstruction(Inst, TD, TLI)) { @@ -1837,7 +1889,7 @@ static bool AddReachableCodeToWorklist(BasicBlock *BB, Inst->eraseFromParent(); continue; } - + if (TD) { // See if we can constant fold its operands. for (User::op_iterator i = Inst->op_begin(), e = Inst->op_end(); @@ -1881,17 +1933,17 @@ static bool AddReachableCodeToWorklist(BasicBlock *BB, Worklist.push_back(ReachableBB); continue; } - + // Otherwise it is the default destination. Worklist.push_back(SI->getDefaultDest()); continue; } } - + for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i) Worklist.push_back(TI->getSuccessor(i)); } while (!Worklist.empty()); - + // Once we've found all of the instructions to add to instcombine's worklist, // add them in reverse order. This way instcombine will visit from the top // of the function down. This jives well with the way that it adds all uses @@ -1899,13 +1951,13 @@ static bool AddReachableCodeToWorklist(BasicBlock *BB, // some N^2 behavior in pathological cases. IC.Worklist.AddInitialGroup(&InstrsForInstCombineWorklist[0], InstrsForInstCombineWorklist.size()); - + return MadeIRChange; } bool InstCombiner::DoOneIteration(Function &F, unsigned Iteration) { MadeIRChange = false; - + DEBUG(errs() << "\n\nINSTCOMBINE ITERATION #" << Iteration << " on " << F.getName() << "\n"); @@ -1976,13 +2028,13 @@ bool InstCombiner::DoOneIteration(Function &F, unsigned Iteration) { BasicBlock *BB = I->getParent(); Instruction *UserInst = cast<Instruction>(I->use_back()); BasicBlock *UserParent; - + // Get the block the use occurs in. if (PHINode *PN = dyn_cast<PHINode>(UserInst)) UserParent = PN->getIncomingBlock(I->use_begin().getUse()); else UserParent = UserInst->getParent(); - + if (UserParent != BB) { bool UserIsSuccessor = false; // See if the user is one of our successors. @@ -2004,7 +2056,7 @@ bool InstCombiner::DoOneIteration(Function &F, unsigned Iteration) { // Now that we have an instruction, try combining it to simplify it. Builder->SetInsertPoint(I->getParent(), I); Builder->SetCurrentDebugLocation(I->getDebugLoc()); - + #ifndef NDEBUG std::string OrigI; #endif @@ -2069,14 +2121,14 @@ bool InstCombiner::DoOneIteration(Function &F, unsigned Iteration) { bool InstCombiner::runOnFunction(Function &F) { TD = getAnalysisIfAvailable<TargetData>(); TLI = &getAnalysis<TargetLibraryInfo>(); - + /// Builder - This is an IRBuilder that automatically inserts new /// instructions into the worklist when they are created. - IRBuilder<true, TargetFolder, InstCombineIRInserter> + IRBuilder<true, TargetFolder, InstCombineIRInserter> TheBuilder(F.getContext(), TargetFolder(TD), InstCombineIRInserter(Worklist)); Builder = &TheBuilder; - + bool EverMadeChange = false; // Lower dbg.declare intrinsics otherwise their value may be clobbered @@ -2087,7 +2139,7 @@ bool InstCombiner::runOnFunction(Function &F) { unsigned Iteration = 0; while (DoOneIteration(F, Iteration++)) EverMadeChange = true; - + Builder = 0; return EverMadeChange; } diff --git a/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/lib/Transforms/Instrumentation/AddressSanitizer.cpp index b43b9e5..bf35eac 100644 --- a/lib/Transforms/Instrumentation/AddressSanitizer.cpp +++ b/lib/Transforms/Instrumentation/AddressSanitizer.cpp @@ -16,20 +16,23 @@ #define DEBUG_TYPE "asan" #include "FunctionBlackList.h" +#include "llvm/Function.h" +#include "llvm/IRBuilder.h" +#include "llvm/InlineAsm.h" +#include "llvm/IntrinsicInst.h" +#include "llvm/LLVMContext.h" +#include "llvm/Module.h" +#include "llvm/Type.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/OwningPtr.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallString.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringExtras.h" -#include "llvm/Function.h" -#include "llvm/IntrinsicInst.h" -#include "llvm/LLVMContext.h" -#include "llvm/Module.h" +#include "llvm/ADT/Triple.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/DataTypes.h" #include "llvm/Support/Debug.h" -#include "llvm/Support/IRBuilder.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Support/system_error.h" #include "llvm/Target/TargetData.h" @@ -37,7 +40,6 @@ #include "llvm/Transforms/Instrumentation.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/ModuleUtils.h" -#include "llvm/Type.h" #include <string> #include <algorithm> @@ -47,6 +49,7 @@ using namespace llvm; static const uint64_t kDefaultShadowScale = 3; static const uint64_t kDefaultShadowOffset32 = 1ULL << 29; static const uint64_t kDefaultShadowOffset64 = 1ULL << 44; +static const uint64_t kDefaultShadowOffsetAndroid = 0; static const size_t kMaxStackMallocSize = 1 << 16; // 64K static const uintptr_t kCurrentStackFrameMagic = 0x41B58AB3; @@ -70,6 +73,9 @@ static const int kAsanStackMidRedzoneMagic = 0xf2; static const int kAsanStackRightRedzoneMagic = 0xf3; static const int kAsanStackPartialRedzoneMagic = 0xf4; +// Accesses sizes are powers of two: 1, 2, 4, 8, 16. +static const size_t kNumberOfAccessSizes = 5; + // Command-line flags. // This flag may need to be replaced with -f[no-]asan-reads. @@ -77,6 +83,17 @@ static cl::opt<bool> ClInstrumentReads("asan-instrument-reads", cl::desc("instrument read instructions"), cl::Hidden, cl::init(true)); static cl::opt<bool> ClInstrumentWrites("asan-instrument-writes", cl::desc("instrument write instructions"), cl::Hidden, cl::init(true)); +static cl::opt<bool> ClInstrumentAtomics("asan-instrument-atomics", + cl::desc("instrument atomic instructions (rmw, cmpxchg)"), + cl::Hidden, cl::init(true)); +// This flag limits the number of instructions to be instrumented +// in any given BB. Normally, this should be set to unlimited (INT_MAX), +// but due to http://llvm.org/bugs/show_bug.cgi?id=12652 we temporary +// set it to 10000. +static cl::opt<int> ClMaxInsnsToInstrumentPerBB("asan-max-ins-per-bb", + cl::init(10000), + cl::desc("maximal number of instructions to instrument in any given BB"), + cl::Hidden); // This flag may need to be replaced with -f[no]asan-stack. static cl::opt<bool> ClStack("asan-stack", cl::desc("Handle stack memory"), cl::Hidden, cl::init(true)); @@ -125,18 +142,29 @@ static cl::opt<int> ClDebugMax("asan-debug-max", cl::desc("Debug man inst"), namespace { +/// An object of this type is created while instrumenting every function. +struct AsanFunctionContext { + AsanFunctionContext(Function &Function) : F(Function) { } + + Function &F; +}; + /// AddressSanitizer: instrument the code in module to find memory bugs. struct AddressSanitizer : public ModulePass { AddressSanitizer(); virtual const char *getPassName() const; - void instrumentMop(Instruction *I); - void instrumentAddress(Instruction *OrigIns, IRBuilder<> &IRB, + void instrumentMop(AsanFunctionContext &AFC, Instruction *I); + void instrumentAddress(AsanFunctionContext &AFC, + Instruction *OrigIns, IRBuilder<> &IRB, Value *Addr, uint32_t TypeSize, bool IsWrite); - Instruction *generateCrashCode(IRBuilder<> &IRB, Value *Addr, - bool IsWrite, uint32_t TypeSize); - bool instrumentMemIntrinsic(MemIntrinsic *MI); - void instrumentMemIntrinsicParam(Instruction *OrigIns, Value *Addr, - Value *Size, + Value *createSlowPathCmp(IRBuilder<> &IRB, Value *AddrLong, + Value *ShadowValue, uint32_t TypeSize); + Instruction *generateCrashCode(BasicBlock *BB, Value *Addr, Value *PC, + bool IsWrite, size_t AccessSizeIndex); + bool instrumentMemIntrinsic(AsanFunctionContext &AFC, MemIntrinsic *MI); + void instrumentMemIntrinsicParam(AsanFunctionContext &AFC, + Instruction *OrigIns, Value *Addr, + Value *Size, Instruction *InsertBefore, bool IsWrite); Value *memToShadow(Value *Shadow, IRBuilder<> &IRB); bool handleFunction(Module &M, Function &F); @@ -144,7 +172,6 @@ struct AddressSanitizer : public ModulePass { bool poisonStackInFunction(Module &M, Function &F); virtual bool runOnModule(Module &M); bool insertGlobalRedzones(Module &M); - BranchInst *splitBlockAndInsertIfThen(Instruction *SplitBefore, Value *Cmp); static char ID; // Pass identification, replacement for typeid private: @@ -163,11 +190,11 @@ struct AddressSanitizer : public ModulePass { return getAlignedSize(SizeInBytes); } + Function *checkInterfaceFunction(Constant *FuncOrBitcast); void PoisonStack(const ArrayRef<AllocaInst*> &AllocaVec, IRBuilder<> IRB, Value *ShadowBase, bool DoPoison); bool LooksLikeCodeInBug11395(Instruction *I); - Module *CurrentModule; LLVMContext *C; TargetData *TD; uint64_t MappingOffset; @@ -180,7 +207,11 @@ struct AddressSanitizer : public ModulePass { Function *AsanInitFunction; Instruction *CtorInsertBefore; OwningPtr<FunctionBlackList> BL; + // This array is indexed by AccessIsWrite and log2(AccessSize). + Function *AsanErrorCallback[2][kNumberOfAccessSizes]; + InlineAsm *EmptyAsm; }; + } // namespace char AddressSanitizer::ID = 0; @@ -196,6 +227,12 @@ const char *AddressSanitizer::getPassName() const { return "AddressSanitizer"; } +static size_t TypeSizeToSizeIndex(uint32_t TypeSize) { + size_t Res = CountTrailingZeros_32(TypeSize / 8); + assert(Res < kNumberOfAccessSizes); + return Res; +} + // Create a constant for Str so that we can pass it to the run-time lib. static GlobalVariable *createPrivateGlobalForString(Module &M, StringRef Str) { Constant *StrConst = ConstantDataArray::getString(M.getContext(), Str); @@ -206,29 +243,32 @@ static GlobalVariable *createPrivateGlobalForString(Module &M, StringRef Str) { // Split the basic block and insert an if-then code. // Before: // Head -// SplitBefore +// Cmp // Tail // After: // Head // if (Cmp) -// NewBasicBlock -// SplitBefore +// ThenBlock // Tail // -// Returns the NewBasicBlock's terminator. -BranchInst *AddressSanitizer::splitBlockAndInsertIfThen( - Instruction *SplitBefore, Value *Cmp) { +// If ThenBlock is zero, a new block is created and its terminator is returned. +// Otherwize 0 is returned. +static BranchInst *splitBlockAndInsertIfThen(Value *Cmp, + BasicBlock *ThenBlock = 0) { + Instruction *SplitBefore = cast<Instruction>(Cmp)->getNextNode(); BasicBlock *Head = SplitBefore->getParent(); BasicBlock *Tail = Head->splitBasicBlock(SplitBefore); TerminatorInst *HeadOldTerm = Head->getTerminator(); - BasicBlock *NewBasicBlock = - BasicBlock::Create(*C, "", Head->getParent()); - BranchInst *HeadNewTerm = BranchInst::Create(/*ifTrue*/NewBasicBlock, - /*ifFalse*/Tail, - Cmp); + BranchInst *CheckTerm = 0; + if (!ThenBlock) { + LLVMContext &C = Head->getParent()->getParent()->getContext(); + ThenBlock = BasicBlock::Create(C, "", Head->getParent(), Tail); + CheckTerm = BranchInst::Create(Tail, ThenBlock); + } + BranchInst *HeadNewTerm = + BranchInst::Create(/*ifTrue*/ThenBlock, /*ifFalse*/Tail, Cmp); ReplaceInstWithInst(HeadOldTerm, HeadNewTerm); - BranchInst *CheckTerm = BranchInst::Create(Tail, NewBasicBlock); return CheckTerm; } @@ -242,12 +282,13 @@ Value *AddressSanitizer::memToShadow(Value *Shadow, IRBuilder<> &IRB) { MappingOffset)); } -void AddressSanitizer::instrumentMemIntrinsicParam(Instruction *OrigIns, +void AddressSanitizer::instrumentMemIntrinsicParam( + AsanFunctionContext &AFC, Instruction *OrigIns, Value *Addr, Value *Size, Instruction *InsertBefore, bool IsWrite) { // Check the first byte. { IRBuilder<> IRB(InsertBefore); - instrumentAddress(OrigIns, IRB, Addr, 8, IsWrite); + instrumentAddress(AFC, OrigIns, IRB, Addr, 8, IsWrite); } // Check the last byte. { @@ -257,15 +298,16 @@ void AddressSanitizer::instrumentMemIntrinsicParam(Instruction *OrigIns, SizeMinusOne = IRB.CreateIntCast(SizeMinusOne, IntptrTy, false); Value *AddrLong = IRB.CreatePointerCast(Addr, IntptrTy); Value *AddrPlusSizeMinisOne = IRB.CreateAdd(AddrLong, SizeMinusOne); - instrumentAddress(OrigIns, IRB, AddrPlusSizeMinisOne, 8, IsWrite); + instrumentAddress(AFC, OrigIns, IRB, AddrPlusSizeMinisOne, 8, IsWrite); } } // Instrument memset/memmove/memcpy -bool AddressSanitizer::instrumentMemIntrinsic(MemIntrinsic *MI) { +bool AddressSanitizer::instrumentMemIntrinsic(AsanFunctionContext &AFC, + MemIntrinsic *MI) { Value *Dst = MI->getDest(); MemTransferInst *MemTran = dyn_cast<MemTransferInst>(MI); - Value *Src = MemTran ? MemTran->getSource() : NULL; + Value *Src = MemTran ? MemTran->getSource() : 0; Value *Length = MI->getLength(); Constant *ConstLength = dyn_cast<Constant>(Length); @@ -277,26 +319,46 @@ bool AddressSanitizer::instrumentMemIntrinsic(MemIntrinsic *MI) { IRBuilder<> IRB(InsertBefore); Value *Cmp = IRB.CreateICmpNE(Length, - Constant::getNullValue(Length->getType())); - InsertBefore = splitBlockAndInsertIfThen(InsertBefore, Cmp); + Constant::getNullValue(Length->getType())); + InsertBefore = splitBlockAndInsertIfThen(Cmp); } - instrumentMemIntrinsicParam(MI, Dst, Length, InsertBefore, true); + instrumentMemIntrinsicParam(AFC, MI, Dst, Length, InsertBefore, true); if (Src) - instrumentMemIntrinsicParam(MI, Src, Length, InsertBefore, false); + instrumentMemIntrinsicParam(AFC, MI, Src, Length, InsertBefore, false); return true; } -static Value *getLDSTOperand(Instruction *I) { +// If I is an interesting memory access, return the PointerOperand +// and set IsWrite. Otherwise return NULL. +static Value *isInterestingMemoryAccess(Instruction *I, bool *IsWrite) { if (LoadInst *LI = dyn_cast<LoadInst>(I)) { + if (!ClInstrumentReads) return NULL; + *IsWrite = false; return LI->getPointerOperand(); } - return cast<StoreInst>(*I).getPointerOperand(); + if (StoreInst *SI = dyn_cast<StoreInst>(I)) { + if (!ClInstrumentWrites) return NULL; + *IsWrite = true; + return SI->getPointerOperand(); + } + if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(I)) { + if (!ClInstrumentAtomics) return NULL; + *IsWrite = true; + return RMW->getPointerOperand(); + } + if (AtomicCmpXchgInst *XCHG = dyn_cast<AtomicCmpXchgInst>(I)) { + if (!ClInstrumentAtomics) return NULL; + *IsWrite = true; + return XCHG->getPointerOperand(); + } + return NULL; } -void AddressSanitizer::instrumentMop(Instruction *I) { - int IsWrite = isa<StoreInst>(*I); - Value *Addr = getLDSTOperand(I); +void AddressSanitizer::instrumentMop(AsanFunctionContext &AFC, Instruction *I) { + bool IsWrite; + Value *Addr = isInterestingMemoryAccess(I, &IsWrite); + assert(Addr); if (ClOpt && ClOptGlobals && isa<GlobalVariable>(Addr)) { // We are accessing a global scalar variable. Nothing to catch here. return; @@ -314,22 +376,57 @@ void AddressSanitizer::instrumentMop(Instruction *I) { } IRBuilder<> IRB(I); - instrumentAddress(I, IRB, Addr, TypeSize, IsWrite); + instrumentAddress(AFC, I, IRB, Addr, TypeSize, IsWrite); +} + +// Validate the result of Module::getOrInsertFunction called for an interface +// function of AddressSanitizer. If the instrumented module defines a function +// with the same name, their prototypes must match, otherwise +// getOrInsertFunction returns a bitcast. +Function *AddressSanitizer::checkInterfaceFunction(Constant *FuncOrBitcast) { + if (isa<Function>(FuncOrBitcast)) return cast<Function>(FuncOrBitcast); + FuncOrBitcast->dump(); + report_fatal_error("trying to redefine an AddressSanitizer " + "interface function"); } Instruction *AddressSanitizer::generateCrashCode( - IRBuilder<> &IRB, Value *Addr, bool IsWrite, uint32_t TypeSize) { - // IsWrite and TypeSize are encoded in the function name. - std::string FunctionName = std::string(kAsanReportErrorTemplate) + - (IsWrite ? "store" : "load") + itostr(TypeSize / 8); - Value *ReportWarningFunc = CurrentModule->getOrInsertFunction( - FunctionName, IRB.getVoidTy(), IntptrTy, NULL); - CallInst *Call = IRB.CreateCall(ReportWarningFunc, Addr); - Call->setDoesNotReturn(); + BasicBlock *BB, Value *Addr, Value *PC, + bool IsWrite, size_t AccessSizeIndex) { + IRBuilder<> IRB(BB->getFirstNonPHI()); + CallInst *Call; + if (PC) + Call = IRB.CreateCall2(AsanErrorCallback[IsWrite][AccessSizeIndex], + Addr, PC); + else + Call = IRB.CreateCall(AsanErrorCallback[IsWrite][AccessSizeIndex], Addr); + // We don't do Call->setDoesNotReturn() because the BB already has + // UnreachableInst at the end. + // This EmptyAsm is required to avoid callback merge. + IRB.CreateCall(EmptyAsm); return Call; } -void AddressSanitizer::instrumentAddress(Instruction *OrigIns, +Value *AddressSanitizer::createSlowPathCmp(IRBuilder<> &IRB, Value *AddrLong, + Value *ShadowValue, + uint32_t TypeSize) { + size_t Granularity = 1 << MappingScale; + // Addr & (Granularity - 1) + Value *LastAccessedByte = IRB.CreateAnd( + AddrLong, ConstantInt::get(IntptrTy, Granularity - 1)); + // (Addr & (Granularity - 1)) + size - 1 + if (TypeSize / 8 > 1) + LastAccessedByte = IRB.CreateAdd( + LastAccessedByte, ConstantInt::get(IntptrTy, TypeSize / 8 - 1)); + // (uint8_t) ((Addr & (Granularity-1)) + size - 1) + LastAccessedByte = IRB.CreateIntCast( + LastAccessedByte, IRB.getInt8Ty(), false); + // ((uint8_t) ((Addr & (Granularity-1)) + size - 1)) >= ShadowValue + return IRB.CreateICmpSGE(LastAccessedByte, ShadowValue); +} + +void AddressSanitizer::instrumentAddress(AsanFunctionContext &AFC, + Instruction *OrigIns, IRBuilder<> &IRB, Value *Addr, uint32_t TypeSize, bool IsWrite) { Value *AddrLong = IRB.CreatePointerCast(Addr, IntptrTy); @@ -344,31 +441,25 @@ void AddressSanitizer::instrumentAddress(Instruction *OrigIns, Value *Cmp = IRB.CreateICmpNE(ShadowValue, CmpVal); - Instruction *CheckTerm = splitBlockAndInsertIfThen( - cast<Instruction>(Cmp)->getNextNode(), Cmp); - IRBuilder<> IRB2(CheckTerm); + BasicBlock *CrashBlock = BasicBlock::Create(*C, "crash_bb", &AFC.F); + new UnreachableInst(*C, CrashBlock); + size_t AccessSizeIndex = TypeSizeToSizeIndex(TypeSize); + Instruction *Crash = + generateCrashCode(CrashBlock, AddrLong, 0, IsWrite, AccessSizeIndex); + Crash->setDebugLoc(OrigIns->getDebugLoc()); size_t Granularity = 1 << MappingScale; if (TypeSize < 8 * Granularity) { - // Addr & (Granularity - 1) - Value *Lower3Bits = IRB2.CreateAnd( - AddrLong, ConstantInt::get(IntptrTy, Granularity - 1)); - // (Addr & (Granularity - 1)) + size - 1 - Value *LastAccessedByte = IRB2.CreateAdd( - Lower3Bits, ConstantInt::get(IntptrTy, TypeSize / 8 - 1)); - // (uint8_t) ((Addr & (Granularity-1)) + size - 1) - LastAccessedByte = IRB2.CreateIntCast( - LastAccessedByte, IRB.getInt8Ty(), false); - // ((uint8_t) ((Addr & (Granularity-1)) + size - 1)) >= ShadowValue - Value *Cmp2 = IRB2.CreateICmpSGE(LastAccessedByte, ShadowValue); - - CheckTerm = splitBlockAndInsertIfThen(CheckTerm, Cmp2); - } - - IRBuilder<> IRB1(CheckTerm); - Instruction *Crash = generateCrashCode(IRB1, AddrLong, IsWrite, TypeSize); - Crash->setDebugLoc(OrigIns->getDebugLoc()); - ReplaceInstWithInst(CheckTerm, new UnreachableInst(*C)); + BranchInst *CheckTerm = splitBlockAndInsertIfThen(Cmp); + assert(CheckTerm->isUnconditional()); + BasicBlock *NextBB = CheckTerm->getSuccessor(0); + IRB.SetInsertPoint(CheckTerm); + Value *Cmp2 = createSlowPathCmp(IRB, AddrLong, ShadowValue, TypeSize); + BranchInst *NewTerm = BranchInst::Create(CrashBlock, NextBB, Cmp2); + ReplaceInstWithInst(CheckTerm, NewTerm); + } else { + splitBlockAndInsertIfThen(Cmp, CrashBlock); + } } // This function replaces all global variables with new variables that have @@ -473,7 +564,7 @@ bool AddressSanitizer::insertGlobalRedzones(Module &M) { // Create a new global variable with enough space for a redzone. GlobalVariable *NewGlobal = new GlobalVariable( M, NewTy, G->isConstant(), G->getLinkage(), - NewInitializer, "", G, G->isThreadLocal()); + NewInitializer, "", G, G->getThreadLocalMode()); NewGlobal->copyAttributesFrom(G); NewGlobal->setAlignment(RedzoneSize); @@ -501,7 +592,7 @@ bool AddressSanitizer::insertGlobalRedzones(Module &M) { M, ArrayOfGlobalStructTy, false, GlobalVariable::PrivateLinkage, ConstantArray::get(ArrayOfGlobalStructTy, Initializers), ""); - Function *AsanRegisterGlobals = cast<Function>(M.getOrInsertFunction( + Function *AsanRegisterGlobals = checkInterfaceFunction(M.getOrInsertFunction( kAsanRegisterGlobalsName, IRB.getVoidTy(), IntptrTy, IntptrTy, NULL)); AsanRegisterGlobals->setLinkage(Function::ExternalLinkage); @@ -516,8 +607,10 @@ bool AddressSanitizer::insertGlobalRedzones(Module &M) { GlobalValue::InternalLinkage, kAsanModuleDtorName, &M); BasicBlock *AsanDtorBB = BasicBlock::Create(*C, "", AsanDtorFunction); IRBuilder<> IRB_Dtor(ReturnInst::Create(*C, AsanDtorBB)); - Function *AsanUnregisterGlobals = cast<Function>(M.getOrInsertFunction( - kAsanUnregisterGlobalsName, IRB.getVoidTy(), IntptrTy, IntptrTy, NULL)); + Function *AsanUnregisterGlobals = + checkInterfaceFunction(M.getOrInsertFunction( + kAsanUnregisterGlobalsName, + IRB.getVoidTy(), IntptrTy, IntptrTy, NULL)); AsanUnregisterGlobals->setLinkage(Function::ExternalLinkage); IRB_Dtor.CreateCall2(AsanUnregisterGlobals, @@ -537,7 +630,6 @@ bool AddressSanitizer::runOnModule(Module &M) { return false; BL.reset(new FunctionBlackList(ClBlackListFile)); - CurrentModule = &M; C = &(M.getContext()); LongSize = TD->getPointerSizeInBits(); IntptrTy = Type::getIntNTy(*C, LongSize); @@ -551,13 +643,33 @@ bool AddressSanitizer::runOnModule(Module &M) { // call __asan_init in the module ctor. IRBuilder<> IRB(CtorInsertBefore); - AsanInitFunction = cast<Function>( + AsanInitFunction = checkInterfaceFunction( M.getOrInsertFunction(kAsanInitName, IRB.getVoidTy(), NULL)); AsanInitFunction->setLinkage(Function::ExternalLinkage); IRB.CreateCall(AsanInitFunction); - MappingOffset = LongSize == 32 - ? kDefaultShadowOffset32 : kDefaultShadowOffset64; + // Create __asan_report* callbacks. + for (size_t AccessIsWrite = 0; AccessIsWrite <= 1; AccessIsWrite++) { + for (size_t AccessSizeIndex = 0; AccessSizeIndex < kNumberOfAccessSizes; + AccessSizeIndex++) { + // IsWrite and TypeSize are encoded in the function name. + std::string FunctionName = std::string(kAsanReportErrorTemplate) + + (AccessIsWrite ? "store" : "load") + itostr(1 << AccessSizeIndex); + // If we are merging crash callbacks, they have two parameters. + AsanErrorCallback[AccessIsWrite][AccessSizeIndex] = cast<Function>( + M.getOrInsertFunction(FunctionName, IRB.getVoidTy(), IntptrTy, NULL)); + } + } + // We insert an empty inline asm after __asan_report* to avoid callback merge. + EmptyAsm = InlineAsm::get(FunctionType::get(IRB.getVoidTy(), false), + StringRef(""), StringRef(""), + /*hasSideEffects=*/true); + + llvm::Triple targetTriple(M.getTargetTriple()); + bool isAndroid = targetTriple.getEnvironment() == llvm::Triple::ANDROIDEABI; + + MappingOffset = isAndroid ? kDefaultShadowOffsetAndroid : + (LongSize == 32 ? kDefaultShadowOffset32 : kDefaultShadowOffset64); if (ClMappingOffsetLog >= 0) { if (ClMappingOffsetLog == 0) { // special case @@ -640,17 +752,17 @@ bool AddressSanitizer::handleFunction(Module &M, Function &F) { SmallSet<Value*, 16> TempsToInstrument; SmallVector<Instruction*, 16> ToInstrument; SmallVector<Instruction*, 8> NoReturnCalls; + bool IsWrite; // Fill the set of memory operations to instrument. for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE; ++FI) { TempsToInstrument.clear(); + int NumInsnsPerBB = 0; for (BasicBlock::iterator BI = FI->begin(), BE = FI->end(); BI != BE; ++BI) { if (LooksLikeCodeInBug11395(BI)) return false; - if ((isa<LoadInst>(BI) && ClInstrumentReads) || - (isa<StoreInst>(BI) && ClInstrumentWrites)) { - Value *Addr = getLDSTOperand(BI); + if (Value *Addr = isInterestingMemoryAccess(BI, &IsWrite)) { if (ClOpt && ClOptSameTemp) { if (!TempsToInstrument.insert(Addr)) continue; // We've seen this temp in the current BB. @@ -668,19 +780,24 @@ bool AddressSanitizer::handleFunction(Module &M, Function &F) { continue; } ToInstrument.push_back(BI); + NumInsnsPerBB++; + if (NumInsnsPerBB >= ClMaxInsnsToInstrumentPerBB) + break; } } + AsanFunctionContext AFC(F); + // Instrument. int NumInstrumented = 0; for (size_t i = 0, n = ToInstrument.size(); i != n; i++) { Instruction *Inst = ToInstrument[i]; if (ClDebugMin < 0 || ClDebugMax < 0 || (NumInstrumented >= ClDebugMin && NumInstrumented <= ClDebugMax)) { - if (isa<StoreInst>(Inst) || isa<LoadInst>(Inst)) - instrumentMop(Inst); + if (isInterestingMemoryAccess(Inst, &IsWrite)) + instrumentMop(AFC, Inst); else - instrumentMemIntrinsic(cast<MemIntrinsic>(Inst)); + instrumentMemIntrinsic(AFC, cast<MemIntrinsic>(Inst)); } NumInstrumented++; } diff --git a/lib/Transforms/Instrumentation/BoundsChecking.cpp b/lib/Transforms/Instrumentation/BoundsChecking.cpp new file mode 100644 index 0000000..09e0f14 --- /dev/null +++ b/lib/Transforms/Instrumentation/BoundsChecking.cpp @@ -0,0 +1,209 @@ +//===- BoundsChecking.cpp - Instrumentation for run-time bounds checking --===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements a pass that instruments the code to perform run-time +// bounds checking on loads, stores, and other memory intrinsics. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "bounds-checking" +#include "llvm/IRBuilder.h" +#include "llvm/Intrinsics.h" +#include "llvm/Pass.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/MemoryBuiltins.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/InstIterator.h" +#include "llvm/Support/TargetFolder.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetData.h" +#include "llvm/Transforms/Instrumentation.h" +using namespace llvm; + +static cl::opt<bool> SingleTrapBB("bounds-checking-single-trap", + cl::desc("Use one trap block per function")); + +STATISTIC(ChecksAdded, "Bounds checks added"); +STATISTIC(ChecksSkipped, "Bounds checks skipped"); +STATISTIC(ChecksUnable, "Bounds checks unable to add"); + +typedef IRBuilder<true, TargetFolder> BuilderTy; + +namespace { + struct BoundsChecking : public FunctionPass { + static char ID; + + BoundsChecking(unsigned _Penalty = 5) : FunctionPass(ID), Penalty(_Penalty){ + initializeBoundsCheckingPass(*PassRegistry::getPassRegistry()); + } + + virtual bool runOnFunction(Function &F); + + virtual void getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired<TargetData>(); + } + + private: + const TargetData *TD; + ObjectSizeOffsetEvaluator *ObjSizeEval; + BuilderTy *Builder; + Instruction *Inst; + BasicBlock *TrapBB; + unsigned Penalty; + + BasicBlock *getTrapBB(); + void emitBranchToTrap(Value *Cmp = 0); + bool computeAllocSize(Value *Ptr, APInt &Offset, Value* &OffsetValue, + APInt &Size, Value* &SizeValue); + bool instrument(Value *Ptr, Value *Val); + }; +} + +char BoundsChecking::ID = 0; +INITIALIZE_PASS(BoundsChecking, "bounds-checking", "Run-time bounds checking", + false, false) + + +/// getTrapBB - create a basic block that traps. All overflowing conditions +/// branch to this block. There's only one trap block per function. +BasicBlock *BoundsChecking::getTrapBB() { + if (TrapBB && SingleTrapBB) + return TrapBB; + + Function *Fn = Inst->getParent()->getParent(); + BasicBlock::iterator PrevInsertPoint = Builder->GetInsertPoint(); + TrapBB = BasicBlock::Create(Fn->getContext(), "trap", Fn); + Builder->SetInsertPoint(TrapBB); + + llvm::Value *F = Intrinsic::getDeclaration(Fn->getParent(), Intrinsic::trap); + CallInst *TrapCall = Builder->CreateCall(F); + TrapCall->setDoesNotReturn(); + TrapCall->setDoesNotThrow(); + TrapCall->setDebugLoc(Inst->getDebugLoc()); + Builder->CreateUnreachable(); + + Builder->SetInsertPoint(PrevInsertPoint); + return TrapBB; +} + + +/// emitBranchToTrap - emit a branch instruction to a trap block. +/// If Cmp is non-null, perform a jump only if its value evaluates to true. +void BoundsChecking::emitBranchToTrap(Value *Cmp) { + // check if the comparison is always false + ConstantInt *C = dyn_cast_or_null<ConstantInt>(Cmp); + if (C) { + ++ChecksSkipped; + if (!C->getZExtValue()) + return; + else + Cmp = 0; // unconditional branch + } + + Instruction *Inst = Builder->GetInsertPoint(); + BasicBlock *OldBB = Inst->getParent(); + BasicBlock *Cont = OldBB->splitBasicBlock(Inst); + OldBB->getTerminator()->eraseFromParent(); + + if (Cmp) + BranchInst::Create(getTrapBB(), Cont, Cmp, OldBB); + else + BranchInst::Create(getTrapBB(), OldBB); +} + + +/// instrument - adds run-time bounds checks to memory accessing instructions. +/// Ptr is the pointer that will be read/written, and InstVal is either the +/// result from the load or the value being stored. It is used to determine the +/// size of memory block that is touched. +/// Returns true if any change was made to the IR, false otherwise. +bool BoundsChecking::instrument(Value *Ptr, Value *InstVal) { + uint64_t NeededSize = TD->getTypeStoreSize(InstVal->getType()); + DEBUG(dbgs() << "Instrument " << *Ptr << " for " << Twine(NeededSize) + << " bytes\n"); + + SizeOffsetEvalType SizeOffset = ObjSizeEval->compute(Ptr); + + if (!ObjSizeEval->bothKnown(SizeOffset)) { + ++ChecksUnable; + return false; + } + + Value *Size = SizeOffset.first; + Value *Offset = SizeOffset.second; + ConstantInt *SizeCI = dyn_cast<ConstantInt>(Size); + + IntegerType *IntTy = TD->getIntPtrType(Inst->getContext()); + Value *NeededSizeVal = ConstantInt::get(IntTy, NeededSize); + + // three checks are required to ensure safety: + // . Offset >= 0 (since the offset is given from the base ptr) + // . Size >= Offset (unsigned) + // . Size - Offset >= NeededSize (unsigned) + // + // optimization: if Size >= 0 (signed), skip 1st check + // FIXME: add NSW/NUW here? -- we dont care if the subtraction overflows + Value *ObjSize = Builder->CreateSub(Size, Offset); + Value *Cmp2 = Builder->CreateICmpULT(Size, Offset); + Value *Cmp3 = Builder->CreateICmpULT(ObjSize, NeededSizeVal); + Value *Or = Builder->CreateOr(Cmp2, Cmp3); + if (!SizeCI || SizeCI->getValue().slt(0)) { + Value *Cmp1 = Builder->CreateICmpSLT(Offset, ConstantInt::get(IntTy, 0)); + Or = Builder->CreateOr(Cmp1, Or); + } + emitBranchToTrap(Or); + + ++ChecksAdded; + return true; +} + +bool BoundsChecking::runOnFunction(Function &F) { + TD = &getAnalysis<TargetData>(); + + TrapBB = 0; + BuilderTy TheBuilder(F.getContext(), TargetFolder(TD)); + Builder = &TheBuilder; + ObjectSizeOffsetEvaluator TheObjSizeEval(TD, F.getContext()); + ObjSizeEval = &TheObjSizeEval; + + // check HANDLE_MEMORY_INST in include/llvm/Instruction.def for memory + // touching instructions + std::vector<Instruction*> WorkList; + for (inst_iterator i = inst_begin(F), e = inst_end(F); i != e; ++i) { + Instruction *I = &*i; + if (isa<LoadInst>(I) || isa<StoreInst>(I) || isa<AtomicCmpXchgInst>(I) || + isa<AtomicRMWInst>(I)) + WorkList.push_back(I); + } + + bool MadeChange = false; + for (std::vector<Instruction*>::iterator i = WorkList.begin(), + e = WorkList.end(); i != e; ++i) { + Inst = *i; + + Builder->SetInsertPoint(Inst); + if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) { + MadeChange |= instrument(LI->getPointerOperand(), LI); + } else if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) { + MadeChange |= instrument(SI->getPointerOperand(), SI->getValueOperand()); + } else if (AtomicCmpXchgInst *AI = dyn_cast<AtomicCmpXchgInst>(Inst)) { + MadeChange |= instrument(AI->getPointerOperand(),AI->getCompareOperand()); + } else if (AtomicRMWInst *AI = dyn_cast<AtomicRMWInst>(Inst)) { + MadeChange |= instrument(AI->getPointerOperand(), AI->getValOperand()); + } else { + llvm_unreachable("unknown Instruction type"); + } + } + return MadeChange; +} + +FunctionPass *llvm::createBoundsCheckingPass(unsigned Penalty) { + return new BoundsChecking(Penalty); +} diff --git a/lib/Transforms/Instrumentation/CMakeLists.txt b/lib/Transforms/Instrumentation/CMakeLists.txt index e4c8cf1..00de882 100644 --- a/lib/Transforms/Instrumentation/CMakeLists.txt +++ b/lib/Transforms/Instrumentation/CMakeLists.txt @@ -1,5 +1,6 @@ add_llvm_library(LLVMInstrumentation AddressSanitizer.cpp + BoundsChecking.cpp EdgeProfiling.cpp FunctionBlackList.cpp GCOVProfiling.cpp @@ -9,3 +10,5 @@ add_llvm_library(LLVMInstrumentation ProfilingUtils.cpp ThreadSanitizer.cpp ) + +add_dependencies(LLVMInstrumentation intrinsics_gen) diff --git a/lib/Transforms/Instrumentation/GCOVProfiling.cpp b/lib/Transforms/Instrumentation/GCOVProfiling.cpp index 96e5d5b..264a6a6 100644 --- a/lib/Transforms/Instrumentation/GCOVProfiling.cpp +++ b/lib/Transforms/Instrumentation/GCOVProfiling.cpp @@ -18,22 +18,23 @@ #include "ProfilingUtils.h" #include "llvm/Transforms/Instrumentation.h" -#include "llvm/Analysis/DebugInfo.h" +#include "llvm/DebugInfo.h" +#include "llvm/IRBuilder.h" +#include "llvm/Instructions.h" #include "llvm/Module.h" #include "llvm/Pass.h" -#include "llvm/Instructions.h" -#include "llvm/Support/raw_ostream.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/DebugLoc.h" -#include "llvm/Support/InstIterator.h" -#include "llvm/Support/IRBuilder.h" -#include "llvm/Support/PathV2.h" #include "llvm/ADT/DenseMap.h" -#include "llvm/ADT/Statistic.h" #include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringMap.h" #include "llvm/ADT/UniqueVector.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/DebugLoc.h" +#include "llvm/Support/InstIterator.h" +#include "llvm/Support/PathV2.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Utils/ModuleUtils.h" #include <string> #include <utility> using namespace llvm; @@ -57,7 +58,6 @@ namespace { virtual const char *getPassName() const { return "GCOV Profiler"; } - private: bool runOnModule(Module &M); @@ -90,6 +90,7 @@ namespace { // list. void insertCounterWriteout(SmallVector<std::pair<GlobalVariable *, MDNode *>, 8> &); + void insertIndirectCounterIncrement(); std::string mangleName(DICompileUnit CU, std::string NewStem); @@ -421,6 +422,7 @@ bool GCOVProfiler::emitProfileArcs() { if (!CU_Nodes) return false; bool Result = false; + bool InsertIndCounterIncrCode = false; for (unsigned i = 0, e = CU_Nodes->getNumOperands(); i != e; ++i) { DICompileUnit CU(CU_Nodes->getOperand(i)); DIArray SPs = CU.getSubprograms(); @@ -446,7 +448,7 @@ bool GCOVProfiler::emitProfileArcs() { new GlobalVariable(*M, CounterTy, false, GlobalValue::InternalLinkage, Constant::getNullValue(CounterTy), - "__llvm_gcov_ctr", 0, false, 0); + "__llvm_gcov_ctr"); CountersBySP.push_back(std::make_pair(Counters, (MDNode*)SP)); UniqueVector<BasicBlock *> ComplexEdgePreds; @@ -507,15 +509,21 @@ bool GCOVProfiler::emitProfileArcs() { Value *CounterPtrArray = Builder.CreateConstInBoundsGEP2_64(EdgeTable, 0, i * ComplexEdgePreds.size()); + + // Build code to increment the counter. + InsertIndCounterIncrCode = true; Builder.CreateCall2(getIncrementIndirectCounterFunc(), EdgeState, CounterPtrArray); - // clear the predecessor number - Builder.CreateStore(ConstantInt::get(Int32Ty, 0xffffffff), EdgeState); } } } + insertCounterWriteout(CountersBySP); } + + if (InsertIndCounterIncrCode) + insertIndirectCounterIncrement(); + return Result; } @@ -574,13 +582,14 @@ Constant *GCOVProfiler::getStartFileFunc() { } Constant *GCOVProfiler::getIncrementIndirectCounterFunc() { + Type *Int32Ty = Type::getInt32Ty(*Ctx); + Type *Int64Ty = Type::getInt64Ty(*Ctx); Type *Args[] = { - Type::getInt32PtrTy(*Ctx), // uint32_t *predecessor - Type::getInt64PtrTy(*Ctx)->getPointerTo(), // uint64_t **state_table_row + Int32Ty->getPointerTo(), // uint32_t *predecessor + Int64Ty->getPointerTo()->getPointerTo() // uint64_t **counters }; - FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx), - Args, false); - return M->getOrInsertFunction("llvm_gcda_increment_indirect_counter", FTy); + FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx), Args, false); + return M->getOrInsertFunction("__llvm_gcov_indirect_counter_increment", FTy); } Constant *GCOVProfiler::getEmitFunctionFunc() { @@ -588,8 +597,7 @@ Constant *GCOVProfiler::getEmitFunctionFunc() { Type::getInt32Ty(*Ctx), // uint32_t ident Type::getInt8PtrTy(*Ctx), // const char *function_name }; - FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx), - Args, false); + FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx), Args, false); return M->getOrInsertFunction("llvm_gcda_emit_function", FTy); } @@ -665,5 +673,75 @@ void GCOVProfiler::insertCounterWriteout( } Builder.CreateRetVoid(); - InsertProfilingShutdownCall(WriteoutF, M); + // Create a small bit of code that registers the "__llvm_gcov_writeout" + // function to be executed at exit. + FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx), false); + Function *F = Function::Create(FTy, GlobalValue::InternalLinkage, + "__llvm_gcov_init", M); + F->setUnnamedAddr(true); + F->setLinkage(GlobalValue::InternalLinkage); + F->addFnAttr(Attribute::NoInline); + + BB = BasicBlock::Create(*Ctx, "entry", F); + Builder.SetInsertPoint(BB); + + FTy = FunctionType::get(Type::getInt32Ty(*Ctx), + PointerType::get(FTy, 0), false); + Constant *AtExitFn = M->getOrInsertFunction("atexit", FTy); + Builder.CreateCall(AtExitFn, WriteoutF); + Builder.CreateRetVoid(); + + appendToGlobalCtors(*M, F, 0); +} + +void GCOVProfiler::insertIndirectCounterIncrement() { + Function *Fn = + cast<Function>(GCOVProfiler::getIncrementIndirectCounterFunc()); + Fn->setUnnamedAddr(true); + Fn->setLinkage(GlobalValue::InternalLinkage); + Fn->addFnAttr(Attribute::NoInline); + + Type *Int32Ty = Type::getInt32Ty(*Ctx); + Type *Int64Ty = Type::getInt64Ty(*Ctx); + Constant *NegOne = ConstantInt::get(Int32Ty, 0xffffffff); + + // Create basic blocks for function. + BasicBlock *BB = BasicBlock::Create(*Ctx, "entry", Fn); + IRBuilder<> Builder(BB); + + BasicBlock *PredNotNegOne = BasicBlock::Create(*Ctx, "", Fn); + BasicBlock *CounterEnd = BasicBlock::Create(*Ctx, "", Fn); + BasicBlock *Exit = BasicBlock::Create(*Ctx, "exit", Fn); + + // uint32_t pred = *predecessor; + // if (pred == 0xffffffff) return; + Argument *Arg = Fn->arg_begin(); + Arg->setName("predecessor"); + Value *Pred = Builder.CreateLoad(Arg, "pred"); + Value *Cond = Builder.CreateICmpEQ(Pred, NegOne); + BranchInst::Create(Exit, PredNotNegOne, Cond, BB); + + Builder.SetInsertPoint(PredNotNegOne); + + // uint64_t *counter = counters[pred]; + // if (!counter) return; + Value *ZExtPred = Builder.CreateZExt(Pred, Int64Ty); + Arg = llvm::next(Fn->arg_begin()); + Arg->setName("counters"); + Value *GEP = Builder.CreateGEP(Arg, ZExtPred); + Value *Counter = Builder.CreateLoad(GEP, "counter"); + Cond = Builder.CreateICmpEQ(Counter, + Constant::getNullValue(Int64Ty->getPointerTo())); + Builder.CreateCondBr(Cond, Exit, CounterEnd); + + // ++*counter; + Builder.SetInsertPoint(CounterEnd); + Value *Add = Builder.CreateAdd(Builder.CreateLoad(Counter), + ConstantInt::get(Int64Ty, 1)); + Builder.CreateStore(Add, Counter); + Builder.CreateBr(Exit); + + // Fill in the exit block. + Builder.SetInsertPoint(Exit); + Builder.CreateRetVoid(); } diff --git a/lib/Transforms/Instrumentation/Instrumentation.cpp b/lib/Transforms/Instrumentation/Instrumentation.cpp index c7266e2..1e0b4a3 100644 --- a/lib/Transforms/Instrumentation/Instrumentation.cpp +++ b/lib/Transforms/Instrumentation/Instrumentation.cpp @@ -20,11 +20,12 @@ using namespace llvm; /// initializeInstrumentation - Initialize all passes in the TransformUtils /// library. void llvm::initializeInstrumentation(PassRegistry &Registry) { + initializeAddressSanitizerPass(Registry); + initializeBoundsCheckingPass(Registry); initializeEdgeProfilerPass(Registry); + initializeGCOVProfilerPass(Registry); initializeOptimalEdgeProfilerPass(Registry); initializePathProfilerPass(Registry); - initializeGCOVProfilerPass(Registry); - initializeAddressSanitizerPass(Registry); initializeThreadSanitizerPass(Registry); } diff --git a/lib/Transforms/Instrumentation/PathProfiling.cpp b/lib/Transforms/Instrumentation/PathProfiling.cpp index b214796..cc27146 100644 --- a/lib/Transforms/Instrumentation/PathProfiling.cpp +++ b/lib/Transforms/Instrumentation/PathProfiling.cpp @@ -55,11 +55,11 @@ #include "llvm/LLVMContext.h" #include "llvm/Module.h" #include "llvm/Pass.h" +#include "llvm/TypeBuilder.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/CFG.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" -#include "llvm/Support/TypeBuilder.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Instrumentation.h" diff --git a/lib/Transforms/Instrumentation/ThreadSanitizer.cpp b/lib/Transforms/Instrumentation/ThreadSanitizer.cpp index 8bb337e..dc0fa71 100644 --- a/lib/Transforms/Instrumentation/ThreadSanitizer.cpp +++ b/lib/Transforms/Instrumentation/ThreadSanitizer.cpp @@ -22,73 +22,73 @@ #define DEBUG_TYPE "tsan" #include "FunctionBlackList.h" -#include "llvm/ADT/SmallSet.h" -#include "llvm/ADT/SmallString.h" -#include "llvm/ADT/SmallVector.h" -#include "llvm/ADT/StringExtras.h" -#include "llvm/Intrinsics.h" #include "llvm/Function.h" +#include "llvm/IRBuilder.h" +#include "llvm/Intrinsics.h" #include "llvm/LLVMContext.h" #include "llvm/Metadata.h" #include "llvm/Module.h" +#include "llvm/Type.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/SmallString.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/ADT/StringExtras.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" -#include "llvm/Support/IRBuilder.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetData.h" #include "llvm/Transforms/Instrumentation.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/ModuleUtils.h" -#include "llvm/Type.h" using namespace llvm; static cl::opt<std::string> ClBlackListFile("tsan-blacklist", cl::desc("Blacklist file"), cl::Hidden); -static cl::opt<bool> ClPrintStats("tsan-print-stats", - cl::desc("Print ThreadSanitizer instrumentation stats"), cl::Hidden); +STATISTIC(NumInstrumentedReads, "Number of instrumented reads"); +STATISTIC(NumInstrumentedWrites, "Number of instrumented writes"); +STATISTIC(NumOmittedReadsBeforeWrite, + "Number of reads ignored due to following writes"); +STATISTIC(NumAccessesWithBadSize, "Number of accesses with bad size"); +STATISTIC(NumInstrumentedVtableWrites, "Number of vtable ptr writes"); +STATISTIC(NumOmittedReadsFromConstantGlobals, + "Number of reads from constant globals"); +STATISTIC(NumOmittedReadsFromVtable, "Number of vtable reads"); namespace { -// Stats counters for ThreadSanitizer instrumentation. -struct ThreadSanitizerStats { - size_t NumInstrumentedReads; - size_t NumInstrumentedWrites; - size_t NumOmittedReadsBeforeWrite; - size_t NumAccessesWithBadSize; - size_t NumInstrumentedVtableWrites; - size_t NumOmittedReadsFromConstantGlobals; - size_t NumOmittedReadsFromVtable; -}; - /// ThreadSanitizer: instrument the code in module to find races. struct ThreadSanitizer : public FunctionPass { ThreadSanitizer(); + const char *getPassName() const; bool runOnFunction(Function &F); bool doInitialization(Module &M); - bool doFinalization(Module &M); - bool instrumentLoadOrStore(Instruction *I); static char ID; // Pass identification, replacement for typeid. private: - void choseInstructionsToInstrument(SmallVectorImpl<Instruction*> &Local, - SmallVectorImpl<Instruction*> &All); + bool instrumentLoadOrStore(Instruction *I); + bool instrumentAtomic(Instruction *I); + void chooseInstructionsToInstrument(SmallVectorImpl<Instruction*> &Local, + SmallVectorImpl<Instruction*> &All); bool addrPointsToConstantData(Value *Addr); + int getMemoryAccessFuncIndex(Value *Addr); TargetData *TD; OwningPtr<FunctionBlackList> BL; + IntegerType *OrdTy; // Callbacks to run-time library are computed in doInitialization. - Value *TsanFuncEntry; - Value *TsanFuncExit; + Function *TsanFuncEntry; + Function *TsanFuncExit; // Accesses sizes are powers of two: 1, 2, 4, 8, 16. static const size_t kNumberOfAccessSizes = 5; - Value *TsanRead[kNumberOfAccessSizes]; - Value *TsanWrite[kNumberOfAccessSizes]; - Value *TsanVptrUpdate; - - // Stats are modified w/o synchronization. - ThreadSanitizerStats stats; + Function *TsanRead[kNumberOfAccessSizes]; + Function *TsanWrite[kNumberOfAccessSizes]; + Function *TsanAtomicLoad[kNumberOfAccessSizes]; + Function *TsanAtomicStore[kNumberOfAccessSizes]; + Function *TsanVptrUpdate; }; } // namespace @@ -97,6 +97,10 @@ INITIALIZE_PASS(ThreadSanitizer, "tsan", "ThreadSanitizer: detects data races.", false, false) +const char *ThreadSanitizer::getPassName() const { + return "ThreadSanitizer"; +} + ThreadSanitizer::ThreadSanitizer() : FunctionPass(ID), TD(NULL) { @@ -106,12 +110,18 @@ FunctionPass *llvm::createThreadSanitizerPass() { return new ThreadSanitizer(); } +static Function *checkInterfaceFunction(Constant *FuncOrBitcast) { + if (Function *F = dyn_cast<Function>(FuncOrBitcast)) + return F; + FuncOrBitcast->dump(); + report_fatal_error("ThreadSanitizer interface function redefined"); +} + bool ThreadSanitizer::doInitialization(Module &M) { TD = getAnalysisIfAvailable<TargetData>(); if (!TD) return false; BL.reset(new FunctionBlackList(ClBlackListFile)); - memset(&stats, 0, sizeof(stats)); // Always insert a call to __tsan_init into the module's CTORs. IRBuilder<> IRB(M.getContext()); @@ -120,38 +130,38 @@ bool ThreadSanitizer::doInitialization(Module &M) { appendToGlobalCtors(M, cast<Function>(TsanInit), 0); // Initialize the callbacks. - TsanFuncEntry = M.getOrInsertFunction("__tsan_func_entry", IRB.getVoidTy(), - IRB.getInt8PtrTy(), NULL); - TsanFuncExit = M.getOrInsertFunction("__tsan_func_exit", IRB.getVoidTy(), - NULL); + TsanFuncEntry = checkInterfaceFunction(M.getOrInsertFunction( + "__tsan_func_entry", IRB.getVoidTy(), IRB.getInt8PtrTy(), NULL)); + TsanFuncExit = checkInterfaceFunction(M.getOrInsertFunction( + "__tsan_func_exit", IRB.getVoidTy(), NULL)); + OrdTy = IRB.getInt32Ty(); for (size_t i = 0; i < kNumberOfAccessSizes; ++i) { - SmallString<32> ReadName("__tsan_read"); - ReadName += itostr(1 << i); - TsanRead[i] = M.getOrInsertFunction(ReadName, IRB.getVoidTy(), - IRB.getInt8PtrTy(), NULL); - SmallString<32> WriteName("__tsan_write"); - WriteName += itostr(1 << i); - TsanWrite[i] = M.getOrInsertFunction(WriteName, IRB.getVoidTy(), - IRB.getInt8PtrTy(), NULL); - } - TsanVptrUpdate = M.getOrInsertFunction("__tsan_vptr_update", IRB.getVoidTy(), - IRB.getInt8PtrTy(), IRB.getInt8PtrTy(), - NULL); - return true; -} + const size_t ByteSize = 1 << i; + const size_t BitSize = ByteSize * 8; + SmallString<32> ReadName("__tsan_read" + itostr(ByteSize)); + TsanRead[i] = checkInterfaceFunction(M.getOrInsertFunction( + ReadName, IRB.getVoidTy(), IRB.getInt8PtrTy(), NULL)); -bool ThreadSanitizer::doFinalization(Module &M) { - if (ClPrintStats) { - errs() << "ThreadSanitizerStats " << M.getModuleIdentifier() - << ": wr " << stats.NumInstrumentedWrites - << "; rd " << stats.NumInstrumentedReads - << "; vt " << stats.NumInstrumentedVtableWrites - << "; bs " << stats.NumAccessesWithBadSize - << "; rbw " << stats.NumOmittedReadsBeforeWrite - << "; rcg " << stats.NumOmittedReadsFromConstantGlobals - << "; rvt " << stats.NumOmittedReadsFromVtable - << "\n"; + SmallString<32> WriteName("__tsan_write" + itostr(ByteSize)); + TsanWrite[i] = checkInterfaceFunction(M.getOrInsertFunction( + WriteName, IRB.getVoidTy(), IRB.getInt8PtrTy(), NULL)); + + Type *Ty = Type::getIntNTy(M.getContext(), BitSize); + Type *PtrTy = Ty->getPointerTo(); + SmallString<32> AtomicLoadName("__tsan_atomic" + itostr(BitSize) + + "_load"); + TsanAtomicLoad[i] = checkInterfaceFunction(M.getOrInsertFunction( + AtomicLoadName, Ty, PtrTy, OrdTy, NULL)); + + SmallString<32> AtomicStoreName("__tsan_atomic" + itostr(BitSize) + + "_store"); + TsanAtomicStore[i] = checkInterfaceFunction(M.getOrInsertFunction( + AtomicStoreName, IRB.getVoidTy(), PtrTy, Ty, OrdTy, + NULL)); } + TsanVptrUpdate = checkInterfaceFunction(M.getOrInsertFunction( + "__tsan_vptr_update", IRB.getVoidTy(), IRB.getInt8PtrTy(), + IRB.getInt8PtrTy(), NULL)); return true; } @@ -173,13 +183,13 @@ bool ThreadSanitizer::addrPointsToConstantData(Value *Addr) { if (GlobalVariable *GV = dyn_cast<GlobalVariable>(Addr)) { if (GV->isConstant()) { // Reads from constant globals can not race with any writes. - stats.NumOmittedReadsFromConstantGlobals++; + NumOmittedReadsFromConstantGlobals++; return true; } } else if(LoadInst *L = dyn_cast<LoadInst>(Addr)) { if (isVtableAccess(L)) { // Reads from a vtable pointer can not race with any writes. - stats.NumOmittedReadsFromVtable++; + NumOmittedReadsFromVtable++; return true; } } @@ -197,7 +207,7 @@ bool ThreadSanitizer::addrPointsToConstantData(Value *Addr) { // // 'Local' is a vector of insns within the same BB (no calls between). // 'All' is a vector of insns that will be instrumented. -void ThreadSanitizer::choseInstructionsToInstrument( +void ThreadSanitizer::chooseInstructionsToInstrument( SmallVectorImpl<Instruction*> &Local, SmallVectorImpl<Instruction*> &All) { SmallSet<Value*, 8> WriteTargets; @@ -212,7 +222,7 @@ void ThreadSanitizer::choseInstructionsToInstrument( Value *Addr = Load->getPointerOperand(); if (WriteTargets.count(Addr)) { // We will write to this temp, so no reason to analyze the read. - stats.NumOmittedReadsBeforeWrite++; + NumOmittedReadsBeforeWrite++; continue; } if (addrPointsToConstantData(Addr)) { @@ -225,12 +235,27 @@ void ThreadSanitizer::choseInstructionsToInstrument( Local.clear(); } +static bool isAtomic(Instruction *I) { + if (LoadInst *LI = dyn_cast<LoadInst>(I)) + return LI->isAtomic() && LI->getSynchScope() == CrossThread; + if (StoreInst *SI = dyn_cast<StoreInst>(I)) + return SI->isAtomic() && SI->getSynchScope() == CrossThread; + if (isa<AtomicRMWInst>(I)) + return true; + if (isa<AtomicCmpXchgInst>(I)) + return true; + if (FenceInst *FI = dyn_cast<FenceInst>(I)) + return FI->getSynchScope() == CrossThread; + return false; +} + bool ThreadSanitizer::runOnFunction(Function &F) { if (!TD) return false; if (BL->isIn(F)) return false; SmallVector<Instruction*, 8> RetVec; SmallVector<Instruction*, 8> AllLoadsAndStores; SmallVector<Instruction*, 8> LocalLoadsAndStores; + SmallVector<Instruction*, 8> AtomicAccesses; bool Res = false; bool HasCalls = false; @@ -240,16 +265,18 @@ bool ThreadSanitizer::runOnFunction(Function &F) { BasicBlock &BB = *FI; for (BasicBlock::iterator BI = BB.begin(), BE = BB.end(); BI != BE; ++BI) { - if (isa<LoadInst>(BI) || isa<StoreInst>(BI)) + if (isAtomic(BI)) + AtomicAccesses.push_back(BI); + else if (isa<LoadInst>(BI) || isa<StoreInst>(BI)) LocalLoadsAndStores.push_back(BI); else if (isa<ReturnInst>(BI)) RetVec.push_back(BI); else if (isa<CallInst>(BI) || isa<InvokeInst>(BI)) { HasCalls = true; - choseInstructionsToInstrument(LocalLoadsAndStores, AllLoadsAndStores); + chooseInstructionsToInstrument(LocalLoadsAndStores, AllLoadsAndStores); } } - choseInstructionsToInstrument(LocalLoadsAndStores, AllLoadsAndStores); + chooseInstructionsToInstrument(LocalLoadsAndStores, AllLoadsAndStores); } // We have collected all loads and stores. @@ -261,6 +288,11 @@ bool ThreadSanitizer::runOnFunction(Function &F) { Res |= instrumentLoadOrStore(AllLoadsAndStores[i]); } + // Instrument atomic memory accesses. + for (size_t i = 0, n = AtomicAccesses.size(); i < n; ++i) { + Res |= instrumentAtomic(AtomicAccesses[i]); + } + // Instrument function entry/exit points if there were instrumented accesses. if (Res || HasCalls) { IRBuilder<> IRB(F.getEntryBlock().getFirstNonPHI()); @@ -283,29 +315,98 @@ bool ThreadSanitizer::instrumentLoadOrStore(Instruction *I) { Value *Addr = IsWrite ? cast<StoreInst>(I)->getPointerOperand() : cast<LoadInst>(I)->getPointerOperand(); - Type *OrigPtrTy = Addr->getType(); - Type *OrigTy = cast<PointerType>(OrigPtrTy)->getElementType(); - assert(OrigTy->isSized()); - uint32_t TypeSize = TD->getTypeStoreSizeInBits(OrigTy); - if (TypeSize != 8 && TypeSize != 16 && - TypeSize != 32 && TypeSize != 64 && TypeSize != 128) { - stats.NumAccessesWithBadSize++; - // Ignore all unusual sizes. + int Idx = getMemoryAccessFuncIndex(Addr); + if (Idx < 0) return false; - } if (IsWrite && isVtableAccess(I)) { + DEBUG(dbgs() << " VPTR : " << *I << "\n"); Value *StoredValue = cast<StoreInst>(I)->getValueOperand(); + // StoredValue does not necessary have a pointer type. + if (isa<IntegerType>(StoredValue->getType())) + StoredValue = IRB.CreateIntToPtr(StoredValue, IRB.getInt8PtrTy()); + // Call TsanVptrUpdate. IRB.CreateCall2(TsanVptrUpdate, IRB.CreatePointerCast(Addr, IRB.getInt8PtrTy()), IRB.CreatePointerCast(StoredValue, IRB.getInt8PtrTy())); - stats.NumInstrumentedVtableWrites++; + NumInstrumentedVtableWrites++; return true; } - size_t Idx = CountTrailingZeros_32(TypeSize / 8); - assert(Idx < kNumberOfAccessSizes); Value *OnAccessFunc = IsWrite ? TsanWrite[Idx] : TsanRead[Idx]; IRB.CreateCall(OnAccessFunc, IRB.CreatePointerCast(Addr, IRB.getInt8PtrTy())); - if (IsWrite) stats.NumInstrumentedWrites++; - else stats.NumInstrumentedReads++; + if (IsWrite) NumInstrumentedWrites++; + else NumInstrumentedReads++; + return true; +} + +static ConstantInt *createOrdering(IRBuilder<> *IRB, AtomicOrdering ord) { + uint32_t v = 0; + switch (ord) { + case NotAtomic: assert(false); + case Unordered: // Fall-through. + case Monotonic: v = 1 << 0; break; + // case Consume: v = 1 << 1; break; // Not specified yet. + case Acquire: v = 1 << 2; break; + case Release: v = 1 << 3; break; + case AcquireRelease: v = 1 << 4; break; + case SequentiallyConsistent: v = 1 << 5; break; + } + return IRB->getInt32(v); +} + +bool ThreadSanitizer::instrumentAtomic(Instruction *I) { + IRBuilder<> IRB(I); + if (LoadInst *LI = dyn_cast<LoadInst>(I)) { + Value *Addr = LI->getPointerOperand(); + int Idx = getMemoryAccessFuncIndex(Addr); + if (Idx < 0) + return false; + const size_t ByteSize = 1 << Idx; + const size_t BitSize = ByteSize * 8; + Type *Ty = Type::getIntNTy(IRB.getContext(), BitSize); + Type *PtrTy = Ty->getPointerTo(); + Value *Args[] = {IRB.CreatePointerCast(Addr, PtrTy), + createOrdering(&IRB, LI->getOrdering())}; + CallInst *C = CallInst::Create(TsanAtomicLoad[Idx], + ArrayRef<Value*>(Args)); + ReplaceInstWithInst(I, C); + + } else if (StoreInst *SI = dyn_cast<StoreInst>(I)) { + Value *Addr = SI->getPointerOperand(); + int Idx = getMemoryAccessFuncIndex(Addr); + if (Idx < 0) + return false; + const size_t ByteSize = 1 << Idx; + const size_t BitSize = ByteSize * 8; + Type *Ty = Type::getIntNTy(IRB.getContext(), BitSize); + Type *PtrTy = Ty->getPointerTo(); + Value *Args[] = {IRB.CreatePointerCast(Addr, PtrTy), + IRB.CreateIntCast(SI->getValueOperand(), Ty, false), + createOrdering(&IRB, SI->getOrdering())}; + CallInst *C = CallInst::Create(TsanAtomicStore[Idx], + ArrayRef<Value*>(Args)); + ReplaceInstWithInst(I, C); + } else if (isa<AtomicRMWInst>(I)) { + // FIXME: Not yet supported. + } else if (isa<AtomicCmpXchgInst>(I)) { + // FIXME: Not yet supported. + } else if (isa<FenceInst>(I)) { + // FIXME: Not yet supported. + } return true; } + +int ThreadSanitizer::getMemoryAccessFuncIndex(Value *Addr) { + Type *OrigPtrTy = Addr->getType(); + Type *OrigTy = cast<PointerType>(OrigPtrTy)->getElementType(); + assert(OrigTy->isSized()); + uint32_t TypeSize = TD->getTypeStoreSizeInBits(OrigTy); + if (TypeSize != 8 && TypeSize != 16 && + TypeSize != 32 && TypeSize != 64 && TypeSize != 128) { + NumAccessesWithBadSize++; + // Ignore all unusual sizes. + return -1; + } + size_t Idx = CountTrailingZeros_32(TypeSize / 8); + assert(Idx < kNumberOfAccessSizes); + return Idx; +} diff --git a/lib/Transforms/Scalar/ADCE.cpp b/lib/Transforms/Scalar/ADCE.cpp index ba214d1..b344952 100644 --- a/lib/Transforms/Scalar/ADCE.cpp +++ b/lib/Transforms/Scalar/ADCE.cpp @@ -9,7 +9,7 @@ // // This file implements the Aggressive Dead Code Elimination pass. This pass // optimistically assumes that all instructions are dead until proven otherwise, -// allowing it to eliminate dead computations that other DCE passes do not +// allowing it to eliminate dead computations that other DCE passes do not // catch, particularly involving loop computations. // //===----------------------------------------------------------------------===// @@ -36,13 +36,13 @@ namespace { ADCE() : FunctionPass(ID) { initializeADCEPass(*PassRegistry::getPassRegistry()); } - + virtual bool runOnFunction(Function& F); - + virtual void getAnalysisUsage(AnalysisUsage& AU) const { AU.setPreservesCFG(); } - + }; } @@ -52,7 +52,7 @@ INITIALIZE_PASS(ADCE, "adce", "Aggressive Dead Code Elimination", false, false) bool ADCE::runOnFunction(Function& F) { SmallPtrSet<Instruction*, 128> alive; SmallVector<Instruction*, 128> worklist; - + // Collect the set of "root" instructions that are known live. for (inst_iterator I = inst_begin(F), E = inst_end(F); I != E; ++I) if (isa<TerminatorInst>(I.getInstructionIterator()) || @@ -62,7 +62,7 @@ bool ADCE::runOnFunction(Function& F) { alive.insert(I.getInstructionIterator()); worklist.push_back(I.getInstructionIterator()); } - + // Propagate liveness backwards to operands. while (!worklist.empty()) { Instruction* curr = worklist.pop_back_val(); @@ -72,7 +72,7 @@ bool ADCE::runOnFunction(Function& F) { if (alive.insert(Inst)) worklist.push_back(Inst); } - + // The inverse of the live set is the dead set. These are those instructions // which have no side effects and do not influence the control flow or return // value of the function, and may therefore be deleted safely. @@ -82,7 +82,7 @@ bool ADCE::runOnFunction(Function& F) { worklist.push_back(I.getInstructionIterator()); I->dropAllReferences(); } - + for (SmallVector<Instruction*, 1024>::iterator I = worklist.begin(), E = worklist.end(); I != E; ++I) { ++NumRemoved; diff --git a/lib/Transforms/Scalar/CMakeLists.txt b/lib/Transforms/Scalar/CMakeLists.txt index d660c72..a01e066 100644 --- a/lib/Transforms/Scalar/CMakeLists.txt +++ b/lib/Transforms/Scalar/CMakeLists.txt @@ -32,3 +32,5 @@ add_llvm_library(LLVMScalarOpts Sink.cpp TailRecursionElimination.cpp ) + +add_dependencies(LLVMScalarOpts intrinsics_gen) diff --git a/lib/Transforms/Scalar/CodeGenPrepare.cpp b/lib/Transforms/Scalar/CodeGenPrepare.cpp index 9a5423f..bc87106 100644 --- a/lib/Transforms/Scalar/CodeGenPrepare.cpp +++ b/lib/Transforms/Scalar/CodeGenPrepare.cpp @@ -18,32 +18,32 @@ #include "llvm/Constants.h" #include "llvm/DerivedTypes.h" #include "llvm/Function.h" +#include "llvm/IRBuilder.h" #include "llvm/InlineAsm.h" #include "llvm/Instructions.h" #include "llvm/IntrinsicInst.h" #include "llvm/Pass.h" -#include "llvm/Analysis/Dominators.h" -#include "llvm/Analysis/InstructionSimplify.h" -#include "llvm/Analysis/ProfileInfo.h" -#include "llvm/Target/TargetData.h" -#include "llvm/Target/TargetLibraryInfo.h" -#include "llvm/Target/TargetLowering.h" -#include "llvm/Transforms/Utils/AddrModeMatcher.h" -#include "llvm/Transforms/Utils/BasicBlockUtils.h" -#include "llvm/Transforms/Utils/Local.h" -#include "llvm/Transforms/Utils/BuildLibCalls.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/Dominators.h" +#include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/Analysis/ProfileInfo.h" #include "llvm/Assembly/Writer.h" #include "llvm/Support/CallSite.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/GetElementPtrTypeIterator.h" #include "llvm/Support/PatternMatch.h" -#include "llvm/Support/raw_ostream.h" -#include "llvm/Support/IRBuilder.h" #include "llvm/Support/ValueHandle.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetData.h" +#include "llvm/Target/TargetLibraryInfo.h" +#include "llvm/Target/TargetLowering.h" +#include "llvm/Transforms/Utils/AddrModeMatcher.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/BuildLibCalls.h" +#include "llvm/Transforms/Utils/Local.h" using namespace llvm; using namespace llvm::PatternMatch; @@ -60,6 +60,7 @@ STATISTIC(NumExtsMoved, "Number of [s|z]ext instructions combined with loads"); STATISTIC(NumExtUses, "Number of uses of [s|z]ext instructions optimized"); STATISTIC(NumRetsDup, "Number of return instructions duplicated"); STATISTIC(NumDbgValueMoved, "Number of debug value instructions moved"); +STATISTIC(NumSelectsExpanded, "Number of selects turned into branches"); static cl::opt<bool> DisableBranchOpts( "disable-cgp-branch-opts", cl::Hidden, cl::init(false), @@ -70,6 +71,10 @@ static cl::opt<bool> DisableDeleteDeadBlocks( "disable-cgp-delete-dead-blocks", cl::Hidden, cl::init(false), cl::desc("Disable deleting dead blocks in CodeGenPrepare")); +static cl::opt<bool> DisableSelectToBranch( + "disable-cgp-select2branch", cl::Hidden, cl::init(false), + cl::desc("Disable select to branch conversion.")); + namespace { class CodeGenPrepare : public FunctionPass { /// TLI - Keep a pointer of a TargetLowering to consult for determining @@ -78,7 +83,7 @@ namespace { const TargetLibraryInfo *TLInfo; DominatorTree *DT; ProfileInfo *PFI; - + /// CurInstIterator - As we scan instructions optimizing them, this is the /// next instruction to optimize. Xforms that can invalidate this should /// update it. @@ -93,6 +98,9 @@ namespace { /// be updated. bool ModifiedDT; + /// OptSize - True if optimizing for size. + bool OptSize; + public: static char ID; // Pass identification, replacement for typeid explicit CodeGenPrepare(const TargetLowering *tli = 0) @@ -108,6 +116,7 @@ namespace { } private: + bool EliminateFallThrough(Function &F); bool EliminateMostlyEmptyBlocks(Function &F); bool CanMergeBlocks(const BasicBlock *BB, const BasicBlock *DestBB) const; void EliminateMostlyEmptyBlock(BasicBlock *BB); @@ -118,6 +127,7 @@ namespace { bool OptimizeCallInst(CallInst *CI); bool MoveExtToFormExtLoad(Instruction *I); bool OptimizeExtUses(Instruction *I); + bool OptimizeSelectInst(SelectInst *SI); bool DupRetToEnableTailCallOpts(ReturnInst *RI); bool PlaceDbgValues(Function &F); }; @@ -141,13 +151,14 @@ bool CodeGenPrepare::runOnFunction(Function &F) { TLInfo = &getAnalysis<TargetLibraryInfo>(); DT = getAnalysisIfAvailable<DominatorTree>(); PFI = getAnalysisIfAvailable<ProfileInfo>(); + OptSize = F.hasFnAttr(Attribute::OptimizeForSize); // First pass, eliminate blocks that contain only PHI nodes and an // unconditional branch. EverMadeChange |= EliminateMostlyEmptyBlocks(F); // llvm.dbg.value is far away from the value then iSel may not be able - // handle it properly. iSel will drop llvm.dbg.value if it can not + // handle it properly. iSel will drop llvm.dbg.value if it can not // find a node corresponding to the value. EverMadeChange |= PlaceDbgValues(F); @@ -182,6 +193,11 @@ bool CodeGenPrepare::runOnFunction(Function &F) { I = WorkList.begin(), E = WorkList.end(); I != E; ++I) DeleteDeadBlock(*I); + // Merge pairs of basic blocks with unconditional branches, connected by + // a single edge. + if (EverMadeChange || MadeChange) + MadeChange |= EliminateFallThrough(F); + if (MadeChange) ModifiedDT = true; EverMadeChange |= MadeChange; @@ -193,6 +209,39 @@ bool CodeGenPrepare::runOnFunction(Function &F) { return EverMadeChange; } +/// EliminateFallThrough - Merge basic blocks which are connected +/// by a single edge, where one of the basic blocks has a single successor +/// pointing to the other basic block, which has a single predecessor. +bool CodeGenPrepare::EliminateFallThrough(Function &F) { + bool Changed = false; + // Scan all of the blocks in the function, except for the entry block. + for (Function::iterator I = ++F.begin(), E = F.end(); I != E; ) { + BasicBlock *BB = I++; + // If the destination block has a single pred, then this is a trivial + // edge, just collapse it. + BasicBlock *SinglePred = BB->getSinglePredecessor(); + + if (!SinglePred || SinglePred == BB) continue; + + BranchInst *Term = dyn_cast<BranchInst>(SinglePred->getTerminator()); + if (Term && !Term->isConditional()) { + Changed = true; + // Remember if SinglePred was the entry block of the function. + // If so, we will need to move BB back to the entry position. + bool isEntry = SinglePred == &SinglePred->getParent()->getEntryBlock(); + MergeBasicBlockIntoOnlyPred(BB, this); + + if (isEntry && BB != &BB->getParent()->getEntryBlock()) + BB->moveBefore(&BB->getParent()->getEntryBlock()); + + // We have erased a block. Update the iterator. + I = BB; + DEBUG(dbgs() << "Merged:\n"<< *SinglePred << "\n\n\n"); + } + } + return Changed; +} + /// EliminateMostlyEmptyBlocks - eliminate blocks that contain only PHI nodes, /// debug info directives, and an unconditional branch. Passes before isel /// (e.g. LSR/loopsimplify) often split edges in ways that are non-optimal for @@ -326,7 +375,7 @@ void CodeGenPrepare::EliminateMostlyEmptyBlock(BasicBlock *BB) { if (isEntry && BB != &BB->getParent()->getEntryBlock()) BB->moveBefore(&BB->getParent()->getEntryBlock()); - + DEBUG(dbgs() << "AFTER:\n" << *DestBB << "\n\n\n"); return; } @@ -537,7 +586,7 @@ protected: bool CodeGenPrepare::OptimizeCallInst(CallInst *CI) { BasicBlock *BB = CI->getParent(); - + // Lower inline assembly if we can. // If we found an inline asm expession, and if the target knows how to // lower it to normal LLVM code, do so now. @@ -554,19 +603,19 @@ bool CodeGenPrepare::OptimizeCallInst(CallInst *CI) { if (OptimizeInlineAsmInst(CI)) return true; } - + // Lower all uses of llvm.objectsize.* IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI); if (II && II->getIntrinsicID() == Intrinsic::objectsize) { bool Min = (cast<ConstantInt>(II->getArgOperand(1))->getZExtValue() == 1); Type *ReturnTy = CI->getType(); - Constant *RetVal = ConstantInt::get(ReturnTy, Min ? 0 : -1ULL); - + Constant *RetVal = ConstantInt::get(ReturnTy, Min ? 0 : -1ULL); + // Substituting this can cause recursive simplifications, which can // invalidate our iterator. Use a WeakVH to hold onto it in case this // happens. WeakVH IterHandle(CurInstIterator); - + replaceAndRecursivelySimplify(CI, RetVal, TLI ? TLI->getTargetData() : 0, TLInfo, ModifiedDT ? 0 : DT); @@ -594,13 +643,13 @@ bool CodeGenPrepare::OptimizeCallInst(CallInst *CI) { // We'll need TargetData from here on out. const TargetData *TD = TLI ? TLI->getTargetData() : 0; if (!TD) return false; - + // Lower all default uses of _chk calls. This is very similar // to what InstCombineCalls does, but here we are only lowering calls // that have the default "don't know" as the objectsize. Anything else // should be left alone. CodeGenPrepareFortifiedLibCalls Simplifier; - return Simplifier.fold(CI, TD); + return Simplifier.fold(CI, TD, TLInfo); } /// DupRetToEnableTailCallOpts - Look for opportunities to duplicate return @@ -635,10 +684,18 @@ bool CodeGenPrepare::DupRetToEnableTailCallOpts(ReturnInst *RI) { if (!TLI) return false; + PHINode *PN = 0; + BitCastInst *BCI = 0; Value *V = RI->getReturnValue(); - PHINode *PN = V ? dyn_cast<PHINode>(V) : NULL; - if (V && !PN) - return false; + if (V) { + BCI = dyn_cast<BitCastInst>(V); + if (BCI) + V = BCI->getOperand(0); + + PN = dyn_cast<PHINode>(V); + if (!PN) + return false; + } BasicBlock *BB = RI->getParent(); if (PN && PN->getParent() != BB) @@ -656,6 +713,9 @@ bool CodeGenPrepare::DupRetToEnableTailCallOpts(ReturnInst *RI) { if (PN) { BasicBlock::iterator BI = BB->begin(); do { ++BI; } while (isa<DbgInfoIntrinsic>(BI)); + if (&*BI == BCI) + // Also skip over the bitcast. + ++BI; if (&*BI != RI) return false; } else { @@ -750,13 +810,13 @@ static bool IsNonLocalValue(Value *V, BasicBlock *BB) { bool CodeGenPrepare::OptimizeMemoryInst(Instruction *MemoryInst, Value *Addr, Type *AccessTy) { Value *Repl = Addr; - - // Try to collapse single-value PHI nodes. This is necessary to undo + + // Try to collapse single-value PHI nodes. This is necessary to undo // unprofitable PRE transformations. SmallVector<Value*, 8> worklist; SmallPtrSet<Value*, 16> Visited; worklist.push_back(Addr); - + // Use a worklist to iteratively look through PHI nodes, and ensure that // the addressing mode obtained from the non-PHI roots of the graph // are equivalent. @@ -768,20 +828,20 @@ bool CodeGenPrepare::OptimizeMemoryInst(Instruction *MemoryInst, Value *Addr, while (!worklist.empty()) { Value *V = worklist.back(); worklist.pop_back(); - + // Break use-def graph loops. if (!Visited.insert(V)) { Consensus = 0; break; } - + // For a PHI node, push all of its incoming values. if (PHINode *P = dyn_cast<PHINode>(V)) { for (unsigned i = 0, e = P->getNumIncomingValues(); i != e; ++i) worklist.push_back(P->getIncomingValue(i)); continue; } - + // For non-PHIs, determine the addressing mode being computed. SmallVector<Instruction*, 16> NewAddrModeInsts; ExtAddrMode NewAddrMode = @@ -816,15 +876,15 @@ bool CodeGenPrepare::OptimizeMemoryInst(Instruction *MemoryInst, Value *Addr, } continue; } - + Consensus = 0; break; } - + // If the addressing mode couldn't be determined, or if multiple different // ones were determined, bail out now. if (!Consensus) return false; - + // Check to see if any of the instructions supersumed by this addr mode are // non-local to I's BB. bool AnyNonLocal = false; @@ -933,7 +993,7 @@ bool CodeGenPrepare::OptimizeMemoryInst(Instruction *MemoryInst, Value *Addr, // Use a WeakVH to hold onto it in case this happens. WeakVH IterHandle(CurInstIterator); BasicBlock *BB = CurInstIterator->getParent(); - + RecursivelyDeleteTriviallyDeadInstructions(Repl); if (IterHandle != CurInstIterator) { @@ -945,7 +1005,7 @@ bool CodeGenPrepare::OptimizeMemoryInst(Instruction *MemoryInst, Value *Addr, // This address is now available for reassignment, so erase the table // entry; we don't want to match some completely different instruction. SunkAddrs[Addr] = 0; - } + } } ++NumMemoryInsts; return true; @@ -957,12 +1017,12 @@ bool CodeGenPrepare::OptimizeMemoryInst(Instruction *MemoryInst, Value *Addr, bool CodeGenPrepare::OptimizeInlineAsmInst(CallInst *CS) { bool MadeChange = false; - TargetLowering::AsmOperandInfoVector + TargetLowering::AsmOperandInfoVector TargetConstraints = TLI->ParseConstraints(CS); unsigned ArgNo = 0; for (unsigned i = 0, e = TargetConstraints.size(); i != e; ++i) { TargetLowering::AsmOperandInfo &OpInfo = TargetConstraints[i]; - + // Compute the constraint code and ConstraintType to use. TLI->ComputeConstraintToUse(OpInfo, SDValue()); @@ -1091,6 +1151,79 @@ bool CodeGenPrepare::OptimizeExtUses(Instruction *I) { return MadeChange; } +/// isFormingBranchFromSelectProfitable - Returns true if a SelectInst should be +/// turned into an explicit branch. +static bool isFormingBranchFromSelectProfitable(SelectInst *SI) { + // FIXME: This should use the same heuristics as IfConversion to determine + // whether a select is better represented as a branch. This requires that + // branch probability metadata is preserved for the select, which is not the + // case currently. + + CmpInst *Cmp = dyn_cast<CmpInst>(SI->getCondition()); + + // If the branch is predicted right, an out of order CPU can avoid blocking on + // the compare. Emit cmovs on compares with a memory operand as branches to + // avoid stalls on the load from memory. If the compare has more than one use + // there's probably another cmov or setcc around so it's not worth emitting a + // branch. + if (!Cmp) + return false; + + Value *CmpOp0 = Cmp->getOperand(0); + Value *CmpOp1 = Cmp->getOperand(1); + + // We check that the memory operand has one use to avoid uses of the loaded + // value directly after the compare, making branches unprofitable. + return Cmp->hasOneUse() && + ((isa<LoadInst>(CmpOp0) && CmpOp0->hasOneUse()) || + (isa<LoadInst>(CmpOp1) && CmpOp1->hasOneUse())); +} + + +bool CodeGenPrepare::OptimizeSelectInst(SelectInst *SI) { + // If we have a SelectInst that will likely profit from branch prediction, + // turn it into a branch. + if (DisableSelectToBranch || OptSize || !TLI || + !TLI->isPredictableSelectExpensive()) + return false; + + if (!SI->getCondition()->getType()->isIntegerTy(1) || + !isFormingBranchFromSelectProfitable(SI)) + return false; + + ModifiedDT = true; + + // First, we split the block containing the select into 2 blocks. + BasicBlock *StartBlock = SI->getParent(); + BasicBlock::iterator SplitPt = ++(BasicBlock::iterator(SI)); + BasicBlock *NextBlock = StartBlock->splitBasicBlock(SplitPt, "select.end"); + + // Create a new block serving as the landing pad for the branch. + BasicBlock *SmallBlock = BasicBlock::Create(SI->getContext(), "select.mid", + NextBlock->getParent(), NextBlock); + + // Move the unconditional branch from the block with the select in it into our + // landing pad block. + StartBlock->getTerminator()->eraseFromParent(); + BranchInst::Create(NextBlock, SmallBlock); + + // Insert the real conditional branch based on the original condition. + BranchInst::Create(NextBlock, SmallBlock, SI->getCondition(), SI); + + // The select itself is replaced with a PHI Node. + PHINode *PN = PHINode::Create(SI->getType(), 2, "", NextBlock->begin()); + PN->takeName(SI); + PN->addIncoming(SI->getTrueValue(), StartBlock); + PN->addIncoming(SI->getFalseValue(), SmallBlock); + SI->replaceAllUsesWith(PN); + SI->eraseFromParent(); + + // Instruct OptimizeBlock to skip to the next block. + CurInstIterator = StartBlock->end(); + ++NumSelectsExpanded; + return true; +} + bool CodeGenPrepare::OptimizeInst(Instruction *I) { if (PHINode *P = dyn_cast<PHINode>(I)) { // It is possible for very late stage optimizations (such as SimplifyCFG) @@ -1104,7 +1237,7 @@ bool CodeGenPrepare::OptimizeInst(Instruction *I) { } return false; } - + if (CastInst *CI = dyn_cast<CastInst>(I)) { // If the source of the cast is a constant, then this should have // already been constant folded. The only reason NOT to constant fold @@ -1124,23 +1257,23 @@ bool CodeGenPrepare::OptimizeInst(Instruction *I) { } return false; } - + if (CmpInst *CI = dyn_cast<CmpInst>(I)) return OptimizeCmpExpression(CI); - + if (LoadInst *LI = dyn_cast<LoadInst>(I)) { if (TLI) return OptimizeMemoryInst(I, I->getOperand(0), LI->getType()); return false; } - + if (StoreInst *SI = dyn_cast<StoreInst>(I)) { if (TLI) return OptimizeMemoryInst(I, SI->getOperand(1), SI->getOperand(0)->getType()); return false; } - + if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(I)) { if (GEPI->hasAllZeroIndices()) { /// The GEP operand must be a pointer, so must its result -> BitCast @@ -1154,13 +1287,16 @@ bool CodeGenPrepare::OptimizeInst(Instruction *I) { } return false; } - + if (CallInst *CI = dyn_cast<CallInst>(I)) return OptimizeCallInst(CI); if (ReturnInst *RI = dyn_cast<ReturnInst>(I)) return DupRetToEnableTailCallOpts(RI); + if (SelectInst *SI = dyn_cast<SelectInst>(I)) + return OptimizeSelectInst(SI); + return false; } @@ -1179,7 +1315,7 @@ bool CodeGenPrepare::OptimizeBlock(BasicBlock &BB) { } // llvm.dbg.value is far away from the value then iSel may not be able -// handle it properly. iSel will drop llvm.dbg.value if it can not +// handle it properly. iSel will drop llvm.dbg.value if it can not // find a node corresponding to the value. bool CodeGenPrepare::PlaceDbgValues(Function &F) { bool MadeChange = false; diff --git a/lib/Transforms/Scalar/DeadStoreElimination.cpp b/lib/Transforms/Scalar/DeadStoreElimination.cpp index c8c5360..8b1283f 100644 --- a/lib/Transforms/Scalar/DeadStoreElimination.cpp +++ b/lib/Transforms/Scalar/DeadStoreElimination.cpp @@ -32,7 +32,7 @@ #include "llvm/Target/TargetData.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Support/Debug.h" -#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SetVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/STLExtras.h" using namespace llvm; @@ -71,7 +71,7 @@ namespace { bool HandleFree(CallInst *F); bool handleEndBlock(BasicBlock &BB); void RemoveAccessedObjects(const AliasAnalysis::Location &LoadedLoc, - SmallPtrSet<Value*, 16> &DeadStackObjects); + SmallSetVector<Value*, 16> &DeadStackObjects); virtual void getAnalysisUsage(AnalysisUsage &AU) const { AU.setPreservesCFG(); @@ -106,7 +106,7 @@ FunctionPass *llvm::createDeadStoreEliminationPass() { return new DSE(); } /// static void DeleteDeadInstruction(Instruction *I, MemoryDependenceAnalysis &MD, - SmallPtrSet<Value*, 16> *ValueSet = 0) { + SmallSetVector<Value*, 16> *ValueSet = 0) { SmallVector<Instruction*, 32> NowDeadInsts; NowDeadInsts.push_back(I); @@ -136,7 +136,7 @@ static void DeleteDeadInstruction(Instruction *I, DeadInst->eraseFromParent(); - if (ValueSet) ValueSet->erase(DeadInst); + if (ValueSet) ValueSet->remove(DeadInst); } while (!NowDeadInsts.empty()); } @@ -248,7 +248,7 @@ static bool isShortenable(Instruction *I) { // Don't shorten stores for now if (isa<StoreInst>(I)) return false; - + IntrinsicInst *II = cast<IntrinsicInst>(I); switch (II->getIntrinsicID()) { default: return false; @@ -275,33 +275,9 @@ static Value *getStoredPointerOperand(Instruction *I) { } static uint64_t getPointerSize(const Value *V, AliasAnalysis &AA) { - const TargetData *TD = AA.getTargetData(); - - if (const CallInst *CI = extractMallocCall(V)) { - if (const ConstantInt *C = dyn_cast<ConstantInt>(CI->getArgOperand(0))) - return C->getZExtValue(); - } - - if (TD == 0) - return AliasAnalysis::UnknownSize; - - if (const AllocaInst *A = dyn_cast<AllocaInst>(V)) { - // Get size information for the alloca - if (const ConstantInt *C = dyn_cast<ConstantInt>(A->getArraySize())) - return C->getZExtValue() * TD->getTypeAllocSize(A->getAllocatedType()); - } - - if (const Argument *A = dyn_cast<Argument>(V)) { - if (A->hasByValAttr()) - if (PointerType *PT = dyn_cast<PointerType>(A->getType())) - return TD->getTypeAllocSize(PT->getElementType()); - } - - if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(V)) { - if (!GV->mayBeOverridden()) - return TD->getTypeAllocSize(GV->getType()->getElementType()); - } - + uint64_t Size; + if (getObjectSize(V, Size, AA.getTargetData())) + return Size; return AliasAnalysis::UnknownSize; } @@ -316,7 +292,7 @@ namespace { /// isOverwrite - Return 'OverwriteComplete' if a store to the 'Later' location /// completely overwrites a store to the 'Earlier' location. -/// 'OverwriteEnd' if the end of the 'Earlier' location is completely +/// 'OverwriteEnd' if the end of the 'Earlier' location is completely /// overwritten by 'Later', or 'OverwriteUnknown' if nothing can be determined static OverwriteResult isOverwrite(const AliasAnalysis::Location &Later, const AliasAnalysis::Location &Earlier, @@ -339,7 +315,7 @@ static OverwriteResult isOverwrite(const AliasAnalysis::Location &Later, if (AA.getTargetData() == 0 && Later.Ptr->getType() == Earlier.Ptr->getType()) return OverwriteComplete; - + return OverwriteUnknown; } @@ -402,10 +378,10 @@ static OverwriteResult isOverwrite(const AliasAnalysis::Location &Later, // // We have to be careful here as *Off is signed while *.Size is unsigned. if (EarlierOff >= LaterOff && - Later.Size > Earlier.Size && + Later.Size >= Earlier.Size && uint64_t(EarlierOff - LaterOff) + Earlier.Size <= Later.Size) return OverwriteComplete; - + // The other interesting case is if the later store overwrites the end of // the earlier store // @@ -544,11 +520,11 @@ bool DSE::runOnBasicBlock(BasicBlock &BB) { // If we find a write that is a) removable (i.e., non-volatile), b) is // completely obliterated by the store to 'Loc', and c) which we know that // 'Inst' doesn't load from, then we can remove it. - if (isRemovable(DepWrite) && + if (isRemovable(DepWrite) && !isPossibleSelfRead(Inst, Loc, DepWrite, *AA)) { - int64_t InstWriteOffset, DepWriteOffset; - OverwriteResult OR = isOverwrite(Loc, DepLoc, *AA, - DepWriteOffset, InstWriteOffset); + int64_t InstWriteOffset, DepWriteOffset; + OverwriteResult OR = isOverwrite(Loc, DepLoc, *AA, + DepWriteOffset, InstWriteOffset); if (OR == OverwriteComplete) { DEBUG(dbgs() << "DSE: Remove Dead Store:\n DEAD: " << *DepWrite << "\n KILLER: " << *Inst << '\n'); @@ -557,7 +533,7 @@ bool DSE::runOnBasicBlock(BasicBlock &BB) { DeleteDeadInstruction(DepWrite, *MD); ++NumFastStores; MadeChange = true; - + // DeleteDeadInstruction can delete the current instruction in loop // cases, reset BBI. BBI = Inst; @@ -575,16 +551,16 @@ bool DSE::runOnBasicBlock(BasicBlock &BB) { unsigned DepWriteAlign = DepIntrinsic->getAlignment(); if (llvm::isPowerOf2_64(InstWriteOffset) || ((DepWriteAlign != 0) && InstWriteOffset % DepWriteAlign == 0)) { - + DEBUG(dbgs() << "DSE: Remove Dead Store:\n OW END: " - << *DepWrite << "\n KILLER (offset " - << InstWriteOffset << ", " + << *DepWrite << "\n KILLER (offset " + << InstWriteOffset << ", " << DepLoc.Size << ")" << *Inst << '\n'); - + Value* DepWriteLength = DepIntrinsic->getLength(); Value* TrimmedLength = ConstantInt::get(DepWriteLength->getType(), - InstWriteOffset - + InstWriteOffset - DepWriteOffset); DepIntrinsic->setLength(TrimmedLength); MadeChange = true; @@ -694,19 +670,18 @@ bool DSE::handleEndBlock(BasicBlock &BB) { // Keep track of all of the stack objects that are dead at the end of the // function. - SmallPtrSet<Value*, 16> DeadStackObjects; + SmallSetVector<Value*, 16> DeadStackObjects; // Find all of the alloca'd pointers in the entry block. BasicBlock *Entry = BB.getParent()->begin(); for (BasicBlock::iterator I = Entry->begin(), E = Entry->end(); I != E; ++I) { - if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) - DeadStackObjects.insert(AI); + if (isa<AllocaInst>(I)) + DeadStackObjects.insert(I); // Okay, so these are dead heap objects, but if the pointer never escapes // then it's leaked by this function anyways. - if (CallInst *CI = extractMallocCall(I)) - if (!PointerMayBeCaptured(CI, true, true)) - DeadStackObjects.insert(CI); + else if (isAllocLikeFn(I) && !PointerMayBeCaptured(I, true, true)) + DeadStackObjects.insert(I); } // Treat byval arguments the same, stores to them are dead at the end of the @@ -723,14 +698,30 @@ bool DSE::handleEndBlock(BasicBlock &BB) { // If we find a store, check to see if it points into a dead stack value. if (hasMemoryWrite(BBI) && isRemovable(BBI)) { // See through pointer-to-pointer bitcasts - Value *Pointer = GetUnderlyingObject(getStoredPointerOperand(BBI)); + SmallVector<Value *, 4> Pointers; + GetUnderlyingObjects(getStoredPointerOperand(BBI), Pointers); // Stores to stack values are valid candidates for removal. - if (DeadStackObjects.count(Pointer)) { + bool AllDead = true; + for (SmallVectorImpl<Value *>::iterator I = Pointers.begin(), + E = Pointers.end(); I != E; ++I) + if (!DeadStackObjects.count(*I)) { + AllDead = false; + break; + } + + if (AllDead) { Instruction *Dead = BBI++; DEBUG(dbgs() << "DSE: Dead Store at End of Block:\n DEAD: " - << *Dead << "\n Object: " << *Pointer << '\n'); + << *Dead << "\n Objects: "; + for (SmallVectorImpl<Value *>::iterator I = Pointers.begin(), + E = Pointers.end(); I != E; ++I) { + dbgs() << **I; + if (llvm::next(I) != E) + dbgs() << ", "; + } + dbgs() << '\n'); // DCE instructions only used to calculate that store. DeleteDeadInstruction(Dead, *MD, &DeadStackObjects); @@ -749,17 +740,19 @@ bool DSE::handleEndBlock(BasicBlock &BB) { continue; } - if (AllocaInst *A = dyn_cast<AllocaInst>(BBI)) { - DeadStackObjects.erase(A); - continue; - } - - if (CallInst *CI = extractMallocCall(BBI)) { - DeadStackObjects.erase(CI); + if (isa<AllocaInst>(BBI)) { + // Remove allocas from the list of dead stack objects; there can't be + // any references before the definition. + DeadStackObjects.remove(BBI); continue; } if (CallSite CS = cast<Value>(BBI)) { + // Remove allocation function calls from the list of dead stack objects; + // there can't be any references before the definition. + if (isAllocLikeFn(BBI)) + DeadStackObjects.remove(BBI); + // If this call does not access memory, it can't be loading any of our // pointers. if (AA->doesNotAccessMemory(CS)) @@ -768,7 +761,7 @@ bool DSE::handleEndBlock(BasicBlock &BB) { // If the call might load from any of our allocas, then any store above // the call is live. SmallVector<Value*, 8> LiveAllocas; - for (SmallPtrSet<Value*, 16>::iterator I = DeadStackObjects.begin(), + for (SmallSetVector<Value*, 16>::iterator I = DeadStackObjects.begin(), E = DeadStackObjects.end(); I != E; ++I) { // See if the call site touches it. AliasAnalysis::ModRefResult A = @@ -780,12 +773,12 @@ bool DSE::handleEndBlock(BasicBlock &BB) { for (SmallVector<Value*, 8>::iterator I = LiveAllocas.begin(), E = LiveAllocas.end(); I != E; ++I) - DeadStackObjects.erase(*I); + DeadStackObjects.remove(*I); // If all of the allocas were clobbered by the call then we're not going // to find anything else to process. if (DeadStackObjects.empty()) - return MadeChange; + break; continue; } @@ -827,7 +820,7 @@ bool DSE::handleEndBlock(BasicBlock &BB) { /// of the stack objects in the DeadStackObjects set. If so, they become live /// because the location is being loaded. void DSE::RemoveAccessedObjects(const AliasAnalysis::Location &LoadedLoc, - SmallPtrSet<Value*, 16> &DeadStackObjects) { + SmallSetVector<Value*, 16> &DeadStackObjects) { const Value *UnderlyingPointer = GetUnderlyingObject(LoadedLoc.Ptr); // A constant can't be in the dead pointer set. @@ -837,12 +830,12 @@ void DSE::RemoveAccessedObjects(const AliasAnalysis::Location &LoadedLoc, // If the kill pointer can be easily reduced to an alloca, don't bother doing // extraneous AA queries. if (isa<AllocaInst>(UnderlyingPointer) || isa<Argument>(UnderlyingPointer)) { - DeadStackObjects.erase(const_cast<Value*>(UnderlyingPointer)); + DeadStackObjects.remove(const_cast<Value*>(UnderlyingPointer)); return; } SmallVector<Value*, 16> NowLive; - for (SmallPtrSet<Value*, 16>::iterator I = DeadStackObjects.begin(), + for (SmallSetVector<Value*, 16>::iterator I = DeadStackObjects.begin(), E = DeadStackObjects.end(); I != E; ++I) { // See if the loaded location could alias the stack location. AliasAnalysis::Location StackLoc(*I, getPointerSize(*I, *AA)); @@ -852,5 +845,5 @@ void DSE::RemoveAccessedObjects(const AliasAnalysis::Location &LoadedLoc, for (SmallVector<Value*, 16>::iterator I = NowLive.begin(), E = NowLive.end(); I != E; ++I) - DeadStackObjects.erase(*I); + DeadStackObjects.remove(*I); } diff --git a/lib/Transforms/Scalar/EarlyCSE.cpp b/lib/Transforms/Scalar/EarlyCSE.cpp index f3c92d6..9759549 100644 --- a/lib/Transforms/Scalar/EarlyCSE.cpp +++ b/lib/Transforms/Scalar/EarlyCSE.cpp @@ -39,7 +39,7 @@ static unsigned getHash(const void *V) { } //===----------------------------------------------------------------------===// -// SimpleValue +// SimpleValue //===----------------------------------------------------------------------===// namespace { @@ -47,16 +47,16 @@ namespace { /// scoped hash table. struct SimpleValue { Instruction *Inst; - + SimpleValue(Instruction *I) : Inst(I) { assert((isSentinel() || canHandle(I)) && "Inst can't be handled!"); } - + bool isSentinel() const { return Inst == DenseMapInfo<Instruction*>::getEmptyKey() || Inst == DenseMapInfo<Instruction*>::getTombstoneKey(); } - + static bool canHandle(Instruction *Inst) { // This can only handle non-void readnone functions. if (CallInst *CI = dyn_cast<CallInst>(Inst)) @@ -90,7 +90,7 @@ template<> struct DenseMapInfo<SimpleValue> { unsigned DenseMapInfo<SimpleValue>::getHashValue(SimpleValue Val) { Instruction *Inst = Val.Inst; - + // Hash in all of the operands as pointers. unsigned Res = 0; for (unsigned i = 0, e = Inst->getNumOperands(); i != e; ++i) @@ -126,13 +126,13 @@ bool DenseMapInfo<SimpleValue>::isEqual(SimpleValue LHS, SimpleValue RHS) { if (LHS.isSentinel() || RHS.isSentinel()) return LHSI == RHSI; - + if (LHSI->getOpcode() != RHSI->getOpcode()) return false; return LHSI->isIdenticalTo(RHSI); } //===----------------------------------------------------------------------===// -// CallValue +// CallValue //===----------------------------------------------------------------------===// namespace { @@ -140,21 +140,21 @@ namespace { /// the scoped hash table. struct CallValue { Instruction *Inst; - + CallValue(Instruction *I) : Inst(I) { assert((isSentinel() || canHandle(I)) && "Inst can't be handled!"); } - + bool isSentinel() const { return Inst == DenseMapInfo<Instruction*>::getEmptyKey() || Inst == DenseMapInfo<Instruction*>::getTombstoneKey(); } - + static bool canHandle(Instruction *Inst) { // Don't value number anything that returns void. if (Inst->getType()->isVoidTy()) return false; - + CallInst *CI = dyn_cast<CallInst>(Inst); if (CI == 0 || !CI->onlyReadsMemory()) return false; @@ -168,7 +168,7 @@ namespace llvm { template<> struct isPodLike<CallValue> { static const bool value = true; }; - + template<> struct DenseMapInfo<CallValue> { static inline CallValue getEmptyKey() { return DenseMapInfo<Instruction*>::getEmptyKey(); @@ -189,7 +189,7 @@ unsigned DenseMapInfo<CallValue>::getHashValue(CallValue Val) { "Cannot value number calls with metadata operands"); Res ^= getHash(Inst->getOperand(i)) << (i & 0xF); } - + // Mix in the opcode. return (Res << 1) ^ Inst->getOpcode(); } @@ -203,11 +203,11 @@ bool DenseMapInfo<CallValue>::isEqual(CallValue LHS, CallValue RHS) { //===----------------------------------------------------------------------===// -// EarlyCSE pass. +// EarlyCSE pass. //===----------------------------------------------------------------------===// namespace { - + /// EarlyCSE - This pass does a simple depth-first walk over the dominator /// tree, eliminating trivially redundant instructions and using instsimplify /// to canonicalize things as it goes. It is intended to be fast and catch @@ -223,14 +223,14 @@ public: ScopedHashTableVal<SimpleValue, Value*> > AllocatorTy; typedef ScopedHashTable<SimpleValue, Value*, DenseMapInfo<SimpleValue>, AllocatorTy> ScopedHTType; - + /// AvailableValues - This scoped hash table contains the current values of /// all of our simple scalar expressions. As we walk down the domtree, we /// look to see if instructions are in this: if so, we replace them with what /// we find, otherwise we insert them so that dominated values can succeed in /// their lookup. ScopedHTType *AvailableValues; - + /// AvailableLoads - This scoped hash table contains the current values /// of loads. This allows us to get efficient access to dominating loads when /// we have a fully redundant load. In addition to the most recent load, we @@ -243,15 +243,15 @@ public: typedef ScopedHashTable<Value*, std::pair<Value*, unsigned>, DenseMapInfo<Value*>, LoadMapAllocator> LoadHTType; LoadHTType *AvailableLoads; - + /// AvailableCalls - This scoped hash table contains the current values /// of read-only call values. It uses the same generation count as loads. typedef ScopedHashTable<CallValue, std::pair<Value*, unsigned> > CallHTType; CallHTType *AvailableCalls; - + /// CurrentGeneration - This is the current generation of the memory value. unsigned CurrentGeneration; - + static char ID; explicit EarlyCSE() : FunctionPass(ID) { initializeEarlyCSEPass(*PassRegistry::getPassRegistry()); @@ -326,7 +326,7 @@ private: }; bool processNode(DomTreeNode *Node); - + // This transformation requires dominator postdominator info virtual void getAnalysisUsage(AnalysisUsage &AU) const { AU.addRequired<DominatorTree>(); @@ -350,7 +350,7 @@ INITIALIZE_PASS_END(EarlyCSE, "early-cse", "Early CSE", false, false) bool EarlyCSE::processNode(DomTreeNode *Node) { BasicBlock *BB = Node->getBlock(); - + // If this block has a single predecessor, then the predecessor is the parent // of the domtree node and all of the live out memory values are still current // in this block. If this block has multiple predecessors, then they could @@ -359,20 +359,20 @@ bool EarlyCSE::processNode(DomTreeNode *Node) { // predecessors. if (BB->getSinglePredecessor() == 0) ++CurrentGeneration; - + /// LastStore - Keep track of the last non-volatile store that we saw... for /// as long as there in no instruction that reads memory. If we see a store /// to the same location, we delete the dead store. This zaps trivial dead /// stores which can occur in bitfield code among other things. StoreInst *LastStore = 0; - + bool Changed = false; // See if any instructions in the block can be eliminated. If so, do it. If // not, add them to AvailableValues. for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ) { Instruction *Inst = I++; - + // Dead instructions should just be removed. if (isInstructionTriviallyDead(Inst)) { DEBUG(dbgs() << "EarlyCSE DCE: " << *Inst << '\n'); @@ -381,7 +381,7 @@ bool EarlyCSE::processNode(DomTreeNode *Node) { ++NumSimplify; continue; } - + // If the instruction can be simplified (e.g. X+0 = X) then replace it with // its simpler value. if (Value *V = SimplifyInstruction(Inst, TD, TLI, DT)) { @@ -392,7 +392,7 @@ bool EarlyCSE::processNode(DomTreeNode *Node) { ++NumSimplify; continue; } - + // If this is a simple instruction that we can value number, process it. if (SimpleValue::canHandle(Inst)) { // See if the instruction has an available value. If so, use it. @@ -404,12 +404,12 @@ bool EarlyCSE::processNode(DomTreeNode *Node) { ++NumCSE; continue; } - + // Otherwise, just remember that this value is available. AvailableValues->insert(Inst, Inst); continue; } - + // If this is a non-volatile load, process it. if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) { // Ignore volatile loads. @@ -417,7 +417,7 @@ bool EarlyCSE::processNode(DomTreeNode *Node) { LastStore = 0; continue; } - + // If we have an available version of this load, and if it is the right // generation, replace this instruction. std::pair<Value*, unsigned> InVal = @@ -431,18 +431,18 @@ bool EarlyCSE::processNode(DomTreeNode *Node) { ++NumCSELoad; continue; } - + // Otherwise, remember that we have this instruction. AvailableLoads->insert(Inst->getOperand(0), std::pair<Value*, unsigned>(Inst, CurrentGeneration)); LastStore = 0; continue; } - + // If this instruction may read from memory, forget LastStore. if (Inst->mayReadFromMemory()) LastStore = 0; - + // If this is a read-only call, process it. if (CallValue::canHandle(Inst)) { // If we have an available version of this call, and if it is the right @@ -457,19 +457,19 @@ bool EarlyCSE::processNode(DomTreeNode *Node) { ++NumCSECall; continue; } - + // Otherwise, remember that we have this instruction. AvailableCalls->insert(Inst, std::pair<Value*, unsigned>(Inst, CurrentGeneration)); continue; } - + // Okay, this isn't something we can CSE at all. Check to see if it is // something that could modify memory. If so, our available memory values // cannot be used so bump the generation count. if (Inst->mayWriteToMemory()) { ++CurrentGeneration; - + if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) { // We do a trivial form of DSE if there are two stores to the same // location with no intervening loads. Delete the earlier store. @@ -483,7 +483,7 @@ bool EarlyCSE::processNode(DomTreeNode *Node) { LastStore = 0; continue; } - + // Okay, we just invalidated anything we knew about loaded values. Try // to salvage *something* by remembering that the stored value is a live // version of the pointer. It is safe to forward from volatile stores @@ -491,7 +491,7 @@ bool EarlyCSE::processNode(DomTreeNode *Node) { // the store. AvailableLoads->insert(SI->getPointerOperand(), std::pair<Value*, unsigned>(SI->getValueOperand(), CurrentGeneration)); - + // Remember that this was the last store we saw for DSE. if (SI->isSimple()) LastStore = SI; @@ -509,7 +509,7 @@ bool EarlyCSE::runOnFunction(Function &F) { TD = getAnalysisIfAvailable<TargetData>(); TLI = &getAnalysis<TargetLibraryInfo>(); DT = &getAnalysis<DominatorTree>(); - + // Tables that the pass uses when walking the domtree. ScopedHTType AVTable; AvailableValues = &AVTable; @@ -517,7 +517,7 @@ bool EarlyCSE::runOnFunction(Function &F) { AvailableLoads = &LoadTable; CallHTType CallTable; AvailableCalls = &CallTable; - + CurrentGeneration = 0; bool Changed = false; diff --git a/lib/Transforms/Scalar/GVN.cpp b/lib/Transforms/Scalar/GVN.cpp index fb733ad..120175d 100644 --- a/lib/Transforms/Scalar/GVN.cpp +++ b/lib/Transforms/Scalar/GVN.cpp @@ -18,8 +18,15 @@ #define DEBUG_TYPE "gvn" #include "llvm/Transforms/Scalar.h" #include "llvm/GlobalVariable.h" +#include "llvm/IRBuilder.h" #include "llvm/IntrinsicInst.h" #include "llvm/LLVMContext.h" +#include "llvm/Metadata.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/DepthFirstIterator.h" +#include "llvm/ADT/Hashing.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/ConstantFolding.h" #include "llvm/Analysis/Dominators.h" @@ -30,20 +37,14 @@ #include "llvm/Analysis/PHITransAddr.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/Assembly/Writer.h" -#include "llvm/Target/TargetData.h" -#include "llvm/Target/TargetLibraryInfo.h" -#include "llvm/Transforms/Utils/BasicBlockUtils.h" -#include "llvm/Transforms/Utils/SSAUpdater.h" -#include "llvm/ADT/DenseMap.h" -#include "llvm/ADT/DepthFirstIterator.h" -#include "llvm/ADT/Hashing.h" -#include "llvm/ADT/SmallPtrSet.h" -#include "llvm/ADT/Statistic.h" #include "llvm/Support/Allocator.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" -#include "llvm/Support/IRBuilder.h" #include "llvm/Support/PatternMatch.h" +#include "llvm/Target/TargetData.h" +#include "llvm/Target/TargetLibraryInfo.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/SSAUpdater.h" using namespace llvm; using namespace PatternMatch; @@ -59,6 +60,11 @@ static cl::opt<bool> EnablePRE("enable-pre", cl::init(true), cl::Hidden); static cl::opt<bool> EnableLoadPRE("enable-load-pre", cl::init(true)); +// Maximum allowed recursion depth. +static cl::opt<uint32_t> +MaxRecurseDepth("max-recurse-depth", cl::Hidden, cl::init(1000), cl::ZeroOrMore, + cl::desc("Max recurse depth (default = 1000)")); + //===----------------------------------------------------------------------===// // ValueTable Class //===----------------------------------------------------------------------===// @@ -167,7 +173,7 @@ Expression ValueTable::create_expression(Instruction *I) { if (e.varargs[0] > e.varargs[1]) std::swap(e.varargs[0], e.varargs[1]); } - + if (CmpInst *C = dyn_cast<CmpInst>(I)) { // Sort the operand value numbers so x<y and y>x get the same value number. CmpInst::Predicate Predicate = C->getPredicate(); @@ -181,7 +187,7 @@ Expression ValueTable::create_expression(Instruction *I) { II != IE; ++II) e.varargs.push_back(*II); } - + return e; } @@ -385,7 +391,7 @@ uint32_t ValueTable::lookup_or_add(Value *V) { valueNumbering[V] = nextValueNumber; return nextValueNumber++; } - + Instruction* I = cast<Instruction>(V); Expression exp; switch (I->getOpcode()) { @@ -501,17 +507,17 @@ namespace { const TargetLibraryInfo *TLI; ValueTable VN; - + /// LeaderTable - A mapping from value numbers to lists of Value*'s that /// have that value number. Use findLeader to query it. struct LeaderTableEntry { Value *Val; - BasicBlock *BB; + const BasicBlock *BB; LeaderTableEntry *Next; }; DenseMap<uint32_t, LeaderTableEntry> LeaderTable; BumpPtrAllocator TableAllocator; - + SmallVector<Instruction*, 8> InstrsToErase; public: static char ID; // Pass identification, replacement for typeid @@ -521,14 +527,14 @@ namespace { } bool runOnFunction(Function &F); - + /// markInstructionForDeletion - This removes the specified instruction from /// our various maps and marks it for deletion. void markInstructionForDeletion(Instruction *I) { VN.erase(I); InstrsToErase.push_back(I); } - + const TargetData *getTargetData() const { return TD; } DominatorTree &getDominatorTree() const { return *DT; } AliasAnalysis *getAliasAnalysis() const { return VN.getAliasAnalysis(); } @@ -536,32 +542,32 @@ namespace { private: /// addToLeaderTable - Push a new Value to the LeaderTable onto the list for /// its value number. - void addToLeaderTable(uint32_t N, Value *V, BasicBlock *BB) { + void addToLeaderTable(uint32_t N, Value *V, const BasicBlock *BB) { LeaderTableEntry &Curr = LeaderTable[N]; if (!Curr.Val) { Curr.Val = V; Curr.BB = BB; return; } - + LeaderTableEntry *Node = TableAllocator.Allocate<LeaderTableEntry>(); Node->Val = V; Node->BB = BB; Node->Next = Curr.Next; Curr.Next = Node; } - + /// removeFromLeaderTable - Scan the list of values corresponding to a given - /// value number, and remove the given value if encountered. - void removeFromLeaderTable(uint32_t N, Value *V, BasicBlock *BB) { + /// value number, and remove the given instruction if encountered. + void removeFromLeaderTable(uint32_t N, Instruction *I, BasicBlock *BB) { LeaderTableEntry* Prev = 0; LeaderTableEntry* Curr = &LeaderTable[N]; - while (Curr->Val != V || Curr->BB != BB) { + while (Curr->Val != I || Curr->BB != BB) { Prev = Curr; Curr = Curr->Next; } - + if (Prev) { Prev->Next = Curr->Next; } else { @@ -591,7 +597,7 @@ namespace { AU.addPreserved<DominatorTree>(); AU.addPreserved<AliasAnalysis>(); } - + // Helper fuctions // FIXME: eliminate or document these better @@ -602,13 +608,13 @@ namespace { void dump(DenseMap<uint32_t, Value*> &d); bool iterateOnFunction(Function &F); bool performPRE(Function &F); - Value *findLeader(BasicBlock *BB, uint32_t num); + Value *findLeader(const BasicBlock *BB, uint32_t num); void cleanupGlobalSets(); void verifyRemoved(const Instruction *I) const; bool splitCriticalEdges(); unsigned replaceAllDominatedUsesWith(Value *From, Value *To, - BasicBlock *Root); - bool propagateEquality(Value *LHS, Value *RHS, BasicBlock *Root); + const BasicBlock *Root); + bool propagateEquality(Value *LHS, Value *RHS, const BasicBlock *Root); }; char GVN::ID = 0; @@ -647,7 +653,11 @@ void GVN::dump(DenseMap<uint32_t, Value*>& d) { /// 3) we are speculating for this block and have used that to speculate for /// other blocks. static bool IsValueFullyAvailableInBlock(BasicBlock *BB, - DenseMap<BasicBlock*, char> &FullyAvailableBlocks) { + DenseMap<BasicBlock*, char> &FullyAvailableBlocks, + uint32_t RecurseDepth) { + if (RecurseDepth > MaxRecurseDepth) + return false; + // Optimistically assume that the block is fully available and check to see // if we already know about this block in one lookup. std::pair<DenseMap<BasicBlock*, char>::iterator, char> IV = @@ -673,7 +683,7 @@ static bool IsValueFullyAvailableInBlock(BasicBlock *BB, // If the value isn't fully available in one of our predecessors, then it // isn't fully available in this block either. Undo our previous // optimistic assumption and bail out. - if (!IsValueFullyAvailableInBlock(*PI, FullyAvailableBlocks)) + if (!IsValueFullyAvailableInBlock(*PI, FullyAvailableBlocks,RecurseDepth+1)) goto SpeculationFailure; return true; @@ -725,15 +735,15 @@ static bool CanCoerceMustAliasedValueToLoad(Value *StoredVal, StoredVal->getType()->isStructTy() || StoredVal->getType()->isArrayTy()) return false; - + // The store has to be at least as big as the load. if (TD.getTypeSizeInBits(StoredVal->getType()) < TD.getTypeSizeInBits(LoadTy)) return false; - + return true; } - + /// CoerceAvailableValueToLoadType - If we saw a store of a value to memory, and /// then a load from a must-aliased pointer of a different type, try to coerce @@ -741,80 +751,80 @@ static bool CanCoerceMustAliasedValueToLoad(Value *StoredVal, /// InsertPt is the place to insert new instructions. /// /// If we can't do it, return null. -static Value *CoerceAvailableValueToLoadType(Value *StoredVal, +static Value *CoerceAvailableValueToLoadType(Value *StoredVal, Type *LoadedTy, Instruction *InsertPt, const TargetData &TD) { if (!CanCoerceMustAliasedValueToLoad(StoredVal, LoadedTy, TD)) return 0; - + // If this is already the right type, just return it. Type *StoredValTy = StoredVal->getType(); - + uint64_t StoreSize = TD.getTypeSizeInBits(StoredValTy); uint64_t LoadSize = TD.getTypeSizeInBits(LoadedTy); - + // If the store and reload are the same size, we can always reuse it. if (StoreSize == LoadSize) { // Pointer to Pointer -> use bitcast. if (StoredValTy->isPointerTy() && LoadedTy->isPointerTy()) return new BitCastInst(StoredVal, LoadedTy, "", InsertPt); - + // Convert source pointers to integers, which can be bitcast. if (StoredValTy->isPointerTy()) { StoredValTy = TD.getIntPtrType(StoredValTy->getContext()); StoredVal = new PtrToIntInst(StoredVal, StoredValTy, "", InsertPt); } - + Type *TypeToCastTo = LoadedTy; if (TypeToCastTo->isPointerTy()) TypeToCastTo = TD.getIntPtrType(StoredValTy->getContext()); - + if (StoredValTy != TypeToCastTo) StoredVal = new BitCastInst(StoredVal, TypeToCastTo, "", InsertPt); - + // Cast to pointer if the load needs a pointer type. if (LoadedTy->isPointerTy()) StoredVal = new IntToPtrInst(StoredVal, LoadedTy, "", InsertPt); - + return StoredVal; } - + // If the loaded value is smaller than the available value, then we can // extract out a piece from it. If the available value is too small, then we // can't do anything. assert(StoreSize >= LoadSize && "CanCoerceMustAliasedValueToLoad fail"); - + // Convert source pointers to integers, which can be manipulated. if (StoredValTy->isPointerTy()) { StoredValTy = TD.getIntPtrType(StoredValTy->getContext()); StoredVal = new PtrToIntInst(StoredVal, StoredValTy, "", InsertPt); } - + // Convert vectors and fp to integer, which can be manipulated. if (!StoredValTy->isIntegerTy()) { StoredValTy = IntegerType::get(StoredValTy->getContext(), StoreSize); StoredVal = new BitCastInst(StoredVal, StoredValTy, "", InsertPt); } - + // If this is a big-endian system, we need to shift the value down to the low // bits so that a truncate will work. if (TD.isBigEndian()) { Constant *Val = ConstantInt::get(StoredVal->getType(), StoreSize-LoadSize); StoredVal = BinaryOperator::CreateLShr(StoredVal, Val, "tmp", InsertPt); } - + // Truncate the integer to the right size now. Type *NewIntTy = IntegerType::get(StoredValTy->getContext(), LoadSize); StoredVal = new TruncInst(StoredVal, NewIntTy, "trunc", InsertPt); - + if (LoadedTy == NewIntTy) return StoredVal; - + // If the result is a pointer, inttoptr. if (LoadedTy->isPointerTy()) return new IntToPtrInst(StoredVal, LoadedTy, "inttoptr", InsertPt); - + // Otherwise, bitcast. return new BitCastInst(StoredVal, LoadedTy, "bitcast", InsertPt); } @@ -835,13 +845,13 @@ static int AnalyzeLoadFromClobberingWrite(Type *LoadTy, Value *LoadPtr, // to transform them. We need to be able to bitcast to integer. if (LoadTy->isStructTy() || LoadTy->isArrayTy()) return -1; - + int64_t StoreOffset = 0, LoadOffset = 0; Value *StoreBase = GetPointerBaseWithConstantOffset(WritePtr, StoreOffset,TD); Value *LoadBase = GetPointerBaseWithConstantOffset(LoadPtr, LoadOffset, TD); if (StoreBase != LoadBase) return -1; - + // If the load and store are to the exact same address, they should have been // a must alias. AA must have gotten confused. // FIXME: Study to see if/when this happens. One case is forwarding a memset @@ -856,18 +866,18 @@ static int AnalyzeLoadFromClobberingWrite(Type *LoadTy, Value *LoadPtr, abort(); } #endif - + // If the load and store don't overlap at all, the store doesn't provide // anything to the load. In this case, they really don't alias at all, AA // must have gotten confused. uint64_t LoadSize = TD.getTypeSizeInBits(LoadTy); - + if ((WriteSizeInBits & 7) | (LoadSize & 7)) return -1; uint64_t StoreSize = WriteSizeInBits >> 3; // Convert to bytes. LoadSize >>= 3; - - + + bool isAAFailure = false; if (StoreOffset < LoadOffset) isAAFailure = StoreOffset+int64_t(StoreSize) <= LoadOffset; @@ -885,7 +895,7 @@ static int AnalyzeLoadFromClobberingWrite(Type *LoadTy, Value *LoadPtr, #endif return -1; } - + // If the Load isn't completely contained within the stored bits, we don't // have all the bits to feed it. We could do something crazy in the future // (issue a smaller load then merge the bits in) but this seems unlikely to be @@ -893,11 +903,11 @@ static int AnalyzeLoadFromClobberingWrite(Type *LoadTy, Value *LoadPtr, if (StoreOffset > LoadOffset || StoreOffset+StoreSize < LoadOffset+LoadSize) return -1; - + // Okay, we can do this transformation. Return the number of bytes into the // store that the load is. return LoadOffset-StoreOffset; -} +} /// AnalyzeLoadFromClobberingStore - This function is called when we have a /// memdep query of a load that ends up being a clobbering store. @@ -923,23 +933,23 @@ static int AnalyzeLoadFromClobberingLoad(Type *LoadTy, Value *LoadPtr, // Cannot handle reading from store of first-class aggregate yet. if (DepLI->getType()->isStructTy() || DepLI->getType()->isArrayTy()) return -1; - + Value *DepPtr = DepLI->getPointerOperand(); uint64_t DepSize = TD.getTypeSizeInBits(DepLI->getType()); int R = AnalyzeLoadFromClobberingWrite(LoadTy, LoadPtr, DepPtr, DepSize, TD); if (R != -1) return R; - + // If we have a load/load clobber an DepLI can be widened to cover this load, // then we should widen it! int64_t LoadOffs = 0; const Value *LoadBase = GetPointerBaseWithConstantOffset(LoadPtr, LoadOffs, TD); unsigned LoadSize = TD.getTypeStoreSize(LoadTy); - + unsigned Size = MemoryDependenceAnalysis:: getLoadLoadClobberFullWidthSize(LoadBase, LoadOffs, LoadSize, DepLI, TD); if (Size == 0) return -1; - + return AnalyzeLoadFromClobberingWrite(LoadTy, LoadPtr, DepPtr, Size*8, TD); } @@ -958,29 +968,29 @@ static int AnalyzeLoadFromClobberingMemInst(Type *LoadTy, Value *LoadPtr, if (MI->getIntrinsicID() == Intrinsic::memset) return AnalyzeLoadFromClobberingWrite(LoadTy, LoadPtr, MI->getDest(), MemSizeInBits, TD); - + // If we have a memcpy/memmove, the only case we can handle is if this is a // copy from constant memory. In that case, we can read directly from the // constant memory. MemTransferInst *MTI = cast<MemTransferInst>(MI); - + Constant *Src = dyn_cast<Constant>(MTI->getSource()); if (Src == 0) return -1; - + GlobalVariable *GV = dyn_cast<GlobalVariable>(GetUnderlyingObject(Src, &TD)); if (GV == 0 || !GV->isConstant()) return -1; - + // See if the access is within the bounds of the transfer. int Offset = AnalyzeLoadFromClobberingWrite(LoadTy, LoadPtr, MI->getDest(), MemSizeInBits, TD); if (Offset == -1) return Offset; - + // Otherwise, see if we can constant fold a load from the constant with the // offset applied as appropriate. Src = ConstantExpr::getBitCast(Src, llvm::Type::getInt8PtrTy(Src->getContext())); - Constant *OffsetCst = + Constant *OffsetCst = ConstantInt::get(Type::getInt64Ty(Src->getContext()), (unsigned)Offset); Src = ConstantExpr::getGetElementPtr(Src, OffsetCst); Src = ConstantExpr::getBitCast(Src, PointerType::getUnqual(LoadTy)); @@ -988,7 +998,7 @@ static int AnalyzeLoadFromClobberingMemInst(Type *LoadTy, Value *LoadPtr, return Offset; return -1; } - + /// GetStoreValueForLoad - This function is called when we have a /// memdep query of a load that ends up being a clobbering store. This means @@ -999,32 +1009,32 @@ static Value *GetStoreValueForLoad(Value *SrcVal, unsigned Offset, Type *LoadTy, Instruction *InsertPt, const TargetData &TD){ LLVMContext &Ctx = SrcVal->getType()->getContext(); - + uint64_t StoreSize = (TD.getTypeSizeInBits(SrcVal->getType()) + 7) / 8; uint64_t LoadSize = (TD.getTypeSizeInBits(LoadTy) + 7) / 8; - + IRBuilder<> Builder(InsertPt->getParent(), InsertPt); - + // Compute which bits of the stored value are being used by the load. Convert // to an integer type to start with. if (SrcVal->getType()->isPointerTy()) SrcVal = Builder.CreatePtrToInt(SrcVal, TD.getIntPtrType(Ctx)); if (!SrcVal->getType()->isIntegerTy()) SrcVal = Builder.CreateBitCast(SrcVal, IntegerType::get(Ctx, StoreSize*8)); - + // Shift the bits to the least significant depending on endianness. unsigned ShiftAmt; if (TD.isLittleEndian()) ShiftAmt = Offset*8; else ShiftAmt = (StoreSize-LoadSize-Offset)*8; - + if (ShiftAmt) SrcVal = Builder.CreateLShr(SrcVal, ShiftAmt); - + if (LoadSize != StoreSize) SrcVal = Builder.CreateTrunc(SrcVal, IntegerType::get(Ctx, LoadSize*8)); - + return CoerceAvailableValueToLoadType(SrcVal, LoadTy, InsertPt, TD); } @@ -1051,14 +1061,14 @@ static Value *GetLoadValueForLoad(LoadInst *SrcVal, unsigned Offset, NewLoadSize = NextPowerOf2(NewLoadSize); Value *PtrVal = SrcVal->getPointerOperand(); - + // Insert the new load after the old load. This ensures that subsequent // memdep queries will find the new load. We can't easily remove the old // load completely because it is already in the value numbering table. IRBuilder<> Builder(SrcVal->getParent(), ++BasicBlock::iterator(SrcVal)); - Type *DestPTy = + Type *DestPTy = IntegerType::get(LoadTy->getContext(), NewLoadSize*8); - DestPTy = PointerType::get(DestPTy, + DestPTy = PointerType::get(DestPTy, cast<PointerType>(PtrVal->getType())->getAddressSpace()); Builder.SetCurrentDebugLocation(SrcVal->getDebugLoc()); PtrVal = Builder.CreateBitCast(PtrVal, DestPTy); @@ -1068,7 +1078,7 @@ static Value *GetLoadValueForLoad(LoadInst *SrcVal, unsigned Offset, DEBUG(dbgs() << "GVN WIDENED LOAD: " << *SrcVal << "\n"); DEBUG(dbgs() << "TO: " << *NewLoad << "\n"); - + // Replace uses of the original load with the wider load. On a big endian // system, we need to shift down to get the relevant bits. Value *RV = NewLoad; @@ -1077,7 +1087,7 @@ static Value *GetLoadValueForLoad(LoadInst *SrcVal, unsigned Offset, NewLoadSize*8-SrcVal->getType()->getPrimitiveSizeInBits()); RV = Builder.CreateTrunc(RV, SrcVal->getType()); SrcVal->replaceAllUsesWith(RV); - + // We would like to use gvn.markInstructionForDeletion here, but we can't // because the load is already memoized into the leader map table that GVN // tracks. It is potentially possible to remove the load from the table, @@ -1086,7 +1096,7 @@ static Value *GetLoadValueForLoad(LoadInst *SrcVal, unsigned Offset, gvn.getMemDep().removeInstruction(SrcVal); SrcVal = NewLoad; } - + return GetStoreValueForLoad(SrcVal, Offset, LoadTy, InsertPt, TD); } @@ -1100,7 +1110,7 @@ static Value *GetMemInstValueForLoad(MemIntrinsic *SrcInst, unsigned Offset, uint64_t LoadSize = TD.getTypeSizeInBits(LoadTy)/8; IRBuilder<> Builder(InsertPt->getParent(), InsertPt); - + // We know that this method is only called when the mem transfer fully // provides the bits for the load. if (MemSetInst *MSI = dyn_cast<MemSetInst>(SrcInst)) { @@ -1109,9 +1119,9 @@ static Value *GetMemInstValueForLoad(MemIntrinsic *SrcInst, unsigned Offset, Value *Val = MSI->getValue(); if (LoadSize != 1) Val = Builder.CreateZExt(Val, IntegerType::get(Ctx, LoadSize*8)); - + Value *OneElt = Val; - + // Splat the value out to the right number of bits. for (unsigned NumBytesSet = 1; NumBytesSet != LoadSize; ) { // If we can double the number of bytes set, do it. @@ -1121,16 +1131,16 @@ static Value *GetMemInstValueForLoad(MemIntrinsic *SrcInst, unsigned Offset, NumBytesSet <<= 1; continue; } - + // Otherwise insert one byte at a time. Value *ShVal = Builder.CreateShl(Val, 1*8); Val = Builder.CreateOr(OneElt, ShVal); ++NumBytesSet; } - + return CoerceAvailableValueToLoadType(Val, LoadTy, InsertPt, TD); } - + // Otherwise, this is a memcpy/memmove from a constant global. MemTransferInst *MTI = cast<MemTransferInst>(SrcInst); Constant *Src = cast<Constant>(MTI->getSource()); @@ -1139,7 +1149,7 @@ static Value *GetMemInstValueForLoad(MemIntrinsic *SrcInst, unsigned Offset, // offset applied as appropriate. Src = ConstantExpr::getBitCast(Src, llvm::Type::getInt8PtrTy(Src->getContext())); - Constant *OffsetCst = + Constant *OffsetCst = ConstantInt::get(Type::getInt64Ty(Src->getContext()), (unsigned)Offset); Src = ConstantExpr::getGetElementPtr(Src, OffsetCst); Src = ConstantExpr::getBitCast(Src, PointerType::getUnqual(LoadTy)); @@ -1156,13 +1166,13 @@ struct AvailableValueInBlock { LoadVal, // A value produced by a load. MemIntrin // A memory intrinsic which is loaded from. }; - + /// V - The value that is live out of the block. PointerIntPair<Value *, 2, ValType> Val; - + /// Offset - The byte offset in Val that is interesting for the load query. unsigned Offset; - + static AvailableValueInBlock get(BasicBlock *BB, Value *V, unsigned Offset = 0) { AvailableValueInBlock Res; @@ -1182,7 +1192,7 @@ struct AvailableValueInBlock { Res.Offset = Offset; return Res; } - + static AvailableValueInBlock getLoad(BasicBlock *BB, LoadInst *LI, unsigned Offset = 0) { AvailableValueInBlock Res; @@ -1201,17 +1211,17 @@ struct AvailableValueInBlock { assert(isSimpleValue() && "Wrong accessor"); return Val.getPointer(); } - + LoadInst *getCoercedLoadValue() const { assert(isCoercedLoadValue() && "Wrong accessor"); return cast<LoadInst>(Val.getPointer()); } - + MemIntrinsic *getMemIntrinValue() const { assert(isMemIntrinValue() && "Wrong accessor"); return cast<MemIntrinsic>(Val.getPointer()); } - + /// MaterializeAdjustedValue - Emit code into this block to adjust the value /// defined here to the specified type. This handles various coercion cases. Value *MaterializeAdjustedValue(Type *LoadTy, GVN &gvn) const { @@ -1223,7 +1233,7 @@ struct AvailableValueInBlock { assert(TD && "Need target data to handle type mismatch case"); Res = GetStoreValueForLoad(Res, Offset, LoadTy, BB->getTerminator(), *TD); - + DEBUG(dbgs() << "GVN COERCED NONLOCAL VAL:\nOffset: " << Offset << " " << *getSimpleValue() << '\n' << *Res << '\n' << "\n\n\n"); @@ -1235,7 +1245,7 @@ struct AvailableValueInBlock { } else { Res = GetLoadValueForLoad(Load, Offset, LoadTy, BB->getTerminator(), gvn); - + DEBUG(dbgs() << "GVN COERCED NONLOCAL LOAD:\nOffset: " << Offset << " " << *getCoercedLoadValue() << '\n' << *Res << '\n' << "\n\n\n"); @@ -1258,12 +1268,12 @@ struct AvailableValueInBlock { /// ConstructSSAForLoadSet - Given a set of loads specified by ValuesPerBlock, /// construct SSA form, allowing us to eliminate LI. This returns the value /// that should be used at LI's definition site. -static Value *ConstructSSAForLoadSet(LoadInst *LI, +static Value *ConstructSSAForLoadSet(LoadInst *LI, SmallVectorImpl<AvailableValueInBlock> &ValuesPerBlock, GVN &gvn) { // Check for the fully redundant, dominating load case. In this case, we can // just use the dominating value directly. - if (ValuesPerBlock.size() == 1 && + if (ValuesPerBlock.size() == 1 && gvn.getDominatorTree().properlyDominates(ValuesPerBlock[0].BB, LI->getParent())) return ValuesPerBlock[0].MaterializeAdjustedValue(LI->getType(), gvn); @@ -1272,29 +1282,29 @@ static Value *ConstructSSAForLoadSet(LoadInst *LI, SmallVector<PHINode*, 8> NewPHIs; SSAUpdater SSAUpdate(&NewPHIs); SSAUpdate.Initialize(LI->getType(), LI->getName()); - + Type *LoadTy = LI->getType(); - + for (unsigned i = 0, e = ValuesPerBlock.size(); i != e; ++i) { const AvailableValueInBlock &AV = ValuesPerBlock[i]; BasicBlock *BB = AV.BB; - + if (SSAUpdate.HasValueForBlock(BB)) continue; SSAUpdate.AddAvailableValue(BB, AV.MaterializeAdjustedValue(LoadTy, gvn)); } - + // Perform PHI construction. Value *V = SSAUpdate.GetValueInMiddleOfBlock(LI->getParent()); - + // If new PHI nodes were created, notify alias analysis. if (V->getType()->isPointerTy()) { AliasAnalysis *AA = gvn.getAliasAnalysis(); - + for (unsigned i = 0, e = NewPHIs.size(); i != e; ++i) AA->copyValue(LI, NewPHIs[i]); - + // Now that we've copied information to the new PHIs, scan through // them again and inform alias analysis that we've added potentially // escaping uses to any values that are operands to these PHIs. @@ -1366,7 +1376,7 @@ bool GVN::processNonLocalLoad(LoadInst *LI) { // the pointer operand of the load if PHI translation occurs. Make sure // to consider the right address. Value *Address = Deps[i].getAddress(); - + // If the dependence is to a store that writes to a superset of the bits // read by the load, we can extract the bits we need for the load from the // stored value. @@ -1382,7 +1392,7 @@ bool GVN::processNonLocalLoad(LoadInst *LI) { } } } - + // Check to see if we have something like this: // load i32* P // load i8* (P+1) @@ -1394,7 +1404,7 @@ bool GVN::processNonLocalLoad(LoadInst *LI) { int Offset = AnalyzeLoadFromClobberingLoad(LI->getType(), LI->getPointerOperand(), DepLI, *TD); - + if (Offset != -1) { ValuesPerBlock.push_back(AvailableValueInBlock::getLoad(DepBB,DepLI, Offset)); @@ -1413,10 +1423,10 @@ bool GVN::processNonLocalLoad(LoadInst *LI) { ValuesPerBlock.push_back(AvailableValueInBlock::getMI(DepBB, DepMI, Offset)); continue; - } + } } } - + UnavailableBlocks.push_back(DepBB); continue; } @@ -1426,14 +1436,14 @@ bool GVN::processNonLocalLoad(LoadInst *LI) { Instruction *DepInst = DepInfo.getInst(); // Loading the allocation -> undef. - if (isa<AllocaInst>(DepInst) || isMalloc(DepInst) || + if (isa<AllocaInst>(DepInst) || isMallocLikeFn(DepInst) || // Loading immediately after lifetime begin -> undef. isLifetimeStart(DepInst)) { ValuesPerBlock.push_back(AvailableValueInBlock::get(DepBB, UndefValue::get(LI->getType()))); continue; } - + if (StoreInst *S = dyn_cast<StoreInst>(DepInst)) { // Reject loads and stores that are to the same address but are of // different types if we have to. @@ -1451,7 +1461,7 @@ bool GVN::processNonLocalLoad(LoadInst *LI) { S->getValueOperand())); continue; } - + if (LoadInst *LD = dyn_cast<LoadInst>(DepInst)) { // If the types mismatch and we can't handle it, reject reuse of the load. if (LD->getType() != LI->getType()) { @@ -1460,12 +1470,12 @@ bool GVN::processNonLocalLoad(LoadInst *LI) { if (TD == 0 || !CanCoerceMustAliasedValueToLoad(LD, LI->getType(),*TD)){ UnavailableBlocks.push_back(DepBB); continue; - } + } } ValuesPerBlock.push_back(AvailableValueInBlock::getLoad(DepBB, LD)); continue; } - + UnavailableBlocks.push_back(DepBB); continue; } @@ -1479,7 +1489,7 @@ bool GVN::processNonLocalLoad(LoadInst *LI) { // its value. Insert PHIs and remove the fully redundant value now. if (UnavailableBlocks.empty()) { DEBUG(dbgs() << "GVN REMOVING NONLOCAL LOAD: " << *LI << '\n'); - + // Perform PHI construction. Value *V = ConstructSSAForLoadSet(LI, ValuesPerBlock, *this); LI->replaceAllUsesWith(V); @@ -1522,10 +1532,10 @@ bool GVN::processNonLocalLoad(LoadInst *LI) { return false; if (Blockers.count(TmpBB)) return false; - + // If any of these blocks has more than one successor (i.e. if the edge we - // just traversed was critical), then there are other paths through this - // block along which the load may not be anticipated. Hoisting the load + // just traversed was critical), then there are other paths through this + // block along which the load may not be anticipated. Hoisting the load // above this block would be adding the load to execution paths along // which it was not previously executed. if (TmpBB->getTerminator()->getNumSuccessors() != 1) @@ -1570,7 +1580,7 @@ bool GVN::processNonLocalLoad(LoadInst *LI) { for (pred_iterator PI = pred_begin(LoadBB), E = pred_end(LoadBB); PI != E; ++PI) { BasicBlock *Pred = *PI; - if (IsValueFullyAvailableInBlock(Pred, FullyAvailableBlocks)) { + if (IsValueFullyAvailableInBlock(Pred, FullyAvailableBlocks, 0)) { continue; } PredLoads[Pred] = 0; @@ -1603,7 +1613,7 @@ bool GVN::processNonLocalLoad(LoadInst *LI) { unsigned NumUnavailablePreds = PredLoads.size(); assert(NumUnavailablePreds != 0 && "Fully available value should be eliminated above!"); - + // If this load is unavailable in multiple predecessors, reject it. // FIXME: If we could restructure the CFG, we could make a common pred with // all the preds that don't have an available LI and insert a new load into @@ -1680,10 +1690,10 @@ bool GVN::processNonLocalLoad(LoadInst *LI) { DEBUG(if (!NewInsts.empty()) dbgs() << "INSERTED " << NewInsts.size() << " INSTS: " << *NewInsts.back() << '\n'); - + // Assign value numbers to the new instructions. for (unsigned i = 0, e = NewInsts.size(); i != e; ++i) { - // FIXME: We really _ought_ to insert these value numbers into their + // FIXME: We really _ought_ to insert these value numbers into their // parent's availability map. However, in doing so, we risk getting into // ordering issues. If a block hasn't been processed yet, we would be // marking a value as AVAIL-IN, which isn't what we intend. @@ -1725,6 +1735,53 @@ bool GVN::processNonLocalLoad(LoadInst *LI) { return true; } +static void patchReplacementInstruction(Value *Repl, Instruction *I) { + // Patch the replacement so that it is not more restrictive than the value + // being replaced. + BinaryOperator *Op = dyn_cast<BinaryOperator>(I); + BinaryOperator *ReplOp = dyn_cast<BinaryOperator>(Repl); + if (Op && ReplOp && isa<OverflowingBinaryOperator>(Op) && + isa<OverflowingBinaryOperator>(ReplOp)) { + if (ReplOp->hasNoSignedWrap() && !Op->hasNoSignedWrap()) + ReplOp->setHasNoSignedWrap(false); + if (ReplOp->hasNoUnsignedWrap() && !Op->hasNoUnsignedWrap()) + ReplOp->setHasNoUnsignedWrap(false); + } + if (Instruction *ReplInst = dyn_cast<Instruction>(Repl)) { + SmallVector<std::pair<unsigned, MDNode*>, 4> Metadata; + ReplInst->getAllMetadataOtherThanDebugLoc(Metadata); + for (int i = 0, n = Metadata.size(); i < n; ++i) { + unsigned Kind = Metadata[i].first; + MDNode *IMD = I->getMetadata(Kind); + MDNode *ReplMD = Metadata[i].second; + switch(Kind) { + default: + ReplInst->setMetadata(Kind, NULL); // Remove unknown metadata + break; + case LLVMContext::MD_dbg: + llvm_unreachable("getAllMetadataOtherThanDebugLoc returned a MD_dbg"); + case LLVMContext::MD_tbaa: + ReplInst->setMetadata(Kind, MDNode::getMostGenericTBAA(IMD, ReplMD)); + break; + case LLVMContext::MD_range: + ReplInst->setMetadata(Kind, MDNode::getMostGenericRange(IMD, ReplMD)); + break; + case LLVMContext::MD_prof: + llvm_unreachable("MD_prof in a non terminator instruction"); + break; + case LLVMContext::MD_fpmath: + ReplInst->setMetadata(Kind, MDNode::getMostGenericFPMath(IMD, ReplMD)); + break; + } + } + } +} + +static void patchAndReplaceAllUsesWith(Value *Repl, Instruction *I) { + patchReplacementInstruction(Repl, I); + I->replaceAllUsesWith(Repl); +} + /// processLoad - Attempt to eliminate a load, first by eliminating it /// locally, and then attempting non-local elimination if that fails. bool GVN::processLoad(LoadInst *L) { @@ -1738,7 +1795,7 @@ bool GVN::processLoad(LoadInst *L) { markInstructionForDeletion(L); return true; } - + // ... to a pointer that has been loaded from before... MemDepResult Dep = MD->getDependency(L); @@ -1764,7 +1821,7 @@ bool GVN::processLoad(LoadInst *L) { AvailVal = GetStoreValueForLoad(DepSI->getValueOperand(), Offset, L->getType(), L, *TD); } - + // Check to see if we have something like this: // load i32* P // load i8* (P+1) @@ -1774,14 +1831,14 @@ bool GVN::processLoad(LoadInst *L) { // we have the first instruction in the entry block. if (DepLI == L) return false; - + int Offset = AnalyzeLoadFromClobberingLoad(L->getType(), L->getPointerOperand(), DepLI, *TD); if (Offset != -1) AvailVal = GetLoadValueForLoad(DepLI, Offset, L->getType(), L, *this); } - + // If the clobbering value is a memset/memcpy/memmove, see if we can forward // a value on from it. if (MemIntrinsic *DepMI = dyn_cast<MemIntrinsic>(Dep.getInst())) { @@ -1791,11 +1848,11 @@ bool GVN::processLoad(LoadInst *L) { if (Offset != -1) AvailVal = GetMemInstValueForLoad(DepMI, Offset, L->getType(), L, *TD); } - + if (AvailVal) { DEBUG(dbgs() << "GVN COERCED INST:\n" << *Dep.getInst() << '\n' << *AvailVal << '\n' << *L << "\n\n\n"); - + // Replace the load! L->replaceAllUsesWith(AvailVal); if (AvailVal->getType()->isPointerTy()) @@ -1805,7 +1862,7 @@ bool GVN::processLoad(LoadInst *L) { return true; } } - + // If the value isn't available, don't do anything! if (Dep.isClobber()) { DEBUG( @@ -1835,7 +1892,7 @@ bool GVN::processLoad(LoadInst *L) { Instruction *DepInst = Dep.getInst(); if (StoreInst *DepSI = dyn_cast<StoreInst>(DepInst)) { Value *StoredVal = DepSI->getValueOperand(); - + // The store and load are to a must-aliased pointer, but they may not // actually have the same type. See if we know how to reuse the stored // value (depending on its type). @@ -1845,11 +1902,11 @@ bool GVN::processLoad(LoadInst *L) { L, *TD); if (StoredVal == 0) return false; - + DEBUG(dbgs() << "GVN COERCED STORE:\n" << *DepSI << '\n' << *StoredVal << '\n' << *L << "\n\n\n"); } - else + else return false; } @@ -1864,7 +1921,7 @@ bool GVN::processLoad(LoadInst *L) { if (LoadInst *DepLI = dyn_cast<LoadInst>(DepInst)) { Value *AvailableVal = DepLI; - + // The loads are of a must-aliased pointer, but they may not actually have // the same type. See if we know how to reuse the previously loaded value // (depending on its type). @@ -1874,16 +1931,16 @@ bool GVN::processLoad(LoadInst *L) { L, *TD); if (AvailableVal == 0) return false; - + DEBUG(dbgs() << "GVN COERCED LOAD:\n" << *DepLI << "\n" << *AvailableVal << "\n" << *L << "\n\n\n"); } - else + else return false; } - + // Remove it! - L->replaceAllUsesWith(AvailableVal); + patchAndReplaceAllUsesWith(AvailableVal, L); if (DepLI->getType()->isPointerTy()) MD->invalidateCachedPointerInfo(DepLI); markInstructionForDeletion(L); @@ -1894,13 +1951,13 @@ bool GVN::processLoad(LoadInst *L) { // If this load really doesn't depend on anything, then we must be loading an // undef value. This can happen when loading for a fresh allocation with no // intervening stores, for example. - if (isa<AllocaInst>(DepInst) || isMalloc(DepInst)) { + if (isa<AllocaInst>(DepInst) || isMallocLikeFn(DepInst)) { L->replaceAllUsesWith(UndefValue::get(L->getType())); markInstructionForDeletion(L); ++NumGVNLoad; return true; } - + // If this load occurs either right after a lifetime begin, // then the loaded value is undefined. if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(DepInst)) { @@ -1915,28 +1972,28 @@ bool GVN::processLoad(LoadInst *L) { return false; } -// findLeader - In order to find a leader for a given value number at a +// findLeader - In order to find a leader for a given value number at a // specific basic block, we first obtain the list of all Values for that number, -// and then scan the list to find one whose block dominates the block in +// and then scan the list to find one whose block dominates the block in // question. This is fast because dominator tree queries consist of only // a few comparisons of DFS numbers. -Value *GVN::findLeader(BasicBlock *BB, uint32_t num) { +Value *GVN::findLeader(const BasicBlock *BB, uint32_t num) { LeaderTableEntry Vals = LeaderTable[num]; if (!Vals.Val) return 0; - + Value *Val = 0; if (DT->dominates(Vals.BB, BB)) { Val = Vals.Val; if (isa<Constant>(Val)) return Val; } - + LeaderTableEntry* Next = Vals.Next; while (Next) { if (DT->dominates(Next->BB, BB)) { if (isa<Constant>(Next->Val)) return Next->Val; if (!Val) Val = Next->Val; } - + Next = Next->Next; } @@ -1947,7 +2004,7 @@ Value *GVN::findLeader(BasicBlock *BB, uint32_t num) { /// use is dominated by the given basic block. Returns the number of uses that /// were replaced. unsigned GVN::replaceAllDominatedUsesWith(Value *From, Value *To, - BasicBlock *Root) { + const BasicBlock *Root) { unsigned Count = 0; for (Value::use_iterator UI = From->use_begin(), UE = From->use_end(); UI != UE; ) { @@ -1973,7 +2030,7 @@ unsigned GVN::replaceAllDominatedUsesWith(Value *From, Value *To, /// propagateEquality - The given values are known to be equal in every block /// dominated by 'Root'. Exploit this, for example by replacing 'LHS' with /// 'RHS' everywhere in the scope. Returns whether a change was made. -bool GVN::propagateEquality(Value *LHS, Value *RHS, BasicBlock *Root) { +bool GVN::propagateEquality(Value *LHS, Value *RHS, const BasicBlock *Root) { SmallVector<std::pair<Value*, Value*>, 4> Worklist; Worklist.push_back(std::make_pair(LHS, RHS)); bool Changed = false; @@ -2012,9 +2069,15 @@ bool GVN::propagateEquality(Value *LHS, Value *RHS, BasicBlock *Root) { DT->properlyDominates(cast<Instruction>(RHS)->getParent(), Root)) && "Instruction doesn't dominate scope!"); - // If value numbering later deduces that an instruction in the scope is equal - // to 'LHS' then ensure it will be turned into 'RHS'. - addToLeaderTable(LVN, RHS, Root); + // If value numbering later sees that an instruction in the scope is equal + // to 'LHS' then ensure it will be turned into 'RHS'. In order to preserve + // the invariant that instructions only occur in the leader table for their + // own value number (this is used by removeFromLeaderTable), do not do this + // if RHS is an instruction (if an instruction in the scope is morphed into + // LHS then it will be turned into RHS by the next GVN iteration anyway, so + // using the leader table is about compiling faster, not optimizing better). + if (!isa<Instruction>(RHS)) + addToLeaderTable(LVN, RHS, Root); // Replace all occurrences of 'LHS' with 'RHS' everywhere in the scope. As // LHS always has at least one use that is not dominated by Root, this will @@ -2180,7 +2243,7 @@ bool GVN::processInstruction(Instruction *I) { // Instructions with void type don't return a value, so there's // no point in trying to find redundancies in them. if (I->getType()->isVoidTy()) return false; - + uint32_t NextNum = VN.getNextUnusedValueNumber(); unsigned Num = VN.lookup_or_add(I); @@ -2198,7 +2261,7 @@ bool GVN::processInstruction(Instruction *I) { addToLeaderTable(Num, I, I->getParent()); return false; } - + // Perform fast-path value-number based elimination of values inherited from // dominators. Value *repl = findLeader(I->getParent(), Num); @@ -2207,9 +2270,9 @@ bool GVN::processInstruction(Instruction *I) { addToLeaderTable(Num, I, I->getParent()); return false; } - + // Remove it! - I->replaceAllUsesWith(repl); + patchAndReplaceAllUsesWith(repl, I); if (MD && repl->getType()->isPointerTy()) MD->invalidateCachedPointerInfo(repl); markInstructionForDeletion(I); @@ -2234,7 +2297,7 @@ bool GVN::runOnFunction(Function& F) { // optimization opportunities. for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE; ) { BasicBlock *BB = FI++; - + bool removedBlock = MergeBlockIntoPredecessor(BB, this); if (removedBlock) ++NumGVNBlocks; @@ -2391,7 +2454,7 @@ bool GVN::performPRE(Function &F) { // we would need to insert instructions in more than one pred. if (NumWithout != 1 || NumWith == 0) continue; - + // Don't do PRE across indirect branch. if (isa<IndirectBrInst>(PREPred->getTerminator())) continue; @@ -2467,7 +2530,7 @@ bool GVN::performPRE(Function &F) { unsigned jj = PHINode::getOperandNumForIncomingValue(ii); VN.getAliasAnalysis()->addEscapingUse(Phi->getOperandUse(jj)); } - + if (MD) MD->invalidateCachedPointerInfo(Phi); } @@ -2504,7 +2567,7 @@ bool GVN::splitCriticalEdges() { /// iterateOnFunction - Executes one iteration of GVN bool GVN::iterateOnFunction(Function &F) { cleanupGlobalSets(); - + // Top-down walk of the dominator tree bool Changed = false; #if 0 @@ -2539,7 +2602,7 @@ void GVN::verifyRemoved(const Instruction *Inst) const { I = LeaderTable.begin(), E = LeaderTable.end(); I != E; ++I) { const LeaderTableEntry *Node = &I->second; assert(Node->Val != Inst && "Inst still in value numbering scope!"); - + while (Node->Next) { Node = Node->Next; assert(Node->Val != Inst && "Inst still in value numbering scope!"); diff --git a/lib/Transforms/Scalar/GlobalMerge.cpp b/lib/Transforms/Scalar/GlobalMerge.cpp index c2bd6e6..b36a3cb 100644 --- a/lib/Transforms/Scalar/GlobalMerge.cpp +++ b/lib/Transforms/Scalar/GlobalMerge.cpp @@ -12,7 +12,7 @@ // global). Such a transformation can significantly reduce the register pressure // when many globals are involved. // -// For example, consider the code which touches several global variables at +// For example, consider the code which touches several global variables at // once: // // static int foo[N], bar[N], baz[N]; @@ -208,8 +208,8 @@ bool GlobalMerge::doInitialization(Module &M) { if (BSSGlobals.size() > 1) Changed |= doMerge(BSSGlobals, M, false); - // FIXME: This currently breaks the EH processing due to way how the - // typeinfo detection works. We might want to detect the TIs and ignore + // FIXME: This currently breaks the EH processing due to way how the + // typeinfo detection works. We might want to detect the TIs and ignore // them in the future. // if (ConstGlobals.size() > 1) // Changed |= doMerge(ConstGlobals, M, true); diff --git a/lib/Transforms/Scalar/IndVarSimplify.cpp b/lib/Transforms/Scalar/IndVarSimplify.cpp index a9ba657..37f8bdf 100644 --- a/lib/Transforms/Scalar/IndVarSimplify.cpp +++ b/lib/Transforms/Scalar/IndVarSimplify.cpp @@ -1215,21 +1215,26 @@ static PHINode *getLoopPhiForCounter(Value *IncV, Loop *L, DominatorTree *DT) { return 0; } -/// needsLFTR - LinearFunctionTestReplace policy. Return true unless we can show -/// that the current exit test is already sufficiently canonical. -static bool needsLFTR(Loop *L, DominatorTree *DT) { +/// Return the compare guarding the loop latch, or NULL for unrecognized tests. +static ICmpInst *getLoopTest(Loop *L) { assert(L->getExitingBlock() && "expected loop exit"); BasicBlock *LatchBlock = L->getLoopLatch(); // Don't bother with LFTR if the loop is not properly simplified. if (!LatchBlock) - return false; + return 0; BranchInst *BI = dyn_cast<BranchInst>(L->getExitingBlock()->getTerminator()); assert(BI && "expected exit branch"); + return dyn_cast<ICmpInst>(BI->getCondition()); +} + +/// needsLFTR - LinearFunctionTestReplace policy. Return true unless we can show +/// that the current exit test is already sufficiently canonical. +static bool needsLFTR(Loop *L, DominatorTree *DT) { // Do LFTR to simplify the exit condition to an ICMP. - ICmpInst *Cond = dyn_cast<ICmpInst>(BI->getCondition()); + ICmpInst *Cond = getLoopTest(L); if (!Cond) return true; @@ -1259,6 +1264,48 @@ static bool needsLFTR(Loop *L, DominatorTree *DT) { return Phi != getLoopPhiForCounter(IncV, L, DT); } +/// Recursive helper for hasConcreteDef(). Unfortunately, this currently boils +/// down to checking that all operands are constant and listing instructions +/// that may hide undef. +static bool hasConcreteDefImpl(Value *V, SmallPtrSet<Value*, 8> &Visited, + unsigned Depth) { + if (isa<Constant>(V)) + return !isa<UndefValue>(V); + + if (Depth >= 6) + return false; + + // Conservatively handle non-constant non-instructions. For example, Arguments + // may be undef. + Instruction *I = dyn_cast<Instruction>(V); + if (!I) + return false; + + // Load and return values may be undef. + if(I->mayReadFromMemory() || isa<CallInst>(I) || isa<InvokeInst>(I)) + return false; + + // Optimistically handle other instructions. + for (User::op_iterator OI = I->op_begin(), E = I->op_end(); OI != E; ++OI) { + if (!Visited.insert(*OI)) + continue; + if (!hasConcreteDefImpl(*OI, Visited, Depth+1)) + return false; + } + return true; +} + +/// Return true if the given value is concrete. We must prove that undef can +/// never reach it. +/// +/// TODO: If we decide that this is a good approach to checking for undef, we +/// may factor it into a common location. +static bool hasConcreteDef(Value *V) { + SmallPtrSet<Value*, 8> Visited; + Visited.insert(V); + return hasConcreteDefImpl(V, Visited, 0); +} + /// AlmostDeadIV - Return true if this IV has any uses other than the (soon to /// be rewritten) loop exit test. static bool AlmostDeadIV(PHINode *Phi, BasicBlock *LatchBlock, Value *Cond) { @@ -1283,6 +1330,8 @@ static bool AlmostDeadIV(PHINode *Phi, BasicBlock *LatchBlock, Value *Cond) { /// valid count without scaling the address stride, so it remains a pointer /// expression as far as SCEV is concerned. /// +/// Currently only valid for LFTR. See the comments on hasConcreteDef below. +/// /// FIXME: Accept -1 stride and set IVLimit = IVInit - BECount /// /// FIXME: Accept non-unit stride as long as SCEV can reduce BECount * Stride. @@ -1331,6 +1380,19 @@ FindLoopCounter(Loop *L, const SCEV *BECount, if (getLoopPhiForCounter(IncV, L, DT) != Phi) continue; + // Avoid reusing a potentially undef value to compute other values that may + // have originally had a concrete definition. + if (!hasConcreteDef(Phi)) { + // We explicitly allow unknown phis as long as they are already used by + // the loop test. In this case we assume that performing LFTR could not + // increase the number of undef users. + if (ICmpInst *Cond = getLoopTest(L)) { + if (Phi != getLoopPhiForCounter(Cond->getOperand(0), L, DT) + && Phi != getLoopPhiForCounter(Cond->getOperand(1), L, DT)) { + continue; + } + } + } const SCEV *Init = AR->getStart(); if (BestPhi && !AlmostDeadIV(BestPhi, LatchBlock, Cond)) { @@ -1347,7 +1409,7 @@ FindLoopCounter(Loop *L, const SCEV *BECount, // If two IVs both count from zero or both count from nonzero then the // narrower is likely a dead phi that has been widened. Use the wider phi // to allow the other to be eliminated. - if (PhiWidth <= SE->getTypeSizeInBits(BestPhi->getType())) + else if (PhiWidth <= SE->getTypeSizeInBits(BestPhi->getType())) continue; } BestPhi = Phi; diff --git a/lib/Transforms/Scalar/JumpThreading.cpp b/lib/Transforms/Scalar/JumpThreading.cpp index 429b61b..dd42c59 100644 --- a/lib/Transforms/Scalar/JumpThreading.cpp +++ b/lib/Transforms/Scalar/JumpThreading.cpp @@ -670,6 +670,8 @@ bool JumpThreading::ProcessBlock(BasicBlock *BB) { } else if (SwitchInst *SI = dyn_cast<SwitchInst>(Terminator)) { Condition = SI->getCondition(); } else if (IndirectBrInst *IB = dyn_cast<IndirectBrInst>(Terminator)) { + // Can't thread indirect branch with no successors. + if (IB->getNumSuccessors() == 0) return false; Condition = IB->getAddress()->stripPointerCasts(); Preference = WantBlockAddress; } else { @@ -859,7 +861,7 @@ bool JumpThreading::SimplifyPartiallyRedundantLoad(LoadInst *LI) { // If all of the loads and stores that feed the value have the same TBAA tag, // then we can propagate it onto any newly inserted loads. - MDNode *TBAATag = LI->getMetadata(LLVMContext::MD_tbaa); + MDNode *TBAATag = LI->getMetadata(LLVMContext::MD_tbaa); SmallPtrSet<BasicBlock*, 8> PredsScanned; typedef SmallVector<std::pair<BasicBlock*, Value*>, 8> AvailablePredsTy; @@ -885,7 +887,7 @@ bool JumpThreading::SimplifyPartiallyRedundantLoad(LoadInst *LI) { OneUnavailablePred = PredBB; continue; } - + // If tbaa tags disagree or are not present, forget about them. if (TBAATag != ThisTBAATag) TBAATag = 0; @@ -949,7 +951,7 @@ bool JumpThreading::SimplifyPartiallyRedundantLoad(LoadInst *LI) { NewVal->setDebugLoc(LI->getDebugLoc()); if (TBAATag) NewVal->setMetadata(LLVMContext::MD_tbaa, TBAATag); - + AvailablePreds.push_back(std::make_pair(UnavailablePred, NewVal)); } diff --git a/lib/Transforms/Scalar/LICM.cpp b/lib/Transforms/Scalar/LICM.cpp index 8795cd8..0192e92 100644 --- a/lib/Transforms/Scalar/LICM.cpp +++ b/lib/Transforms/Scalar/LICM.cpp @@ -175,7 +175,9 @@ namespace { bool canSinkOrHoistInst(Instruction &I); bool isNotUsedInLoop(Instruction &I); - void PromoteAliasSet(AliasSet &AS); + void PromoteAliasSet(AliasSet &AS, + SmallVectorImpl<BasicBlock*> &ExitBlocks, + SmallVectorImpl<Instruction*> &InsertPts); }; } @@ -256,10 +258,13 @@ bool LICM::runOnLoop(Loop *L, LPPassManager &LPM) { // Now that all loop invariants have been removed from the loop, promote any // memory references to scalars that we can. if (!DisablePromotion && Preheader && L->hasDedicatedExits()) { + SmallVector<BasicBlock *, 8> ExitBlocks; + SmallVector<Instruction *, 8> InsertPts; + // Loop over all of the alias sets in the tracker object. for (AliasSetTracker::iterator I = CurAST->begin(), E = CurAST->end(); I != E; ++I) - PromoteAliasSet(*I); + PromoteAliasSet(*I, ExitBlocks, InsertPts); } // Clear out loops state information for the next iteration @@ -618,6 +623,11 @@ bool LICM::isGuaranteedToExecute(Instruction &Inst) { if (!DT->dominates(Inst.getParent(), ExitBlocks[i])) return false; + // As a degenerate case, if the loop is statically infinite then we haven't + // proven anything since there are no exit blocks. + if (ExitBlocks.empty()) + return false; + return true; } @@ -626,6 +636,7 @@ namespace { Value *SomePtr; // Designated pointer to store to. SmallPtrSet<Value*, 4> &PointerMustAliases; SmallVectorImpl<BasicBlock*> &LoopExitBlocks; + SmallVectorImpl<Instruction*> &LoopInsertPts; AliasSetTracker &AST; DebugLoc DL; int Alignment; @@ -633,11 +644,12 @@ namespace { LoopPromoter(Value *SP, const SmallVectorImpl<Instruction*> &Insts, SSAUpdater &S, SmallPtrSet<Value*, 4> &PMA, - SmallVectorImpl<BasicBlock*> &LEB, AliasSetTracker &ast, - DebugLoc dl, int alignment) + SmallVectorImpl<BasicBlock*> &LEB, + SmallVectorImpl<Instruction*> &LIP, + AliasSetTracker &ast, DebugLoc dl, int alignment) : LoadAndStorePromoter(Insts, S), SomePtr(SP), - PointerMustAliases(PMA), LoopExitBlocks(LEB), AST(ast), DL(dl), - Alignment(alignment) {} + PointerMustAliases(PMA), LoopExitBlocks(LEB), LoopInsertPts(LIP), + AST(ast), DL(dl), Alignment(alignment) {} virtual bool isInstInList(Instruction *I, const SmallVectorImpl<Instruction*> &) const { @@ -657,7 +669,7 @@ namespace { for (unsigned i = 0, e = LoopExitBlocks.size(); i != e; ++i) { BasicBlock *ExitBlock = LoopExitBlocks[i]; Value *LiveInValue = SSA.GetValueInMiddleOfBlock(ExitBlock); - Instruction *InsertPos = ExitBlock->getFirstInsertionPt(); + Instruction *InsertPos = LoopInsertPts[i]; StoreInst *NewSI = new StoreInst(LiveInValue, SomePtr, InsertPos); NewSI->setAlignment(Alignment); NewSI->setDebugLoc(DL); @@ -679,7 +691,9 @@ namespace { /// looping over the stores in the loop, looking for stores to Must pointers /// which are loop invariant. /// -void LICM::PromoteAliasSet(AliasSet &AS) { +void LICM::PromoteAliasSet(AliasSet &AS, + SmallVectorImpl<BasicBlock*> &ExitBlocks, + SmallVectorImpl<Instruction*> &InsertPts) { // We can promote this alias set if it has a store, if it is a "Must" alias // set, if the pointer is loop invariant, and if we are not eliminating any // volatile loads or stores. @@ -789,14 +803,20 @@ void LICM::PromoteAliasSet(AliasSet &AS) { // location is better than none. DebugLoc DL = LoopUses[0]->getDebugLoc(); - SmallVector<BasicBlock*, 8> ExitBlocks; - CurLoop->getUniqueExitBlocks(ExitBlocks); + // Figure out the loop exits and their insertion points, if this is the + // first promotion. + if (ExitBlocks.empty()) { + CurLoop->getUniqueExitBlocks(ExitBlocks); + InsertPts.resize(ExitBlocks.size()); + for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i) + InsertPts[i] = ExitBlocks[i]->getFirstInsertionPt(); + } // We use the SSAUpdater interface to insert phi nodes as required. SmallVector<PHINode*, 16> NewPHIs; SSAUpdater SSA(&NewPHIs); LoopPromoter Promoter(SomePtr, LoopUses, SSA, PointerMustAliases, ExitBlocks, - *CurAST, DL, Alignment); + InsertPts, *CurAST, DL, Alignment); // Set up the preheader to have a definition of the value. It is the live-out // value from the preheader that uses in the loop will use. diff --git a/lib/Transforms/Scalar/LoopDeletion.cpp b/lib/Transforms/Scalar/LoopDeletion.cpp index f7f3298..3771f5a 100644 --- a/lib/Transforms/Scalar/LoopDeletion.cpp +++ b/lib/Transforms/Scalar/LoopDeletion.cpp @@ -32,10 +32,10 @@ namespace { LoopDeletion() : LoopPass(ID) { initializeLoopDeletionPass(*PassRegistry::getPassRegistry()); } - + // Possibly eliminate loop L if it is dead. bool runOnLoop(Loop* L, LPPassManager& LPM); - + bool IsLoopDead(Loop* L, SmallVector<BasicBlock*, 4>& exitingBlocks, SmallVector<BasicBlock*, 4>& exitBlocks, bool &Changed, BasicBlock *Preheader); @@ -46,7 +46,7 @@ namespace { AU.addRequired<ScalarEvolution>(); AU.addRequiredID(LoopSimplifyID); AU.addRequiredID(LCSSAID); - + AU.addPreserved<ScalarEvolution>(); AU.addPreserved<DominatorTree>(); AU.addPreserved<LoopInfo>(); @@ -55,7 +55,7 @@ namespace { } }; } - + char LoopDeletion::ID = 0; INITIALIZE_PASS_BEGIN(LoopDeletion, "loop-deletion", "Delete dead loops", false, false) @@ -79,7 +79,7 @@ bool LoopDeletion::IsLoopDead(Loop* L, SmallVector<BasicBlock*, 4>& exitBlocks, bool &Changed, BasicBlock *Preheader) { BasicBlock* exitBlock = exitBlocks[0]; - + // Make sure that all PHI entries coming from the loop are loop invariant. // Because the code is in LCSSA form, any values used outside of the loop // must pass through a PHI in the exit block, meaning that this check is @@ -97,14 +97,14 @@ bool LoopDeletion::IsLoopDead(Loop* L, if (incoming != P->getIncomingValueForBlock(exitingBlocks[i])) return false; } - + if (Instruction* I = dyn_cast<Instruction>(incoming)) if (!L->makeLoopInvariant(I, Changed, Preheader->getTerminator())) return false; ++BI; } - + // Make sure that no instructions in the block have potential side-effects. // This includes instructions that could write to memory, and loads that are // marked volatile. This could be made more aggressive by using aliasing @@ -117,23 +117,23 @@ bool LoopDeletion::IsLoopDead(Loop* L, return false; } } - + return true; } /// runOnLoop - Remove dead loops, by which we mean loops that do not impact the -/// observable behavior of the program other than finite running time. Note +/// observable behavior of the program other than finite running time. Note /// we do ensure that this never remove a loop that might be infinite, as doing /// so could change the halting/non-halting nature of a program. /// NOTE: This entire process relies pretty heavily on LoopSimplify and LCSSA /// in order to make various safety checks work. bool LoopDeletion::runOnLoop(Loop* L, LPPassManager& LPM) { - // We can only remove the loop if there is a preheader that we can + // We can only remove the loop if there is a preheader that we can // branch from after removing it. BasicBlock* preheader = L->getLoopPreheader(); if (!preheader) return false; - + // If LoopSimplify form is not available, stay out of trouble. if (!L->hasDedicatedExits()) return false; @@ -142,36 +142,36 @@ bool LoopDeletion::runOnLoop(Loop* L, LPPassManager& LPM) { // they would already have been removed in earlier executions of this pass. if (L->begin() != L->end()) return false; - + SmallVector<BasicBlock*, 4> exitingBlocks; L->getExitingBlocks(exitingBlocks); - + SmallVector<BasicBlock*, 4> exitBlocks; L->getUniqueExitBlocks(exitBlocks); - + // We require that the loop only have a single exit block. Otherwise, we'd // be in the situation of needing to be able to solve statically which exit // block will be branched to, or trying to preserve the branching logic in // a loop invariant manner. if (exitBlocks.size() != 1) return false; - + // Finally, we have to check that the loop really is dead. bool Changed = false; if (!IsLoopDead(L, exitingBlocks, exitBlocks, Changed, preheader)) return Changed; - + // Don't remove loops for which we can't solve the trip count. // They could be infinite, in which case we'd be changing program behavior. ScalarEvolution& SE = getAnalysis<ScalarEvolution>(); const SCEV *S = SE.getMaxBackedgeTakenCount(L); if (isa<SCEVCouldNotCompute>(S)) return Changed; - + // Now that we know the removal is safe, remove the loop by changing the - // branch from the preheader to go to the single exit block. + // branch from the preheader to go to the single exit block. BasicBlock* exitBlock = exitBlocks[0]; - + // Because we're deleting a large chunk of code at once, the sequence in which // we remove things is very important to avoid invalidation issues. Don't // mess with this unless you have good reason and know what you're doing. @@ -197,7 +197,7 @@ bool LoopDeletion::runOnLoop(Loop* L, LPPassManager& LPM) { P->removeIncomingValue(exitingBlocks[i]); ++BI; } - + // Update the dominator tree and remove the instructions and blocks that will // be deleted from the reference counting scheme. DominatorTree& DT = getAnalysis<DominatorTree>(); @@ -211,7 +211,7 @@ bool LoopDeletion::runOnLoop(Loop* L, LPPassManager& LPM) { DE = ChildNodes.end(); DI != DE; ++DI) { DT.changeImmediateDominator(*DI, DT[preheader]); } - + ChildNodes.clear(); DT.eraseNode(*LI); @@ -219,7 +219,7 @@ bool LoopDeletion::runOnLoop(Loop* L, LPPassManager& LPM) { // delete it freely later. (*LI)->dropAllReferences(); } - + // Erase the instructions and the blocks without having to worry // about ordering because we already dropped the references. // NOTE: This iteration is safe because erasing the block does not remove its @@ -236,13 +236,13 @@ bool LoopDeletion::runOnLoop(Loop* L, LPPassManager& LPM) { for (SmallPtrSet<BasicBlock*,8>::iterator I = blocks.begin(), E = blocks.end(); I != E; ++I) loopInfo.removeBlock(*I); - + // The last step is to inform the loop pass manager that we've // eliminated this loop. LPM.deleteLoopFromQueue(L); Changed = true; - + ++NumDeleted; - + return Changed; } diff --git a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp index ad15cbb..ac1082c 100644 --- a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp +++ b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp @@ -43,20 +43,20 @@ #define DEBUG_TYPE "loop-idiom" #include "llvm/Transforms/Scalar.h" +#include "llvm/IRBuilder.h" #include "llvm/IntrinsicInst.h" #include "llvm/Module.h" +#include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/LoopPass.h" -#include "llvm/Analysis/ScalarEvolutionExpressions.h" #include "llvm/Analysis/ScalarEvolutionExpander.h" +#include "llvm/Analysis/ScalarEvolutionExpressions.h" #include "llvm/Analysis/ValueTracking.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetData.h" #include "llvm/Target/TargetLibraryInfo.h" #include "llvm/Transforms/Utils/Local.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/IRBuilder.h" -#include "llvm/Support/raw_ostream.h" -#include "llvm/ADT/Statistic.h" using namespace llvm; STATISTIC(NumMemSet, "Number of memset's formed from loop stores"); @@ -173,7 +173,7 @@ static void deleteIfDeadInstruction(Value *V, ScalarEvolution &SE) { bool LoopIdiomRecognize::runOnLoop(Loop *L, LPPassManager &LPM) { CurLoop = L; - // Disable loop idiom recognition if the function's name is a common idiom. + // Disable loop idiom recognition if the function's name is a common idiom. StringRef Name = L->getHeader()->getParent()->getName(); if (Name == "memset" || Name == "memcpy") return false; diff --git a/lib/Transforms/Scalar/LoopInstSimplify.cpp b/lib/Transforms/Scalar/LoopInstSimplify.cpp index f0f05e6..982400c 100644 --- a/lib/Transforms/Scalar/LoopInstSimplify.cpp +++ b/lib/Transforms/Scalar/LoopInstSimplify.cpp @@ -48,7 +48,7 @@ namespace { } }; } - + char LoopInstSimplify::ID = 0; INITIALIZE_PASS_BEGIN(LoopInstSimplify, "loop-instsimplify", "Simplify instructions in loops", false, false) diff --git a/lib/Transforms/Scalar/LoopRotation.cpp b/lib/Transforms/Scalar/LoopRotation.cpp index 59aace9..7eeb152 100644 --- a/lib/Transforms/Scalar/LoopRotation.cpp +++ b/lib/Transforms/Scalar/LoopRotation.cpp @@ -418,12 +418,13 @@ bool LoopRotate::rotateLoop(Loop *L) { } // Right now OrigPreHeader has two successors, NewHeader and ExitBlock, and - // thus is not a preheader anymore. Split the edge to form a real preheader. + // thus is not a preheader anymore. + // Split the edge to form a real preheader. BasicBlock *NewPH = SplitCriticalEdge(OrigPreheader, NewHeader, this); NewPH->setName(NewHeader->getName() + ".lr.ph"); - // Preserve canonical loop form, which means that 'Exit' should have only one - // predecessor. + // Preserve canonical loop form, which means that 'Exit' should have only + // one predecessor. BasicBlock *ExitSplit = SplitCriticalEdge(L->getLoopLatch(), Exit, this); ExitSplit->moveBefore(Exit); } else { diff --git a/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/lib/Transforms/Scalar/LoopStrengthReduce.cpp index b085b00..b14a713 100644 --- a/lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -1308,8 +1308,8 @@ static bool isLegalUse(const TargetLowering::AddrMode &AM, return !AM.BaseGV && AM.Scale == 0 && AM.BaseOffs == 0; case LSRUse::Special: - // Only handle -1 scales, or no scale. - return AM.Scale == 0 || AM.Scale == -1; + // Special case Basic to handle -1 scales. + return !AM.BaseGV && (AM.Scale == 0 || AM.Scale == -1) && AM.BaseOffs == 0; } llvm_unreachable("Invalid LSRUse Kind!"); @@ -1439,7 +1439,41 @@ struct IVInc { // IVChain - The list of IV increments in program order. // We typically add the head of a chain without finding subsequent links. -typedef SmallVector<IVInc,1> IVChain; +struct IVChain { + SmallVector<IVInc,1> Incs; + const SCEV *ExprBase; + + IVChain() : ExprBase(0) {} + + IVChain(const IVInc &Head, const SCEV *Base) + : Incs(1, Head), ExprBase(Base) {} + + typedef SmallVectorImpl<IVInc>::const_iterator const_iterator; + + // begin - return the first increment in the chain. + const_iterator begin() const { + assert(!Incs.empty()); + return llvm::next(Incs.begin()); + } + const_iterator end() const { + return Incs.end(); + } + + // hasIncs - Returns true if this chain contains any increments. + bool hasIncs() const { return Incs.size() >= 2; } + + // add - Add an IVInc to the end of this chain. + void add(const IVInc &X) { Incs.push_back(X); } + + // tailUserInst - Returns the last UserInst in the chain. + Instruction *tailUserInst() const { return Incs.back().UserInst; } + + // isProfitableIncrement - Returns true if IncExpr can be profitably added to + // this chain. + bool isProfitableIncrement(const SCEV *OperExpr, + const SCEV *IncExpr, + ScalarEvolution&); +}; /// ChainUsers - Helper for CollectChains to track multiple IV increment uses. /// Distinguish between FarUsers that definitely cross IV increments and @@ -2160,7 +2194,7 @@ LSRInstance::FindUseWithSimilarFormula(const Formula &OrigF, return &LU; // This is the formula where all the registers and symbols matched; // there aren't going to be any others. Since we declined it, we - // can skip the rest of the formulae and procede to the next LSRUse. + // can skip the rest of the formulae and proceed to the next LSRUse. break; } } @@ -2319,41 +2353,23 @@ static const SCEV *getExprBase(const SCEV *S) { /// increment will be an offset relative to the same base. We allow such offsets /// to potentially be used as chain increment as long as it's not obviously /// expensive to expand using real instructions. -static const SCEV * -getProfitableChainIncrement(Value *NextIV, Value *PrevIV, - const IVChain &Chain, Loop *L, - ScalarEvolution &SE, const TargetLowering *TLI) { - // Prune the solution space aggressively by checking that both IV operands - // are expressions that operate on the same unscaled SCEVUnknown. This - // "base" will be canceled by the subsequent getMinusSCEV call. Checking first - // avoids creating extra SCEV expressions. - const SCEV *OperExpr = SE.getSCEV(NextIV); - const SCEV *PrevExpr = SE.getSCEV(PrevIV); - if (getExprBase(OperExpr) != getExprBase(PrevExpr) && !StressIVChain) - return 0; - - const SCEV *IncExpr = SE.getMinusSCEV(OperExpr, PrevExpr); - if (!SE.isLoopInvariant(IncExpr, L)) - return 0; - - // We are not able to expand an increment unless it is loop invariant, - // however, the following checks are purely for profitability. +bool IVChain::isProfitableIncrement(const SCEV *OperExpr, + const SCEV *IncExpr, + ScalarEvolution &SE) { + // Aggressively form chains when -stress-ivchain. if (StressIVChain) - return IncExpr; + return true; // Do not replace a constant offset from IV head with a nonconstant IV // increment. if (!isa<SCEVConstant>(IncExpr)) { - const SCEV *HeadExpr = SE.getSCEV(getWideOperand(Chain[0].IVOperand)); + const SCEV *HeadExpr = SE.getSCEV(getWideOperand(Incs[0].IVOperand)); if (isa<SCEVConstant>(SE.getMinusSCEV(OperExpr, HeadExpr))) return 0; } SmallPtrSet<const SCEV*, 8> Processed; - if (isHighCostExpansion(IncExpr, Processed, SE)) - return 0; - - return IncExpr; + return !isHighCostExpansion(IncExpr, Processed, SE); } /// Return true if the number of registers needed for the chain is estimated to @@ -2372,18 +2388,18 @@ isProfitableChain(IVChain &Chain, SmallPtrSet<Instruction*, 4> &Users, if (StressIVChain) return true; - if (Chain.size() <= 2) + if (!Chain.hasIncs()) return false; if (!Users.empty()) { - DEBUG(dbgs() << "Chain: " << *Chain[0].UserInst << " users:\n"; + DEBUG(dbgs() << "Chain: " << *Chain.Incs[0].UserInst << " users:\n"; for (SmallPtrSet<Instruction*, 4>::const_iterator I = Users.begin(), E = Users.end(); I != E; ++I) { dbgs() << " " << **I << "\n"; }); return false; } - assert(!Chain.empty() && "empty IV chains are not allowed"); + assert(!Chain.Incs.empty() && "empty IV chains are not allowed"); // The chain itself may require a register, so intialize cost to 1. int cost = 1; @@ -2391,15 +2407,15 @@ isProfitableChain(IVChain &Chain, SmallPtrSet<Instruction*, 4> &Users, // A complete chain likely eliminates the need for keeping the original IV in // a register. LSR does not currently know how to form a complete chain unless // the header phi already exists. - if (isa<PHINode>(Chain.back().UserInst) - && SE.getSCEV(Chain.back().UserInst) == Chain[0].IncExpr) { + if (isa<PHINode>(Chain.tailUserInst()) + && SE.getSCEV(Chain.tailUserInst()) == Chain.Incs[0].IncExpr) { --cost; } const SCEV *LastIncExpr = 0; unsigned NumConstIncrements = 0; unsigned NumVarIncrements = 0; unsigned NumReusedIncrements = 0; - for (IVChain::const_iterator I = llvm::next(Chain.begin()), E = Chain.end(); + for (IVChain::const_iterator I = Chain.begin(), E = Chain.end(); I != E; ++I) { if (I->IncExpr->isZero()) @@ -2435,7 +2451,8 @@ isProfitableChain(IVChain &Chain, SmallPtrSet<Instruction*, 4> &Users, // the stride. cost -= NumReusedIncrements; - DEBUG(dbgs() << "Chain: " << *Chain[0].UserInst << " Cost: " << cost << "\n"); + DEBUG(dbgs() << "Chain: " << *Chain.Incs[0].UserInst << " Cost: " << cost + << "\n"); return cost < 0; } @@ -2446,25 +2463,39 @@ void LSRInstance::ChainInstruction(Instruction *UserInst, Instruction *IVOper, SmallVectorImpl<ChainUsers> &ChainUsersVec) { // When IVs are used as types of varying widths, they are generally converted // to a wider type with some uses remaining narrow under a (free) trunc. - Value *NextIV = getWideOperand(IVOper); + Value *const NextIV = getWideOperand(IVOper); + const SCEV *const OperExpr = SE.getSCEV(NextIV); + const SCEV *const OperExprBase = getExprBase(OperExpr); // Visit all existing chains. Check if its IVOper can be computed as a // profitable loop invariant increment from the last link in the Chain. unsigned ChainIdx = 0, NChains = IVChainVec.size(); const SCEV *LastIncExpr = 0; for (; ChainIdx < NChains; ++ChainIdx) { - Value *PrevIV = getWideOperand(IVChainVec[ChainIdx].back().IVOperand); + IVChain &Chain = IVChainVec[ChainIdx]; + + // Prune the solution space aggressively by checking that both IV operands + // are expressions that operate on the same unscaled SCEVUnknown. This + // "base" will be canceled by the subsequent getMinusSCEV call. Checking + // first avoids creating extra SCEV expressions. + if (!StressIVChain && Chain.ExprBase != OperExprBase) + continue; + + Value *PrevIV = getWideOperand(Chain.Incs.back().IVOperand); if (!isCompatibleIVType(PrevIV, NextIV)) continue; // A phi node terminates a chain. - if (isa<PHINode>(UserInst) - && isa<PHINode>(IVChainVec[ChainIdx].back().UserInst)) + if (isa<PHINode>(UserInst) && isa<PHINode>(Chain.tailUserInst())) + continue; + + // The increment must be loop-invariant so it can be kept in a register. + const SCEV *PrevExpr = SE.getSCEV(PrevIV); + const SCEV *IncExpr = SE.getMinusSCEV(OperExpr, PrevExpr); + if (!SE.isLoopInvariant(IncExpr, L)) continue; - if (const SCEV *IncExpr = - getProfitableChainIncrement(NextIV, PrevIV, IVChainVec[ChainIdx], - L, SE, TLI)) { + if (Chain.isProfitableIncrement(OperExpr, IncExpr, SE)) { LastIncExpr = IncExpr; break; } @@ -2478,24 +2509,24 @@ void LSRInstance::ChainInstruction(Instruction *UserInst, Instruction *IVOper, DEBUG(dbgs() << "IV Chain Limit\n"); return; } - LastIncExpr = SE.getSCEV(NextIV); + LastIncExpr = OperExpr; // IVUsers may have skipped over sign/zero extensions. We don't currently // attempt to form chains involving extensions unless they can be hoisted // into this loop's AddRec. if (!isa<SCEVAddRecExpr>(LastIncExpr)) return; ++NChains; - IVChainVec.resize(NChains); + IVChainVec.push_back(IVChain(IVInc(UserInst, IVOper, LastIncExpr), + OperExprBase)); ChainUsersVec.resize(NChains); - DEBUG(dbgs() << "IV Head: (" << *UserInst << ") IV=" << *LastIncExpr - << "\n"); + DEBUG(dbgs() << "IV Chain#" << ChainIdx << " Head: (" << *UserInst + << ") IV=" << *LastIncExpr << "\n"); + } else { + DEBUG(dbgs() << "IV Chain#" << ChainIdx << " Inc: (" << *UserInst + << ") IV+" << *LastIncExpr << "\n"); + // Add this IV user to the end of the chain. + IVChainVec[ChainIdx].add(IVInc(UserInst, IVOper, LastIncExpr)); } - else - DEBUG(dbgs() << "IV Inc: (" << *UserInst << ") IV+" << *LastIncExpr - << "\n"); - - // Add this IV user to the end of the chain. - IVChainVec[ChainIdx].push_back(IVInc(UserInst, IVOper, LastIncExpr)); SmallPtrSet<Instruction*,4> &NearUsers = ChainUsersVec[ChainIdx].NearUsers; // This chain's NearUsers become FarUsers. @@ -2551,6 +2582,7 @@ void LSRInstance::ChainInstruction(Instruction *UserInst, Instruction *IVOper, /// loop latch. This will discover chains on side paths, but requires /// maintaining multiple copies of the Chains state. void LSRInstance::CollectChains() { + DEBUG(dbgs() << "Collecting IV Chains.\n"); SmallVector<ChainUsers, 8> ChainUsersVec; SmallVector<BasicBlock *,8> LatchPath; @@ -2622,10 +2654,10 @@ void LSRInstance::CollectChains() { } void LSRInstance::FinalizeChain(IVChain &Chain) { - assert(!Chain.empty() && "empty IV chains are not allowed"); - DEBUG(dbgs() << "Final Chain: " << *Chain[0].UserInst << "\n"); + assert(!Chain.Incs.empty() && "empty IV chains are not allowed"); + DEBUG(dbgs() << "Final Chain: " << *Chain.Incs[0].UserInst << "\n"); - for (IVChain::const_iterator I = llvm::next(Chain.begin()), E = Chain.end(); + for (IVChain::const_iterator I = Chain.begin(), E = Chain.end(); I != E; ++I) { DEBUG(dbgs() << " Inc: " << *I->UserInst << "\n"); User::op_iterator UseI = @@ -2659,7 +2691,7 @@ void LSRInstance::GenerateIVChain(const IVChain &Chain, SCEVExpander &Rewriter, SmallVectorImpl<WeakVH> &DeadInsts) { // Find the new IVOperand for the head of the chain. It may have been replaced // by LSR. - const IVInc &Head = Chain[0]; + const IVInc &Head = Chain.Incs[0]; User::op_iterator IVOpEnd = Head.UserInst->op_end(); User::op_iterator IVOpIter = findIVOperand(Head.UserInst->op_begin(), IVOpEnd, L, SE); @@ -2691,7 +2723,7 @@ void LSRInstance::GenerateIVChain(const IVChain &Chain, SCEVExpander &Rewriter, Type *IVTy = IVSrc->getType(); Type *IntTy = SE.getEffectiveSCEVType(IVTy); const SCEV *LeftOverExpr = 0; - for (IVChain::const_iterator IncI = llvm::next(Chain.begin()), + for (IVChain::const_iterator IncI = Chain.begin(), IncE = Chain.end(); IncI != IncE; ++IncI) { Instruction *InsertPt = IncI->UserInst; @@ -2736,7 +2768,7 @@ void LSRInstance::GenerateIVChain(const IVChain &Chain, SCEVExpander &Rewriter, } // If LSR created a new, wider phi, we may also replace its postinc. We only // do this if we also found a wide value for the head of the chain. - if (isa<PHINode>(Chain.back().UserInst)) { + if (isa<PHINode>(Chain.tailUserInst())) { for (BasicBlock::iterator I = L->getHeader()->begin(); PHINode *Phi = dyn_cast<PHINode>(I); ++I) { if (!isCompatibleIVType(Phi, IVSrc)) @@ -2804,7 +2836,7 @@ void LSRInstance::CollectFixupsAndInitialFormulae() { // x == y --> x - y == 0 const SCEV *N = SE.getSCEV(NV); - if (SE.isLoopInvariant(N, L)) { + if (SE.isLoopInvariant(N, L) && isSafeToExpand(N)) { // S is normalized, so normalize N before folding it into S // to keep the result normalized. N = TransformForPostIncUse(Normalize, N, CI, 0, @@ -2974,42 +3006,64 @@ LSRInstance::CollectLoopInvariantFixupsAndFormulae() { /// CollectSubexprs - Split S into subexpressions which can be pulled out into /// separate registers. If C is non-null, multiply each subexpression by C. -static void CollectSubexprs(const SCEV *S, const SCEVConstant *C, - SmallVectorImpl<const SCEV *> &Ops, - const Loop *L, - ScalarEvolution &SE) { +/// +/// Return remainder expression after factoring the subexpressions captured by +/// Ops. If Ops is complete, return NULL. +static const SCEV *CollectSubexprs(const SCEV *S, const SCEVConstant *C, + SmallVectorImpl<const SCEV *> &Ops, + const Loop *L, + ScalarEvolution &SE, + unsigned Depth = 0) { + // Arbitrarily cap recursion to protect compile time. + if (Depth >= 3) + return S; + if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) { // Break out add operands. for (SCEVAddExpr::op_iterator I = Add->op_begin(), E = Add->op_end(); - I != E; ++I) - CollectSubexprs(*I, C, Ops, L, SE); - return; + I != E; ++I) { + const SCEV *Remainder = CollectSubexprs(*I, C, Ops, L, SE, Depth+1); + if (Remainder) + Ops.push_back(C ? SE.getMulExpr(C, Remainder) : Remainder); + } + return NULL; } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) { // Split a non-zero base out of an addrec. - if (!AR->getStart()->isZero()) { - CollectSubexprs(SE.getAddRecExpr(SE.getConstant(AR->getType(), 0), - AR->getStepRecurrence(SE), - AR->getLoop(), - //FIXME: AR->getNoWrapFlags(SCEV::FlagNW) - SCEV::FlagAnyWrap), - C, Ops, L, SE); - CollectSubexprs(AR->getStart(), C, Ops, L, SE); - return; + if (AR->getStart()->isZero()) + return S; + + const SCEV *Remainder = CollectSubexprs(AR->getStart(), + C, Ops, L, SE, Depth+1); + // Split the non-zero AddRec unless it is part of a nested recurrence that + // does not pertain to this loop. + if (Remainder && (AR->getLoop() == L || !isa<SCEVAddRecExpr>(Remainder))) { + Ops.push_back(C ? SE.getMulExpr(C, Remainder) : Remainder); + Remainder = NULL; + } + if (Remainder != AR->getStart()) { + if (!Remainder) + Remainder = SE.getConstant(AR->getType(), 0); + return SE.getAddRecExpr(Remainder, + AR->getStepRecurrence(SE), + AR->getLoop(), + //FIXME: AR->getNoWrapFlags(SCEV::FlagNW) + SCEV::FlagAnyWrap); } } else if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(S)) { // Break (C * (a + b + c)) into C*a + C*b + C*c. - if (Mul->getNumOperands() == 2) - if (const SCEVConstant *Op0 = - dyn_cast<SCEVConstant>(Mul->getOperand(0))) { - CollectSubexprs(Mul->getOperand(1), - C ? cast<SCEVConstant>(SE.getMulExpr(C, Op0)) : Op0, - Ops, L, SE); - return; - } + if (Mul->getNumOperands() != 2) + return S; + if (const SCEVConstant *Op0 = + dyn_cast<SCEVConstant>(Mul->getOperand(0))) { + C = C ? cast<SCEVConstant>(SE.getMulExpr(C, Op0)) : Op0; + const SCEV *Remainder = + CollectSubexprs(Mul->getOperand(1), C, Ops, L, SE, Depth+1); + if (Remainder) + Ops.push_back(SE.getMulExpr(C, Remainder)); + return NULL; + } } - - // Otherwise use the value itself, optionally with a scale applied. - Ops.push_back(C ? SE.getMulExpr(C, S) : S); + return S; } /// GenerateReassociations - Split out subexpressions from adds and the bases of @@ -3024,7 +3078,9 @@ void LSRInstance::GenerateReassociations(LSRUse &LU, unsigned LUIdx, const SCEV *BaseReg = Base.BaseRegs[i]; SmallVector<const SCEV *, 8> AddOps; - CollectSubexprs(BaseReg, 0, AddOps, L, SE); + const SCEV *Remainder = CollectSubexprs(BaseReg, 0, AddOps, L, SE); + if (Remainder) + AddOps.push_back(Remainder); if (AddOps.size() == 1) continue; @@ -4236,13 +4292,6 @@ Value *LSRInstance::Expand(const LSRFixup &LF, Ops.push_back(SE.getUnknown(Rewriter.expandCodeFor(Reg, 0, IP))); } - // Flush the operand list to suppress SCEVExpander hoisting. - if (!Ops.empty()) { - Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty, IP); - Ops.clear(); - Ops.push_back(SE.getUnknown(FullV)); - } - // Expand the ScaledReg portion. Value *ICmpScaledV = 0; if (F.AM.Scale != 0) { @@ -4264,23 +4313,34 @@ Value *LSRInstance::Expand(const LSRFixup &LF, } else { // Otherwise just expand the scaled register and an explicit scale, // which is expected to be matched as part of the address. + + // Flush the operand list to suppress SCEVExpander hoisting address modes. + if (!Ops.empty() && LU.Kind == LSRUse::Address) { + Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty, IP); + Ops.clear(); + Ops.push_back(SE.getUnknown(FullV)); + } ScaledS = SE.getUnknown(Rewriter.expandCodeFor(ScaledS, 0, IP)); ScaledS = SE.getMulExpr(ScaledS, SE.getConstant(ScaledS->getType(), F.AM.Scale)); Ops.push_back(ScaledS); - - // Flush the operand list to suppress SCEVExpander hoisting. - Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty, IP); - Ops.clear(); - Ops.push_back(SE.getUnknown(FullV)); } } // Expand the GV portion. if (F.AM.BaseGV) { + // Flush the operand list to suppress SCEVExpander hoisting. + if (!Ops.empty()) { + Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty, IP); + Ops.clear(); + Ops.push_back(SE.getUnknown(FullV)); + } Ops.push_back(SE.getUnknown(F.AM.BaseGV)); + } - // Flush the operand list to suppress SCEVExpander hoisting. + // Flush the operand list to suppress SCEVExpander hoisting of both folded and + // unfolded offsets. LSR assumes they both live next to their uses. + if (!Ops.empty()) { Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty, IP); Ops.clear(); Ops.push_back(SE.getUnknown(FullV)); @@ -4485,7 +4545,7 @@ LSRInstance::ImplementSolution(const SmallVectorImpl<const Formula *> &Solution, // Mark phi nodes that terminate chains so the expander tries to reuse them. for (SmallVectorImpl<IVChain>::const_iterator ChainI = IVChainVec.begin(), ChainE = IVChainVec.end(); ChainI != ChainE; ++ChainI) { - if (PHINode *PN = dyn_cast<PHINode>(ChainI->back().UserInst)) + if (PHINode *PN = dyn_cast<PHINode>(ChainI->tailUserInst())) Rewriter.setChainedPhi(PN); } diff --git a/lib/Transforms/Scalar/LoopUnswitch.cpp b/lib/Transforms/Scalar/LoopUnswitch.cpp index 930980f..58f7739 100644 --- a/lib/Transforms/Scalar/LoopUnswitch.cpp +++ b/lib/Transforms/Scalar/LoopUnswitch.cpp @@ -1214,8 +1214,8 @@ void LoopUnswitch::SimplifyCode(std::vector<Instruction*> &Worklist, Loop *L) { // See if instruction simplification can hack this up. This is common for // things like "select false, X, Y" after unswitching made the condition be - // 'false'. - if (Value *V = SimplifyInstruction(I, 0, 0, DT)) + // 'false'. TODO: update the domtree properly so we can pass it here. + if (Value *V = SimplifyInstruction(I)) if (LI->replacementPreservesLCSSAForm(I, V)) { ReplaceUsesOfWith(I, V, Worklist, L, LPM); continue; diff --git a/lib/Transforms/Scalar/LowerAtomic.cpp b/lib/Transforms/Scalar/LowerAtomic.cpp index 689bbe9..7419a65 100644 --- a/lib/Transforms/Scalar/LowerAtomic.cpp +++ b/lib/Transforms/Scalar/LowerAtomic.cpp @@ -15,9 +15,9 @@ #define DEBUG_TYPE "loweratomic" #include "llvm/Transforms/Scalar.h" #include "llvm/Function.h" +#include "llvm/IRBuilder.h" #include "llvm/IntrinsicInst.h" #include "llvm/Pass.h" -#include "llvm/Support/IRBuilder.h" using namespace llvm; static bool LowerAtomicCmpXchgInst(AtomicCmpXchgInst *CXI) { @@ -25,12 +25,12 @@ static bool LowerAtomicCmpXchgInst(AtomicCmpXchgInst *CXI) { Value *Ptr = CXI->getPointerOperand(); Value *Cmp = CXI->getCompareOperand(); Value *Val = CXI->getNewValOperand(); - + LoadInst *Orig = Builder.CreateLoad(Ptr); Value *Equal = Builder.CreateICmpEQ(Orig, Cmp); Value *Res = Builder.CreateSelect(Equal, Val, Orig); Builder.CreateStore(Res, Ptr); - + CXI->replaceAllUsesWith(Orig); CXI->eraseFromParent(); return true; diff --git a/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/lib/Transforms/Scalar/MemCpyOptimizer.cpp index a87cce3..2a5ee33 100644 --- a/lib/Transforms/Scalar/MemCpyOptimizer.cpp +++ b/lib/Transforms/Scalar/MemCpyOptimizer.cpp @@ -15,21 +15,21 @@ #define DEBUG_TYPE "memcpyopt" #include "llvm/Transforms/Scalar.h" #include "llvm/GlobalVariable.h" -#include "llvm/IntrinsicInst.h" +#include "llvm/IRBuilder.h" #include "llvm/Instructions.h" +#include "llvm/IntrinsicInst.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" -#include "llvm/Analysis/Dominators.h" #include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/Dominators.h" #include "llvm/Analysis/MemoryDependenceAnalysis.h" #include "llvm/Analysis/ValueTracking.h" -#include "llvm/Transforms/Utils/Local.h" #include "llvm/Support/Debug.h" #include "llvm/Support/GetElementPtrTypeIterator.h" -#include "llvm/Support/IRBuilder.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetData.h" #include "llvm/Target/TargetLibraryInfo.h" +#include "llvm/Transforms/Utils/Local.h" #include <list> using namespace llvm; @@ -44,7 +44,7 @@ static int64_t GetOffsetFromIndex(const GetElementPtrInst *GEP, unsigned Idx, gep_type_iterator GTI = gep_type_begin(GEP); for (unsigned i = 1; i != Idx; ++i, ++GTI) /*skip along*/; - + // Compute the offset implied by the rest of the indices. int64_t Offset = 0; for (unsigned i = Idx, e = GEP->getNumOperands(); i != e; ++i, ++GTI) { @@ -58,7 +58,7 @@ static int64_t GetOffsetFromIndex(const GetElementPtrInst *GEP, unsigned Idx, Offset += TD.getStructLayout(STy)->getElementOffset(OpC->getZExtValue()); continue; } - + // Otherwise, we have a sequential type like an array or vector. Multiply // the index by the ElementSize. uint64_t Size = TD.getTypeAllocSize(GTI.getIndexedType()); @@ -77,7 +77,7 @@ static bool IsPointerOffset(Value *Ptr1, Value *Ptr2, int64_t &Offset, Ptr2 = Ptr2->stripPointerCasts(); GetElementPtrInst *GEP1 = dyn_cast<GetElementPtrInst>(Ptr1); GetElementPtrInst *GEP2 = dyn_cast<GetElementPtrInst>(Ptr2); - + bool VariableIdxFound = false; // If one pointer is a GEP and the other isn't, then see if the GEP is a @@ -91,7 +91,7 @@ static bool IsPointerOffset(Value *Ptr1, Value *Ptr2, int64_t &Offset, Offset = GetOffsetFromIndex(GEP2, 1, VariableIdxFound, TD); return !VariableIdxFound; } - + // Right now we handle the case when Ptr1/Ptr2 are both GEPs with an identical // base. After that base, they may have some number of common (and // potentially variable) indices. After that they handle some constant @@ -99,7 +99,7 @@ static bool IsPointerOffset(Value *Ptr1, Value *Ptr2, int64_t &Offset, // handle no other case. if (!GEP1 || !GEP2 || GEP1->getOperand(0) != GEP2->getOperand(0)) return false; - + // Skip any common indices and track the GEP types. unsigned Idx = 1; for (; Idx != GEP1->getNumOperands() && Idx != GEP2->getNumOperands(); ++Idx) @@ -109,7 +109,7 @@ static bool IsPointerOffset(Value *Ptr1, Value *Ptr2, int64_t &Offset, int64_t Offset1 = GetOffsetFromIndex(GEP1, Idx, VariableIdxFound, TD); int64_t Offset2 = GetOffsetFromIndex(GEP2, Idx, VariableIdxFound, TD); if (VariableIdxFound) return false; - + Offset = Offset2-Offset1; return true; } @@ -128,19 +128,19 @@ static bool IsPointerOffset(Value *Ptr1, Value *Ptr2, int64_t &Offset, namespace { struct MemsetRange { // Start/End - A semi range that describes the span that this range covers. - // The range is closed at the start and open at the end: [Start, End). + // The range is closed at the start and open at the end: [Start, End). int64_t Start, End; /// StartPtr - The getelementptr instruction that points to the start of the /// range. Value *StartPtr; - + /// Alignment - The known alignment of the first store. unsigned Alignment; - + /// TheStores - The actual stores that make up this range. SmallVector<Instruction*, 16> TheStores; - + bool isProfitableToUseMemset(const TargetData &TD) const; }; @@ -152,17 +152,17 @@ bool MemsetRange::isProfitableToUseMemset(const TargetData &TD) const { // If there is nothing to merge, don't do anything. if (TheStores.size() < 2) return false; - + // If any of the stores are a memset, then it is always good to extend the // memset. for (unsigned i = 0, e = TheStores.size(); i != e; ++i) if (!isa<StoreInst>(TheStores[i])) return true; - + // Assume that the code generator is capable of merging pairs of stores // together if it wants to. if (TheStores.size() == 2) return false; - + // If we have fewer than 8 stores, it can still be worthwhile to do this. // For example, merging 4 i8 stores into an i32 store is useful almost always. // However, merging 2 32-bit stores isn't useful on a 32-bit architecture (the @@ -175,15 +175,15 @@ bool MemsetRange::isProfitableToUseMemset(const TargetData &TD) const { // actually reducing the number of stores used. unsigned Bytes = unsigned(End-Start); unsigned NumPointerStores = Bytes/TD.getPointerSize(); - + // Assume the remaining bytes if any are done a byte at a time. unsigned NumByteStores = Bytes - NumPointerStores*TD.getPointerSize(); - + // If we will reduce the # stores (according to this heuristic), do the // transformation. This encourages merging 4 x i8 -> i32 and 2 x i16 -> i32 // etc. return TheStores.size() > NumPointerStores+NumByteStores; -} +} namespace { @@ -195,12 +195,12 @@ class MemsetRanges { const TargetData &TD; public: MemsetRanges(const TargetData &td) : TD(td) {} - + typedef std::list<MemsetRange>::const_iterator const_iterator; const_iterator begin() const { return Ranges.begin(); } const_iterator end() const { return Ranges.end(); } bool empty() const { return Ranges.empty(); } - + void addInst(int64_t OffsetFromFirst, Instruction *Inst) { if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) addStore(OffsetFromFirst, SI); @@ -210,21 +210,21 @@ public: void addStore(int64_t OffsetFromFirst, StoreInst *SI) { int64_t StoreSize = TD.getTypeStoreSize(SI->getOperand(0)->getType()); - + addRange(OffsetFromFirst, StoreSize, SI->getPointerOperand(), SI->getAlignment(), SI); } - + void addMemSet(int64_t OffsetFromFirst, MemSetInst *MSI) { int64_t Size = cast<ConstantInt>(MSI->getLength())->getZExtValue(); addRange(OffsetFromFirst, Size, MSI->getDest(), MSI->getAlignment(), MSI); } - + void addRange(int64_t Start, int64_t Size, Value *Ptr, unsigned Alignment, Instruction *Inst); }; - + } // end anon namespace @@ -240,10 +240,10 @@ void MemsetRanges::addRange(int64_t Start, int64_t Size, Value *Ptr, unsigned Alignment, Instruction *Inst) { int64_t End = Start+Size; range_iterator I = Ranges.begin(), E = Ranges.end(); - + while (I != E && Start > I->End) ++I; - + // We now know that I == E, in which case we didn't find anything to merge // with, or that Start <= I->End. If End < I->Start or I == E, then we need // to insert a new range. Handle this now. @@ -256,18 +256,18 @@ void MemsetRanges::addRange(int64_t Start, int64_t Size, Value *Ptr, R.TheStores.push_back(Inst); return; } - + // This store overlaps with I, add it. I->TheStores.push_back(Inst); - + // At this point, we may have an interval that completely contains our store. // If so, just add it to the interval and return. if (I->Start <= Start && I->End >= End) return; - + // Now we know that Start <= I->End and End >= I->Start so the range overlaps // but is not entirely contained within the range. - + // See if the range extends the start of the range. In this case, it couldn't // possibly cause it to join the prior range, because otherwise we would have // stopped on *it*. @@ -276,7 +276,7 @@ void MemsetRanges::addRange(int64_t Start, int64_t Size, Value *Ptr, I->StartPtr = Ptr; I->Alignment = Alignment; } - + // Now we know that Start <= I->End and Start >= I->Start (so the startpoint // is in or right at the end of I), and that End >= I->Start. Extend I out to // End. @@ -325,7 +325,7 @@ namespace { AU.addPreserved<AliasAnalysis>(); AU.addPreserved<MemoryDependenceAnalysis>(); } - + // Helper fuctions bool processStore(StoreInst *SI, BasicBlock::iterator &BBI); bool processMemSet(MemSetInst *SI, BasicBlock::iterator &BBI); @@ -341,7 +341,7 @@ namespace { bool iterateOnFunction(Function &F); }; - + char MemCpyOpt::ID = 0; } @@ -361,16 +361,16 @@ INITIALIZE_PASS_END(MemCpyOpt, "memcpyopt", "MemCpy Optimization", /// some other patterns to fold away. In particular, this looks for stores to /// neighboring locations of memory. If it sees enough consecutive ones, it /// attempts to merge them together into a memcpy/memset. -Instruction *MemCpyOpt::tryMergingIntoMemset(Instruction *StartInst, +Instruction *MemCpyOpt::tryMergingIntoMemset(Instruction *StartInst, Value *StartPtr, Value *ByteVal) { if (TD == 0) return 0; - + // Okay, so we now have a single store that can be splatable. Scan to find // all subsequent stores of the same value to offset from the same pointer. // Join these together into ranges, so we can decide whether contiguous blocks // are stored. MemsetRanges Ranges(*TD); - + BasicBlock::iterator BI = StartInst; for (++BI; !isa<TerminatorInst>(BI); ++BI) { if (!isa<StoreInst>(BI) && !isa<MemSetInst>(BI)) { @@ -381,43 +381,43 @@ Instruction *MemCpyOpt::tryMergingIntoMemset(Instruction *StartInst, break; continue; } - + if (StoreInst *NextStore = dyn_cast<StoreInst>(BI)) { // If this is a store, see if we can merge it in. if (!NextStore->isSimple()) break; - + // Check to see if this stored value is of the same byte-splattable value. if (ByteVal != isBytewiseValue(NextStore->getOperand(0))) break; - + // Check to see if this store is to a constant offset from the start ptr. int64_t Offset; if (!IsPointerOffset(StartPtr, NextStore->getPointerOperand(), Offset, *TD)) break; - + Ranges.addStore(Offset, NextStore); } else { MemSetInst *MSI = cast<MemSetInst>(BI); - + if (MSI->isVolatile() || ByteVal != MSI->getValue() || !isa<ConstantInt>(MSI->getLength())) break; - + // Check to see if this store is to a constant offset from the start ptr. int64_t Offset; if (!IsPointerOffset(StartPtr, MSI->getDest(), Offset, *TD)) break; - + Ranges.addMemSet(Offset, MSI); } } - + // If we have no ranges, then we just had a single store with nothing that // could be merged in. This is a very common case of course. if (Ranges.empty()) return 0; - + // If we had at least one store that could be merged in, add the starting // store as well. We try to avoid this unless there is at least something // interesting as a small compile-time optimization. @@ -434,28 +434,28 @@ Instruction *MemCpyOpt::tryMergingIntoMemset(Instruction *StartInst, for (MemsetRanges::const_iterator I = Ranges.begin(), E = Ranges.end(); I != E; ++I) { const MemsetRange &Range = *I; - + if (Range.TheStores.size() == 1) continue; - + // If it is profitable to lower this range to memset, do so now. if (!Range.isProfitableToUseMemset(*TD)) continue; - + // Otherwise, we do want to transform this! Create a new memset. // Get the starting pointer of the block. StartPtr = Range.StartPtr; - + // Determine alignment unsigned Alignment = Range.Alignment; if (Alignment == 0) { - Type *EltType = + Type *EltType = cast<PointerType>(StartPtr->getType())->getElementType(); Alignment = TD->getABITypeAlignment(EltType); } - - AMemSet = + + AMemSet = Builder.CreateMemSet(StartPtr, ByteVal, Range.End-Range.Start, Alignment); - + DEBUG(dbgs() << "Replace stores:\n"; for (unsigned i = 0, e = Range.TheStores.size(); i != e; ++i) dbgs() << *Range.TheStores[i] << '\n'; @@ -473,14 +473,14 @@ Instruction *MemCpyOpt::tryMergingIntoMemset(Instruction *StartInst, } ++NumMemSetInfer; } - + return AMemSet; } bool MemCpyOpt::processStore(StoreInst *SI, BasicBlock::iterator &BBI) { if (!SI->isSimple()) return false; - + if (TD == 0) return false; // Detect cases where we're performing call slot forwarding, but @@ -510,7 +510,7 @@ bool MemCpyOpt::processStore(StoreInst *SI, BasicBlock::iterator &BBI) { if (C) { bool changed = performCallSlotOptzn(LI, - SI->getPointerOperand()->stripPointerCasts(), + SI->getPointerOperand()->stripPointerCasts(), LI->getPointerOperand()->stripPointerCasts(), TD->getTypeStoreSize(SI->getOperand(0)->getType()), C); if (changed) { @@ -524,10 +524,10 @@ bool MemCpyOpt::processStore(StoreInst *SI, BasicBlock::iterator &BBI) { } } } - + // There are two cases that are interesting for this code to handle: memcpy // and memset. Right now we only handle memset. - + // Ensure that the value being stored is something that can be memset'able a // byte at a time like "0" or "-1" or any width, as well as things like // 0xA0A0A0A0 and 0.0. @@ -537,7 +537,7 @@ bool MemCpyOpt::processStore(StoreInst *SI, BasicBlock::iterator &BBI) { BBI = I; // Don't invalidate iterator. return true; } - + return false; } @@ -662,7 +662,11 @@ bool MemCpyOpt::performCallSlotOptzn(Instruction *cpy, // the use analysis, we also need to know that it does not sneakily // access dest. We rely on AA to figure this out for us. AliasAnalysis &AA = getAnalysis<AliasAnalysis>(); - if (AA.getModRefInfo(C, cpyDest, srcSize) != AliasAnalysis::NoModRef) + AliasAnalysis::ModRefResult MR = AA.getModRefInfo(C, cpyDest, srcSize); + // If necessary, perform additional analysis. + if (MR != AliasAnalysis::NoModRef) + MR = AA.callCapturesBefore(C, cpyDest, srcSize, &DT); + if (MR != AliasAnalysis::NoModRef) return false; // All the checks have passed, so do the transformation. @@ -676,7 +680,7 @@ bool MemCpyOpt::performCallSlotOptzn(Instruction *cpy, if (CS.getArgument(i)->getType() == cpyDest->getType()) CS.setArgument(i, cpyDest); else - CS.setArgument(i, CastInst::CreatePointerCast(cpyDest, + CS.setArgument(i, CastInst::CreatePointerCast(cpyDest, CS.getArgument(i)->getType(), cpyDest->getName(), C)); } @@ -697,14 +701,14 @@ bool MemCpyOpt::performCallSlotOptzn(Instruction *cpy, /// processMemCpyMemCpyDependence - We've found that the (upward scanning) /// memory dependence of memcpy 'M' is the memcpy 'MDep'. Try to simplify M to /// copy from MDep's input if we can. MSize is the size of M's copy. -/// +/// bool MemCpyOpt::processMemCpyMemCpyDependence(MemCpyInst *M, MemCpyInst *MDep, uint64_t MSize) { // We can only transforms memcpy's where the dest of one is the source of the // other. if (M->getSource() != MDep->getDest() || MDep->isVolatile()) return false; - + // If dep instruction is reading from our current input, then it is a noop // transfer and substituting the input won't change this instruction. Just // ignore the input and let someone else zap MDep. This handles cases like: @@ -712,14 +716,14 @@ bool MemCpyOpt::processMemCpyMemCpyDependence(MemCpyInst *M, MemCpyInst *MDep, // memcpy(b <- a) if (M->getSource() == MDep->getSource()) return false; - + // Second, the length of the memcpy's must be the same, or the preceding one // must be larger than the following one. ConstantInt *MDepLen = dyn_cast<ConstantInt>(MDep->getLength()); ConstantInt *MLen = dyn_cast<ConstantInt>(M->getLength()); if (!MDepLen || !MLen || MDepLen->getZExtValue() < MLen->getZExtValue()) return false; - + AliasAnalysis &AA = getAnalysis<AliasAnalysis>(); // Verify that the copied-from memory doesn't change in between the two @@ -739,23 +743,23 @@ bool MemCpyOpt::processMemCpyMemCpyDependence(MemCpyInst *M, MemCpyInst *MDep, false, M, M->getParent()); if (!SourceDep.isClobber() || SourceDep.getInst() != MDep) return false; - + // If the dest of the second might alias the source of the first, then the // source and dest might overlap. We still want to eliminate the intermediate // value, but we have to generate a memmove instead of memcpy. bool UseMemMove = false; if (!AA.isNoAlias(AA.getLocationForDest(M), AA.getLocationForSource(MDep))) UseMemMove = true; - + // If all checks passed, then we can transform M. - + // Make sure to use the lesser of the alignment of the source and the dest // since we're changing where we're reading from, but don't want to increase // the alignment past what can be read from or written to. // TODO: Is this worth it if we're creating a less aligned memcpy? For // example we could be moving from movaps -> movq on x86. unsigned Align = std::min(MDep->getAlignment(), M->getAlignment()); - + IRBuilder<> Builder(M); if (UseMemMove) Builder.CreateMemMove(M->getRawDest(), MDep->getRawSource(), M->getLength(), @@ -835,13 +839,13 @@ bool MemCpyOpt::processMemMove(MemMoveInst *M) { if (!TLI->has(LibFunc::memmove)) return false; - + // See if the pointers alias. if (!AA.isNoAlias(AA.getLocationForDest(M), AA.getLocationForSource(M))) return false; - + DEBUG(dbgs() << "MemCpyOpt: Optimizing memmove -> memcpy: " << *M << "\n"); - + // If not, then we know we can transform this. Module *Mod = M->getParent()->getParent()->getParent(); Type *ArgTys[3] = { M->getRawDest()->getType(), @@ -857,7 +861,7 @@ bool MemCpyOpt::processMemMove(MemMoveInst *M) { ++NumMoveToCpy; return true; } - + /// processByValArgument - This is called on every byval argument in call sites. bool MemCpyOpt::processByValArgument(CallSite CS, unsigned ArgNo) { if (TD == 0) return false; @@ -880,7 +884,7 @@ bool MemCpyOpt::processByValArgument(CallSite CS, unsigned ArgNo) { if (MDep == 0 || MDep->isVolatile() || ByValArg->stripPointerCasts() != MDep->getDest()) return false; - + // The length of the memcpy must be larger or equal to the size of the byval. ConstantInt *C1 = dyn_cast<ConstantInt>(MDep->getLength()); if (C1 == 0 || C1->getValue().getZExtValue() < ByValSize) @@ -890,13 +894,13 @@ bool MemCpyOpt::processByValArgument(CallSite CS, unsigned ArgNo) { // then it is some target specific value that we can't know. unsigned ByValAlign = CS.getParamAlignment(ArgNo+1); if (ByValAlign == 0) return false; - + // If it is greater than the memcpy, then we check to see if we can force the // source of the memcpy to the alignment we need. If we fail, we bail out. if (MDep->getAlignment() < ByValAlign && getOrEnforceKnownAlignment(MDep->getSource(),ByValAlign, TD) < ByValAlign) return false; - + // Verify that the copied-from memory doesn't change in between the memcpy and // the byval call. // memcpy(a <- b) @@ -911,16 +915,16 @@ bool MemCpyOpt::processByValArgument(CallSite CS, unsigned ArgNo) { false, CS.getInstruction(), MDep->getParent()); if (!SourceDep.isClobber() || SourceDep.getInst() != MDep) return false; - + Value *TmpCast = MDep->getSource(); if (MDep->getSource()->getType() != ByValArg->getType()) TmpCast = new BitCastInst(MDep->getSource(), ByValArg->getType(), "tmpcast", CS.getInstruction()); - + DEBUG(dbgs() << "MemCpyOpt: Forwarding memcpy to byval:\n" << " " << *MDep << "\n" << " " << *CS.getInstruction() << "\n"); - + // Otherwise we're good! Update the byval argument. CS.setArgument(ArgNo, TmpCast); ++NumMemCpyInstr; @@ -936,9 +940,9 @@ bool MemCpyOpt::iterateOnFunction(Function &F) { for (BasicBlock::iterator BI = BB->begin(), BE = BB->end(); BI != BE;) { // Avoid invalidating the iterator. Instruction *I = BI++; - + bool RepeatInstruction = false; - + if (StoreInst *SI = dyn_cast<StoreInst>(I)) MadeChange |= processStore(SI, BI); else if (MemSetInst *M = dyn_cast<MemSetInst>(I)) @@ -960,7 +964,7 @@ bool MemCpyOpt::iterateOnFunction(Function &F) { } } } - + return MadeChange; } @@ -972,19 +976,19 @@ bool MemCpyOpt::runOnFunction(Function &F) { MD = &getAnalysis<MemoryDependenceAnalysis>(); TD = getAnalysisIfAvailable<TargetData>(); TLI = &getAnalysis<TargetLibraryInfo>(); - + // If we don't have at least memset and memcpy, there is little point of doing // anything here. These are required by a freestanding implementation, so if // even they are disabled, there is no point in trying hard. if (!TLI->has(LibFunc::memset) || !TLI->has(LibFunc::memcpy)) return false; - + while (1) { if (!iterateOnFunction(F)) break; MadeChange = true; } - + MD = 0; return MadeChange; } diff --git a/lib/Transforms/Scalar/ObjCARC.cpp b/lib/Transforms/Scalar/ObjCARC.cpp index 7e3e69b..3222f20 100644 --- a/lib/Transforms/Scalar/ObjCARC.cpp +++ b/lib/Transforms/Scalar/ObjCARC.cpp @@ -20,7 +20,7 @@ // This file also defines a simple ARC-aware AliasAnalysis. // // WARNING: This file knows about certain library functions. It recognizes them -// by name, and hardwires knowedge of their semantics. +// by name, and hardwires knowledge of their semantics. // // WARNING: This file knows about how certain Objective-C library functions are // used. Naive LLVM IR transformations which would otherwise be @@ -29,18 +29,8 @@ //===----------------------------------------------------------------------===// #define DEBUG_TYPE "objc-arc" -#include "llvm/Function.h" -#include "llvm/Intrinsics.h" -#include "llvm/GlobalVariable.h" -#include "llvm/DerivedTypes.h" -#include "llvm/Module.h" -#include "llvm/Analysis/ValueTracking.h" -#include "llvm/Transforms/Utils/Local.h" -#include "llvm/Support/CallSite.h" #include "llvm/Support/CommandLine.h" -#include "llvm/ADT/StringSwitch.h" #include "llvm/ADT/DenseMap.h" -#include "llvm/ADT/STLExtras.h" using namespace llvm; // A handy option to enable/disable all optimizations in this file. @@ -141,6 +131,13 @@ namespace { // ARC Utilities. //===----------------------------------------------------------------------===// +#include "llvm/Intrinsics.h" +#include "llvm/Module.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/Transforms/Utils/Local.h" +#include "llvm/Support/CallSite.h" +#include "llvm/ADT/StringSwitch.h" + namespace { /// InstructionClass - A simple classification for instructions. enum InstructionClass { @@ -299,22 +296,23 @@ static InstructionClass GetInstructionClass(const Value *V) { // None of the intrinsic functions do objc_release. For intrinsics, the // only question is whether or not they may be users. switch (F->getIntrinsicID()) { - case 0: break; - case Intrinsic::bswap: case Intrinsic::ctpop: - case Intrinsic::ctlz: case Intrinsic::cttz: case Intrinsic::returnaddress: case Intrinsic::frameaddress: case Intrinsic::stacksave: case Intrinsic::stackrestore: case Intrinsic::vastart: case Intrinsic::vacopy: case Intrinsic::vaend: + case Intrinsic::objectsize: case Intrinsic::prefetch: + case Intrinsic::stackprotector: + case Intrinsic::eh_return_i32: case Intrinsic::eh_return_i64: + case Intrinsic::eh_typeid_for: case Intrinsic::eh_dwarf_cfa: + case Intrinsic::eh_sjlj_lsda: case Intrinsic::eh_sjlj_functioncontext: + case Intrinsic::init_trampoline: case Intrinsic::adjust_trampoline: + case Intrinsic::lifetime_start: case Intrinsic::lifetime_end: + case Intrinsic::invariant_start: case Intrinsic::invariant_end: // Don't let dbg info affect our results. case Intrinsic::dbg_declare: case Intrinsic::dbg_value: // Short cut: Some intrinsics obviously don't use ObjC pointers. return IC_None; default: - for (Function::const_arg_iterator AI = F->arg_begin(), - AE = F->arg_end(); AI != AE; ++AI) - if (IsPotentialUse(AI)) - return IC_User; - return IC_None; + break; } } return GetCallSiteClass(CI); @@ -382,14 +380,14 @@ static InstructionClass GetBasicInstructionClass(const Value *V) { return isa<InvokeInst>(V) ? IC_CallOrUser : IC_User; } -/// IsRetain - Test if the the given class is objc_retain or +/// IsRetain - Test if the given class is objc_retain or /// equivalent. static bool IsRetain(InstructionClass Class) { return Class == IC_Retain || Class == IC_RetainRV; } -/// IsAutorelease - Test if the the given class is objc_autorelease or +/// IsAutorelease - Test if the given class is objc_autorelease or /// equivalent. static bool IsAutorelease(InstructionClass Class) { return Class == IC_Autorelease || @@ -444,7 +442,7 @@ static bool IsNoThrow(InstructionClass Class) { Class == IC_AutoreleasepoolPop; } -/// EraseInstruction - Erase the given instruction. ObjC calls return their +/// EraseInstruction - Erase the given instruction. Many ObjC calls return their /// argument verbatim, so if it's such a call and the return value has users, /// replace them with the argument value. static void EraseInstruction(Instruction *CI) { @@ -565,9 +563,8 @@ static const Value *FindSingleUseIdentifiedObject(const Value *Arg) { return Arg; } - // If we found an identifiable object but it has multiple uses, but they - // are trivial uses, we can still consider this to be a single-use - // value. + // If we found an identifiable object but it has multiple uses, but they are + // trivial uses, we can still consider this to be a single-use value. if (IsObjCIdentifiedObject(Arg)) { for (Value::const_use_iterator UI = Arg->use_begin(), UE = Arg->use_end(); UI != UE; ++UI) { @@ -692,7 +689,7 @@ namespace { /// specified pass info. virtual void *getAdjustedAnalysisPointer(const void *PI) { if (PI == &AliasAnalysis::ID) - return (AliasAnalysis*)this; + return static_cast<AliasAnalysis *>(this); return this; } @@ -815,7 +812,7 @@ ObjCARCAliasAnalysis::getModRefInfo(ImmutableCallSite CS, const Location &Loc) { case IC_FusedRetainAutorelease: case IC_FusedRetainAutoreleaseRV: // These functions don't access any memory visible to the compiler. - // Note that this doesn't include objc_retainBlock, becuase it updates + // Note that this doesn't include objc_retainBlock, because it updates // pointers when it copies block data. return NoModRef; default: @@ -915,6 +912,7 @@ bool ObjCARCExpand::runOnFunction(Function &F) { //===----------------------------------------------------------------------===// #include "llvm/Constants.h" +#include "llvm/ADT/STLExtras.h" namespace { /// ObjCARCAPElim - Autorelease pool elimination. @@ -922,8 +920,8 @@ namespace { virtual void getAnalysisUsage(AnalysisUsage &AU) const; virtual bool runOnModule(Module &M); - bool MayAutorelease(CallSite CS, unsigned Depth = 0); - bool OptimizeBB(BasicBlock *BB); + static bool MayAutorelease(ImmutableCallSite CS, unsigned Depth = 0); + static bool OptimizeBB(BasicBlock *BB); public: static char ID; @@ -949,15 +947,16 @@ void ObjCARCAPElim::getAnalysisUsage(AnalysisUsage &AU) const { /// MayAutorelease - Interprocedurally determine if calls made by the /// given call site can possibly produce autoreleases. -bool ObjCARCAPElim::MayAutorelease(CallSite CS, unsigned Depth) { - if (Function *Callee = CS.getCalledFunction()) { +bool ObjCARCAPElim::MayAutorelease(ImmutableCallSite CS, unsigned Depth) { + if (const Function *Callee = CS.getCalledFunction()) { if (Callee->isDeclaration() || Callee->mayBeOverridden()) return true; - for (Function::iterator I = Callee->begin(), E = Callee->end(); + for (Function::const_iterator I = Callee->begin(), E = Callee->end(); I != E; ++I) { - BasicBlock *BB = I; - for (BasicBlock::iterator J = BB->begin(), F = BB->end(); J != F; ++J) - if (CallSite JCS = CallSite(J)) + const BasicBlock *BB = I; + for (BasicBlock::const_iterator J = BB->begin(), F = BB->end(); + J != F; ++J) + if (ImmutableCallSite JCS = ImmutableCallSite(J)) // This recursion depth limit is arbitrary. It's just great // enough to cover known interesting testcases. if (Depth < 3 && @@ -992,7 +991,7 @@ bool ObjCARCAPElim::OptimizeBB(BasicBlock *BB) { Push = 0; break; case IC_CallOrUser: - if (MayAutorelease(CallSite(Inst))) + if (MayAutorelease(ImmutableCallSite(Inst))) Push = 0; break; default: @@ -1033,7 +1032,11 @@ bool ObjCARCAPElim::runOnModule(Module &M) { Value *Op = *OI; // llvm.global_ctors is an array of pairs where the second members // are constructor functions. - Function *F = cast<Function>(cast<ConstantStruct>(Op)->getOperand(1)); + Function *F = dyn_cast<Function>(cast<ConstantStruct>(Op)->getOperand(1)); + // If the user used a constructor function with the wrong signature and + // it got bitcasted or whatever, look the other way. + if (!F) + continue; // Only look at function definitions. if (F->isDeclaration()) continue; @@ -1089,14 +1092,10 @@ bool ObjCARCAPElim::runOnModule(Module &M) { // TODO: Delete release+retain pairs (rare). -#include "llvm/GlobalAlias.h" -#include "llvm/Constants.h" #include "llvm/LLVMContext.h" -#include "llvm/Support/ErrorHandling.h" #include "llvm/Support/CFG.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/SmallPtrSet.h" -#include "llvm/ADT/DenseSet.h" STATISTIC(NumNoops, "Number of no-op objc calls eliminated"); STATISTIC(NumPartialNoops, "Number of partially no-op objc calls eliminated"); @@ -1144,22 +1143,13 @@ bool ProvenanceAnalysis::relatedSelect(const SelectInst *A, const Value *B) { // If the values are Selects with the same condition, we can do a more precise // check: just check for relations between the values on corresponding arms. if (const SelectInst *SB = dyn_cast<SelectInst>(B)) - if (A->getCondition() == SB->getCondition()) { - if (related(A->getTrueValue(), SB->getTrueValue())) - return true; - if (related(A->getFalseValue(), SB->getFalseValue())) - return true; - return false; - } + if (A->getCondition() == SB->getCondition()) + return related(A->getTrueValue(), SB->getTrueValue()) || + related(A->getFalseValue(), SB->getFalseValue()); // Check both arms of the Select node individually. - if (related(A->getTrueValue(), B)) - return true; - if (related(A->getFalseValue(), B)) - return true; - - // The arms both checked out. - return false; + return related(A->getTrueValue(), B) || + related(A->getFalseValue(), B); } bool ProvenanceAnalysis::relatedPHI(const PHINode *A, const Value *B) { @@ -1357,12 +1347,6 @@ namespace { /// with the "tail" keyword. bool IsTailCallRelease; - /// Partial - True of we've seen an opportunity for partial RR elimination, - /// such as pushing calls into a CFG triangle or into one side of a - /// CFG diamond. - /// TODO: Consider moving this to PtrState. - bool Partial; - /// ReleaseMetadata - If the Calls are objc_release calls and they all have /// a clang.imprecise_release tag, this is the metadata tag. MDNode *ReleaseMetadata; @@ -1377,7 +1361,7 @@ namespace { RRInfo() : KnownSafe(false), IsRetainBlock(false), - IsTailCallRelease(false), Partial(false), + IsTailCallRelease(false), ReleaseMetadata(0) {} void clear(); @@ -1388,7 +1372,6 @@ void RRInfo::clear() { KnownSafe = false; IsRetainBlock = false; IsTailCallRelease = false; - Partial = false; ReleaseMetadata = 0; Calls.clear(); ReverseInsertPts.clear(); @@ -1398,36 +1381,39 @@ namespace { /// PtrState - This class summarizes several per-pointer runtime properties /// which are propogated through the flow graph. class PtrState { - /// RefCount - The known minimum number of reference count increments. - unsigned RefCount; - /// NestCount - The known minimum level of retain+release nesting. unsigned NestCount; + /// KnownPositiveRefCount - True if the reference count is known to + /// be incremented. + bool KnownPositiveRefCount; + + /// Partial - True of we've seen an opportunity for partial RR elimination, + /// such as pushing calls into a CFG triangle or into one side of a + /// CFG diamond. + bool Partial; + /// Seq - The current position in the sequence. - Sequence Seq; + Sequence Seq : 8; public: /// RRI - Unidirectional information about the current sequence. /// TODO: Encapsulate this better. RRInfo RRI; - PtrState() : RefCount(0), NestCount(0), Seq(S_None) {} - - void SetAtLeastOneRefCount() { - if (RefCount == 0) RefCount = 1; - } + PtrState() : NestCount(0), KnownPositiveRefCount(false), Partial(false), + Seq(S_None) {} - void IncrementRefCount() { - if (RefCount != UINT_MAX) ++RefCount; + void SetKnownPositiveRefCount() { + KnownPositiveRefCount = true; } - void DecrementRefCount() { - if (RefCount != 0) --RefCount; + void ClearRefCount() { + KnownPositiveRefCount = false; } bool IsKnownIncremented() const { - return RefCount > 0; + return KnownPositiveRefCount; } void IncrementNestCount() { @@ -1451,7 +1437,12 @@ namespace { } void ClearSequenceProgress() { - Seq = S_None; + ResetSequenceProgress(S_None); + } + + void ResetSequenceProgress(Sequence NewSeq) { + Seq = NewSeq; + Partial = false; RRI.clear(); } @@ -1462,7 +1453,7 @@ namespace { void PtrState::Merge(const PtrState &Other, bool TopDown) { Seq = MergeSeqs(Seq, Other.Seq, TopDown); - RefCount = std::min(RefCount, Other.RefCount); + KnownPositiveRefCount = KnownPositiveRefCount && Other.KnownPositiveRefCount; NestCount = std::min(NestCount, Other.NestCount); // We can't merge a plain objc_retain with an objc_retainBlock. @@ -1471,31 +1462,31 @@ PtrState::Merge(const PtrState &Other, bool TopDown) { // If we're not in a sequence (anymore), drop all associated state. if (Seq == S_None) { + Partial = false; RRI.clear(); - } else if (RRI.Partial || Other.RRI.Partial) { + } else if (Partial || Other.Partial) { // If we're doing a merge on a path that's previously seen a partial // merge, conservatively drop the sequence, to avoid doing partial // RR elimination. If the branch predicates for the two merge differ, // mixing them is unsafe. - Seq = S_None; - RRI.clear(); + ClearSequenceProgress(); } else { // Conservatively merge the ReleaseMetadata information. if (RRI.ReleaseMetadata != Other.RRI.ReleaseMetadata) RRI.ReleaseMetadata = 0; RRI.KnownSafe = RRI.KnownSafe && Other.RRI.KnownSafe; - RRI.IsTailCallRelease = RRI.IsTailCallRelease && Other.RRI.IsTailCallRelease; + RRI.IsTailCallRelease = RRI.IsTailCallRelease && + Other.RRI.IsTailCallRelease; RRI.Calls.insert(Other.RRI.Calls.begin(), Other.RRI.Calls.end()); // Merge the insert point sets. If there are any differences, // that makes this a partial merge. - RRI.Partial = RRI.ReverseInsertPts.size() != - Other.RRI.ReverseInsertPts.size(); + Partial = RRI.ReverseInsertPts.size() != Other.RRI.ReverseInsertPts.size(); for (SmallPtrSet<Instruction *, 2>::const_iterator I = Other.RRI.ReverseInsertPts.begin(), E = Other.RRI.ReverseInsertPts.end(); I != E; ++I) - RRI.Partial |= RRI.ReverseInsertPts.insert(*I); + Partial |= RRI.ReverseInsertPts.insert(*I); } } @@ -1521,6 +1512,11 @@ namespace { /// known about a pointer at the top of each block. MapTy PerPtrBottomUp; + /// Preds, Succs - Effective successors and predecessors of the current + /// block (this ignores ignorable edges and ignored backedges). + SmallVector<BasicBlock *, 2> Preds; + SmallVector<BasicBlock *, 2> Succs; + public: BBState() : TopDownPathCount(0), BottomUpPathCount(0) {} @@ -1578,14 +1574,22 @@ namespace { /// entry to an exit which pass through this block. This is only valid /// after both the top-down and bottom-up traversals are complete. unsigned GetAllPathCount() const { + assert(TopDownPathCount != 0); + assert(BottomUpPathCount != 0); return TopDownPathCount * BottomUpPathCount; } - /// IsVisitedTopDown - Test whether the block for this BBState has been - /// visited by the top-down portion of the algorithm. - bool isVisitedTopDown() const { - return TopDownPathCount != 0; - } + // Specialized CFG utilities. + typedef SmallVectorImpl<BasicBlock *>::const_iterator edge_iterator; + edge_iterator pred_begin() { return Preds.begin(); } + edge_iterator pred_end() { return Preds.end(); } + edge_iterator succ_begin() { return Succs.begin(); } + edge_iterator succ_end() { return Succs.end(); } + + void addSucc(BasicBlock *Succ) { Succs.push_back(Succ); } + void addPred(BasicBlock *Pred) { Preds.push_back(Pred); } + + bool isExit() const { return Succs.empty(); } }; } @@ -1783,12 +1787,9 @@ Constant *ObjCARCOpt::getRetainRVCallee(Module *M) { if (!RetainRVCallee) { LLVMContext &C = M->getContext(); Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C)); - std::vector<Type *> Params; - Params.push_back(I8X); - FunctionType *FTy = - FunctionType::get(I8X, Params, /*isVarArg=*/false); - AttrListPtr Attributes; - Attributes.addAttr(~0u, Attribute::NoUnwind); + Type *Params[] = { I8X }; + FunctionType *FTy = FunctionType::get(I8X, Params, /*isVarArg=*/false); + AttrListPtr Attributes = AttrListPtr().addAttr(~0u, Attribute::NoUnwind); RetainRVCallee = M->getOrInsertFunction("objc_retainAutoreleasedReturnValue", FTy, Attributes); @@ -1800,12 +1801,9 @@ Constant *ObjCARCOpt::getAutoreleaseRVCallee(Module *M) { if (!AutoreleaseRVCallee) { LLVMContext &C = M->getContext(); Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C)); - std::vector<Type *> Params; - Params.push_back(I8X); - FunctionType *FTy = - FunctionType::get(I8X, Params, /*isVarArg=*/false); - AttrListPtr Attributes; - Attributes.addAttr(~0u, Attribute::NoUnwind); + Type *Params[] = { I8X }; + FunctionType *FTy = FunctionType::get(I8X, Params, /*isVarArg=*/false); + AttrListPtr Attributes = AttrListPtr().addAttr(~0u, Attribute::NoUnwind); AutoreleaseRVCallee = M->getOrInsertFunction("objc_autoreleaseReturnValue", FTy, Attributes); @@ -1816,10 +1814,8 @@ Constant *ObjCARCOpt::getAutoreleaseRVCallee(Module *M) { Constant *ObjCARCOpt::getReleaseCallee(Module *M) { if (!ReleaseCallee) { LLVMContext &C = M->getContext(); - std::vector<Type *> Params; - Params.push_back(PointerType::getUnqual(Type::getInt8Ty(C))); - AttrListPtr Attributes; - Attributes.addAttr(~0u, Attribute::NoUnwind); + Type *Params[] = { PointerType::getUnqual(Type::getInt8Ty(C)) }; + AttrListPtr Attributes = AttrListPtr().addAttr(~0u, Attribute::NoUnwind); ReleaseCallee = M->getOrInsertFunction( "objc_release", @@ -1832,10 +1828,8 @@ Constant *ObjCARCOpt::getReleaseCallee(Module *M) { Constant *ObjCARCOpt::getRetainCallee(Module *M) { if (!RetainCallee) { LLVMContext &C = M->getContext(); - std::vector<Type *> Params; - Params.push_back(PointerType::getUnqual(Type::getInt8Ty(C))); - AttrListPtr Attributes; - Attributes.addAttr(~0u, Attribute::NoUnwind); + Type *Params[] = { PointerType::getUnqual(Type::getInt8Ty(C)) }; + AttrListPtr Attributes = AttrListPtr().addAttr(~0u, Attribute::NoUnwind); RetainCallee = M->getOrInsertFunction( "objc_retain", @@ -1848,16 +1842,14 @@ Constant *ObjCARCOpt::getRetainCallee(Module *M) { Constant *ObjCARCOpt::getRetainBlockCallee(Module *M) { if (!RetainBlockCallee) { LLVMContext &C = M->getContext(); - std::vector<Type *> Params; - Params.push_back(PointerType::getUnqual(Type::getInt8Ty(C))); - AttrListPtr Attributes; + Type *Params[] = { PointerType::getUnqual(Type::getInt8Ty(C)) }; // objc_retainBlock is not nounwind because it calls user copy constructors // which could theoretically throw. RetainBlockCallee = M->getOrInsertFunction( "objc_retainBlock", FunctionType::get(Params[0], Params, /*isVarArg=*/false), - Attributes); + AttrListPtr()); } return RetainBlockCallee; } @@ -1865,10 +1857,8 @@ Constant *ObjCARCOpt::getRetainBlockCallee(Module *M) { Constant *ObjCARCOpt::getAutoreleaseCallee(Module *M) { if (!AutoreleaseCallee) { LLVMContext &C = M->getContext(); - std::vector<Type *> Params; - Params.push_back(PointerType::getUnqual(Type::getInt8Ty(C))); - AttrListPtr Attributes; - Attributes.addAttr(~0u, Attribute::NoUnwind); + Type *Params[] = { PointerType::getUnqual(Type::getInt8Ty(C)) }; + AttrListPtr Attributes = AttrListPtr().addAttr(~0u, Attribute::NoUnwind); AutoreleaseCallee = M->getOrInsertFunction( "objc_autorelease", @@ -2153,13 +2143,13 @@ static bool isNoopInstruction(const Instruction *I) { /// objc_retainAutoreleasedReturnValue if the operand is a return value. void ObjCARCOpt::OptimizeRetainCall(Function &F, Instruction *Retain) { - CallSite CS(GetObjCArg(Retain)); - Instruction *Call = CS.getInstruction(); + ImmutableCallSite CS(GetObjCArg(Retain)); + const Instruction *Call = CS.getInstruction(); if (!Call) return; if (Call->getParent() != Retain->getParent()) return; // Check that the call is next to the retain. - BasicBlock::iterator I = Call; + BasicBlock::const_iterator I = Call; ++I; while (isNoopInstruction(I)) ++I; if (&*I != Retain) @@ -2172,25 +2162,24 @@ ObjCARCOpt::OptimizeRetainCall(Function &F, Instruction *Retain) { } /// OptimizeRetainRVCall - Turn objc_retainAutoreleasedReturnValue into -/// objc_retain if the operand is not a return value. Or, if it can be -/// paired with an objc_autoreleaseReturnValue, delete the pair and -/// return true. +/// objc_retain if the operand is not a return value. Or, if it can be paired +/// with an objc_autoreleaseReturnValue, delete the pair and return true. bool ObjCARCOpt::OptimizeRetainRVCall(Function &F, Instruction *RetainRV) { // Check for the argument being from an immediately preceding call or invoke. - Value *Arg = GetObjCArg(RetainRV); - CallSite CS(Arg); - if (Instruction *Call = CS.getInstruction()) { + const Value *Arg = GetObjCArg(RetainRV); + ImmutableCallSite CS(Arg); + if (const Instruction *Call = CS.getInstruction()) { if (Call->getParent() == RetainRV->getParent()) { - BasicBlock::iterator I = Call; + BasicBlock::const_iterator I = Call; ++I; while (isNoopInstruction(I)) ++I; if (&*I == RetainRV) return false; - } else if (InvokeInst *II = dyn_cast<InvokeInst>(Call)) { + } else if (const InvokeInst *II = dyn_cast<InvokeInst>(Call)) { BasicBlock *RetainRVParent = RetainRV->getParent(); if (II->getNormalDest() == RetainRVParent) { - BasicBlock::iterator I = RetainRVParent->begin(); + BasicBlock::const_iterator I = RetainRVParent->begin(); while (isNoopInstruction(I)) ++I; if (&*I == RetainRV) return false; @@ -2418,7 +2407,8 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) { // These can always be moved up. break; case IC_Release: - // These can't be moved across things that care about the retain count. + // These can't be moved across things that care about the retain + // count. FindDependencies(NeedsPositiveRetainCount, Arg, Inst->getParent(), Inst, DependingInstructions, Visited, PA); @@ -2500,13 +2490,14 @@ ObjCARCOpt::CheckForCFGHazards(const BasicBlock *BB, for (; SI != SE; ++SI) { Sequence SuccSSeq = S_None; bool SuccSRRIKnownSafe = false; - // If VisitBottomUp has visited this successor, take what we know about it. - DenseMap<const BasicBlock *, BBState>::iterator BBI = BBStates.find(*SI); - if (BBI != BBStates.end()) { - const PtrState &SuccS = BBI->second.getPtrBottomUpState(Arg); - SuccSSeq = SuccS.GetSeq(); - SuccSRRIKnownSafe = SuccS.RRI.KnownSafe; - } + // If VisitBottomUp has pointer information for this successor, take + // what we know about it. + DenseMap<const BasicBlock *, BBState>::iterator BBI = + BBStates.find(*SI); + assert(BBI != BBStates.end()); + const PtrState &SuccS = BBI->second.getPtrBottomUpState(Arg); + SuccSSeq = SuccS.GetSeq(); + SuccSRRIKnownSafe = SuccS.RRI.KnownSafe; switch (SuccSSeq) { case S_None: case S_CanRelease: { @@ -2553,13 +2544,14 @@ ObjCARCOpt::CheckForCFGHazards(const BasicBlock *BB, for (; SI != SE; ++SI) { Sequence SuccSSeq = S_None; bool SuccSRRIKnownSafe = false; - // If VisitBottomUp has visited this successor, take what we know about it. - DenseMap<const BasicBlock *, BBState>::iterator BBI = BBStates.find(*SI); - if (BBI != BBStates.end()) { - const PtrState &SuccS = BBI->second.getPtrBottomUpState(Arg); - SuccSSeq = SuccS.GetSeq(); - SuccSRRIKnownSafe = SuccS.RRI.KnownSafe; - } + // If VisitBottomUp has pointer information for this successor, take + // what we know about it. + DenseMap<const BasicBlock *, BBState>::iterator BBI = + BBStates.find(*SI); + assert(BBI != BBStates.end()); + const PtrState &SuccS = BBI->second.getPtrBottomUpState(Arg); + SuccSSeq = SuccS.GetSeq(); + SuccSRRIKnownSafe = SuccS.RRI.KnownSafe; switch (SuccSSeq) { case S_None: { if (!S.RRI.KnownSafe && !SuccSRRIKnownSafe) { @@ -2617,16 +2609,13 @@ ObjCARCOpt::VisitInstructionBottomUp(Instruction *Inst, if (S.GetSeq() == S_Release || S.GetSeq() == S_MovableRelease) NestingDetected = true; - S.RRI.clear(); - MDNode *ReleaseMetadata = Inst->getMetadata(ImpreciseReleaseMDKind); - S.SetSeq(ReleaseMetadata ? S_MovableRelease : S_Release); + S.ResetSequenceProgress(ReleaseMetadata ? S_MovableRelease : S_Release); S.RRI.ReleaseMetadata = ReleaseMetadata; S.RRI.KnownSafe = S.IsKnownNested() || S.IsKnownIncremented(); S.RRI.IsTailCallRelease = cast<CallInst>(Inst)->isTailCall(); S.RRI.Calls.insert(Inst); - S.IncrementRefCount(); S.IncrementNestCount(); break; } @@ -2641,8 +2630,7 @@ ObjCARCOpt::VisitInstructionBottomUp(Instruction *Inst, Arg = GetObjCArg(Inst); PtrState &S = MyStates.getPtrBottomUpState(Arg); - S.DecrementRefCount(); - S.SetAtLeastOneRefCount(); + S.SetKnownPositiveRefCount(); S.DecrementNestCount(); switch (S.GetSeq()) { @@ -2692,7 +2680,7 @@ ObjCARCOpt::VisitInstructionBottomUp(Instruction *Inst, // Check for possible releases. if (CanAlterRefCount(Inst, Ptr, PA, Class)) { - S.DecrementRefCount(); + S.ClearRefCount(); switch (Seq) { case S_Use: S.SetSeq(S_CanRelease); @@ -2759,37 +2747,20 @@ ObjCARCOpt::VisitBottomUp(BasicBlock *BB, // Merge the states from each successor to compute the initial state // for the current block. - const TerminatorInst *TI = cast<TerminatorInst>(&BB->back()); - succ_const_iterator SI(TI), SE(TI, false); - if (SI == SE) - MyStates.SetAsExit(); - else { - // If the terminator is an invoke marked with the - // clang.arc.no_objc_arc_exceptions metadata, the unwind edge can be - // ignored, for ARC purposes. - if (isa<InvokeInst>(TI) && TI->getMetadata(NoObjCARCExceptionsMDKind)) - --SE; - - do { - const BasicBlock *Succ = *SI++; - if (Succ == BB) - continue; - DenseMap<const BasicBlock *, BBState>::iterator I = BBStates.find(Succ); - // If we haven't seen this node yet, then we've found a CFG cycle. - // Be optimistic here; it's CheckForCFGHazards' job detect trouble. - if (I == BBStates.end()) - continue; - MyStates.InitFromSucc(I->second); - while (SI != SE) { - Succ = *SI++; - if (Succ != BB) { - I = BBStates.find(Succ); - if (I != BBStates.end()) - MyStates.MergeSucc(I->second); - } - } - break; - } while (SI != SE); + for (BBState::edge_iterator SI(MyStates.succ_begin()), + SE(MyStates.succ_end()); SI != SE; ++SI) { + const BasicBlock *Succ = *SI; + DenseMap<const BasicBlock *, BBState>::iterator I = BBStates.find(Succ); + assert(I != BBStates.end()); + MyStates.InitFromSucc(I->second); + ++SI; + for (; SI != SE; ++SI) { + Succ = *SI; + I = BBStates.find(Succ); + assert(I != BBStates.end()); + MyStates.MergeSucc(I->second); + } + break; } // Visit all the instructions, bottom-up. @@ -2803,15 +2774,14 @@ ObjCARCOpt::VisitBottomUp(BasicBlock *BB, NestingDetected |= VisitInstructionBottomUp(Inst, BB, Retains, MyStates); } - // If there's a predecessor with an invoke, visit the invoke as - // if it were part of this block, since we can't insert code after - // an invoke in its own block, and we don't want to split critical - // edges. - for (pred_iterator PI(BB), PE(BB, false); PI != PE; ++PI) { + // If there's a predecessor with an invoke, visit the invoke as if it were + // part of this block, since we can't insert code after an invoke in its own + // block, and we don't want to split critical edges. + for (BBState::edge_iterator PI(MyStates.pred_begin()), + PE(MyStates.pred_end()); PI != PE; ++PI) { BasicBlock *Pred = *PI; - TerminatorInst *PredTI = cast<TerminatorInst>(&Pred->back()); - if (isa<InvokeInst>(PredTI)) - NestingDetected |= VisitInstructionBottomUp(PredTI, BB, Retains, MyStates); + if (InvokeInst *II = dyn_cast<InvokeInst>(&Pred->back())) + NestingDetected |= VisitInstructionBottomUp(II, BB, Retains, MyStates); } return NestingDetected; @@ -2851,25 +2821,23 @@ ObjCARCOpt::VisitInstructionTopDown(Instruction *Inst, if (S.GetSeq() == S_Retain) NestingDetected = true; - S.SetSeq(S_Retain); - S.RRI.clear(); + S.ResetSequenceProgress(S_Retain); S.RRI.IsRetainBlock = Class == IC_RetainBlock; - // Don't check S.IsKnownIncremented() here because it's not - // sufficient. + // Don't check S.IsKnownIncremented() here because it's not sufficient. S.RRI.KnownSafe = S.IsKnownNested(); S.RRI.Calls.insert(Inst); } - S.SetAtLeastOneRefCount(); - S.IncrementRefCount(); S.IncrementNestCount(); - return NestingDetected; + + // A retain can be a potential use; procede to the generic checking + // code below. + break; } case IC_Release: { Arg = GetObjCArg(Inst); PtrState &S = MyStates.getPtrTopDownState(Arg); - S.DecrementRefCount(); S.DecrementNestCount(); switch (S.GetSeq()) { @@ -2916,7 +2884,7 @@ ObjCARCOpt::VisitInstructionTopDown(Instruction *Inst, // Check for possible releases. if (CanAlterRefCount(Inst, Ptr, PA, Class)) { - S.DecrementRefCount(); + S.ClearRefCount(); switch (Seq) { case S_Retain: S.SetSeq(S_CanRelease); @@ -2967,41 +2935,21 @@ ObjCARCOpt::VisitTopDown(BasicBlock *BB, // Merge the states from each predecessor to compute the initial state // for the current block. - const_pred_iterator PI(BB), PE(BB, false); - if (PI == PE) - MyStates.SetAsEntry(); - else - do { - unsigned OperandNo = PI.getOperandNo(); - const Use &Us = PI.getUse(); - ++PI; - - // Skip invoke unwind edges on invoke instructions marked with - // clang.arc.no_objc_arc_exceptions. - if (const InvokeInst *II = dyn_cast<InvokeInst>(Us.getUser())) - if (OperandNo == II->getNumArgOperands() + 2 && - II->getMetadata(NoObjCARCExceptionsMDKind)) - continue; - - const BasicBlock *Pred = cast<TerminatorInst>(Us.getUser())->getParent(); - if (Pred == BB) - continue; - DenseMap<const BasicBlock *, BBState>::iterator I = BBStates.find(Pred); - // If we haven't seen this node yet, then we've found a CFG cycle. - // Be optimistic here; it's CheckForCFGHazards' job detect trouble. - if (I == BBStates.end() || !I->second.isVisitedTopDown()) - continue; - MyStates.InitFromPred(I->second); - while (PI != PE) { - Pred = *PI++; - if (Pred != BB) { - I = BBStates.find(Pred); - if (I != BBStates.end() && I->second.isVisitedTopDown()) - MyStates.MergePred(I->second); - } - } - break; - } while (PI != PE); + for (BBState::edge_iterator PI(MyStates.pred_begin()), + PE(MyStates.pred_end()); PI != PE; ++PI) { + const BasicBlock *Pred = *PI; + DenseMap<const BasicBlock *, BBState>::iterator I = BBStates.find(Pred); + assert(I != BBStates.end()); + MyStates.InitFromPred(I->second); + ++PI; + for (; PI != PE; ++PI) { + Pred = *PI; + I = BBStates.find(Pred); + assert(I != BBStates.end()); + MyStates.MergePred(I->second); + } + break; + } // Visit all the instructions, top-down. for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) { @@ -3016,73 +2964,82 @@ ObjCARCOpt::VisitTopDown(BasicBlock *BB, static void ComputePostOrders(Function &F, SmallVectorImpl<BasicBlock *> &PostOrder, - SmallVectorImpl<BasicBlock *> &ReverseCFGPostOrder) { - /// Backedges - Backedges detected in the DFS. These edges will be - /// ignored in the reverse-CFG DFS, so that loops with multiple exits will be - /// traversed in the desired order. - DenseSet<std::pair<BasicBlock *, BasicBlock *> > Backedges; - + SmallVectorImpl<BasicBlock *> &ReverseCFGPostOrder, + unsigned NoObjCARCExceptionsMDKind, + DenseMap<const BasicBlock *, BBState> &BBStates) { /// Visited - The visited set, for doing DFS walks. SmallPtrSet<BasicBlock *, 16> Visited; // Do DFS, computing the PostOrder. SmallPtrSet<BasicBlock *, 16> OnStack; SmallVector<std::pair<BasicBlock *, succ_iterator>, 16> SuccStack; + + // Functions always have exactly one entry block, and we don't have + // any other block that we treat like an entry block. BasicBlock *EntryBB = &F.getEntryBlock(); - SuccStack.push_back(std::make_pair(EntryBB, succ_begin(EntryBB))); + BBState &MyStates = BBStates[EntryBB]; + MyStates.SetAsEntry(); + TerminatorInst *EntryTI = cast<TerminatorInst>(&EntryBB->back()); + SuccStack.push_back(std::make_pair(EntryBB, succ_iterator(EntryTI))); Visited.insert(EntryBB); OnStack.insert(EntryBB); do { dfs_next_succ: - TerminatorInst *TI = cast<TerminatorInst>(&SuccStack.back().first->back()); - succ_iterator End = succ_iterator(TI, true); - while (SuccStack.back().second != End) { - BasicBlock *BB = *SuccStack.back().second++; - if (Visited.insert(BB)) { - SuccStack.push_back(std::make_pair(BB, succ_begin(BB))); - OnStack.insert(BB); + BasicBlock *CurrBB = SuccStack.back().first; + TerminatorInst *TI = cast<TerminatorInst>(&CurrBB->back()); + succ_iterator SE(TI, false); + + // If the terminator is an invoke marked with the + // clang.arc.no_objc_arc_exceptions metadata, the unwind edge can be + // ignored, for ARC purposes. + if (isa<InvokeInst>(TI) && TI->getMetadata(NoObjCARCExceptionsMDKind)) + --SE; + + while (SuccStack.back().second != SE) { + BasicBlock *SuccBB = *SuccStack.back().second++; + if (Visited.insert(SuccBB)) { + TerminatorInst *TI = cast<TerminatorInst>(&SuccBB->back()); + SuccStack.push_back(std::make_pair(SuccBB, succ_iterator(TI))); + BBStates[CurrBB].addSucc(SuccBB); + BBState &SuccStates = BBStates[SuccBB]; + SuccStates.addPred(CurrBB); + OnStack.insert(SuccBB); goto dfs_next_succ; } - if (OnStack.count(BB)) - Backedges.insert(std::make_pair(SuccStack.back().first, BB)); + + if (!OnStack.count(SuccBB)) { + BBStates[CurrBB].addSucc(SuccBB); + BBStates[SuccBB].addPred(CurrBB); + } } - OnStack.erase(SuccStack.back().first); - PostOrder.push_back(SuccStack.pop_back_val().first); + OnStack.erase(CurrBB); + PostOrder.push_back(CurrBB); + SuccStack.pop_back(); } while (!SuccStack.empty()); Visited.clear(); - // Compute the exits, which are the starting points for reverse-CFG DFS. - // This includes blocks where all the successors are backedges that - // we're skipping. - SmallVector<BasicBlock *, 4> Exits; + // Do reverse-CFG DFS, computing the reverse-CFG PostOrder. + // Functions may have many exits, and there also blocks which we treat + // as exits due to ignored edges. + SmallVector<std::pair<BasicBlock *, BBState::edge_iterator>, 16> PredStack; for (Function::iterator I = F.begin(), E = F.end(); I != E; ++I) { - BasicBlock *BB = I; - TerminatorInst *TI = cast<TerminatorInst>(&BB->back()); - for (succ_iterator SI(TI), SE(TI, true); SI != SE; ++SI) - if (!Backedges.count(std::make_pair(BB, *SI))) - goto HasNonBackedgeSucc; - Exits.push_back(BB); - HasNonBackedgeSucc:; - } + BasicBlock *ExitBB = I; + BBState &MyStates = BBStates[ExitBB]; + if (!MyStates.isExit()) + continue; - // Do reverse-CFG DFS, computing the reverse-CFG PostOrder. - SmallVector<std::pair<BasicBlock *, pred_iterator>, 16> PredStack; - for (SmallVectorImpl<BasicBlock *>::iterator I = Exits.begin(), E = Exits.end(); - I != E; ++I) { - BasicBlock *ExitBB = *I; - PredStack.push_back(std::make_pair(ExitBB, pred_begin(ExitBB))); + MyStates.SetAsExit(); + + PredStack.push_back(std::make_pair(ExitBB, MyStates.pred_begin())); Visited.insert(ExitBB); while (!PredStack.empty()) { reverse_dfs_next_succ: - pred_iterator End = pred_end(PredStack.back().first); - while (PredStack.back().second != End) { + BBState::edge_iterator PE = BBStates[PredStack.back().first].pred_end(); + while (PredStack.back().second != PE) { BasicBlock *BB = *PredStack.back().second++; - // Skip backedges detected in the forward-CFG DFS. - if (Backedges.count(std::make_pair(BB, PredStack.back().first))) - continue; if (Visited.insert(BB)) { - PredStack.push_back(std::make_pair(BB, pred_begin(BB))); + PredStack.push_back(std::make_pair(BB, BBStates[BB].pred_begin())); goto reverse_dfs_next_succ; } } @@ -3105,7 +3062,9 @@ ObjCARCOpt::Visit(Function &F, // function exit point, and we want to ignore selected cycle edges. SmallVector<BasicBlock *, 16> PostOrder; SmallVector<BasicBlock *, 16> ReverseCFGPostOrder; - ComputePostOrders(F, PostOrder, ReverseCFGPostOrder); + ComputePostOrders(F, PostOrder, ReverseCFGPostOrder, + NoObjCARCExceptionsMDKind, + BBStates); // Use reverse-postorder on the reverse CFG for bottom-up. bool BottomUpNestingDetected = false; @@ -3214,7 +3173,7 @@ ObjCARCOpt::PerformCodePlacement(DenseMap<const BasicBlock *, BBState> // not being managed by ObjC reference counting, so we can delete pairs // regardless of what possible decrements or uses lie between them. bool KnownSafe = isa<Constant>(Arg) || isa<AllocaInst>(Arg); - + // A constant pointer can't be pointing to an object on the heap. It may // be reference-counted, but it won't be deleted. if (const LoadInst *LI = dyn_cast<LoadInst>(Arg)) @@ -3375,6 +3334,7 @@ ObjCARCOpt::PerformCodePlacement(DenseMap<const BasicBlock *, BBState> // Ok, everything checks out and we're all set. Let's move some code! Changed = true; + assert(OldCount != 0 && "Unreachable code?"); AnyPairsCompletelyEliminated = NewCount == 0; NumRRs += OldCount - NewCount; MoveCalls(Arg, RetainsToMove, ReleasesToMove, @@ -3515,7 +3475,7 @@ void ObjCARCOpt::OptimizeWeakCalls(Function &F) { if (AllocaInst *Alloca = dyn_cast<AllocaInst>(Arg)) { for (Value::use_iterator UI = Alloca->use_begin(), UE = Alloca->use_end(); UI != UE; ++UI) { - Instruction *UserInst = cast<Instruction>(*UI); + const Instruction *UserInst = cast<Instruction>(*UI); switch (GetBasicInstructionClass(UserInst)) { case IC_InitWeak: case IC_StoreWeak: @@ -3529,8 +3489,18 @@ void ObjCARCOpt::OptimizeWeakCalls(Function &F) { for (Value::use_iterator UI = Alloca->use_begin(), UE = Alloca->use_end(); UI != UE; ) { CallInst *UserInst = cast<CallInst>(*UI++); - if (!UserInst->use_empty()) - UserInst->replaceAllUsesWith(UserInst->getArgOperand(0)); + switch (GetBasicInstructionClass(UserInst)) { + case IC_InitWeak: + case IC_StoreWeak: + // These functions return their second argument. + UserInst->replaceAllUsesWith(UserInst->getArgOperand(1)); + break; + case IC_DestroyWeak: + // No return value. + break; + default: + llvm_unreachable("alloca really is used!"); + } UserInst->eraseFromParent(); } Alloca->eraseFromParent(); @@ -3598,8 +3568,7 @@ void ObjCARCOpt::OptimizeReturns(Function &F) { dyn_cast_or_null<CallInst>(*DependingInstructions.begin()); if (!Autorelease) goto next_block; - InstructionClass AutoreleaseClass = - GetBasicInstructionClass(Autorelease); + InstructionClass AutoreleaseClass = GetBasicInstructionClass(Autorelease); if (!IsAutorelease(AutoreleaseClass)) goto next_block; if (GetObjCArg(Autorelease) != Arg) @@ -3690,7 +3659,7 @@ bool ObjCARCOpt::doInitialization(Module &M) { // Intuitively, objc_retain and others are nocapture, however in practice // they are not, because they return their argument value. And objc_release - // calls finalizers. + // calls finalizers which can have arbitrary side effects. // These are initialized lazily. RetainRVCallee = 0; @@ -3742,8 +3711,8 @@ bool ObjCARCOpt::runOnFunction(Function &F) { while (OptimizeSequences(F)) {} // Optimizations if objc_autorelease is used. - if (UsedInThisFunction & - ((1 << IC_Autorelease) | (1 << IC_AutoreleaseRV))) + if (UsedInThisFunction & ((1 << IC_Autorelease) | + (1 << IC_AutoreleaseRV))) OptimizeReturns(F); return Changed; @@ -3791,7 +3760,7 @@ namespace { /// StoreStrongCalls - The set of inserted objc_storeStrong calls. If /// at the end of walking the function we have found no alloca /// instructions, these calls can be marked "tail". - DenseSet<CallInst *> StoreStrongCalls; + SmallPtrSet<CallInst *, 8> StoreStrongCalls; Constant *getStoreStrongCallee(Module *M); Constant *getRetainAutoreleaseCallee(Module *M); @@ -3842,13 +3811,11 @@ Constant *ObjCARCContract::getStoreStrongCallee(Module *M) { LLVMContext &C = M->getContext(); Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C)); Type *I8XX = PointerType::getUnqual(I8X); - std::vector<Type *> Params; - Params.push_back(I8XX); - Params.push_back(I8X); + Type *Params[] = { I8XX, I8X }; - AttrListPtr Attributes; - Attributes.addAttr(~0u, Attribute::NoUnwind); - Attributes.addAttr(1, Attribute::NoCapture); + AttrListPtr Attributes = AttrListPtr() + .addAttr(~0u, Attribute::NoUnwind) + .addAttr(1, Attribute::NoCapture); StoreStrongCallee = M->getOrInsertFunction( @@ -3863,12 +3830,9 @@ Constant *ObjCARCContract::getRetainAutoreleaseCallee(Module *M) { if (!RetainAutoreleaseCallee) { LLVMContext &C = M->getContext(); Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C)); - std::vector<Type *> Params; - Params.push_back(I8X); - FunctionType *FTy = - FunctionType::get(I8X, Params, /*isVarArg=*/false); - AttrListPtr Attributes; - Attributes.addAttr(~0u, Attribute::NoUnwind); + Type *Params[] = { I8X }; + FunctionType *FTy = FunctionType::get(I8X, Params, /*isVarArg=*/false); + AttrListPtr Attributes = AttrListPtr().addAttr(~0u, Attribute::NoUnwind); RetainAutoreleaseCallee = M->getOrInsertFunction("objc_retainAutorelease", FTy, Attributes); } @@ -3879,12 +3843,9 @@ Constant *ObjCARCContract::getRetainAutoreleaseRVCallee(Module *M) { if (!RetainAutoreleaseRVCallee) { LLVMContext &C = M->getContext(); Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C)); - std::vector<Type *> Params; - Params.push_back(I8X); - FunctionType *FTy = - FunctionType::get(I8X, Params, /*isVarArg=*/false); - AttrListPtr Attributes; - Attributes.addAttr(~0u, Attribute::NoUnwind); + Type *Params[] = { I8X }; + FunctionType *FTy = FunctionType::get(I8X, Params, /*isVarArg=*/false); + AttrListPtr Attributes = AttrListPtr().addAttr(~0u, Attribute::NoUnwind); RetainAutoreleaseRVCallee = M->getOrInsertFunction("objc_retainAutoreleaseReturnValue", FTy, Attributes); @@ -3892,8 +3853,7 @@ Constant *ObjCARCContract::getRetainAutoreleaseRVCallee(Module *M) { return RetainAutoreleaseRVCallee; } -/// ContractAutorelease - Merge an autorelease with a retain into a fused -/// call. +/// ContractAutorelease - Merge an autorelease with a retain into a fused call. bool ObjCARCContract::ContractAutorelease(Function &F, Instruction *Autorelease, InstructionClass Class, @@ -3954,18 +3914,41 @@ void ObjCARCContract::ContractRelease(Instruction *Release, BasicBlock *BB = Release->getParent(); if (Load->getParent() != BB) return; - // Walk down to find the store. + // Walk down to find the store and the release, which may be in either order. BasicBlock::iterator I = Load, End = BB->end(); ++I; AliasAnalysis::Location Loc = AA->getLocation(Load); - while (I != End && - (&*I == Release || - IsRetain(GetBasicInstructionClass(I)) || - !(AA->getModRefInfo(I, Loc) & AliasAnalysis::Mod))) - ++I; - StoreInst *Store = dyn_cast<StoreInst>(I); - if (!Store || !Store->isSimple()) return; - if (Store->getPointerOperand() != Loc.Ptr) return; + StoreInst *Store = 0; + bool SawRelease = false; + for (; !Store || !SawRelease; ++I) { + if (I == End) + return; + + Instruction *Inst = I; + if (Inst == Release) { + SawRelease = true; + continue; + } + + InstructionClass Class = GetBasicInstructionClass(Inst); + + // Unrelated retains are harmless. + if (IsRetain(Class)) + continue; + + if (Store) { + // The store is the point where we're going to put the objc_storeStrong, + // so make sure there are no uses after it. + if (CanUse(Inst, Load, PA, Class)) + return; + } else if (AA->getModRefInfo(Inst, Loc) & AliasAnalysis::Mod) { + // We are moving the load down to the store, so check for anything + // else which writes to the memory between the load and the store. + Store = dyn_cast<StoreInst>(Inst); + if (!Store || !Store->isSimple()) return; + if (Store->getPointerOperand() != Loc.Ptr) return; + } + } Value *New = StripPointerCastsAndObjCCalls(Store->getValueOperand()); @@ -4053,7 +4036,8 @@ bool ObjCARCContract::runOnFunction(Function &F) { // It seems that functions which "return twice" are also unsafe for the // "tail" argument, because they are setjmp, which could need to // return to an earlier stack state. - bool TailOkForStoreStrongs = !F.isVarArg() && !F.callsFunctionThatReturnsTwice(); + bool TailOkForStoreStrongs = !F.isVarArg() && + !F.callsFunctionThatReturnsTwice(); // For ObjC library calls which return their argument, replace uses of the // argument with uses of the call return value, if it dominates the use. This @@ -4083,8 +4067,22 @@ bool ObjCARCContract::runOnFunction(Function &F) { if (!RetainRVMarker) break; BasicBlock::iterator BBI = Inst; - --BBI; - while (isNoopInstruction(BBI)) --BBI; + BasicBlock *InstParent = Inst->getParent(); + + // Step up to see if the call immediately precedes the RetainRV call. + // If it's an invoke, we have to cross a block boundary. And we have + // to carefully dodge no-op instructions. + do { + if (&*BBI == InstParent->begin()) { + BasicBlock *Pred = InstParent->getSinglePredecessor(); + if (!Pred) + goto decline_rv_optimization; + BBI = Pred->getTerminator(); + break; + } + --BBI; + } while (isNoopInstruction(BBI)); + if (&*BBI == GetObjCArg(Inst)) { Changed = true; InlineAsm *IA = @@ -4094,6 +4092,7 @@ bool ObjCARCContract::runOnFunction(Function &F) { /*Constraints=*/"", /*hasSideEffects=*/true); CallInst::Create(IA, "", Inst); } + decline_rv_optimization: break; } case IC_InitWeak: { @@ -4143,25 +4142,21 @@ bool ObjCARCContract::runOnFunction(Function &F) { // trivially dominate itself, which would lead us to rewriting its // argument in terms of its return value, which would lead to // infinite loops in GetObjCArg. - if (DT->isReachableFromEntry(U) && - DT->dominates(Inst, U)) { + if (DT->isReachableFromEntry(U) && DT->dominates(Inst, U)) { Changed = true; Instruction *Replacement = Inst; Type *UseTy = U.get()->getType(); if (PHINode *PHI = dyn_cast<PHINode>(U.getUser())) { // For PHI nodes, insert the bitcast in the predecessor block. - unsigned ValNo = - PHINode::getIncomingValueNumForOperand(OperandNo); - BasicBlock *BB = - PHI->getIncomingBlock(ValNo); + unsigned ValNo = PHINode::getIncomingValueNumForOperand(OperandNo); + BasicBlock *BB = PHI->getIncomingBlock(ValNo); if (Replacement->getType() != UseTy) Replacement = new BitCastInst(Replacement, UseTy, "", &BB->back()); // While we're here, rewrite all edges for this PHI, rather // than just one use at a time, to minimize the number of // bitcasts we emit. - for (unsigned i = 0, e = PHI->getNumIncomingValues(); - i != e; ++i) + for (unsigned i = 0, e = PHI->getNumIncomingValues(); i != e; ++i) if (PHI->getIncomingBlock(i) == BB) { // Keep the UI iterator valid. if (&PHI->getOperandUse( @@ -4179,8 +4174,7 @@ bool ObjCARCContract::runOnFunction(Function &F) { } } - // If Arg is a no-op casted pointer, strip one level of casts and - // iterate. + // If Arg is a no-op casted pointer, strip one level of casts and iterate. if (const BitCastInst *BI = dyn_cast<BitCastInst>(Arg)) Arg = BI->getOperand(0); else if (isa<GEPOperator>(Arg) && @@ -4197,7 +4191,7 @@ bool ObjCARCContract::runOnFunction(Function &F) { // If this function has no escaping allocas or suspicious vararg usage, // objc_storeStrong calls can be marked with the "tail" keyword. if (TailOkForStoreStrongs) - for (DenseSet<CallInst *>::iterator I = StoreStrongCalls.begin(), + for (SmallPtrSet<CallInst *, 8>::iterator I = StoreStrongCalls.begin(), E = StoreStrongCalls.end(); I != E; ++I) (*I)->setTailCall(); StoreStrongCalls.clear(); diff --git a/lib/Transforms/Scalar/Reassociate.cpp b/lib/Transforms/Scalar/Reassociate.cpp index 5de00d1..09687d8 100644 --- a/lib/Transforms/Scalar/Reassociate.cpp +++ b/lib/Transforms/Scalar/Reassociate.cpp @@ -26,21 +26,23 @@ #include "llvm/Constants.h" #include "llvm/DerivedTypes.h" #include "llvm/Function.h" +#include "llvm/IRBuilder.h" #include "llvm/Instructions.h" #include "llvm/IntrinsicInst.h" #include "llvm/Pass.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/PostOrderIterator.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SetVector.h" +#include "llvm/ADT/Statistic.h" #include "llvm/Assembly/Writer.h" #include "llvm/Support/CFG.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ValueHandle.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/ADT/PostOrderIterator.h" -#include "llvm/ADT/Statistic.h" -#include "llvm/ADT/DenseMap.h" #include <algorithm> using namespace llvm; -STATISTIC(NumLinear , "Number of insts linearized"); STATISTIC(NumChanged, "Number of insts reassociated"); STATISTIC(NumAnnihil, "Number of expr tree annihilated"); STATISTIC(NumFactor , "Number of multiplies factored"); @@ -70,13 +72,51 @@ static void PrintOps(Instruction *I, const SmallVectorImpl<ValueEntry> &Ops) { } } #endif - + +namespace { + /// \brief Utility class representing a base and exponent pair which form one + /// factor of some product. + struct Factor { + Value *Base; + unsigned Power; + + Factor(Value *Base, unsigned Power) : Base(Base), Power(Power) {} + + /// \brief Sort factors by their Base. + struct BaseSorter { + bool operator()(const Factor &LHS, const Factor &RHS) { + return LHS.Base < RHS.Base; + } + }; + + /// \brief Compare factors for equal bases. + struct BaseEqual { + bool operator()(const Factor &LHS, const Factor &RHS) { + return LHS.Base == RHS.Base; + } + }; + + /// \brief Sort factors in descending order by their power. + struct PowerDescendingSorter { + bool operator()(const Factor &LHS, const Factor &RHS) { + return LHS.Power > RHS.Power; + } + }; + + /// \brief Compare factors for equal powers. + struct PowerEqual { + bool operator()(const Factor &LHS, const Factor &RHS) { + return LHS.Power == RHS.Power; + } + }; + }; +} + namespace { class Reassociate : public FunctionPass { DenseMap<BasicBlock*, unsigned> RankMap; DenseMap<AssertingVH<Value>, unsigned> ValueRankMap; - SmallVector<WeakVH, 8> RedoInsts; - SmallVector<WeakVH, 8> DeadInsts; + SetVector<AssertingVH<Instruction> > RedoInsts; bool MadeChange; public: static char ID; // Pass identification, replacement for typeid @@ -92,18 +132,19 @@ namespace { private: void BuildRankMap(Function &F); unsigned getRank(Value *V); - Value *ReassociateExpression(BinaryOperator *I); - void RewriteExprTree(BinaryOperator *I, SmallVectorImpl<ValueEntry> &Ops, - unsigned Idx = 0); + void ReassociateExpression(BinaryOperator *I); + void RewriteExprTree(BinaryOperator *I, SmallVectorImpl<ValueEntry> &Ops); Value *OptimizeExpression(BinaryOperator *I, SmallVectorImpl<ValueEntry> &Ops); Value *OptimizeAdd(Instruction *I, SmallVectorImpl<ValueEntry> &Ops); - void LinearizeExprTree(BinaryOperator *I, SmallVectorImpl<ValueEntry> &Ops); - void LinearizeExpr(BinaryOperator *I); + bool collectMultiplyFactors(SmallVectorImpl<ValueEntry> &Ops, + SmallVectorImpl<Factor> &Factors); + Value *buildMinimalMultiplyDAG(IRBuilder<> &Builder, + SmallVectorImpl<Factor> &Factors); + Value *OptimizeMul(BinaryOperator *I, SmallVectorImpl<ValueEntry> &Ops); Value *RemoveFactorFromExpression(Value *V, Value *Factor); - void ReassociateInst(BasicBlock::iterator &BBI); - - void RemoveDeadBinaryOp(Value *V); + void EraseInst(Instruction *I); + void OptimizeInst(Instruction *I); }; } @@ -114,28 +155,24 @@ INITIALIZE_PASS(Reassociate, "reassociate", // Public interface to the Reassociate pass FunctionPass *llvm::createReassociatePass() { return new Reassociate(); } -void Reassociate::RemoveDeadBinaryOp(Value *V) { - Instruction *Op = dyn_cast<Instruction>(V); - if (!Op || !isa<BinaryOperator>(Op)) - return; - - Value *LHS = Op->getOperand(0), *RHS = Op->getOperand(1); - - ValueRankMap.erase(Op); - DeadInsts.push_back(Op); - RemoveDeadBinaryOp(LHS); - RemoveDeadBinaryOp(RHS); +/// isReassociableOp - Return true if V is an instruction of the specified +/// opcode and if it only has one use. +static BinaryOperator *isReassociableOp(Value *V, unsigned Opcode) { + if (V->hasOneUse() && isa<Instruction>(V) && + cast<Instruction>(V)->getOpcode() == Opcode) + return cast<BinaryOperator>(V); + return 0; } - static bool isUnmovableInstruction(Instruction *I) { if (I->getOpcode() == Instruction::PHI || + I->getOpcode() == Instruction::LandingPad || I->getOpcode() == Instruction::Alloca || I->getOpcode() == Instruction::Load || I->getOpcode() == Instruction::Invoke || (I->getOpcode() == Instruction::Call && !isa<DbgInfoIntrinsic>(I)) || - I->getOpcode() == Instruction::UDiv || + I->getOpcode() == Instruction::UDiv || I->getOpcode() == Instruction::SDiv || I->getOpcode() == Instruction::FDiv || I->getOpcode() == Instruction::URem || @@ -198,211 +235,572 @@ unsigned Reassociate::getRank(Value *V) { return ValueRankMap[I] = Rank; } -/// isReassociableOp - Return true if V is an instruction of the specified -/// opcode and if it only has one use. -static BinaryOperator *isReassociableOp(Value *V, unsigned Opcode) { - if ((V->hasOneUse() || V->use_empty()) && isa<Instruction>(V) && - cast<Instruction>(V)->getOpcode() == Opcode) - return cast<BinaryOperator>(V); - return 0; -} - /// LowerNegateToMultiply - Replace 0-X with X*-1. /// -static Instruction *LowerNegateToMultiply(Instruction *Neg, - DenseMap<AssertingVH<Value>, unsigned> &ValueRankMap) { +static BinaryOperator *LowerNegateToMultiply(Instruction *Neg) { Constant *Cst = Constant::getAllOnesValue(Neg->getType()); - Instruction *Res = BinaryOperator::CreateMul(Neg->getOperand(1), Cst, "",Neg); - ValueRankMap.erase(Neg); + BinaryOperator *Res = + BinaryOperator::CreateMul(Neg->getOperand(1), Cst, "",Neg); + Neg->setOperand(1, Constant::getNullValue(Neg->getType())); // Drop use of op. Res->takeName(Neg); Neg->replaceAllUsesWith(Res); Res->setDebugLoc(Neg->getDebugLoc()); - Neg->eraseFromParent(); return Res; } -// Given an expression of the form '(A+B)+(D+C)', turn it into '(((A+B)+C)+D)'. -// Note that if D is also part of the expression tree that we recurse to -// linearize it as well. Besides that case, this does not recurse into A,B, or -// C. -void Reassociate::LinearizeExpr(BinaryOperator *I) { - BinaryOperator *LHS = cast<BinaryOperator>(I->getOperand(0)); - BinaryOperator *RHS = cast<BinaryOperator>(I->getOperand(1)); - assert(isReassociableOp(LHS, I->getOpcode()) && - isReassociableOp(RHS, I->getOpcode()) && - "Not an expression that needs linearization?"); - - DEBUG(dbgs() << "Linear" << *LHS << '\n' << *RHS << '\n' << *I << '\n'); - - // Move the RHS instruction to live immediately before I, avoiding breaking - // dominator properties. - RHS->moveBefore(I); - - // Move operands around to do the linearization. - I->setOperand(1, RHS->getOperand(0)); - RHS->setOperand(0, LHS); - I->setOperand(0, RHS); - - // Conservatively clear all the optional flags, which may not hold - // after the reassociation. - I->clearSubclassOptionalData(); - LHS->clearSubclassOptionalData(); - RHS->clearSubclassOptionalData(); - - ++NumLinear; - MadeChange = true; - DEBUG(dbgs() << "Linearized: " << *I << '\n'); - - // If D is part of this expression tree, tail recurse. - if (isReassociableOp(I->getOperand(1), I->getOpcode())) - LinearizeExpr(I); +/// CarmichaelShift - Returns k such that lambda(2^Bitwidth) = 2^k, where lambda +/// is the Carmichael function. This means that x^(2^k) === 1 mod 2^Bitwidth for +/// every odd x, i.e. x^(2^k) = 1 for every odd x in Bitwidth-bit arithmetic. +/// Note that 0 <= k < Bitwidth, and if Bitwidth > 3 then x^(2^k) = 0 for every +/// even x in Bitwidth-bit arithmetic. +static unsigned CarmichaelShift(unsigned Bitwidth) { + if (Bitwidth < 3) + return Bitwidth - 1; + return Bitwidth - 2; +} + +/// IncorporateWeight - Add the extra weight 'RHS' to the existing weight 'LHS', +/// reducing the combined weight using any special properties of the operation. +/// The existing weight LHS represents the computation X op X op ... op X where +/// X occurs LHS times. The combined weight represents X op X op ... op X with +/// X occurring LHS + RHS times. If op is "Xor" for example then the combined +/// operation is equivalent to X if LHS + RHS is odd, or 0 if LHS + RHS is even; +/// the routine returns 1 in LHS in the first case, and 0 in LHS in the second. +static void IncorporateWeight(APInt &LHS, const APInt &RHS, unsigned Opcode) { + // If we were working with infinite precision arithmetic then the combined + // weight would be LHS + RHS. But we are using finite precision arithmetic, + // and the APInt sum LHS + RHS may not be correct if it wraps (it is correct + // for nilpotent operations and addition, but not for idempotent operations + // and multiplication), so it is important to correctly reduce the combined + // weight back into range if wrapping would be wrong. + + // If RHS is zero then the weight didn't change. + if (RHS.isMinValue()) + return; + // If LHS is zero then the combined weight is RHS. + if (LHS.isMinValue()) { + LHS = RHS; + return; + } + // From this point on we know that neither LHS nor RHS is zero. + + if (Instruction::isIdempotent(Opcode)) { + // Idempotent means X op X === X, so any non-zero weight is equivalent to a + // weight of 1. Keeping weights at zero or one also means that wrapping is + // not a problem. + assert(LHS == 1 && RHS == 1 && "Weights not reduced!"); + return; // Return a weight of 1. + } + if (Instruction::isNilpotent(Opcode)) { + // Nilpotent means X op X === 0, so reduce weights modulo 2. + assert(LHS == 1 && RHS == 1 && "Weights not reduced!"); + LHS = 0; // 1 + 1 === 0 modulo 2. + return; + } + if (Opcode == Instruction::Add) { + // TODO: Reduce the weight by exploiting nsw/nuw? + LHS += RHS; + return; + } + + assert(Opcode == Instruction::Mul && "Unknown associative operation!"); + unsigned Bitwidth = LHS.getBitWidth(); + // If CM is the Carmichael number then a weight W satisfying W >= CM+Bitwidth + // can be replaced with W-CM. That's because x^W=x^(W-CM) for every Bitwidth + // bit number x, since either x is odd in which case x^CM = 1, or x is even in + // which case both x^W and x^(W - CM) are zero. By subtracting off multiples + // of CM like this weights can always be reduced to the range [0, CM+Bitwidth) + // which by a happy accident means that they can always be represented using + // Bitwidth bits. + // TODO: Reduce the weight by exploiting nsw/nuw? (Could do much better than + // the Carmichael number). + if (Bitwidth > 3) { + /// CM - The value of Carmichael's lambda function. + APInt CM = APInt::getOneBitSet(Bitwidth, CarmichaelShift(Bitwidth)); + // Any weight W >= Threshold can be replaced with W - CM. + APInt Threshold = CM + Bitwidth; + assert(LHS.ult(Threshold) && RHS.ult(Threshold) && "Weights not reduced!"); + // For Bitwidth 4 or more the following sum does not overflow. + LHS += RHS; + while (LHS.uge(Threshold)) + LHS -= CM; + } else { + // To avoid problems with overflow do everything the same as above but using + // a larger type. + unsigned CM = 1U << CarmichaelShift(Bitwidth); + unsigned Threshold = CM + Bitwidth; + assert(LHS.getZExtValue() < Threshold && RHS.getZExtValue() < Threshold && + "Weights not reduced!"); + unsigned Total = LHS.getZExtValue() + RHS.getZExtValue(); + while (Total >= Threshold) + Total -= CM; + LHS = Total; + } } +/// EvaluateRepeatedConstant - Compute C op C op ... op C where the constant C +/// is repeated Weight times. +static Constant *EvaluateRepeatedConstant(unsigned Opcode, Constant *C, + APInt Weight) { + // For addition the result can be efficiently computed as the product of the + // constant and the weight. + if (Opcode == Instruction::Add) + return ConstantExpr::getMul(C, ConstantInt::get(C->getContext(), Weight)); + + // The weight might be huge, so compute by repeated squaring to ensure that + // compile time is proportional to the logarithm of the weight. + Constant *Result = 0; + Constant *Power = C; // Successively C, C op C, (C op C) op (C op C) etc. + // Visit the bits in Weight. + while (Weight != 0) { + // If the current bit in Weight is non-zero do Result = Result op Power. + if (Weight[0]) + Result = Result ? ConstantExpr::get(Opcode, Result, Power) : Power; + // Move on to the next bit if any more are non-zero. + Weight = Weight.lshr(1); + if (Weight.isMinValue()) + break; + // Square the power. + Power = ConstantExpr::get(Opcode, Power, Power); + } + + assert(Result && "Only positive weights supported!"); + return Result; +} -/// LinearizeExprTree - Given an associative binary expression tree, traverse -/// all of the uses putting it into canonical form. This forces a left-linear -/// form of the expression (((a+b)+c)+d), and collects information about the -/// rank of the non-tree operands. +typedef std::pair<Value*, APInt> RepeatedValue; + +/// LinearizeExprTree - Given an associative binary expression, return the leaf +/// nodes in Ops along with their weights (how many times the leaf occurs). The +/// original expression is the same as +/// (Ops[0].first op Ops[0].first op ... Ops[0].first) <- Ops[0].second times +/// op +/// (Ops[1].first op Ops[1].first op ... Ops[1].first) <- Ops[1].second times +/// op +/// ... +/// op +/// (Ops[N].first op Ops[N].first op ... Ops[N].first) <- Ops[N].second times +/// +/// Note that the values Ops[0].first, ..., Ops[N].first are all distinct, and +/// they are all non-constant except possibly for the last one, which if it is +/// constant will have weight one (Ops[N].second === 1). +/// +/// This routine may modify the function, in which case it returns 'true'. The +/// changes it makes may well be destructive, changing the value computed by 'I' +/// to something completely different. Thus if the routine returns 'true' then +/// you MUST either replace I with a new expression computed from the Ops array, +/// or use RewriteExprTree to put the values back in. +/// +/// A leaf node is either not a binary operation of the same kind as the root +/// node 'I' (i.e. is not a binary operator at all, or is, but with a different +/// opcode), or is the same kind of binary operator but has a use which either +/// does not belong to the expression, or does belong to the expression but is +/// a leaf node. Every leaf node has at least one use that is a non-leaf node +/// of the expression, while for non-leaf nodes (except for the root 'I') every +/// use is a non-leaf node of the expression. +/// +/// For example: +/// expression graph node names +/// +/// + | I +/// / \ | +/// + + | A, B +/// / \ / \ | +/// * + * | C, D, E +/// / \ / \ / \ | +/// + * | F, G +/// +/// The leaf nodes are C, E, F and G. The Ops array will contain (maybe not in +/// that order) (C, 1), (E, 1), (F, 2), (G, 2). /// -/// NOTE: These intentionally destroys the expression tree operands (turning -/// them into undef values) to reduce #uses of the values. This means that the -/// caller MUST use something like RewriteExprTree to put the values back in. +/// The expression is maximal: if some instruction is a binary operator of the +/// same kind as 'I', and all of its uses are non-leaf nodes of the expression, +/// then the instruction also belongs to the expression, is not a leaf node of +/// it, and its operands also belong to the expression (but may be leaf nodes). /// -void Reassociate::LinearizeExprTree(BinaryOperator *I, - SmallVectorImpl<ValueEntry> &Ops) { - Value *LHS = I->getOperand(0), *RHS = I->getOperand(1); +/// NOTE: This routine will set operands of non-leaf non-root nodes to undef in +/// order to ensure that every non-root node in the expression has *exactly one* +/// use by a non-leaf node of the expression. This destruction means that the +/// caller MUST either replace 'I' with a new expression or use something like +/// RewriteExprTree to put the values back in if the routine indicates that it +/// made a change by returning 'true'. +/// +/// In the above example either the right operand of A or the left operand of B +/// will be replaced by undef. If it is B's operand then this gives: +/// +/// + | I +/// / \ | +/// + + | A, B - operand of B replaced with undef +/// / \ \ | +/// * + * | C, D, E +/// / \ / \ / \ | +/// + * | F, G +/// +/// Note that such undef operands can only be reached by passing through 'I'. +/// For example, if you visit operands recursively starting from a leaf node +/// then you will never see such an undef operand unless you get back to 'I', +/// which requires passing through a phi node. +/// +/// Note that this routine may also mutate binary operators of the wrong type +/// that have all uses inside the expression (i.e. only used by non-leaf nodes +/// of the expression) if it can turn them into binary operators of the right +/// type and thus make the expression bigger. + +static bool LinearizeExprTree(BinaryOperator *I, + SmallVectorImpl<RepeatedValue> &Ops) { + DEBUG(dbgs() << "LINEARIZE: " << *I << '\n'); + unsigned Bitwidth = I->getType()->getScalarType()->getPrimitiveSizeInBits(); unsigned Opcode = I->getOpcode(); + assert(Instruction::isAssociative(Opcode) && + Instruction::isCommutative(Opcode) && + "Expected an associative and commutative operation!"); + // If we see an absorbing element then the entire expression must be equal to + // it. For example, if this is a multiplication expression and zero occurs as + // an operand somewhere in it then the result of the expression must be zero. + Constant *Absorber = ConstantExpr::getBinOpAbsorber(Opcode, I->getType()); + + // Visit all operands of the expression, keeping track of their weight (the + // number of paths from the expression root to the operand, or if you like + // the number of times that operand occurs in the linearized expression). + // For example, if I = X + A, where X = A + B, then I, X and B have weight 1 + // while A has weight two. + + // Worklist of non-leaf nodes (their operands are in the expression too) along + // with their weights, representing a certain number of paths to the operator. + // If an operator occurs in the worklist multiple times then we found multiple + // ways to get to it. + SmallVector<std::pair<BinaryOperator*, APInt>, 8> Worklist; // (Op, Weight) + Worklist.push_back(std::make_pair(I, APInt(Bitwidth, 1))); + bool MadeChange = false; + + // Leaves of the expression are values that either aren't the right kind of + // operation (eg: a constant, or a multiply in an add tree), or are, but have + // some uses that are not inside the expression. For example, in I = X + X, + // X = A + B, the value X has two uses (by I) that are in the expression. If + // X has any other uses, for example in a return instruction, then we consider + // X to be a leaf, and won't analyze it further. When we first visit a value, + // if it has more than one use then at first we conservatively consider it to + // be a leaf. Later, as the expression is explored, we may discover some more + // uses of the value from inside the expression. If all uses turn out to be + // from within the expression (and the value is a binary operator of the right + // kind) then the value is no longer considered to be a leaf, and its operands + // are explored. + + // Leaves - Keeps track of the set of putative leaves as well as the number of + // paths to each leaf seen so far. + typedef DenseMap<Value*, APInt> LeafMap; + LeafMap Leaves; // Leaf -> Total weight so far. + SmallVector<Value*, 8> LeafOrder; // Ensure deterministic leaf output order. - // First step, linearize the expression if it is in ((A+B)+(C+D)) form. - BinaryOperator *LHSBO = isReassociableOp(LHS, Opcode); - BinaryOperator *RHSBO = isReassociableOp(RHS, Opcode); +#ifndef NDEBUG + SmallPtrSet<Value*, 8> Visited; // For sanity checking the iteration scheme. +#endif + while (!Worklist.empty()) { + std::pair<BinaryOperator*, APInt> P = Worklist.pop_back_val(); + I = P.first; // We examine the operands of this binary operator. + + for (unsigned OpIdx = 0; OpIdx < 2; ++OpIdx) { // Visit operands. + Value *Op = I->getOperand(OpIdx); + APInt Weight = P.second; // Number of paths to this operand. + DEBUG(dbgs() << "OPERAND: " << *Op << " (" << Weight << ")\n"); + assert(!Op->use_empty() && "No uses, so how did we get to it?!"); + + // If the expression contains an absorbing element then there is no need + // to analyze it further: it must evaluate to the absorbing element. + if (Op == Absorber && !Weight.isMinValue()) { + Ops.push_back(std::make_pair(Absorber, APInt(Bitwidth, 1))); + return MadeChange; + } - // If this is a multiply expression tree and it contains internal negations, - // transform them into multiplies by -1 so they can be reassociated. - if (I->getOpcode() == Instruction::Mul) { - if (!LHSBO && LHS->hasOneUse() && BinaryOperator::isNeg(LHS)) { - LHS = LowerNegateToMultiply(cast<Instruction>(LHS), ValueRankMap); - LHSBO = isReassociableOp(LHS, Opcode); - } - if (!RHSBO && RHS->hasOneUse() && BinaryOperator::isNeg(RHS)) { - RHS = LowerNegateToMultiply(cast<Instruction>(RHS), ValueRankMap); - RHSBO = isReassociableOp(RHS, Opcode); + // If this is a binary operation of the right kind with only one use then + // add its operands to the expression. + if (BinaryOperator *BO = isReassociableOp(Op, Opcode)) { + assert(Visited.insert(Op) && "Not first visit!"); + DEBUG(dbgs() << "DIRECT ADD: " << *Op << " (" << Weight << ")\n"); + Worklist.push_back(std::make_pair(BO, Weight)); + continue; + } + + // Appears to be a leaf. Is the operand already in the set of leaves? + LeafMap::iterator It = Leaves.find(Op); + if (It == Leaves.end()) { + // Not in the leaf map. Must be the first time we saw this operand. + assert(Visited.insert(Op) && "Not first visit!"); + if (!Op->hasOneUse()) { + // This value has uses not accounted for by the expression, so it is + // not safe to modify. Mark it as being a leaf. + DEBUG(dbgs() << "ADD USES LEAF: " << *Op << " (" << Weight << ")\n"); + LeafOrder.push_back(Op); + Leaves[Op] = Weight; + continue; + } + // No uses outside the expression, try morphing it. + } else if (It != Leaves.end()) { + // Already in the leaf map. + assert(Visited.count(Op) && "In leaf map but not visited!"); + + // Update the number of paths to the leaf. + IncorporateWeight(It->second, Weight, Opcode); + +#if 0 // TODO: Re-enable once PR13021 is fixed. + // The leaf already has one use from inside the expression. As we want + // exactly one such use, drop this new use of the leaf. + assert(!Op->hasOneUse() && "Only one use, but we got here twice!"); + I->setOperand(OpIdx, UndefValue::get(I->getType())); + MadeChange = true; + + // If the leaf is a binary operation of the right kind and we now see + // that its multiple original uses were in fact all by nodes belonging + // to the expression, then no longer consider it to be a leaf and add + // its operands to the expression. + if (BinaryOperator *BO = isReassociableOp(Op, Opcode)) { + DEBUG(dbgs() << "UNLEAF: " << *Op << " (" << It->second << ")\n"); + Worklist.push_back(std::make_pair(BO, It->second)); + Leaves.erase(It); + continue; + } +#endif + + // If we still have uses that are not accounted for by the expression + // then it is not safe to modify the value. + if (!Op->hasOneUse()) + continue; + + // No uses outside the expression, try morphing it. + Weight = It->second; + Leaves.erase(It); // Since the value may be morphed below. + } + + // At this point we have a value which, first of all, is not a binary + // expression of the right kind, and secondly, is only used inside the + // expression. This means that it can safely be modified. See if we + // can usefully morph it into an expression of the right kind. + assert((!isa<Instruction>(Op) || + cast<Instruction>(Op)->getOpcode() != Opcode) && + "Should have been handled above!"); + assert(Op->hasOneUse() && "Has uses outside the expression tree!"); + + // If this is a multiply expression, turn any internal negations into + // multiplies by -1 so they can be reassociated. + BinaryOperator *BO = dyn_cast<BinaryOperator>(Op); + if (Opcode == Instruction::Mul && BO && BinaryOperator::isNeg(BO)) { + DEBUG(dbgs() << "MORPH LEAF: " << *Op << " (" << Weight << ") TO "); + BO = LowerNegateToMultiply(BO); + DEBUG(dbgs() << *BO << 'n'); + Worklist.push_back(std::make_pair(BO, Weight)); + MadeChange = true; + continue; + } + + // Failed to morph into an expression of the right type. This really is + // a leaf. + DEBUG(dbgs() << "ADD LEAF: " << *Op << " (" << Weight << ")\n"); + assert(!isReassociableOp(Op, Opcode) && "Value was morphed?"); + LeafOrder.push_back(Op); + Leaves[Op] = Weight; } } - if (!LHSBO) { - if (!RHSBO) { - // Neither the LHS or RHS as part of the tree, thus this is a leaf. As - // such, just remember these operands and their rank. - Ops.push_back(ValueEntry(getRank(LHS), LHS)); - Ops.push_back(ValueEntry(getRank(RHS), RHS)); - - // Clear the leaves out. - I->setOperand(0, UndefValue::get(I->getType())); - I->setOperand(1, UndefValue::get(I->getType())); - return; + // The leaves, repeated according to their weights, represent the linearized + // form of the expression. + Constant *Cst = 0; // Accumulate constants here. + for (unsigned i = 0, e = LeafOrder.size(); i != e; ++i) { + Value *V = LeafOrder[i]; + LeafMap::iterator It = Leaves.find(V); + if (It == Leaves.end()) + // Node initially thought to be a leaf wasn't. + continue; + assert(!isReassociableOp(V, Opcode) && "Shouldn't be a leaf!"); + APInt Weight = It->second; + if (Weight.isMinValue()) + // Leaf already output or weight reduction eliminated it. + continue; + // Ensure the leaf is only output once. + It->second = 0; + // Glob all constants together into Cst. + if (Constant *C = dyn_cast<Constant>(V)) { + C = EvaluateRepeatedConstant(Opcode, C, Weight); + Cst = Cst ? ConstantExpr::get(Opcode, Cst, C) : C; + continue; } - - // Turn X+(Y+Z) -> (Y+Z)+X - std::swap(LHSBO, RHSBO); - std::swap(LHS, RHS); - bool Success = !I->swapOperands(); - assert(Success && "swapOperands failed"); - (void)Success; - MadeChange = true; - } else if (RHSBO) { - // Turn (A+B)+(C+D) -> (((A+B)+C)+D). This guarantees the RHS is not - // part of the expression tree. - LinearizeExpr(I); - LHS = LHSBO = cast<BinaryOperator>(I->getOperand(0)); - RHS = I->getOperand(1); - RHSBO = 0; + // Add non-constant + Ops.push_back(std::make_pair(V, Weight)); } - // Okay, now we know that the LHS is a nested expression and that the RHS is - // not. Perform reassociation. - assert(!isReassociableOp(RHS, Opcode) && "LinearizeExpr failed!"); - - // Move LHS right before I to make sure that the tree expression dominates all - // values. - LHSBO->moveBefore(I); + // Add any constants back into Ops, all globbed together and reduced to having + // weight 1 for the convenience of users. + Constant *Identity = ConstantExpr::getBinOpIdentity(Opcode, I->getType()); + if (Cst && Cst != Identity) { + // If combining multiple constants resulted in the absorber then the entire + // expression must evaluate to the absorber. + if (Cst == Absorber) + Ops.clear(); + Ops.push_back(std::make_pair(Cst, APInt(Bitwidth, 1))); + } - // Linearize the expression tree on the LHS. - LinearizeExprTree(LHSBO, Ops); + // For nilpotent operations or addition there may be no operands, for example + // because the expression was "X xor X" or consisted of 2^Bitwidth additions: + // in both cases the weight reduces to 0 causing the value to be skipped. + if (Ops.empty()) { + assert(Identity && "Associative operation without identity!"); + Ops.push_back(std::make_pair(Identity, APInt(Bitwidth, 1))); + } - // Remember the RHS operand and its rank. - Ops.push_back(ValueEntry(getRank(RHS), RHS)); - - // Clear the RHS leaf out. - I->setOperand(1, UndefValue::get(I->getType())); + return MadeChange; } // RewriteExprTree - Now that the operands for this expression tree are -// linearized and optimized, emit them in-order. This function is written to be -// tail recursive. +// linearized and optimized, emit them in-order. void Reassociate::RewriteExprTree(BinaryOperator *I, - SmallVectorImpl<ValueEntry> &Ops, - unsigned i) { - if (i+2 == Ops.size()) { - if (I->getOperand(0) != Ops[i].Op || - I->getOperand(1) != Ops[i+1].Op) { - Value *OldLHS = I->getOperand(0); - DEBUG(dbgs() << "RA: " << *I << '\n'); - I->setOperand(0, Ops[i].Op); - I->setOperand(1, Ops[i+1].Op); - - // Clear all the optional flags, which may not hold after the - // reassociation if the expression involved more than just this operation. - if (Ops.size() != 2) - I->clearSubclassOptionalData(); - - DEBUG(dbgs() << "TO: " << *I << '\n'); + SmallVectorImpl<ValueEntry> &Ops) { + assert(Ops.size() > 1 && "Single values should be used directly!"); + + // Since our optimizations never increase the number of operations, the new + // expression can always be written by reusing the existing binary operators + // from the original expression tree, without creating any new instructions, + // though the rewritten expression may have a completely different topology. + // We take care to not change anything if the new expression will be the same + // as the original. If more than trivial changes (like commuting operands) + // were made then we are obliged to clear out any optional subclass data like + // nsw flags. + + /// NodesToRewrite - Nodes from the original expression available for writing + /// the new expression into. + SmallVector<BinaryOperator*, 8> NodesToRewrite; + unsigned Opcode = I->getOpcode(); + BinaryOperator *Op = I; + + // ExpressionChanged - Non-null if the rewritten expression differs from the + // original in some non-trivial way, requiring the clearing of optional flags. + // Flags are cleared from the operator in ExpressionChanged up to I inclusive. + BinaryOperator *ExpressionChanged = 0; + for (unsigned i = 0; ; ++i) { + // The last operation (which comes earliest in the IR) is special as both + // operands will come from Ops, rather than just one with the other being + // a subexpression. + if (i+2 == Ops.size()) { + Value *NewLHS = Ops[i].Op; + Value *NewRHS = Ops[i+1].Op; + Value *OldLHS = Op->getOperand(0); + Value *OldRHS = Op->getOperand(1); + + if (NewLHS == OldLHS && NewRHS == OldRHS) + // Nothing changed, leave it alone. + break; + + if (NewLHS == OldRHS && NewRHS == OldLHS) { + // The order of the operands was reversed. Swap them. + DEBUG(dbgs() << "RA: " << *Op << '\n'); + Op->swapOperands(); + DEBUG(dbgs() << "TO: " << *Op << '\n'); + MadeChange = true; + ++NumChanged; + break; + } + + // The new operation differs non-trivially from the original. Overwrite + // the old operands with the new ones. + DEBUG(dbgs() << "RA: " << *Op << '\n'); + if (NewLHS != OldLHS) { + if (BinaryOperator *BO = isReassociableOp(OldLHS, Opcode)) + NodesToRewrite.push_back(BO); + Op->setOperand(0, NewLHS); + } + if (NewRHS != OldRHS) { + if (BinaryOperator *BO = isReassociableOp(OldRHS, Opcode)) + NodesToRewrite.push_back(BO); + Op->setOperand(1, NewRHS); + } + DEBUG(dbgs() << "TO: " << *Op << '\n'); + + ExpressionChanged = Op; + MadeChange = true; + ++NumChanged; + + break; + } + + // Not the last operation. The left-hand side will be a sub-expression + // while the right-hand side will be the current element of Ops. + Value *NewRHS = Ops[i].Op; + if (NewRHS != Op->getOperand(1)) { + DEBUG(dbgs() << "RA: " << *Op << '\n'); + if (NewRHS == Op->getOperand(0)) { + // The new right-hand side was already present as the left operand. If + // we are lucky then swapping the operands will sort out both of them. + Op->swapOperands(); + } else { + // Overwrite with the new right-hand side. + if (BinaryOperator *BO = isReassociableOp(Op->getOperand(1), Opcode)) + NodesToRewrite.push_back(BO); + Op->setOperand(1, NewRHS); + ExpressionChanged = Op; + } + DEBUG(dbgs() << "TO: " << *Op << '\n'); MadeChange = true; ++NumChanged; - - // If we reassociated a tree to fewer operands (e.g. (1+a+2) -> (a+3) - // delete the extra, now dead, nodes. - RemoveDeadBinaryOp(OldLHS); } - return; - } - assert(i+2 < Ops.size() && "Ops index out of range!"); - if (I->getOperand(1) != Ops[i].Op) { - DEBUG(dbgs() << "RA: " << *I << '\n'); - I->setOperand(1, Ops[i].Op); + // Now deal with the left-hand side. If this is already an operation node + // from the original expression then just rewrite the rest of the expression + // into it. + if (BinaryOperator *BO = isReassociableOp(Op->getOperand(0), Opcode)) { + Op = BO; + continue; + } - // Conservatively clear all the optional flags, which may not hold - // after the reassociation. - I->clearSubclassOptionalData(); + // Otherwise, grab a spare node from the original expression and use that as + // the left-hand side. If there are no nodes left then the optimizers made + // an expression with more nodes than the original! This usually means that + // they did something stupid but it might mean that the problem was just too + // hard (finding the mimimal number of multiplications needed to realize a + // multiplication expression is NP-complete). Whatever the reason, smart or + // stupid, create a new node if there are none left. + BinaryOperator *NewOp; + if (NodesToRewrite.empty()) { + Constant *Undef = UndefValue::get(I->getType()); + NewOp = BinaryOperator::Create(Instruction::BinaryOps(Opcode), + Undef, Undef, "", I); + } else { + NewOp = NodesToRewrite.pop_back_val(); + } - DEBUG(dbgs() << "TO: " << *I << '\n'); + DEBUG(dbgs() << "RA: " << *Op << '\n'); + Op->setOperand(0, NewOp); + DEBUG(dbgs() << "TO: " << *Op << '\n'); + ExpressionChanged = Op; MadeChange = true; ++NumChanged; + Op = NewOp; } - - BinaryOperator *LHS = cast<BinaryOperator>(I->getOperand(0)); - assert(LHS->getOpcode() == I->getOpcode() && - "Improper expression tree!"); - - // Compactify the tree instructions together with each other to guarantee - // that the expression tree is dominated by all of Ops. - LHS->moveBefore(I); - RewriteExprTree(LHS, Ops, i+1); -} - + // If the expression changed non-trivially then clear out all subclass data + // starting from the operator specified in ExpressionChanged, and compactify + // the operators to just before the expression root to guarantee that the + // expression tree is dominated by all of Ops. + if (ExpressionChanged) + do { + ExpressionChanged->clearSubclassOptionalData(); + if (ExpressionChanged == I) + break; + ExpressionChanged->moveBefore(I); + ExpressionChanged = cast<BinaryOperator>(*ExpressionChanged->use_begin()); + } while (1); + + // Throw away any left over nodes from the original expression. + for (unsigned i = 0, e = NodesToRewrite.size(); i != e; ++i) + RedoInsts.insert(NodesToRewrite[i]); +} -// NegateValue - Insert instructions before the instruction pointed to by BI, -// that computes the negative version of the value specified. The negative -// version of the value is returned, and BI is left pointing at the instruction -// that should be processed next by the reassociation pass. -// +/// NegateValue - Insert instructions before the instruction pointed to by BI, +/// that computes the negative version of the value specified. The negative +/// version of the value is returned, and BI is left pointing at the instruction +/// that should be processed next by the reassociation pass. static Value *NegateValue(Value *V, Instruction *BI) { if (Constant *C = dyn_cast<Constant>(V)) return ConstantExpr::getNeg(C); - + // We are trying to expose opportunity for reassociation. One of the things // that we want to do to achieve this is to push a negation as deep into an // expression chain as possible, to expose the add instructions. In practice, @@ -412,22 +810,21 @@ static Value *NegateValue(Value *V, Instruction *BI) { // the constants. We assume that instcombine will clean up the mess later if // we introduce tons of unnecessary negation instructions. // - if (Instruction *I = dyn_cast<Instruction>(V)) - if (I->getOpcode() == Instruction::Add && I->hasOneUse()) { - // Push the negates through the add. - I->setOperand(0, NegateValue(I->getOperand(0), BI)); - I->setOperand(1, NegateValue(I->getOperand(1), BI)); - - // We must move the add instruction here, because the neg instructions do - // not dominate the old add instruction in general. By moving it, we are - // assured that the neg instructions we just inserted dominate the - // instruction we are about to insert after them. - // - I->moveBefore(BI); - I->setName(I->getName()+".neg"); - return I; - } - + if (BinaryOperator *I = isReassociableOp(V, Instruction::Add)) { + // Push the negates through the add. + I->setOperand(0, NegateValue(I->getOperand(0), BI)); + I->setOperand(1, NegateValue(I->getOperand(1), BI)); + + // We must move the add instruction here, because the neg instructions do + // not dominate the old add instruction in general. By moving it, we are + // assured that the neg instructions we just inserted dominate the + // instruction we are about to insert after them. + // + I->moveBefore(BI); + I->setName(I->getName()+".neg"); + return I; + } + // Okay, we need to materialize a negated version of V with an instruction. // Scan the use lists of V to see if we have one already. for (Value::use_iterator UI = V->use_begin(), E = V->use_end(); UI != E;++UI){ @@ -443,7 +840,7 @@ static Value *NegateValue(Value *V, Instruction *BI) { // Verify that the negate is in this function, V might be a constant expr. if (TheNeg->getParent()->getParent() != BI->getParent()->getParent()) continue; - + BasicBlock::iterator InsertPt; if (Instruction *InstInput = dyn_cast<Instruction>(V)) { if (InvokeInst *II = dyn_cast<InvokeInst>(InstInput)) { @@ -471,7 +868,7 @@ static bool ShouldBreakUpSubtract(Instruction *Sub) { // If this is a negation, we can't split it up! if (BinaryOperator::isNeg(Sub)) return false; - + // Don't bother to break this up unless either the LHS is an associable add or // subtract or if this is only used by one. if (isReassociableOp(Sub->getOperand(0), Instruction::Add) || @@ -480,19 +877,18 @@ static bool ShouldBreakUpSubtract(Instruction *Sub) { if (isReassociableOp(Sub->getOperand(1), Instruction::Add) || isReassociableOp(Sub->getOperand(1), Instruction::Sub)) return true; - if (Sub->hasOneUse() && + if (Sub->hasOneUse() && (isReassociableOp(Sub->use_back(), Instruction::Add) || isReassociableOp(Sub->use_back(), Instruction::Sub))) return true; - + return false; } /// BreakUpSubtract - If we have (X-Y), and if either X is an add, or if this is /// only used by an add, transform this into (X+(0-Y)) to promote better /// reassociation. -static Instruction *BreakUpSubtract(Instruction *Sub, - DenseMap<AssertingVH<Value>, unsigned> &ValueRankMap) { +static BinaryOperator *BreakUpSubtract(Instruction *Sub) { // Convert a subtract into an add and a neg instruction. This allows sub // instructions to be commuted with other add instructions. // @@ -500,15 +896,15 @@ static Instruction *BreakUpSubtract(Instruction *Sub, // and set it as the RHS of the add instruction we just made. // Value *NegVal = NegateValue(Sub->getOperand(1), Sub); - Instruction *New = + BinaryOperator *New = BinaryOperator::CreateAdd(Sub->getOperand(0), NegVal, "", Sub); + Sub->setOperand(0, Constant::getNullValue(Sub->getType())); // Drop use of op. + Sub->setOperand(1, Constant::getNullValue(Sub->getType())); // Drop use of op. New->takeName(Sub); // Everyone now refers to the add instruction. - ValueRankMap.erase(Sub); Sub->replaceAllUsesWith(New); New->setDebugLoc(Sub->getDebugLoc()); - Sub->eraseFromParent(); DEBUG(dbgs() << "Negated: " << *New << '\n'); return New; @@ -517,32 +913,23 @@ static Instruction *BreakUpSubtract(Instruction *Sub, /// ConvertShiftToMul - If this is a shift of a reassociable multiply or is used /// by one, change this into a multiply by a constant to assist with further /// reassociation. -static Instruction *ConvertShiftToMul(Instruction *Shl, - DenseMap<AssertingVH<Value>, unsigned> &ValueRankMap) { - // If an operand of this shift is a reassociable multiply, or if the shift - // is used by a reassociable multiply or add, turn into a multiply. - if (isReassociableOp(Shl->getOperand(0), Instruction::Mul) || - (Shl->hasOneUse() && - (isReassociableOp(Shl->use_back(), Instruction::Mul) || - isReassociableOp(Shl->use_back(), Instruction::Add)))) { - Constant *MulCst = ConstantInt::get(Shl->getType(), 1); - MulCst = ConstantExpr::getShl(MulCst, cast<Constant>(Shl->getOperand(1))); - - Instruction *Mul = - BinaryOperator::CreateMul(Shl->getOperand(0), MulCst, "", Shl); - ValueRankMap.erase(Shl); - Mul->takeName(Shl); - Shl->replaceAllUsesWith(Mul); - Mul->setDebugLoc(Shl->getDebugLoc()); - Shl->eraseFromParent(); - return Mul; - } - return 0; +static BinaryOperator *ConvertShiftToMul(Instruction *Shl) { + Constant *MulCst = ConstantInt::get(Shl->getType(), 1); + MulCst = ConstantExpr::getShl(MulCst, cast<Constant>(Shl->getOperand(1))); + + BinaryOperator *Mul = + BinaryOperator::CreateMul(Shl->getOperand(0), MulCst, "", Shl); + Shl->setOperand(0, UndefValue::get(Shl->getType())); // Drop use of op. + Mul->takeName(Shl); + Shl->replaceAllUsesWith(Mul); + Mul->setDebugLoc(Shl->getDebugLoc()); + return Mul; } -// Scan backwards and forwards among values with the same rank as element i to -// see if X exists. If X does not exist, return i. This is useful when -// scanning for 'x' when we see '-x' because they both get the same rank. +/// FindInOperandList - Scan backwards and forwards among values with the same +/// rank as element i to see if X exists. If X does not exist, return i. This +/// is useful when scanning for 'x' when we see '-x' because they both get the +/// same rank. static unsigned FindInOperandList(SmallVectorImpl<ValueEntry> &Ops, unsigned i, Value *X) { unsigned XRank = Ops[i].Rank; @@ -562,22 +949,29 @@ static unsigned FindInOperandList(SmallVectorImpl<ValueEntry> &Ops, unsigned i, static Value *EmitAddTreeOfValues(Instruction *I, SmallVectorImpl<WeakVH> &Ops){ if (Ops.size() == 1) return Ops.back(); - + Value *V1 = Ops.back(); Ops.pop_back(); Value *V2 = EmitAddTreeOfValues(I, Ops); return BinaryOperator::CreateAdd(V2, V1, "tmp", I); } -/// RemoveFactorFromExpression - If V is an expression tree that is a +/// RemoveFactorFromExpression - If V is an expression tree that is a /// multiplication sequence, and if this sequence contains a multiply by Factor, /// remove Factor from the tree and return the new tree. Value *Reassociate::RemoveFactorFromExpression(Value *V, Value *Factor) { BinaryOperator *BO = isReassociableOp(V, Instruction::Mul); if (!BO) return 0; - + + SmallVector<RepeatedValue, 8> Tree; + MadeChange |= LinearizeExprTree(BO, Tree); SmallVector<ValueEntry, 8> Factors; - LinearizeExprTree(BO, Factors); + Factors.reserve(Tree.size()); + for (unsigned i = 0, e = Tree.size(); i != e; ++i) { + RepeatedValue E = Tree[i]; + Factors.append(E.second.getZExtValue(), + ValueEntry(getRank(E.first), E.first)); + } bool FoundFactor = false; bool NeedsNegate = false; @@ -587,7 +981,7 @@ Value *Reassociate::RemoveFactorFromExpression(Value *V, Value *Factor) { Factors.erase(Factors.begin()+i); break; } - + // If this is a negative version of this factor, remove it. if (ConstantInt *FC1 = dyn_cast<ConstantInt>(Factor)) if (ConstantInt *FC2 = dyn_cast<ConstantInt>(Factors[i].Op)) @@ -597,29 +991,28 @@ Value *Reassociate::RemoveFactorFromExpression(Value *V, Value *Factor) { break; } } - + if (!FoundFactor) { // Make sure to restore the operands to the expression tree. RewriteExprTree(BO, Factors); return 0; } - + BasicBlock::iterator InsertPt = BO; ++InsertPt; - + // If this was just a single multiply, remove the multiply and return the only // remaining operand. if (Factors.size() == 1) { - ValueRankMap.erase(BO); - DeadInsts.push_back(BO); + RedoInsts.insert(BO); V = Factors[0].Op; } else { RewriteExprTree(BO, Factors); V = BO; } - + if (NeedsNegate) V = BinaryOperator::CreateNeg(V, "neg", InsertPt); - + return V; } @@ -629,31 +1022,16 @@ Value *Reassociate::RemoveFactorFromExpression(Value *V, Value *Factor) { /// Ops is the top-level list of add operands we're trying to factor. static void FindSingleUseMultiplyFactors(Value *V, SmallVectorImpl<Value*> &Factors, - const SmallVectorImpl<ValueEntry> &Ops, - bool IsRoot) { - BinaryOperator *BO; - if (!(V->hasOneUse() || V->use_empty()) || // More than one use. - !(BO = dyn_cast<BinaryOperator>(V)) || - BO->getOpcode() != Instruction::Mul) { + const SmallVectorImpl<ValueEntry> &Ops) { + BinaryOperator *BO = isReassociableOp(V, Instruction::Mul); + if (!BO) { Factors.push_back(V); return; } - - // If this value has a single use because it is another input to the add - // tree we're reassociating and we dropped its use, it actually has two - // uses and we can't factor it. - if (!IsRoot) { - for (unsigned i = 0, e = Ops.size(); i != e; ++i) - if (Ops[i].Op == V) { - Factors.push_back(V); - return; - } - } - - + // Otherwise, add the LHS and RHS to the list of factors. - FindSingleUseMultiplyFactors(BO->getOperand(1), Factors, Ops, false); - FindSingleUseMultiplyFactors(BO->getOperand(0), Factors, Ops, false); + FindSingleUseMultiplyFactors(BO->getOperand(1), Factors, Ops); + FindSingleUseMultiplyFactors(BO->getOperand(0), Factors, Ops); } /// OptimizeAndOrXor - Optimize a series of operands to an 'and', 'or', or 'xor' @@ -673,12 +1051,12 @@ static Value *OptimizeAndOrXor(unsigned Opcode, if (FoundX != i) { if (Opcode == Instruction::And) // ...&X&~X = 0 return Constant::getNullValue(X->getType()); - + if (Opcode == Instruction::Or) // ...|X|~X = -1 return Constant::getAllOnesValue(X->getType()); } } - + // Next, check for duplicate pairs of values, which we assume are next to // each other, due to our sorting criteria. assert(i < Ops.size()); @@ -690,12 +1068,12 @@ static Value *OptimizeAndOrXor(unsigned Opcode, ++NumAnnihil; continue; } - + // Drop pairs of values for Xor. assert(Opcode == Instruction::Xor); if (e == 2) return Constant::getNullValue(Ops[0].Op->getType()); - + // Y ^ X^X -> Y Ops.erase(Ops.begin()+i, Ops.begin()+i+2); i -= 1; e -= 2; @@ -728,46 +1106,46 @@ Value *Reassociate::OptimizeAdd(Instruction *I, Ops.erase(Ops.begin()+i); ++NumFound; } while (i != Ops.size() && Ops[i].Op == TheOp); - + DEBUG(errs() << "\nFACTORING [" << NumFound << "]: " << *TheOp << '\n'); ++NumFactor; - + // Insert a new multiply. Value *Mul = ConstantInt::get(cast<IntegerType>(I->getType()), NumFound); Mul = BinaryOperator::CreateMul(TheOp, Mul, "factor", I); - + // Now that we have inserted a multiply, optimize it. This allows us to // handle cases that require multiple factoring steps, such as this: // (X*2) + (X*2) + (X*2) -> (X*2)*3 -> X*6 - RedoInsts.push_back(Mul); - + RedoInsts.insert(cast<Instruction>(Mul)); + // If every add operand was a duplicate, return the multiply. if (Ops.empty()) return Mul; - + // Otherwise, we had some input that didn't have the dupe, such as // "A + A + B" -> "A*2 + B". Add the new multiply to the list of // things being added by this operation. Ops.insert(Ops.begin(), ValueEntry(getRank(Mul), Mul)); - + --i; e = Ops.size(); continue; } - + // Check for X and -X in the operand list. if (!BinaryOperator::isNeg(TheOp)) continue; - + Value *X = BinaryOperator::getNegArgument(TheOp); unsigned FoundX = FindInOperandList(Ops, i, X); if (FoundX == i) continue; - + // Remove X and -X from the operand list. if (Ops.size() == 2) return Constant::getNullValue(X->getType()); - + Ops.erase(Ops.begin()+i); if (i < FoundX) --FoundX; @@ -778,37 +1156,37 @@ Value *Reassociate::OptimizeAdd(Instruction *I, --i; // Revisit element. e -= 2; // Removed two elements. } - + // Scan the operand list, checking to see if there are any common factors // between operands. Consider something like A*A+A*B*C+D. We would like to // reassociate this to A*(A+B*C)+D, which reduces the number of multiplies. // To efficiently find this, we count the number of times a factor occurs // for any ADD operands that are MULs. DenseMap<Value*, unsigned> FactorOccurrences; - + // Keep track of each multiply we see, to avoid triggering on (X*4)+(X*4) // where they are actually the same multiply. unsigned MaxOcc = 0; Value *MaxOccVal = 0; for (unsigned i = 0, e = Ops.size(); i != e; ++i) { - BinaryOperator *BOp = dyn_cast<BinaryOperator>(Ops[i].Op); - if (BOp == 0 || BOp->getOpcode() != Instruction::Mul || !BOp->use_empty()) + BinaryOperator *BOp = isReassociableOp(Ops[i].Op, Instruction::Mul); + if (!BOp) continue; - + // Compute all of the factors of this added value. SmallVector<Value*, 8> Factors; - FindSingleUseMultiplyFactors(BOp, Factors, Ops, true); + FindSingleUseMultiplyFactors(BOp, Factors, Ops); assert(Factors.size() > 1 && "Bad linearize!"); - + // Add one to FactorOccurrences for each unique factor in this op. SmallPtrSet<Value*, 8> Duplicates; for (unsigned i = 0, e = Factors.size(); i != e; ++i) { Value *Factor = Factors[i]; if (!Duplicates.insert(Factor)) continue; - + unsigned Occ = ++FactorOccurrences[Factor]; if (Occ > MaxOcc) { MaxOcc = Occ; MaxOccVal = Factor; } - + // If Factor is a negative constant, add the negated value as a factor // because we can percolate the negate out. Watch for minint, which // cannot be positivified. @@ -817,13 +1195,13 @@ Value *Reassociate::OptimizeAdd(Instruction *I, Factor = ConstantInt::get(CI->getContext(), -CI->getValue()); assert(!Duplicates.count(Factor) && "Shouldn't have two constant factors, missed a canonicalize"); - + unsigned Occ = ++FactorOccurrences[Factor]; if (Occ > MaxOcc) { MaxOcc = Occ; MaxOccVal = Factor; } } } } - + // If any factor occurred more than one time, we can pull it out. if (MaxOcc > 1) { DEBUG(errs() << "\nFACTORING [" << MaxOcc << "]: " << *MaxOccVal << '\n'); @@ -831,16 +1209,16 @@ Value *Reassociate::OptimizeAdd(Instruction *I, // Create a new instruction that uses the MaxOccVal twice. If we don't do // this, we could otherwise run into situations where removing a factor - // from an expression will drop a use of maxocc, and this can cause + // from an expression will drop a use of maxocc, and this can cause // RemoveFactorFromExpression on successive values to behave differently. Instruction *DummyInst = BinaryOperator::CreateAdd(MaxOccVal, MaxOccVal); SmallVector<WeakVH, 4> NewMulOps; for (unsigned i = 0; i != Ops.size(); ++i) { // Only try to remove factors from expressions we're allowed to. - BinaryOperator *BOp = dyn_cast<BinaryOperator>(Ops[i].Op); - if (BOp == 0 || BOp->getOpcode() != Instruction::Mul || !BOp->use_empty()) + BinaryOperator *BOp = isReassociableOp(Ops[i].Op, Instruction::Mul); + if (!BOp) continue; - + if (Value *V = RemoveFactorFromExpression(Ops[i].Op, MaxOccVal)) { // The factorized operand may occur several times. Convert them all in // one fell swoop. @@ -854,7 +1232,7 @@ Value *Reassociate::OptimizeAdd(Instruction *I, --i; } } - + // No need for extra uses anymore. delete DummyInst; @@ -866,26 +1244,201 @@ Value *Reassociate::OptimizeAdd(Instruction *I, // A*A*B + A*A*C --> A*(A*B+A*C) --> A*(A*(B+C)) assert(NumAddedValues > 1 && "Each occurrence should contribute a value"); (void)NumAddedValues; - V = ReassociateExpression(cast<BinaryOperator>(V)); + if (Instruction *VI = dyn_cast<Instruction>(V)) + RedoInsts.insert(VI); // Create the multiply. - Value *V2 = BinaryOperator::CreateMul(V, MaxOccVal, "tmp", I); + Instruction *V2 = BinaryOperator::CreateMul(V, MaxOccVal, "tmp", I); // Rerun associate on the multiply in case the inner expression turned into // a multiply. We want to make sure that we keep things in canonical form. - V2 = ReassociateExpression(cast<BinaryOperator>(V2)); - + RedoInsts.insert(V2); + // If every add operand included the factor (e.g. "A*B + A*C"), then the // entire result expression is just the multiply "A*(B+C)". if (Ops.empty()) return V2; - + // Otherwise, we had some input that didn't have the factor, such as // "A*B + A*C + D" -> "A*(B+C) + D". Add the new multiply to the list of // things being added by this operation. Ops.insert(Ops.begin(), ValueEntry(getRank(V2), V2)); } - + + return 0; +} + +namespace { + /// \brief Predicate tests whether a ValueEntry's op is in a map. + struct IsValueInMap { + const DenseMap<Value *, unsigned> ⤅ + + IsValueInMap(const DenseMap<Value *, unsigned> &Map) : Map(Map) {} + + bool operator()(const ValueEntry &Entry) { + return Map.find(Entry.Op) != Map.end(); + } + }; +} + +/// \brief Build up a vector of value/power pairs factoring a product. +/// +/// Given a series of multiplication operands, build a vector of factors and +/// the powers each is raised to when forming the final product. Sort them in +/// the order of descending power. +/// +/// (x*x) -> [(x, 2)] +/// ((x*x)*x) -> [(x, 3)] +/// ((((x*y)*x)*y)*x) -> [(x, 3), (y, 2)] +/// +/// \returns Whether any factors have a power greater than one. +bool Reassociate::collectMultiplyFactors(SmallVectorImpl<ValueEntry> &Ops, + SmallVectorImpl<Factor> &Factors) { + // FIXME: Have Ops be (ValueEntry, Multiplicity) pairs, simplifying this. + // Compute the sum of powers of simplifiable factors. + unsigned FactorPowerSum = 0; + for (unsigned Idx = 1, Size = Ops.size(); Idx < Size; ++Idx) { + Value *Op = Ops[Idx-1].Op; + + // Count the number of occurrences of this value. + unsigned Count = 1; + for (; Idx < Size && Ops[Idx].Op == Op; ++Idx) + ++Count; + // Track for simplification all factors which occur 2 or more times. + if (Count > 1) + FactorPowerSum += Count; + } + + // We can only simplify factors if the sum of the powers of our simplifiable + // factors is 4 or higher. When that is the case, we will *always* have + // a simplification. This is an important invariant to prevent cyclicly + // trying to simplify already minimal formations. + if (FactorPowerSum < 4) + return false; + + // Now gather the simplifiable factors, removing them from Ops. + FactorPowerSum = 0; + for (unsigned Idx = 1; Idx < Ops.size(); ++Idx) { + Value *Op = Ops[Idx-1].Op; + + // Count the number of occurrences of this value. + unsigned Count = 1; + for (; Idx < Ops.size() && Ops[Idx].Op == Op; ++Idx) + ++Count; + if (Count == 1) + continue; + // Move an even number of occurrences to Factors. + Count &= ~1U; + Idx -= Count; + FactorPowerSum += Count; + Factors.push_back(Factor(Op, Count)); + Ops.erase(Ops.begin()+Idx, Ops.begin()+Idx+Count); + } + + // None of the adjustments above should have reduced the sum of factor powers + // below our mininum of '4'. + assert(FactorPowerSum >= 4); + + std::sort(Factors.begin(), Factors.end(), Factor::PowerDescendingSorter()); + return true; +} + +/// \brief Build a tree of multiplies, computing the product of Ops. +static Value *buildMultiplyTree(IRBuilder<> &Builder, + SmallVectorImpl<Value*> &Ops) { + if (Ops.size() == 1) + return Ops.back(); + + Value *LHS = Ops.pop_back_val(); + do { + LHS = Builder.CreateMul(LHS, Ops.pop_back_val()); + } while (!Ops.empty()); + + return LHS; +} + +/// \brief Build a minimal multiplication DAG for (a^x)*(b^y)*(c^z)*... +/// +/// Given a vector of values raised to various powers, where no two values are +/// equal and the powers are sorted in decreasing order, compute the minimal +/// DAG of multiplies to compute the final product, and return that product +/// value. +Value *Reassociate::buildMinimalMultiplyDAG(IRBuilder<> &Builder, + SmallVectorImpl<Factor> &Factors) { + assert(Factors[0].Power); + SmallVector<Value *, 4> OuterProduct; + for (unsigned LastIdx = 0, Idx = 1, Size = Factors.size(); + Idx < Size && Factors[Idx].Power > 0; ++Idx) { + if (Factors[Idx].Power != Factors[LastIdx].Power) { + LastIdx = Idx; + continue; + } + + // We want to multiply across all the factors with the same power so that + // we can raise them to that power as a single entity. Build a mini tree + // for that. + SmallVector<Value *, 4> InnerProduct; + InnerProduct.push_back(Factors[LastIdx].Base); + do { + InnerProduct.push_back(Factors[Idx].Base); + ++Idx; + } while (Idx < Size && Factors[Idx].Power == Factors[LastIdx].Power); + + // Reset the base value of the first factor to the new expression tree. + // We'll remove all the factors with the same power in a second pass. + Value *M = Factors[LastIdx].Base = buildMultiplyTree(Builder, InnerProduct); + if (Instruction *MI = dyn_cast<Instruction>(M)) + RedoInsts.insert(MI); + + LastIdx = Idx; + } + // Unique factors with equal powers -- we've folded them into the first one's + // base. + Factors.erase(std::unique(Factors.begin(), Factors.end(), + Factor::PowerEqual()), + Factors.end()); + + // Iteratively collect the base of each factor with an add power into the + // outer product, and halve each power in preparation for squaring the + // expression. + for (unsigned Idx = 0, Size = Factors.size(); Idx != Size; ++Idx) { + if (Factors[Idx].Power & 1) + OuterProduct.push_back(Factors[Idx].Base); + Factors[Idx].Power >>= 1; + } + if (Factors[0].Power) { + Value *SquareRoot = buildMinimalMultiplyDAG(Builder, Factors); + OuterProduct.push_back(SquareRoot); + OuterProduct.push_back(SquareRoot); + } + if (OuterProduct.size() == 1) + return OuterProduct.front(); + + Value *V = buildMultiplyTree(Builder, OuterProduct); + return V; +} + +Value *Reassociate::OptimizeMul(BinaryOperator *I, + SmallVectorImpl<ValueEntry> &Ops) { + // We can only optimize the multiplies when there is a chain of more than + // three, such that a balanced tree might require fewer total multiplies. + if (Ops.size() < 4) + return 0; + + // Try to turn linear trees of multiplies without other uses of the + // intermediate stages into minimal multiply DAGs with perfect sub-expression + // re-use. + SmallVector<Factor, 4> Factors; + if (!collectMultiplyFactors(Ops, Factors)) + return 0; // All distinct factors, so nothing left for us to do. + + IRBuilder<> Builder(I); + Value *V = buildMinimalMultiplyDAG(Builder, Factors); + if (Ops.empty()) + return V; + + ValueEntry NewEntry = ValueEntry(getRank(V), V); + Ops.insert(std::lower_bound(Ops.begin(), Ops.end(), NewEntry), NewEntry); return 0; } @@ -893,95 +1446,105 @@ Value *Reassociate::OptimizeExpression(BinaryOperator *I, SmallVectorImpl<ValueEntry> &Ops) { // Now that we have the linearized expression tree, try to optimize it. // Start by folding any constants that we found. - bool IterateOptimization = false; if (Ops.size() == 1) return Ops[0].Op; unsigned Opcode = I->getOpcode(); - - if (Constant *V1 = dyn_cast<Constant>(Ops[Ops.size()-2].Op)) - if (Constant *V2 = dyn_cast<Constant>(Ops.back().Op)) { - Ops.pop_back(); - Ops.back().Op = ConstantExpr::get(Opcode, V1, V2); - return OptimizeExpression(I, Ops); - } - - // Check for destructive annihilation due to a constant being used. - if (ConstantInt *CstVal = dyn_cast<ConstantInt>(Ops.back().Op)) - switch (Opcode) { - default: break; - case Instruction::And: - if (CstVal->isZero()) // X & 0 -> 0 - return CstVal; - if (CstVal->isAllOnesValue()) // X & -1 -> X - Ops.pop_back(); - break; - case Instruction::Mul: - if (CstVal->isZero()) { // X * 0 -> 0 - ++NumAnnihil; - return CstVal; - } - - if (cast<ConstantInt>(CstVal)->isOne()) - Ops.pop_back(); // X * 1 -> X - break; - case Instruction::Or: - if (CstVal->isAllOnesValue()) // X | -1 -> -1 - return CstVal; - // FALLTHROUGH! - case Instruction::Add: - case Instruction::Xor: - if (CstVal->isZero()) // X [|^+] 0 -> X - Ops.pop_back(); - break; - } - if (Ops.size() == 1) return Ops[0].Op; // Handle destructive annihilation due to identities between elements in the // argument list here. + unsigned NumOps = Ops.size(); switch (Opcode) { default: break; case Instruction::And: case Instruction::Or: - case Instruction::Xor: { - unsigned NumOps = Ops.size(); + case Instruction::Xor: if (Value *Result = OptimizeAndOrXor(Opcode, Ops)) return Result; - IterateOptimization |= Ops.size() != NumOps; break; - } - case Instruction::Add: { - unsigned NumOps = Ops.size(); + case Instruction::Add: if (Value *Result = OptimizeAdd(I, Ops)) return Result; - IterateOptimization |= Ops.size() != NumOps; - } + break; + case Instruction::Mul: + if (Value *Result = OptimizeMul(I, Ops)) + return Result; break; - //case Instruction::Mul: } - if (IterateOptimization) + if (Ops.size() != NumOps) return OptimizeExpression(I, Ops); return 0; } +/// EraseInst - Zap the given instruction, adding interesting operands to the +/// work list. +void Reassociate::EraseInst(Instruction *I) { + assert(isInstructionTriviallyDead(I) && "Trivially dead instructions only!"); + SmallVector<Value*, 8> Ops(I->op_begin(), I->op_end()); + // Erase the dead instruction. + ValueRankMap.erase(I); + RedoInsts.remove(I); + I->eraseFromParent(); + // Optimize its operands. + SmallPtrSet<Instruction *, 8> Visited; // Detect self-referential nodes. + for (unsigned i = 0, e = Ops.size(); i != e; ++i) + if (Instruction *Op = dyn_cast<Instruction>(Ops[i])) { + // If this is a node in an expression tree, climb to the expression root + // and add that since that's where optimization actually happens. + unsigned Opcode = Op->getOpcode(); + while (Op->hasOneUse() && Op->use_back()->getOpcode() == Opcode && + Visited.insert(Op)) + Op = Op->use_back(); + RedoInsts.insert(Op); + } +} + +/// OptimizeInst - Inspect and optimize the given instruction. Note that erasing +/// instructions is not allowed. +void Reassociate::OptimizeInst(Instruction *I) { + // Only consider operations that we understand. + if (!isa<BinaryOperator>(I)) + return; -/// ReassociateInst - Inspect and reassociate the instruction at the -/// given position, post-incrementing the position. -void Reassociate::ReassociateInst(BasicBlock::iterator &BBI) { - Instruction *BI = BBI++; - if (BI->getOpcode() == Instruction::Shl && - isa<ConstantInt>(BI->getOperand(1))) - if (Instruction *NI = ConvertShiftToMul(BI, ValueRankMap)) { + if (I->getOpcode() == Instruction::Shl && + isa<ConstantInt>(I->getOperand(1))) + // If an operand of this shift is a reassociable multiply, or if the shift + // is used by a reassociable multiply or add, turn into a multiply. + if (isReassociableOp(I->getOperand(0), Instruction::Mul) || + (I->hasOneUse() && + (isReassociableOp(I->use_back(), Instruction::Mul) || + isReassociableOp(I->use_back(), Instruction::Add)))) { + Instruction *NI = ConvertShiftToMul(I); + RedoInsts.insert(I); MadeChange = true; - BI = NI; + I = NI; + } + + // Floating point binary operators are not associative, but we can still + // commute (some) of them, to canonicalize the order of their operands. + // This can potentially expose more CSE opportunities, and makes writing + // other transformations simpler. + if ((I->getType()->isFloatingPointTy() || I->getType()->isVectorTy())) { + // FAdd and FMul can be commuted. + if (I->getOpcode() != Instruction::FMul && + I->getOpcode() != Instruction::FAdd) + return; + + Value *LHS = I->getOperand(0); + Value *RHS = I->getOperand(1); + unsigned LHSRank = getRank(LHS); + unsigned RHSRank = getRank(RHS); + + // Sort the operands by rank. + if (RHSRank < LHSRank) { + I->setOperand(0, RHS); + I->setOperand(1, LHS); } - // Reject cases where it is pointless to do this. - if (!isa<BinaryOperator>(BI) || BI->getType()->isFloatingPointTy() || - BI->getType()->isVectorTy()) - return; // Floating point ops are not associative. + return; + } // Do not reassociate boolean (i1) expressions. We want to preserve the // original order of evaluation for short-circuited comparisons that @@ -989,58 +1552,66 @@ void Reassociate::ReassociateInst(BasicBlock::iterator &BBI) { // is not further optimized, it is likely to be transformed back to a // short-circuited form for code gen, and the source order may have been // optimized for the most likely conditions. - if (BI->getType()->isIntegerTy(1)) + if (I->getType()->isIntegerTy(1)) return; // If this is a subtract instruction which is not already in negate form, // see if we can convert it to X+-Y. - if (BI->getOpcode() == Instruction::Sub) { - if (ShouldBreakUpSubtract(BI)) { - BI = BreakUpSubtract(BI, ValueRankMap); - // Reset the BBI iterator in case BreakUpSubtract changed the - // instruction it points to. - BBI = BI; - ++BBI; + if (I->getOpcode() == Instruction::Sub) { + if (ShouldBreakUpSubtract(I)) { + Instruction *NI = BreakUpSubtract(I); + RedoInsts.insert(I); MadeChange = true; - } else if (BinaryOperator::isNeg(BI)) { + I = NI; + } else if (BinaryOperator::isNeg(I)) { // Otherwise, this is a negation. See if the operand is a multiply tree // and if this is not an inner node of a multiply tree. - if (isReassociableOp(BI->getOperand(1), Instruction::Mul) && - (!BI->hasOneUse() || - !isReassociableOp(BI->use_back(), Instruction::Mul))) { - BI = LowerNegateToMultiply(BI, ValueRankMap); + if (isReassociableOp(I->getOperand(1), Instruction::Mul) && + (!I->hasOneUse() || + !isReassociableOp(I->use_back(), Instruction::Mul))) { + Instruction *NI = LowerNegateToMultiply(I); + RedoInsts.insert(I); MadeChange = true; + I = NI; } } } - // If this instruction is a commutative binary operator, process it. - if (!BI->isAssociative()) return; - BinaryOperator *I = cast<BinaryOperator>(BI); + // If this instruction is an associative binary operator, process it. + if (!I->isAssociative()) return; + BinaryOperator *BO = cast<BinaryOperator>(I); // If this is an interior node of a reassociable tree, ignore it until we // get to the root of the tree, to avoid N^2 analysis. - if (I->hasOneUse() && isReassociableOp(I->use_back(), I->getOpcode())) + unsigned Opcode = BO->getOpcode(); + if (BO->hasOneUse() && BO->use_back()->getOpcode() == Opcode) return; - // If this is an add tree that is used by a sub instruction, ignore it + // If this is an add tree that is used by a sub instruction, ignore it // until we process the subtract. - if (I->hasOneUse() && I->getOpcode() == Instruction::Add && - cast<Instruction>(I->use_back())->getOpcode() == Instruction::Sub) + if (BO->hasOneUse() && BO->getOpcode() == Instruction::Add && + cast<Instruction>(BO->use_back())->getOpcode() == Instruction::Sub) return; - ReassociateExpression(I); + ReassociateExpression(BO); } -Value *Reassociate::ReassociateExpression(BinaryOperator *I) { - +void Reassociate::ReassociateExpression(BinaryOperator *I) { + // First, walk the expression tree, linearizing the tree, collecting the // operand information. + SmallVector<RepeatedValue, 8> Tree; + MadeChange |= LinearizeExprTree(I, Tree); SmallVector<ValueEntry, 8> Ops; - LinearizeExprTree(I, Ops); - + Ops.reserve(Tree.size()); + for (unsigned i = 0, e = Tree.size(); i != e; ++i) { + RepeatedValue E = Tree[i]; + Ops.append(E.second.getZExtValue(), + ValueEntry(getRank(E.first), E.first)); + } + DEBUG(dbgs() << "RAIn:\t"; PrintOps(I, Ops); dbgs() << '\n'); - + // Now that we have linearized the tree to a list and have gathered all of // the operands and their ranks, sort the operands by their rank. Use a // stable_sort so that values with equal ranks will have their relative @@ -1048,21 +1619,24 @@ Value *Reassociate::ReassociateExpression(BinaryOperator *I) { // this sorts so that the highest ranking values end up at the beginning of // the vector. std::stable_sort(Ops.begin(), Ops.end()); - + // OptimizeExpression - Now that we have the expression tree in a convenient // sorted form, optimize it globally if possible. if (Value *V = OptimizeExpression(I, Ops)) { + if (V == I) + // Self-referential expression in unreachable code. + return; // This expression tree simplified to something that isn't a tree, // eliminate it. DEBUG(dbgs() << "Reassoc to scalar: " << *V << '\n'); I->replaceAllUsesWith(V); if (Instruction *VI = dyn_cast<Instruction>(V)) VI->setDebugLoc(I->getDebugLoc()); - RemoveDeadBinaryOp(I); + RedoInsts.insert(I); ++NumAnnihil; - return V; + return; } - + // We want to sink immediates as deeply as possible except in the case where // this is a multiply tree used only by an add, and the immediate is a -1. // In this case we reassociate to put the negation on the outside so that we @@ -1074,51 +1648,57 @@ Value *Reassociate::ReassociateExpression(BinaryOperator *I) { ValueEntry Tmp = Ops.pop_back_val(); Ops.insert(Ops.begin(), Tmp); } - + DEBUG(dbgs() << "RAOut:\t"; PrintOps(I, Ops); dbgs() << '\n'); - + if (Ops.size() == 1) { + if (Ops[0].Op == I) + // Self-referential expression in unreachable code. + return; + // This expression tree simplified to something that isn't a tree, // eliminate it. I->replaceAllUsesWith(Ops[0].Op); if (Instruction *OI = dyn_cast<Instruction>(Ops[0].Op)) OI->setDebugLoc(I->getDebugLoc()); - RemoveDeadBinaryOp(I); - return Ops[0].Op; + RedoInsts.insert(I); + return; } - + // Now that we ordered and optimized the expressions, splat them back into // the expression tree, removing any unneeded nodes. RewriteExprTree(I, Ops); - return I; } - bool Reassociate::runOnFunction(Function &F) { - // Recalculate the rank map for F + // Calculate the rank map for F BuildRankMap(F); MadeChange = false; - for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE; ++FI) - for (BasicBlock::iterator BBI = FI->begin(); BBI != FI->end(); ) - ReassociateInst(BBI); - - // Now that we're done, revisit any instructions which are likely to - // have secondary reassociation opportunities. - while (!RedoInsts.empty()) - if (Value *V = RedoInsts.pop_back_val()) { - BasicBlock::iterator BBI = cast<Instruction>(V); - ReassociateInst(BBI); - } + for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE; ++BI) { + // Optimize every instruction in the basic block. + for (BasicBlock::iterator II = BI->begin(), IE = BI->end(); II != IE; ) + if (isInstructionTriviallyDead(II)) { + EraseInst(II++); + } else { + OptimizeInst(II); + assert(II->getParent() == BI && "Moved to a different block!"); + ++II; + } - // Now that we're done, delete any instructions which are no longer used. - while (!DeadInsts.empty()) - if (Value *V = DeadInsts.pop_back_val()) - RecursivelyDeleteTriviallyDeadInstructions(V); + // If this produced extra instructions to optimize, handle them now. + while (!RedoInsts.empty()) { + Instruction *I = RedoInsts.pop_back_val(); + if (isInstructionTriviallyDead(I)) + EraseInst(I); + else + OptimizeInst(I); + } + } // We are done with the rank map. RankMap.clear(); ValueRankMap.clear(); + return MadeChange; } - diff --git a/lib/Transforms/Scalar/Reg2Mem.cpp b/lib/Transforms/Scalar/Reg2Mem.cpp index 47afc77..ea1de63 100644 --- a/lib/Transforms/Scalar/Reg2Mem.cpp +++ b/lib/Transforms/Scalar/Reg2Mem.cpp @@ -7,7 +7,7 @@ // //===----------------------------------------------------------------------===// // -// This file demotes all registers to memory references. It is intented to be +// This file demotes all registers to memory references. It is intended to be // the inverse of PromoteMemoryToRegister. By converting to loads, the only // values live across basic blocks are allocas and loads before phi nodes. // It is intended that this should make CFG hacking much easier. @@ -59,7 +59,7 @@ namespace { virtual bool runOnFunction(Function &F); }; } - + char RegToMem::ID = 0; INITIALIZE_PASS_BEGIN(RegToMem, "reg2mem", "Demote all values to stack slots", false, false) @@ -68,25 +68,25 @@ INITIALIZE_PASS_END(RegToMem, "reg2mem", "Demote all values to stack slots", false, false) bool RegToMem::runOnFunction(Function &F) { - if (F.isDeclaration()) + if (F.isDeclaration()) return false; - + // Insert all new allocas into entry block. BasicBlock *BBEntry = &F.getEntryBlock(); assert(pred_begin(BBEntry) == pred_end(BBEntry) && "Entry block to function must not have predecessors!"); - + // Find first non-alloca instruction and create insertion point. This is // safe if block is well-formed: it always have terminator, otherwise // we'll get and assertion. BasicBlock::iterator I = BBEntry->begin(); while (isa<AllocaInst>(I)) ++I; - + CastInst *AllocaInsertionPoint = new BitCastInst(Constant::getNullValue(Type::getInt32Ty(F.getContext())), Type::getInt32Ty(F.getContext()), "reg2mem alloca point", I); - + // Find the escaped instructions. But don't create stack slots for // allocas in entry block. std::list<Instruction*> WorkList; @@ -99,15 +99,15 @@ bool RegToMem::runOnFunction(Function &F) { WorkList.push_front(&*iib); } } - + // Demote escaped instructions NumRegsDemoted += WorkList.size(); - for (std::list<Instruction*>::iterator ilb = WorkList.begin(), + for (std::list<Instruction*>::iterator ilb = WorkList.begin(), ile = WorkList.end(); ilb != ile; ++ilb) DemoteRegToStack(**ilb, false, AllocaInsertionPoint); - + WorkList.clear(); - + // Find all phi's for (Function::iterator ibb = F.begin(), ibe = F.end(); ibb != ibe; ++ibb) @@ -115,19 +115,18 @@ bool RegToMem::runOnFunction(Function &F) { iib != iie; ++iib) if (isa<PHINode>(iib)) WorkList.push_front(&*iib); - + // Demote phi nodes NumPhisDemoted += WorkList.size(); - for (std::list<Instruction*>::iterator ilb = WorkList.begin(), + for (std::list<Instruction*>::iterator ilb = WorkList.begin(), ile = WorkList.end(); ilb != ile; ++ilb) DemotePHIToStack(cast<PHINode>(*ilb), AllocaInsertionPoint); - + return true; } // createDemoteRegisterToMemory - Provide an entry point to create this pass. -// char &llvm::DemoteRegisterToMemoryID = RegToMem::ID; FunctionPass *llvm::createDemoteRegisterToMemoryPass() { return new RegToMem(); diff --git a/lib/Transforms/Scalar/SCCP.cpp b/lib/Transforms/Scalar/SCCP.cpp index 16b64a5..2c39aab 100644 --- a/lib/Transforms/Scalar/SCCP.cpp +++ b/lib/Transforms/Scalar/SCCP.cpp @@ -409,7 +409,7 @@ private: if (Constant *C = dyn_cast<Constant>(V)) { Constant *Elt = C->getAggregateElement(i); - + if (Elt == 0) LV.markOverdefined(); // Unknown sort of constant. else if (isa<UndefValue>(Elt)) diff --git a/lib/Transforms/Scalar/Scalar.cpp b/lib/Transforms/Scalar/Scalar.cpp index 7d65bcc..48318c8 100644 --- a/lib/Transforms/Scalar/Scalar.cpp +++ b/lib/Transforms/Scalar/Scalar.cpp @@ -7,7 +7,7 @@ // //===----------------------------------------------------------------------===// // -// This file implements common infrastructure for libLLVMScalarOpts.a, which +// This file implements common infrastructure for libLLVMScalarOpts.a, which // implements several scalar transformations over the LLVM intermediate // representation, including the C bindings for that library. // @@ -24,7 +24,7 @@ using namespace llvm; -/// initializeScalarOptsPasses - Initialize all passes linked into the +/// initializeScalarOptsPasses - Initialize all passes linked into the /// ScalarOpts library. void llvm::initializeScalarOpts(PassRegistry &Registry) { initializeADCEPass(Registry); diff --git a/lib/Transforms/Scalar/ScalarReplAggregates.cpp b/lib/Transforms/Scalar/ScalarReplAggregates.cpp index 026fea1..6637126 100644 --- a/lib/Transforms/Scalar/ScalarReplAggregates.cpp +++ b/lib/Transforms/Scalar/ScalarReplAggregates.cpp @@ -22,33 +22,34 @@ #define DEBUG_TYPE "scalarrepl" #include "llvm/Transforms/Scalar.h" #include "llvm/Constants.h" +#include "llvm/DIBuilder.h" +#include "llvm/DebugInfo.h" #include "llvm/DerivedTypes.h" #include "llvm/Function.h" #include "llvm/GlobalVariable.h" +#include "llvm/IRBuilder.h" #include "llvm/Instructions.h" #include "llvm/IntrinsicInst.h" #include "llvm/LLVMContext.h" #include "llvm/Module.h" +#include "llvm/Operator.h" #include "llvm/Pass.h" -#include "llvm/Analysis/DebugInfo.h" -#include "llvm/Analysis/DIBuilder.h" +#include "llvm/ADT/SetVector.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" #include "llvm/Analysis/Dominators.h" #include "llvm/Analysis/Loads.h" #include "llvm/Analysis/ValueTracking.h" -#include "llvm/Target/TargetData.h" -#include "llvm/Transforms/Utils/PromoteMemToReg.h" -#include "llvm/Transforms/Utils/Local.h" -#include "llvm/Transforms/Utils/SSAUpdater.h" #include "llvm/Support/CallSite.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/GetElementPtrTypeIterator.h" -#include "llvm/Support/IRBuilder.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/ADT/SetVector.h" -#include "llvm/ADT/SmallVector.h" -#include "llvm/ADT/Statistic.h" +#include "llvm/Target/TargetData.h" +#include "llvm/Transforms/Utils/Local.h" +#include "llvm/Transforms/Utils/PromoteMemToReg.h" +#include "llvm/Transforms/Utils/SSAUpdater.h" using namespace llvm; STATISTIC(NumReplaced, "Number of allocas broken up"); @@ -59,12 +60,25 @@ STATISTIC(NumGlobals, "Number of allocas copied from constant global"); namespace { struct SROA : public FunctionPass { - SROA(int T, bool hasDT, char &ID) + SROA(int T, bool hasDT, char &ID, int ST, int AT, int SLT) : FunctionPass(ID), HasDomTree(hasDT) { if (T == -1) SRThreshold = 128; else SRThreshold = T; + if (ST == -1) + StructMemberThreshold = 32; + else + StructMemberThreshold = ST; + if (AT == -1) + ArrayElementThreshold = 8; + else + ArrayElementThreshold = AT; + if (SLT == -1) + // Do not limit the scalar integer load size if no threshold is given. + ScalarLoadThreshold = -1; + else + ScalarLoadThreshold = SLT; } bool runOnFunction(Function &F); @@ -86,11 +100,11 @@ namespace { struct AllocaInfo { /// The alloca to promote. AllocaInst *AI; - + /// CheckedPHIs - This is a set of verified PHI nodes, to prevent infinite /// looping and avoid redundant work. SmallPtrSet<PHINode*, 8> CheckedPHIs; - + /// isUnsafe - This is set to true if the alloca cannot be SROA'd. bool isUnsafe : 1; @@ -104,19 +118,32 @@ namespace { /// ever accessed, or false if the alloca is only accessed with mem /// intrinsics or load/store that only access the entire alloca at once. bool hasSubelementAccess : 1; - + /// hasALoadOrStore - This is true if there are any loads or stores to it. /// The alloca may just be accessed with memcpy, for example, which would /// not set this. bool hasALoadOrStore : 1; - + explicit AllocaInfo(AllocaInst *ai) : AI(ai), isUnsafe(false), isMemCpySrc(false), isMemCpyDst(false), hasSubelementAccess(false), hasALoadOrStore(false) {} }; + /// SRThreshold - The maximum alloca size to considered for SROA. unsigned SRThreshold; + /// StructMemberThreshold - The maximum number of members a struct can + /// contain to be considered for SROA. + unsigned StructMemberThreshold; + + /// ArrayElementThreshold - The maximum number of elements an array can + /// have to be considered for SROA. + unsigned ArrayElementThreshold; + + /// ScalarLoadThreshold - The maximum size in bits of scalars to load when + /// converting to scalar + unsigned ScalarLoadThreshold; + void MarkUnsafe(AllocaInfo &I, Instruction *User) { I.isUnsafe = true; DEBUG(dbgs() << " Transformation preventing inst: " << *User << '\n'); @@ -155,19 +182,21 @@ namespace { SmallVector<AllocaInst*, 32> &NewElts); void RewriteLoadUserOfWholeAlloca(LoadInst *LI, AllocaInst *AI, SmallVector<AllocaInst*, 32> &NewElts); + bool ShouldAttemptScalarRepl(AllocaInst *AI); static MemTransferInst *isOnlyCopiedFromConstantGlobal( AllocaInst *AI, SmallVector<Instruction*, 4> &ToDelete); }; - + // SROA_DT - SROA that uses DominatorTree. struct SROA_DT : public SROA { static char ID; public: - SROA_DT(int T = -1) : SROA(T, true, ID) { + SROA_DT(int T = -1, int ST = -1, int AT = -1, int SLT = -1) : + SROA(T, true, ID, ST, AT, SLT) { initializeSROA_DTPass(*PassRegistry::getPassRegistry()); } - + // getAnalysisUsage - This pass does not require any passes, but we know it // will not alter the CFG, so say so. virtual void getAnalysisUsage(AnalysisUsage &AU) const { @@ -175,22 +204,23 @@ namespace { AU.setPreservesCFG(); } }; - + // SROA_SSAUp - SROA that uses SSAUpdater. struct SROA_SSAUp : public SROA { static char ID; public: - SROA_SSAUp(int T = -1) : SROA(T, false, ID) { + SROA_SSAUp(int T = -1, int ST = -1, int AT = -1, int SLT = -1) : + SROA(T, false, ID, ST, AT, SLT) { initializeSROA_SSAUpPass(*PassRegistry::getPassRegistry()); } - + // getAnalysisUsage - This pass does not require any passes, but we know it // will not alter the CFG, so say so. virtual void getAnalysisUsage(AnalysisUsage &AU) const { AU.setPreservesCFG(); } }; - + } char SROA_DT::ID = 0; @@ -209,10 +239,15 @@ INITIALIZE_PASS_END(SROA_SSAUp, "scalarrepl-ssa", // Public interface to the ScalarReplAggregates pass FunctionPass *llvm::createScalarReplAggregatesPass(int Threshold, - bool UseDomTree) { + bool UseDomTree, + int StructMemberThreshold, + int ArrayElementThreshold, + int ScalarLoadThreshold) { if (UseDomTree) - return new SROA_DT(Threshold); - return new SROA_SSAUp(Threshold); + return new SROA_DT(Threshold, StructMemberThreshold, ArrayElementThreshold, + ScalarLoadThreshold); + return new SROA_SSAUp(Threshold, StructMemberThreshold, + ArrayElementThreshold, ScalarLoadThreshold); } @@ -228,6 +263,7 @@ class ConvertToScalarInfo { /// AllocaSize - The size of the alloca being considered in bytes. unsigned AllocaSize; const TargetData &TD; + unsigned ScalarLoadThreshold; /// IsNotTrivial - This is set to true if there is some access to the object /// which means that mem2reg can't promote it. @@ -258,28 +294,38 @@ class ConvertToScalarInfo { /// isn't possible to turn into a vector type, it gets set to VoidTy. VectorType *VectorTy; - /// HadNonMemTransferAccess - True if there is at least one access to the + /// HadNonMemTransferAccess - True if there is at least one access to the /// alloca that is not a MemTransferInst. We don't want to turn structs into /// large integers unless there is some potential for optimization. bool HadNonMemTransferAccess; + /// HadDynamicAccess - True if some element of this alloca was dynamic. + /// We don't yet have support for turning a dynamic access into a large + /// integer. + bool HadDynamicAccess; + public: - explicit ConvertToScalarInfo(unsigned Size, const TargetData &td) - : AllocaSize(Size), TD(td), IsNotTrivial(false), ScalarKind(Unknown), - VectorTy(0), HadNonMemTransferAccess(false) { } + explicit ConvertToScalarInfo(unsigned Size, const TargetData &td, + unsigned SLT) + : AllocaSize(Size), TD(td), ScalarLoadThreshold(SLT), IsNotTrivial(false), + ScalarKind(Unknown), VectorTy(0), HadNonMemTransferAccess(false), + HadDynamicAccess(false) { } AllocaInst *TryConvert(AllocaInst *AI); private: - bool CanConvertToScalar(Value *V, uint64_t Offset); + bool CanConvertToScalar(Value *V, uint64_t Offset, Value* NonConstantIdx); void MergeInTypeForLoadOrStore(Type *In, uint64_t Offset); bool MergeInVectorType(VectorType *VInTy, uint64_t Offset); - void ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI, uint64_t Offset); + void ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI, uint64_t Offset, + Value *NonConstantIdx); Value *ConvertScalar_ExtractValue(Value *NV, Type *ToType, - uint64_t Offset, IRBuilder<> &Builder); + uint64_t Offset, Value* NonConstantIdx, + IRBuilder<> &Builder); Value *ConvertScalar_InsertValue(Value *StoredVal, Value *ExistingVal, - uint64_t Offset, IRBuilder<> &Builder); + uint64_t Offset, Value* NonConstantIdx, + IRBuilder<> &Builder); }; } // end anonymous namespace. @@ -290,7 +336,7 @@ private: AllocaInst *ConvertToScalarInfo::TryConvert(AllocaInst *AI) { // If we can't convert this scalar, or if mem2reg can trivially do it, bail // out. - if (!CanConvertToScalar(AI, 0) || !IsNotTrivial) + if (!CanConvertToScalar(AI, 0, 0) || !IsNotTrivial) return 0; // If an alloca has only memset / memcpy uses, it may still have an Unknown @@ -315,16 +361,27 @@ AllocaInst *ConvertToScalarInfo::TryConvert(AllocaInst *AI) { NewTy = VectorTy; // Use the vector type. } else { unsigned BitWidth = AllocaSize * 8; + + // Do not convert to scalar integer if the alloca size exceeds the + // scalar load threshold. + if (BitWidth > ScalarLoadThreshold) + return 0; + if ((ScalarKind == ImplicitVector || ScalarKind == Integer) && !HadNonMemTransferAccess && !TD.fitsInLegalInteger(BitWidth)) return 0; + // Dynamic accesses on integers aren't yet supported. They need us to shift + // by a dynamic amount which could be difficult to work out as we might not + // know whether to use a left or right shift. + if (ScalarKind == Integer && HadDynamicAccess) + return 0; DEBUG(dbgs() << "CONVERT TO SCALAR INTEGER: " << *AI << "\n"); // Create and insert the integer alloca. NewTy = IntegerType::get(AI->getContext(), BitWidth); } AllocaInst *NewAI = new AllocaInst(NewTy, 0, "", AI->getParent()->begin()); - ConvertUsesToScalar(AI, NewAI, 0); + ConvertUsesToScalar(AI, NewAI, 0, 0); return NewAI; } @@ -411,7 +468,8 @@ bool ConvertToScalarInfo::MergeInVectorType(VectorType *VInTy, /// /// If we see at least one access to the value that is as a vector type, set the /// SawVec flag. -bool ConvertToScalarInfo::CanConvertToScalar(Value *V, uint64_t Offset) { +bool ConvertToScalarInfo::CanConvertToScalar(Value *V, uint64_t Offset, + Value* NonConstantIdx) { for (Value::use_iterator UI = V->use_begin(), E = V->use_end(); UI!=E; ++UI) { Instruction *User = cast<Instruction>(*UI); @@ -441,24 +499,35 @@ bool ConvertToScalarInfo::CanConvertToScalar(Value *V, uint64_t Offset) { if (BitCastInst *BCI = dyn_cast<BitCastInst>(User)) { if (!onlyUsedByLifetimeMarkers(BCI)) IsNotTrivial = true; // Can't be mem2reg'd. - if (!CanConvertToScalar(BCI, Offset)) + if (!CanConvertToScalar(BCI, Offset, NonConstantIdx)) return false; continue; } if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(User)) { // If this is a GEP with a variable indices, we can't handle it. - if (!GEP->hasAllConstantIndices()) + PointerType* PtrTy = dyn_cast<PointerType>(GEP->getPointerOperandType()); + if (!PtrTy) return false; // Compute the offset that this GEP adds to the pointer. SmallVector<Value*, 8> Indices(GEP->op_begin()+1, GEP->op_end()); - if (!GEP->getPointerOperandType()->isPointerTy()) - return false; - uint64_t GEPOffset = TD.getIndexedOffset(GEP->getPointerOperandType(), + Value *GEPNonConstantIdx = 0; + if (!GEP->hasAllConstantIndices()) { + if (!isa<VectorType>(PtrTy->getElementType())) + return false; + if (NonConstantIdx) + return false; + GEPNonConstantIdx = Indices.pop_back_val(); + if (!GEPNonConstantIdx->getType()->isIntegerTy(32)) + return false; + HadDynamicAccess = true; + } else + GEPNonConstantIdx = NonConstantIdx; + uint64_t GEPOffset = TD.getIndexedOffset(PtrTy, Indices); // See if all uses can be converted. - if (!CanConvertToScalar(GEP, Offset+GEPOffset)) + if (!CanConvertToScalar(GEP, Offset+GEPOffset, GEPNonConstantIdx)) return false; IsNotTrivial = true; // Can't be mem2reg'd. HadNonMemTransferAccess = true; @@ -468,6 +537,9 @@ bool ConvertToScalarInfo::CanConvertToScalar(Value *V, uint64_t Offset) { // If this is a constant sized memset of a constant value (e.g. 0) we can // handle it. if (MemSetInst *MSI = dyn_cast<MemSetInst>(User)) { + // Store to dynamic index. + if (NonConstantIdx) + return false; // Store of constant value. if (!isa<ConstantInt>(MSI->getValue())) return false; @@ -492,6 +564,9 @@ bool ConvertToScalarInfo::CanConvertToScalar(Value *V, uint64_t Offset) { // If this is a memcpy or memmove into or out of the whole allocation, we // can handle it like a load or store of the scalar type. if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(User)) { + // Store to dynamic index. + if (NonConstantIdx) + return false; ConstantInt *Len = dyn_cast<ConstantInt>(MTI->getLength()); if (Len == 0 || Len->getZExtValue() != AllocaSize || Offset != 0) return false; @@ -523,12 +598,13 @@ bool ConvertToScalarInfo::CanConvertToScalar(Value *V, uint64_t Offset) { /// Offset is an offset from the original alloca, in bits that need to be /// shifted to the right. By the end of this, there should be no uses of Ptr. void ConvertToScalarInfo::ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI, - uint64_t Offset) { + uint64_t Offset, + Value* NonConstantIdx) { while (!Ptr->use_empty()) { Instruction *User = cast<Instruction>(Ptr->use_back()); if (BitCastInst *CI = dyn_cast<BitCastInst>(User)) { - ConvertUsesToScalar(CI, NewAI, Offset); + ConvertUsesToScalar(CI, NewAI, Offset, NonConstantIdx); CI->eraseFromParent(); continue; } @@ -536,9 +612,16 @@ void ConvertToScalarInfo::ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI, if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(User)) { // Compute the offset that this GEP adds to the pointer. SmallVector<Value*, 8> Indices(GEP->op_begin()+1, GEP->op_end()); + Value* GEPNonConstantIdx = 0; + if (!GEP->hasAllConstantIndices()) { + assert(!NonConstantIdx && + "Dynamic GEP reading from dynamic GEP unsupported"); + GEPNonConstantIdx = Indices.pop_back_val(); + } else + GEPNonConstantIdx = NonConstantIdx; uint64_t GEPOffset = TD.getIndexedOffset(GEP->getPointerOperandType(), Indices); - ConvertUsesToScalar(GEP, NewAI, Offset+GEPOffset*8); + ConvertUsesToScalar(GEP, NewAI, Offset+GEPOffset*8, GEPNonConstantIdx); GEP->eraseFromParent(); continue; } @@ -549,7 +632,8 @@ void ConvertToScalarInfo::ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI, // The load is a bit extract from NewAI shifted right by Offset bits. Value *LoadedVal = Builder.CreateLoad(NewAI); Value *NewLoadVal - = ConvertScalar_ExtractValue(LoadedVal, LI->getType(), Offset, Builder); + = ConvertScalar_ExtractValue(LoadedVal, LI->getType(), Offset, + NonConstantIdx, Builder); LI->replaceAllUsesWith(NewLoadVal); LI->eraseFromParent(); continue; @@ -559,7 +643,7 @@ void ConvertToScalarInfo::ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI, assert(SI->getOperand(0) != Ptr && "Consistency error!"); Instruction *Old = Builder.CreateLoad(NewAI, NewAI->getName()+".in"); Value *New = ConvertScalar_InsertValue(SI->getOperand(0), Old, Offset, - Builder); + NonConstantIdx, Builder); Builder.CreateStore(New, NewAI); SI->eraseFromParent(); @@ -574,6 +658,7 @@ void ConvertToScalarInfo::ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI, // transform it into a store of the expanded constant value. if (MemSetInst *MSI = dyn_cast<MemSetInst>(User)) { assert(MSI->getRawDest() == Ptr && "Consistency error!"); + assert(!NonConstantIdx && "Cannot replace dynamic memset with insert"); int64_t SNumBytes = cast<ConstantInt>(MSI->getLength())->getSExtValue(); if (SNumBytes > 0 && (SNumBytes >> 32) == 0) { unsigned NumBytes = static_cast<unsigned>(SNumBytes); @@ -590,7 +675,7 @@ void ConvertToScalarInfo::ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI, Instruction *Old = Builder.CreateLoad(NewAI, NewAI->getName()+".in"); Value *New = ConvertScalar_InsertValue( ConstantInt::get(User->getContext(), APVal), - Old, Offset, Builder); + Old, Offset, 0, Builder); Builder.CreateStore(New, NewAI); // If the load we just inserted is now dead, then the memset overwrote @@ -606,6 +691,7 @@ void ConvertToScalarInfo::ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI, // can handle it like a load or store of the scalar type. if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(User)) { assert(Offset == 0 && "must be store to start of alloca"); + assert(!NonConstantIdx && "Cannot replace dynamic transfer with insert"); // If the source and destination are both to the same alloca, then this is // a noop copy-to-self, just delete it. Otherwise, emit a load and store @@ -678,7 +764,8 @@ void ConvertToScalarInfo::ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI, /// shifted to the right. Value *ConvertToScalarInfo:: ConvertScalar_ExtractValue(Value *FromVal, Type *ToType, - uint64_t Offset, IRBuilder<> &Builder) { + uint64_t Offset, Value* NonConstantIdx, + IRBuilder<> &Builder) { // If the load is of the whole new alloca, no conversion is needed. Type *FromType = FromVal->getType(); if (FromType == ToType && Offset == 0) @@ -700,7 +787,17 @@ ConvertScalar_ExtractValue(Value *FromVal, Type *ToType, assert(EltSize*Elt == Offset && "Invalid modulus in validity checking"); } // Return the element extracted out of it. - Value *V = Builder.CreateExtractElement(FromVal, Builder.getInt32(Elt)); + Value *Idx; + if (NonConstantIdx) { + if (Elt) + Idx = Builder.CreateAdd(NonConstantIdx, + Builder.getInt32(Elt), + "dyn.offset"); + else + Idx = NonConstantIdx; + } else + Idx = Builder.getInt32(Elt); + Value *V = Builder.CreateExtractElement(FromVal, Idx); if (V->getType() != ToType) V = Builder.CreateBitCast(V, ToType); return V; @@ -709,23 +806,27 @@ ConvertScalar_ExtractValue(Value *FromVal, Type *ToType, // If ToType is a first class aggregate, extract out each of the pieces and // use insertvalue's to form the FCA. if (StructType *ST = dyn_cast<StructType>(ToType)) { + assert(!NonConstantIdx && + "Dynamic indexing into struct types not supported"); const StructLayout &Layout = *TD.getStructLayout(ST); Value *Res = UndefValue::get(ST); for (unsigned i = 0, e = ST->getNumElements(); i != e; ++i) { Value *Elt = ConvertScalar_ExtractValue(FromVal, ST->getElementType(i), Offset+Layout.getElementOffsetInBits(i), - Builder); + 0, Builder); Res = Builder.CreateInsertValue(Res, Elt, i); } return Res; } if (ArrayType *AT = dyn_cast<ArrayType>(ToType)) { + assert(!NonConstantIdx && + "Dynamic indexing into array types not supported"); uint64_t EltSize = TD.getTypeAllocSizeInBits(AT->getElementType()); Value *Res = UndefValue::get(AT); for (unsigned i = 0, e = AT->getNumElements(); i != e; ++i) { Value *Elt = ConvertScalar_ExtractValue(FromVal, AT->getElementType(), - Offset+i*EltSize, Builder); + Offset+i*EltSize, 0, Builder); Res = Builder.CreateInsertValue(Res, Elt, i); } return Res; @@ -791,9 +892,14 @@ ConvertScalar_ExtractValue(Value *FromVal, Type *ToType, /// /// Offset is an offset from the original alloca, in bits that need to be /// shifted to the right. +/// +/// NonConstantIdx is an index value if there was a GEP with a non-constant +/// index value. If this is 0 then all GEPs used to find this insert address +/// are constant. Value *ConvertToScalarInfo:: ConvertScalar_InsertValue(Value *SV, Value *Old, - uint64_t Offset, IRBuilder<> &Builder) { + uint64_t Offset, Value* NonConstantIdx, + IRBuilder<> &Builder) { // Convert the stored type to the actual type, shift it left to insert // then 'or' into place. Type *AllocaType = Old->getType(); @@ -814,26 +920,40 @@ ConvertScalar_InsertValue(Value *SV, Value *Old, SV = Builder.CreateBitCast(SV, EltTy); uint64_t EltSize = TD.getTypeAllocSizeInBits(EltTy); unsigned Elt = Offset/EltSize; - return Builder.CreateInsertElement(Old, SV, Builder.getInt32(Elt)); + Value *Idx; + if (NonConstantIdx) { + if (Elt) + Idx = Builder.CreateAdd(NonConstantIdx, + Builder.getInt32(Elt), + "dyn.offset"); + else + Idx = NonConstantIdx; + } else + Idx = Builder.getInt32(Elt); + return Builder.CreateInsertElement(Old, SV, Idx); } // If SV is a first-class aggregate value, insert each value recursively. if (StructType *ST = dyn_cast<StructType>(SV->getType())) { + assert(!NonConstantIdx && + "Dynamic indexing into struct types not supported"); const StructLayout &Layout = *TD.getStructLayout(ST); for (unsigned i = 0, e = ST->getNumElements(); i != e; ++i) { Value *Elt = Builder.CreateExtractValue(SV, i); Old = ConvertScalar_InsertValue(Elt, Old, Offset+Layout.getElementOffsetInBits(i), - Builder); + 0, Builder); } return Old; } if (ArrayType *AT = dyn_cast<ArrayType>(SV->getType())) { + assert(!NonConstantIdx && + "Dynamic indexing into array types not supported"); uint64_t EltSize = TD.getTypeAllocSizeInBits(AT->getElementType()); for (unsigned i = 0, e = AT->getNumElements(); i != e; ++i) { Value *Elt = Builder.CreateExtractValue(SV, i); - Old = ConvertScalar_InsertValue(Elt, Old, Offset+i*EltSize, Builder); + Old = ConvertScalar_InsertValue(Elt, Old, Offset+i*EltSize, 0, Builder); } return Old; } @@ -935,7 +1055,7 @@ public: AllocaPromoter(const SmallVectorImpl<Instruction*> &Insts, SSAUpdater &S, DIBuilder *DB) : LoadAndStorePromoter(Insts, S), AI(0), DIB(DB) {} - + void run(AllocaInst *AI, const SmallVectorImpl<Instruction*> &Insts) { // Remember which alloca we're promoting (for isInstInList). this->AI = AI; @@ -950,18 +1070,18 @@ public: LoadAndStorePromoter::run(Insts); AI->eraseFromParent(); - for (SmallVector<DbgDeclareInst *, 4>::iterator I = DDIs.begin(), + for (SmallVector<DbgDeclareInst *, 4>::iterator I = DDIs.begin(), E = DDIs.end(); I != E; ++I) { DbgDeclareInst *DDI = *I; DDI->eraseFromParent(); } - for (SmallVector<DbgValueInst *, 4>::iterator I = DVIs.begin(), + for (SmallVector<DbgValueInst *, 4>::iterator I = DVIs.begin(), E = DVIs.end(); I != E; ++I) { DbgValueInst *DVI = *I; DVI->eraseFromParent(); } } - + virtual bool isInstInList(Instruction *I, const SmallVectorImpl<Instruction*> &Insts) const { if (LoadInst *LI = dyn_cast<LoadInst>(I)) @@ -970,7 +1090,7 @@ public: } virtual void updateDebugInfo(Instruction *Inst) const { - for (SmallVector<DbgDeclareInst *, 4>::const_iterator I = DDIs.begin(), + for (SmallVector<DbgDeclareInst *, 4>::const_iterator I = DDIs.begin(), E = DDIs.end(); I != E; ++I) { DbgDeclareInst *DDI = *I; if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) @@ -978,7 +1098,7 @@ public: else if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) ConvertDebugDeclareToDebugValue(DDI, LI, *DIB); } - for (SmallVector<DbgValueInst *, 4>::const_iterator I = DVIs.begin(), + for (SmallVector<DbgValueInst *, 4>::const_iterator I = DVIs.begin(), E = DVIs.end(); I != E; ++I) { DbgValueInst *DVI = *I; Value *Arg = NULL; @@ -1021,12 +1141,12 @@ public: static bool isSafeSelectToSpeculate(SelectInst *SI, const TargetData *TD) { bool TDerefable = SI->getTrueValue()->isDereferenceablePointer(); bool FDerefable = SI->getFalseValue()->isDereferenceablePointer(); - + for (Value::use_iterator UI = SI->use_begin(), UE = SI->use_end(); UI != UE; ++UI) { LoadInst *LI = dyn_cast<LoadInst>(*UI); if (LI == 0 || !LI->isSimple()) return false; - + // Both operands to the select need to be dereferencable, either absolutely // (e.g. allocas) or at this point because we can see other accesses to it. if (!TDerefable && !isSafeToLoadUnconditionally(SI->getTrueValue(), LI, @@ -1036,7 +1156,7 @@ static bool isSafeSelectToSpeculate(SelectInst *SI, const TargetData *TD) { LI->getAlignment(), TD)) return false; } - + return true; } @@ -1067,20 +1187,20 @@ static bool isSafePHIToSpeculate(PHINode *PN, const TargetData *TD) { UI != UE; ++UI) { LoadInst *LI = dyn_cast<LoadInst>(*UI); if (LI == 0 || !LI->isSimple()) return false; - + // For now we only allow loads in the same block as the PHI. This is a // common case that happens when instcombine merges two loads through a PHI. if (LI->getParent() != BB) return false; - + // Ensure that there are no instructions between the PHI and the load that // could store. for (BasicBlock::iterator BBI = PN; &*BBI != LI; ++BBI) if (BBI->mayWriteToMemory()) return false; - + MaxAlign = std::max(MaxAlign, LI->getAlignment()); } - + // Okay, we know that we have one or more loads in the same block as the PHI. // We can transform this if it is safe to push the loads into the predecessor // blocks. The only thing to watch out for is that we can't put a possibly @@ -1108,10 +1228,10 @@ static bool isSafePHIToSpeculate(PHINode *PN, const TargetData *TD) { if (InVal->isDereferenceablePointer() || isSafeToLoadUnconditionally(InVal, Pred->getTerminator(), MaxAlign, TD)) continue; - + return false; } - + return true; } @@ -1123,7 +1243,7 @@ static bool isSafePHIToSpeculate(PHINode *PN, const TargetData *TD) { static bool tryToMakeAllocaBePromotable(AllocaInst *AI, const TargetData *TD) { SetVector<Instruction*, SmallVector<Instruction*, 4>, SmallPtrSet<Instruction*, 4> > InstsToRewrite; - + for (Value::use_iterator UI = AI->use_begin(), UE = AI->use_end(); UI != UE; ++UI) { User *U = *UI; @@ -1132,7 +1252,7 @@ static bool tryToMakeAllocaBePromotable(AllocaInst *AI, const TargetData *TD) { return false; continue; } - + if (StoreInst *SI = dyn_cast<StoreInst>(U)) { if (SI->getOperand(0) == AI || !SI->isSimple()) return false; // Don't allow a store OF the AI, only INTO the AI. @@ -1146,7 +1266,7 @@ static bool tryToMakeAllocaBePromotable(AllocaInst *AI, const TargetData *TD) { Value *Result = SI->getOperand(1+CI->isZero()); SI->replaceAllUsesWith(Result); SI->eraseFromParent(); - + // This is very rare and we just scrambled the use list of AI, start // over completely. return tryToMakeAllocaBePromotable(AI, TD); @@ -1156,33 +1276,33 @@ static bool tryToMakeAllocaBePromotable(AllocaInst *AI, const TargetData *TD) { // loads, then we can transform this by rewriting the select. if (!isSafeSelectToSpeculate(SI, TD)) return false; - + InstsToRewrite.insert(SI); continue; } - + if (PHINode *PN = dyn_cast<PHINode>(U)) { if (PN->use_empty()) { // Dead PHIs can be stripped. InstsToRewrite.insert(PN); continue; } - + // If it is safe to turn "load (phi [AI, ptr, ...])" into a PHI of loads // in the pred blocks, then we can transform this by rewriting the PHI. if (!isSafePHIToSpeculate(PN, TD)) return false; - + InstsToRewrite.insert(PN); continue; } - + if (BitCastInst *BCI = dyn_cast<BitCastInst>(U)) { if (onlyUsedByLifetimeMarkers(BCI)) { InstsToRewrite.insert(BCI); continue; } } - + return false; } @@ -1190,7 +1310,7 @@ static bool tryToMakeAllocaBePromotable(AllocaInst *AI, const TargetData *TD) { // we're done! if (InstsToRewrite.empty()) return true; - + // If we have instructions that need to be rewritten for this to be promotable // take care of it now. for (unsigned i = 0, e = InstsToRewrite.size(); i != e; ++i) { @@ -1211,13 +1331,13 @@ static bool tryToMakeAllocaBePromotable(AllocaInst *AI, const TargetData *TD) { // loads with a new select. while (!SI->use_empty()) { LoadInst *LI = cast<LoadInst>(SI->use_back()); - + IRBuilder<> Builder(LI); - LoadInst *TrueLoad = + LoadInst *TrueLoad = Builder.CreateLoad(SI->getTrueValue(), LI->getName()+".t"); - LoadInst *FalseLoad = + LoadInst *FalseLoad = Builder.CreateLoad(SI->getFalseValue(), LI->getName()+".f"); - + // Transfer alignment and TBAA info if present. TrueLoad->setAlignment(LI->getAlignment()); FalseLoad->setAlignment(LI->getAlignment()); @@ -1225,18 +1345,18 @@ static bool tryToMakeAllocaBePromotable(AllocaInst *AI, const TargetData *TD) { TrueLoad->setMetadata(LLVMContext::MD_tbaa, Tag); FalseLoad->setMetadata(LLVMContext::MD_tbaa, Tag); } - + Value *V = Builder.CreateSelect(SI->getCondition(), TrueLoad, FalseLoad); V->takeName(LI); LI->replaceAllUsesWith(V); LI->eraseFromParent(); } - + // Now that all the loads are gone, the select is gone too. SI->eraseFromParent(); continue; } - + // Otherwise, we have a PHI node which allows us to push the loads into the // predecessors. PHINode *PN = cast<PHINode>(InstsToRewrite[i]); @@ -1244,7 +1364,7 @@ static bool tryToMakeAllocaBePromotable(AllocaInst *AI, const TargetData *TD) { PN->eraseFromParent(); continue; } - + Type *LoadTy = cast<PointerType>(PN->getType())->getElementType(); PHINode *NewPN = PHINode::Create(LoadTy, PN->getNumIncomingValues(), PN->getName()+".ld", PN); @@ -1254,18 +1374,18 @@ static bool tryToMakeAllocaBePromotable(AllocaInst *AI, const TargetData *TD) { LoadInst *SomeLoad = cast<LoadInst>(PN->use_back()); MDNode *TBAATag = SomeLoad->getMetadata(LLVMContext::MD_tbaa); unsigned Align = SomeLoad->getAlignment(); - + // Rewrite all loads of the PN to use the new PHI. while (!PN->use_empty()) { LoadInst *LI = cast<LoadInst>(PN->use_back()); LI->replaceAllUsesWith(NewPN); LI->eraseFromParent(); } - + // Inject loads into all of the pred blocks. Keep track of which blocks we // insert them into in case we have multiple edges from the same block. DenseMap<BasicBlock*, LoadInst*> InsertedLoads; - + for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) { BasicBlock *Pred = PN->getIncomingBlock(i); LoadInst *&Load = InsertedLoads[Pred]; @@ -1276,13 +1396,13 @@ static bool tryToMakeAllocaBePromotable(AllocaInst *AI, const TargetData *TD) { Load->setAlignment(Align); if (TBAATag) Load->setMetadata(LLVMContext::MD_tbaa, TBAATag); } - + NewPN->addIncoming(Load, Pred); } - + PN->eraseFromParent(); } - + ++NumAdjusted; return true; } @@ -1315,7 +1435,7 @@ bool SROA::performPromotion(Function &F) { SSAUpdater SSA; for (unsigned i = 0, e = Allocas.size(); i != e; ++i) { AllocaInst *AI = Allocas[i]; - + // Build list of instructions to promote. for (Value::use_iterator UI = AI->use_begin(), E = AI->use_end(); UI != E; ++UI) @@ -1334,18 +1454,36 @@ bool SROA::performPromotion(Function &F) { /// ShouldAttemptScalarRepl - Decide if an alloca is a good candidate for /// SROA. It must be a struct or array type with a small number of elements. -static bool ShouldAttemptScalarRepl(AllocaInst *AI) { +bool SROA::ShouldAttemptScalarRepl(AllocaInst *AI) { Type *T = AI->getAllocatedType(); - // Do not promote any struct into more than 32 separate vars. + // Do not promote any struct that has too many members. if (StructType *ST = dyn_cast<StructType>(T)) - return ST->getNumElements() <= 32; - // Arrays are much less likely to be safe for SROA; only consider - // them if they are very small. + return ST->getNumElements() <= StructMemberThreshold; + // Do not promote any array that has too many elements. if (ArrayType *AT = dyn_cast<ArrayType>(T)) - return AT->getNumElements() <= 8; + return AT->getNumElements() <= ArrayElementThreshold; return false; } +/// getPointeeAlignment - Compute the minimum alignment of the value pointed +/// to by the given pointer. +static unsigned getPointeeAlignment(Value *V, const TargetData &TD) { + if (ConstantExpr *CE = dyn_cast<ConstantExpr>(V)) + if (CE->getOpcode() == Instruction::BitCast || + (CE->getOpcode() == Instruction::GetElementPtr && + cast<GEPOperator>(CE)->hasAllZeroIndices())) + return getPointeeAlignment(CE->getOperand(0), TD); + + if (GlobalVariable *GV = dyn_cast<GlobalVariable>(V)) + if (!GV->isDeclaration()) + return TD.getPreferredAlignment(GV); + + if (PointerType *PT = dyn_cast<PointerType>(V->getType())) + return TD.getABITypeAlignment(PT->getElementType()); + + return 0; +} + // performScalarRepl - This algorithm is a simple worklist driven algorithm, // which runs on all of the alloca instructions in the function, removing them @@ -1379,23 +1517,26 @@ bool SROA::performScalarRepl(Function &F) { continue; // Check to see if this allocation is only modified by a memcpy/memmove from - // a constant global. If this is the case, we can change all users to use + // a constant global whose alignment is equal to or exceeds that of the + // allocation. If this is the case, we can change all users to use // the constant global instead. This is commonly produced by the CFE by // constructs like "void foo() { int A[] = {1,2,3,4,5,6,7,8,9...}; }" if 'A' // is only subsequently read. SmallVector<Instruction *, 4> ToDelete; if (MemTransferInst *Copy = isOnlyCopiedFromConstantGlobal(AI, ToDelete)) { - DEBUG(dbgs() << "Found alloca equal to global: " << *AI << '\n'); - DEBUG(dbgs() << " memcpy = " << *Copy << '\n'); - for (unsigned i = 0, e = ToDelete.size(); i != e; ++i) - ToDelete[i]->eraseFromParent(); - Constant *TheSrc = cast<Constant>(Copy->getSource()); - AI->replaceAllUsesWith(ConstantExpr::getBitCast(TheSrc, AI->getType())); - Copy->eraseFromParent(); // Don't mutate the global. - AI->eraseFromParent(); - ++NumGlobals; - Changed = true; - continue; + if (AI->getAlignment() <= getPointeeAlignment(Copy->getSource(), *TD)) { + DEBUG(dbgs() << "Found alloca equal to global: " << *AI << '\n'); + DEBUG(dbgs() << " memcpy = " << *Copy << '\n'); + for (unsigned i = 0, e = ToDelete.size(); i != e; ++i) + ToDelete[i]->eraseFromParent(); + Constant *TheSrc = cast<Constant>(Copy->getSource()); + AI->replaceAllUsesWith(ConstantExpr::getBitCast(TheSrc, AI->getType())); + Copy->eraseFromParent(); // Don't mutate the global. + AI->eraseFromParent(); + ++NumGlobals; + Changed = true; + continue; + } } // Check to see if we can perform the core SROA transformation. We cannot @@ -1425,8 +1566,8 @@ bool SROA::performScalarRepl(Function &F) { // promoted itself. If so, we don't want to transform it needlessly. Note // that we can't just check based on the type: the alloca may be of an i32 // but that has pointer arithmetic to set byte 3 of it or something. - if (AllocaInst *NewAI = - ConvertToScalarInfo((unsigned)AllocaSize, *TD).TryConvert(AI)) { + if (AllocaInst *NewAI = ConvertToScalarInfo( + (unsigned)AllocaSize, *TD, ScalarLoadThreshold).TryConvert(AI)) { NewAI->takeName(AI); AI->eraseFromParent(); ++NumConverted; @@ -1531,12 +1672,12 @@ void SROA::isSafeForScalarRepl(Instruction *I, uint64_t Offset, isSafeMemAccess(Offset, TD->getTypeAllocSize(LIType), LIType, false, Info, LI, true /*AllowWholeAccess*/); Info.hasALoadOrStore = true; - + } else if (StoreInst *SI = dyn_cast<StoreInst>(User)) { // Store is ok if storing INTO the pointer, not storing the pointer if (!SI->isSimple() || SI->getOperand(0) == I) return MarkUnsafe(Info, User); - + Type *SIType = SI->getOperand(0)->getType(); isSafeMemAccess(Offset, TD->getTypeAllocSize(SIType), SIType, true, Info, SI, true /*AllowWholeAccess*/); @@ -1553,7 +1694,7 @@ void SROA::isSafeForScalarRepl(Instruction *I, uint64_t Offset, if (Info.isUnsafe) return; } } - + /// isSafePHIUseForScalarRepl - If we see a PHI node or select using a pointer /// derived from the alloca, we can often still split the alloca into elements. @@ -1570,10 +1711,10 @@ void SROA::isSafePHISelectUseForScalarRepl(Instruction *I, uint64_t Offset, if (PHINode *PN = dyn_cast<PHINode>(I)) if (!Info.CheckedPHIs.insert(PN)) return; - + for (Value::use_iterator UI = I->use_begin(), E = I->use_end(); UI!=E; ++UI) { Instruction *User = cast<Instruction>(*UI); - + if (BitCastInst *BC = dyn_cast<BitCastInst>(User)) { isSafePHISelectUseForScalarRepl(BC, Offset, Info); } else if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(User)) { @@ -1590,12 +1731,12 @@ void SROA::isSafePHISelectUseForScalarRepl(Instruction *I, uint64_t Offset, isSafeMemAccess(Offset, TD->getTypeAllocSize(LIType), LIType, false, Info, LI, false /*AllowWholeAccess*/); Info.hasALoadOrStore = true; - + } else if (StoreInst *SI = dyn_cast<StoreInst>(User)) { // Store is ok if storing INTO the pointer, not storing the pointer if (!SI->isSimple() || SI->getOperand(0) == I) return MarkUnsafe(Info, User); - + Type *SIType = SI->getOperand(0)->getType(); isSafeMemAccess(Offset, TD->getTypeAllocSize(SIType), SIType, true, Info, SI, false /*AllowWholeAccess*/); @@ -1619,6 +1760,8 @@ void SROA::isSafeGEP(GetElementPtrInst *GEPI, gep_type_iterator GEPIt = gep_type_begin(GEPI), E = gep_type_end(GEPI); if (GEPIt == E) return; + bool NonConstant = false; + unsigned NonConstantIdxSize = 0; // Walk through the GEP type indices, checking the types that this indexes // into. @@ -1628,15 +1771,30 @@ void SROA::isSafeGEP(GetElementPtrInst *GEPI, continue; ConstantInt *IdxVal = dyn_cast<ConstantInt>(GEPIt.getOperand()); - if (!IdxVal) - return MarkUnsafe(Info, GEPI); + if (!IdxVal) { + // Non constant GEPs are only a problem on arrays, structs, and pointers + // Vectors can be dynamically indexed. + // FIXME: Add support for dynamic indexing on arrays. This should be + // ok on any subarrays of the alloca array, eg, a[0][i] is ok, but a[i][0] + // isn't. + if (!(*GEPIt)->isVectorTy()) + return MarkUnsafe(Info, GEPI); + NonConstant = true; + NonConstantIdxSize = TD->getTypeAllocSize(*GEPIt); + } } // Compute the offset due to this GEP and check if the alloca has a // component element at that offset. SmallVector<Value*, 8> Indices(GEPI->op_begin() + 1, GEPI->op_end()); + // If this GEP is non constant then the last operand must have been a + // dynamic index into a vector. Pop this now as it has no impact on the + // constant part of the offset. + if (NonConstant) + Indices.pop_back(); Offset += TD->getIndexedOffset(GEPI->getPointerOperandType(), Indices); - if (!TypeHasComponent(Info.AI->getAllocatedType(), Offset, 0)) + if (!TypeHasComponent(Info.AI->getAllocatedType(), Offset, + NonConstantIdxSize)) MarkUnsafe(Info, GEPI); } @@ -1741,6 +1899,12 @@ bool SROA::TypeHasComponent(Type *T, uint64_t Offset, uint64_t Size) { if (Offset >= AT->getNumElements() * EltSize) return false; Offset %= EltSize; + } else if (VectorType *VT = dyn_cast<VectorType>(T)) { + EltTy = VT->getElementType(); + EltSize = TD->getTypeAllocSize(EltTy); + if (Offset >= VT->getNumElements() * EltSize) + return false; + Offset %= EltSize; } else { return false; } @@ -1766,12 +1930,12 @@ void SROA::RewriteForScalarRepl(Instruction *I, AllocaInst *AI, uint64_t Offset, RewriteBitCast(BC, AI, Offset, NewElts); continue; } - + if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(User)) { RewriteGEP(GEPI, AI, Offset, NewElts); continue; } - + if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(User)) { ConstantInt *Length = dyn_cast<ConstantInt>(MI->getLength()); uint64_t MemSize = Length->getZExtValue(); @@ -1790,10 +1954,10 @@ void SROA::RewriteForScalarRepl(Instruction *I, AllocaInst *AI, uint64_t Offset, } continue; } - + if (LoadInst *LI = dyn_cast<LoadInst>(User)) { Type *LIType = LI->getType(); - + if (isCompatibleAggregate(LIType, AI->getAllocatedType())) { // Replace: // %res = load { i32, i32 }* %alloc @@ -1819,7 +1983,7 @@ void SROA::RewriteForScalarRepl(Instruction *I, AllocaInst *AI, uint64_t Offset, } continue; } - + if (StoreInst *SI = dyn_cast<StoreInst>(User)) { Value *Val = SI->getOperand(0); Type *SIType = Val->getType(); @@ -1846,16 +2010,16 @@ void SROA::RewriteForScalarRepl(Instruction *I, AllocaInst *AI, uint64_t Offset, } continue; } - + if (isa<SelectInst>(User) || isa<PHINode>(User)) { - // If we have a PHI user of the alloca itself (as opposed to a GEP or + // If we have a PHI user of the alloca itself (as opposed to a GEP or // bitcast) we have to rewrite it. GEP and bitcast uses will be RAUW'd to // the new pointer. if (!isa<AllocaInst>(I)) continue; - + assert(Offset == 0 && NewElts[0] && "Direct alloca use should have a zero offset"); - + // If we have a use of the alloca, we know the derived uses will be // utilizing just the first element of the scalarized result. Insert a // bitcast of the first alloca before the user as required. @@ -1908,9 +2072,16 @@ uint64_t SROA::FindElementAndOffset(Type *&T, uint64_t &Offset, Offset -= Layout->getElementOffset(Idx); IdxTy = Type::getInt32Ty(T->getContext()); return Idx; + } else if (ArrayType *AT = dyn_cast<ArrayType>(T)) { + T = AT->getElementType(); + uint64_t EltSize = TD->getTypeAllocSize(T); + Idx = Offset / EltSize; + Offset -= Idx * EltSize; + IdxTy = Type::getInt64Ty(T->getContext()); + return Idx; } - ArrayType *AT = cast<ArrayType>(T); - T = AT->getElementType(); + VectorType *VT = cast<VectorType>(T); + T = VT->getElementType(); uint64_t EltSize = TD->getTypeAllocSize(T); Idx = Offset / EltSize; Offset -= Idx * EltSize; @@ -1925,6 +2096,13 @@ void SROA::RewriteGEP(GetElementPtrInst *GEPI, AllocaInst *AI, uint64_t Offset, SmallVector<AllocaInst*, 32> &NewElts) { uint64_t OldOffset = Offset; SmallVector<Value*, 8> Indices(GEPI->op_begin() + 1, GEPI->op_end()); + // If the GEP was dynamic then it must have been a dynamic vector lookup. + // In this case, it must be the last GEP operand which is dynamic so keep that + // aside until we've found the constant GEP offset then add it back in at the + // end. + Value* NonConstantIdx = 0; + if (!GEPI->hasAllConstantIndices()) + NonConstantIdx = Indices.pop_back_val(); Offset += TD->getIndexedOffset(GEPI->getPointerOperandType(), Indices); RewriteForScalarRepl(GEPI, AI, Offset, NewElts); @@ -1951,6 +2129,17 @@ void SROA::RewriteGEP(GetElementPtrInst *GEPI, AllocaInst *AI, uint64_t Offset, uint64_t EltIdx = FindElementAndOffset(T, EltOffset, IdxTy); NewArgs.push_back(ConstantInt::get(IdxTy, EltIdx)); } + if (NonConstantIdx) { + Type* GepTy = T; + // This GEP has a dynamic index. We need to add "i32 0" to index through + // any structs or arrays in the original type until we get to the vector + // to index. + while (!isa<VectorType>(GepTy)) { + NewArgs.push_back(Constant::getNullValue(i32Ty)); + GepTy = cast<CompositeType>(GepTy)->getTypeAtIndex(0U); + } + NewArgs.push_back(NonConstantIdx); + } Instruction *Val = NewElts[Idx]; if (NewArgs.size() > 1) { Val = GetElementPtrInst::CreateInBounds(Val, NewArgs, "", GEPI); @@ -2202,7 +2391,7 @@ void SROA::RewriteStoreUserOfWholeAlloca(StoreInst *SI, AllocaInst *AI, uint64_t AllocaSizeBits = TD->getTypeAllocSizeInBits(AllocaEltTy); IRBuilder<> Builder(SI); - + // Handle tail padding by extending the operand if (TD->getTypeSizeInBits(SrcVal->getType()) != AllocaSizeBits) SrcVal = Builder.CreateZExt(SrcVal, @@ -2464,7 +2653,7 @@ bool SROA::isSafeAllocaToScalarRepl(AllocaInst *AI) { return false; } } - + return true; } diff --git a/lib/Transforms/Scalar/SimplifyCFGPass.cpp b/lib/Transforms/Scalar/SimplifyCFGPass.cpp index a66b3e3..d13e4ab 100644 --- a/lib/Transforms/Scalar/SimplifyCFGPass.cpp +++ b/lib/Transforms/Scalar/SimplifyCFGPass.cpp @@ -67,7 +67,7 @@ static void ChangeToUnreachable(Instruction *I, bool UseLLVMTrap) { // nodes. for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB); SI != SE; ++SI) (*SI)->removePredecessor(BB); - + // Insert a call to llvm.trap right before this. This turns the undefined // behavior into a hard fail instead of falling through into random code. if (UseLLVMTrap) { @@ -77,7 +77,7 @@ static void ChangeToUnreachable(Instruction *I, bool UseLLVMTrap) { CallTrap->setDebugLoc(I->getDebugLoc()); } new UnreachableInst(I->getContext(), I); - + // All instructions after this are dead. BasicBlock::iterator BBI = I, BBE = BB->end(); while (BBI != BBE) { @@ -89,7 +89,6 @@ static void ChangeToUnreachable(Instruction *I, bool UseLLVMTrap) { /// ChangeToCall - Convert the specified invoke into a normal call. static void ChangeToCall(InvokeInst *II) { - BasicBlock *BB = II->getParent(); SmallVector<Value*, 8> Args(II->op_begin(), II->op_end() - 3); CallInst *NewCall = CallInst::Create(II->getCalledValue(), Args, "", II); NewCall->takeName(II); @@ -102,19 +101,19 @@ static void ChangeToCall(InvokeInst *II) { BranchInst::Create(II->getNormalDest(), II); // Update PHI nodes in the unwind destination - II->getUnwindDest()->removePredecessor(BB); - BB->getInstList().erase(II); + II->getUnwindDest()->removePredecessor(II->getParent()); + II->eraseFromParent(); } static bool MarkAliveBlocks(BasicBlock *BB, SmallPtrSet<BasicBlock*, 128> &Reachable) { - + SmallVector<BasicBlock*, 128> Worklist; Worklist.push_back(BB); bool Changed = false; do { BB = Worklist.pop_back_val(); - + if (!Reachable.insert(BB)) continue; @@ -136,7 +135,7 @@ static bool MarkAliveBlocks(BasicBlock *BB, break; } } - + // Store to undef and store to null are undefined and used to signal that // they should be changed to unreachable by passes that can't modify the // CFG. @@ -145,7 +144,7 @@ static bool MarkAliveBlocks(BasicBlock *BB, if (SI->isVolatile()) continue; Value *Ptr = SI->getOperand(1); - + if (isa<UndefValue>(Ptr) || (isa<ConstantPointerNull>(Ptr) && SI->getPointerAddressSpace() == 0)) { @@ -157,11 +156,22 @@ static bool MarkAliveBlocks(BasicBlock *BB, } // Turn invokes that call 'nounwind' functions into ordinary calls. - if (InvokeInst *II = dyn_cast<InvokeInst>(BB->getTerminator())) - if (II->doesNotThrow()) { - ChangeToCall(II); + if (InvokeInst *II = dyn_cast<InvokeInst>(BB->getTerminator())) { + Value *Callee = II->getCalledValue(); + if (isa<ConstantPointerNull>(Callee) || isa<UndefValue>(Callee)) { + ChangeToUnreachable(II, true); + Changed = true; + } else if (II->doesNotThrow()) { + if (II->use_empty() && II->onlyReadsMemory()) { + // jump to the normal destination branch. + BranchInst::Create(II->getNormalDest(), II); + II->getUnwindDest()->removePredecessor(II->getParent()); + II->eraseFromParent(); + } else + ChangeToCall(II); Changed = true; } + } Changed |= ConstantFoldTerminator(BB, true); for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB); SI != SE; ++SI) @@ -170,38 +180,38 @@ static bool MarkAliveBlocks(BasicBlock *BB, return Changed; } -/// RemoveUnreachableBlocksFromFn - Remove blocks that are not reachable, even -/// if they are in a dead cycle. Return true if a change was made, false +/// RemoveUnreachableBlocksFromFn - Remove blocks that are not reachable, even +/// if they are in a dead cycle. Return true if a change was made, false /// otherwise. static bool RemoveUnreachableBlocksFromFn(Function &F) { SmallPtrSet<BasicBlock*, 128> Reachable; bool Changed = MarkAliveBlocks(F.begin(), Reachable); - + // If there are unreachable blocks in the CFG... if (Reachable.size() == F.size()) return Changed; - + assert(Reachable.size() < F.size()); NumSimpl += F.size()-Reachable.size(); - + // Loop over all of the basic blocks that are not reachable, dropping all of // their internal references... for (Function::iterator BB = ++F.begin(), E = F.end(); BB != E; ++BB) { if (Reachable.count(BB)) continue; - + for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB); SI != SE; ++SI) if (Reachable.count(*SI)) (*SI)->removePredecessor(BB); BB->dropAllReferences(); } - + for (Function::iterator I = ++F.begin(); I != F.end();) if (!Reachable.count(I)) I = F.getBasicBlockList().erase(I); else ++I; - + return true; } @@ -209,17 +219,17 @@ static bool RemoveUnreachableBlocksFromFn(Function &F) { /// node) return blocks, merge them together to promote recursive block merging. static bool MergeEmptyReturnBlocks(Function &F) { bool Changed = false; - + BasicBlock *RetBlock = 0; - + // Scan all the blocks in the function, looking for empty return blocks. for (Function::iterator BBI = F.begin(), E = F.end(); BBI != E; ) { BasicBlock &BB = *BBI++; - + // Only look at return blocks. ReturnInst *Ret = dyn_cast<ReturnInst>(BB.getTerminator()); if (Ret == 0) continue; - + // Only look at the block if it is empty or the only other thing in it is a // single PHI node that is the operand to the return. if (Ret != &BB.front()) { @@ -241,21 +251,21 @@ static bool MergeEmptyReturnBlocks(Function &F) { RetBlock = &BB; continue; } - + // Otherwise, we found a duplicate return block. Merge the two. Changed = true; - + // Case when there is no input to the return or when the returned values // agree is trivial. Note that they can't agree if there are phis in the // blocks. if (Ret->getNumOperands() == 0 || - Ret->getOperand(0) == + Ret->getOperand(0) == cast<ReturnInst>(RetBlock->getTerminator())->getOperand(0)) { BB.replaceAllUsesWith(RetBlock); BB.eraseFromParent(); continue; } - + // If the canonical return block has no PHI node, create one now. PHINode *RetBlockPHI = dyn_cast<PHINode>(RetBlock->begin()); if (RetBlockPHI == 0) { @@ -264,12 +274,12 @@ static bool MergeEmptyReturnBlocks(Function &F) { RetBlockPHI = PHINode::Create(Ret->getOperand(0)->getType(), std::distance(PB, PE), "merge", &RetBlock->front()); - + for (pred_iterator PI = PB; PI != PE; ++PI) RetBlockPHI->addIncoming(InVal, *PI); RetBlock->getTerminator()->setOperand(0, RetBlockPHI); } - + // Turn BB into a block that just unconditionally branches to the return // block. This handles the case when the two return blocks have a common // predecessor but that return different things. @@ -277,7 +287,7 @@ static bool MergeEmptyReturnBlocks(Function &F) { BB.getTerminator()->eraseFromParent(); BranchInst::Create(RetBlock, &BB); } - + return Changed; } @@ -288,7 +298,7 @@ static bool IterativeSimplifyCFG(Function &F, const TargetData *TD) { bool LocalChange = true; while (LocalChange) { LocalChange = false; - + // Loop over all of the basic blocks and remove them if they are unneeded... // for (Function::iterator BBIt = F.begin(); BBIt != F.end(); ) { @@ -317,7 +327,7 @@ bool CFGSimplifyPass::runOnFunction(Function &F) { // IterativeSimplifyCFG can (rarely) make some loops dead. If this happens, // RemoveUnreachableBlocksFromFn is needed to nuke them, which means we should // iterate between the two optimizations. We structure the code like this to - // avoid reruning IterativeSimplifyCFG if the second pass of + // avoid reruning IterativeSimplifyCFG if the second pass of // RemoveUnreachableBlocksFromFn doesn't do anything. if (!RemoveUnreachableBlocksFromFn(F)) return true; diff --git a/lib/Transforms/Scalar/SimplifyLibCalls.cpp b/lib/Transforms/Scalar/SimplifyLibCalls.cpp index f7b6941..f110320 100644 --- a/lib/Transforms/Scalar/SimplifyLibCalls.cpp +++ b/lib/Transforms/Scalar/SimplifyLibCalls.cpp @@ -18,20 +18,20 @@ #define DEBUG_TYPE "simplify-libcalls" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/BuildLibCalls.h" +#include "llvm/IRBuilder.h" #include "llvm/Intrinsics.h" #include "llvm/LLVMContext.h" #include "llvm/Module.h" #include "llvm/Pass.h" -#include "llvm/Support/IRBuilder.h" -#include "llvm/Analysis/ValueTracking.h" -#include "llvm/Target/TargetData.h" -#include "llvm/Target/TargetLibraryInfo.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallPtrSet.h" -#include "llvm/ADT/StringMap.h" #include "llvm/ADT/Statistic.h" -#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/StringMap.h" +#include "llvm/Analysis/ValueTracking.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetData.h" +#include "llvm/Target/TargetLibraryInfo.h" #include "llvm/Config/config.h" // FIXME: Shouldn't depend on host! using namespace llvm; @@ -100,7 +100,7 @@ static bool IsOnlyUsedInZeroEqualityComparison(Value *V) { } return true; } - + static bool CallHasFloatingPointArgument(const CallInst *CI) { for (CallInst::const_op_iterator it = CI->op_begin(), e = CI->op_end(); it != e; ++it) { @@ -157,14 +157,15 @@ struct StrCatOpt : public LibCallOptimization { // These optimizations require TargetData. if (!TD) return 0; - EmitStrLenMemCpy(Src, Dst, Len, B); - return Dst; + return EmitStrLenMemCpy(Src, Dst, Len, B); } - void EmitStrLenMemCpy(Value *Src, Value *Dst, uint64_t Len, IRBuilder<> &B) { + Value *EmitStrLenMemCpy(Value *Src, Value *Dst, uint64_t Len, IRBuilder<> &B) { // We need to find the end of the destination string. That's where the // memory is to be moved to. We just generate a call to strlen. - Value *DstLen = EmitStrLen(Dst, B, TD); + Value *DstLen = EmitStrLen(Dst, B, TD, TLI); + if (!DstLen) + return 0; // Now that we have the destination's length, we must index into the // destination's pointer to get the actual memcpy destination (end of @@ -175,6 +176,7 @@ struct StrCatOpt : public LibCallOptimization { // concatenation for us. Make a memcpy to copy the nul byte with align = 1. B.CreateMemCpy(CpyDst, Src, ConstantInt::get(TD->getIntPtrType(*Context), Len + 1), 1); + return Dst; } }; @@ -221,8 +223,7 @@ struct StrNCatOpt : public StrCatOpt { // strncat(x, s, c) -> strcat(x, s) // s is constant so the strcat can be optimized further - EmitStrLenMemCpy(Src, Dst, SrcLen, B); - return Dst; + return EmitStrLenMemCpy(Src, Dst, SrcLen, B); } }; @@ -254,9 +255,9 @@ struct StrChrOpt : public LibCallOptimization { return EmitMemChr(SrcStr, CI->getArgOperand(1), // include nul. ConstantInt::get(TD->getIntPtrType(*Context), Len), - B, TD); + B, TD, TLI); } - + // Otherwise, the character is a constant, see if the first argument is // a string literal. If so, we can constant fold. StringRef Str; @@ -299,7 +300,7 @@ struct StrRChrOpt : public LibCallOptimization { if (!getConstantStringInfo(SrcStr, Str)) { // strrchr(s, 0) -> strchr(s, 0) if (TD && CharC->isZero()) - return EmitStrChr(SrcStr, '\0', B, TD); + return EmitStrChr(SrcStr, '\0', B, TD, TLI); return 0; } @@ -355,7 +356,7 @@ struct StrCmpOpt : public LibCallOptimization { return EmitMemCmp(Str1P, Str2P, ConstantInt::get(TD->getIntPtrType(*Context), - std::min(Len1, Len2)), B, TD); + std::min(Len1, Len2)), B, TD, TLI); } return 0; @@ -391,7 +392,7 @@ struct StrNCmpOpt : public LibCallOptimization { return ConstantInt::get(CI->getType(), 0); if (TD && Length == 1) // strncmp(x,y,1) -> memcmp(x,y,1) - return EmitMemCmp(Str1P, Str2P, CI->getArgOperand(2), B, TD); + return EmitMemCmp(Str1P, Str2P, CI->getArgOperand(2), B, TD, TLI); StringRef Str1, Str2; bool HasStr1 = getConstantStringInfo(Str1P, Str1); @@ -447,11 +448,10 @@ struct StrCpyOpt : public LibCallOptimization { // We have enough information to now generate the memcpy call to do the // concatenation for us. Make a memcpy to copy the nul byte with align = 1. - if (OptChkCall) - EmitMemCpyChk(Dst, Src, - ConstantInt::get(TD->getIntPtrType(*Context), Len), - CI->getArgOperand(2), B, TD); - else + if (!OptChkCall || + !EmitMemCpyChk(Dst, Src, + ConstantInt::get(TD->getIntPtrType(*Context), Len), + CI->getArgOperand(2), B, TD, TLI)) B.CreateMemCpy(Dst, Src, ConstantInt::get(TD->getIntPtrType(*Context), Len), 1); return Dst; @@ -459,6 +459,51 @@ struct StrCpyOpt : public LibCallOptimization { }; //===---------------------------------------===// +// 'stpcpy' Optimizations + +struct StpCpyOpt: public LibCallOptimization { + bool OptChkCall; // True if it's optimizing a __stpcpy_chk libcall. + + StpCpyOpt(bool c) : OptChkCall(c) {} + + virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) { + // Verify the "stpcpy" function prototype. + unsigned NumParams = OptChkCall ? 3 : 2; + FunctionType *FT = Callee->getFunctionType(); + if (FT->getNumParams() != NumParams || + FT->getReturnType() != FT->getParamType(0) || + FT->getParamType(0) != FT->getParamType(1) || + FT->getParamType(0) != B.getInt8PtrTy()) + return 0; + + // These optimizations require TargetData. + if (!TD) return 0; + + Value *Dst = CI->getArgOperand(0), *Src = CI->getArgOperand(1); + if (Dst == Src) { // stpcpy(x,x) -> x+strlen(x) + Value *StrLen = EmitStrLen(Src, B, TD, TLI); + return StrLen ? B.CreateInBoundsGEP(Dst, StrLen) : 0; + } + + // See if we can get the length of the input string. + uint64_t Len = GetStringLength(Src); + if (Len == 0) return 0; + + Value *LenV = ConstantInt::get(TD->getIntPtrType(*Context), Len); + Value *DstEnd = B.CreateGEP(Dst, + ConstantInt::get(TD->getIntPtrType(*Context), + Len - 1)); + + // We have enough information to now generate the memcpy call to do the + // copy for us. Make a memcpy to copy the nul byte with align = 1. + if (!OptChkCall || !EmitMemCpyChk(Dst, Src, LenV, CI->getArgOperand(2), B, + TD, TLI)) + B.CreateMemCpy(Dst, Src, LenV, 1); + return DstEnd; + } +}; + +//===---------------------------------------===// // 'strncpy' Optimizations struct StrNCpyOpt : public LibCallOptimization { @@ -565,7 +610,7 @@ struct StrPBrkOpt : public LibCallOptimization { // strpbrk(s, "a") -> strchr(s, 'a') if (TD && HasS2 && S2.size() == 1) - return EmitStrChr(CI->getArgOperand(0), S2[0], B, TD); + return EmitStrChr(CI->getArgOperand(0), S2[0], B, TD, TLI); return 0; } @@ -654,7 +699,7 @@ struct StrCSpnOpt : public LibCallOptimization { // strcspn(s, "") -> strlen(s) if (TD && HasS2 && S2.empty()) - return EmitStrLen(CI->getArgOperand(0), B, TD); + return EmitStrLen(CI->getArgOperand(0), B, TD, TLI); return 0; } @@ -678,9 +723,13 @@ struct StrStrOpt : public LibCallOptimization { // fold strstr(a, b) == a -> strncmp(a, b, strlen(b)) == 0 if (TD && IsOnlyUsedInEqualityComparison(CI, CI->getArgOperand(0))) { - Value *StrLen = EmitStrLen(CI->getArgOperand(1), B, TD); + Value *StrLen = EmitStrLen(CI->getArgOperand(1), B, TD, TLI); + if (!StrLen) + return 0; Value *StrNCmp = EmitStrNCmp(CI->getArgOperand(0), CI->getArgOperand(1), - StrLen, B, TD); + StrLen, B, TD, TLI); + if (!StrNCmp) + return 0; for (Value::use_iterator UI = CI->use_begin(), UE = CI->use_end(); UI != UE; ) { ICmpInst *Old = cast<ICmpInst>(*UI++); @@ -716,9 +765,10 @@ struct StrStrOpt : public LibCallOptimization { } // fold strstr(x, "y") -> strchr(x, 'y'). - if (HasStr2 && ToFindStr.size() == 1) - return B.CreateBitCast(EmitStrChr(CI->getArgOperand(0), - ToFindStr[0], B, TD), CI->getType()); + if (HasStr2 && ToFindStr.size() == 1) { + Value *StrChr= EmitStrChr(CI->getArgOperand(0), ToFindStr[0], B, TD, TLI); + return StrChr ? B.CreateBitCast(StrChr, CI->getType()) : 0; + } return 0; } }; @@ -1135,8 +1185,8 @@ struct PrintFOpt : public LibCallOptimization { // printf("x") -> putchar('x'), even for '%'. if (FormatStr.size() == 1) { - Value *Res = EmitPutChar(B.getInt32(FormatStr[0]), B, TD); - if (CI->use_empty()) return CI; + Value *Res = EmitPutChar(B.getInt32(FormatStr[0]), B, TD, TLI); + if (CI->use_empty() || !Res) return Res; return B.CreateIntCast(Res, CI->getType(), true); } @@ -1147,26 +1197,26 @@ struct PrintFOpt : public LibCallOptimization { // pass to be run after this pass, to merge duplicate strings. FormatStr = FormatStr.drop_back(); Value *GV = B.CreateGlobalString(FormatStr, "str"); - EmitPutS(GV, B, TD); - return CI->use_empty() ? (Value*)CI : - ConstantInt::get(CI->getType(), FormatStr.size()+1); + Value *NewCI = EmitPutS(GV, B, TD, TLI); + return (CI->use_empty() || !NewCI) ? + NewCI : + ConstantInt::get(CI->getType(), FormatStr.size()+1); } // Optimize specific format strings. // printf("%c", chr) --> putchar(chr) if (FormatStr == "%c" && CI->getNumArgOperands() > 1 && CI->getArgOperand(1)->getType()->isIntegerTy()) { - Value *Res = EmitPutChar(CI->getArgOperand(1), B, TD); + Value *Res = EmitPutChar(CI->getArgOperand(1), B, TD, TLI); - if (CI->use_empty()) return CI; + if (CI->use_empty() || !Res) return Res; return B.CreateIntCast(Res, CI->getType(), true); } // printf("%s\n", str) --> puts(str) if (FormatStr == "%s\n" && CI->getNumArgOperands() > 1 && CI->getArgOperand(1)->getType()->isPointerTy()) { - EmitPutS(CI->getArgOperand(1), B, TD); - return CI; + return EmitPutS(CI->getArgOperand(1), B, TD, TLI); } return 0; } @@ -1253,7 +1303,9 @@ struct SPrintFOpt : public LibCallOptimization { // sprintf(dest, "%s", str) -> llvm.memcpy(dest, str, strlen(str)+1, 1) if (!CI->getArgOperand(2)->getType()->isPointerTy()) return 0; - Value *Len = EmitStrLen(CI->getArgOperand(2), B, TD); + Value *Len = EmitStrLen(CI->getArgOperand(2), B, TD, TLI); + if (!Len) + return 0; Value *IncLen = B.CreateAdd(Len, ConstantInt::get(Len->getType(), 1), "leninc"); @@ -1320,8 +1372,8 @@ struct FWriteOpt : public LibCallOptimization { // This optimisation is only valid, if the return value is unused. if (Bytes == 1 && CI->use_empty()) { // fwrite(S,1,1,F) -> fputc(S[0],F) Value *Char = B.CreateLoad(CastToCStr(CI->getArgOperand(0), B), "char"); - EmitFPutC(Char, CI->getArgOperand(3), B, TD); - return ConstantInt::get(CI->getType(), 1); + Value *NewCI = EmitFPutC(Char, CI->getArgOperand(3), B, TD, TLI); + return NewCI ? ConstantInt::get(CI->getType(), 1) : 0; } return 0; @@ -1346,10 +1398,10 @@ struct FPutsOpt : public LibCallOptimization { // fputs(s,F) --> fwrite(s,1,strlen(s),F) uint64_t Len = GetStringLength(CI->getArgOperand(0)); if (!Len) return 0; - EmitFWrite(CI->getArgOperand(0), - ConstantInt::get(TD->getIntPtrType(*Context), Len-1), - CI->getArgOperand(1), B, TD, TLI); - return CI; // Known to have no uses (see above). + // Known to have no uses (see above). + return EmitFWrite(CI->getArgOperand(0), + ConstantInt::get(TD->getIntPtrType(*Context), Len-1), + CI->getArgOperand(1), B, TD, TLI); } }; @@ -1373,11 +1425,11 @@ struct FPrintFOpt : public LibCallOptimization { // These optimizations require TargetData. if (!TD) return 0; - EmitFWrite(CI->getArgOperand(1), - ConstantInt::get(TD->getIntPtrType(*Context), - FormatStr.size()), - CI->getArgOperand(0), B, TD, TLI); - return ConstantInt::get(CI->getType(), FormatStr.size()); + Value *NewCI = EmitFWrite(CI->getArgOperand(1), + ConstantInt::get(TD->getIntPtrType(*Context), + FormatStr.size()), + CI->getArgOperand(0), B, TD, TLI); + return NewCI ? ConstantInt::get(CI->getType(), FormatStr.size()) : 0; } // The remaining optimizations require the format string to be "%s" or "%c" @@ -1390,16 +1442,16 @@ struct FPrintFOpt : public LibCallOptimization { if (FormatStr[1] == 'c') { // fprintf(F, "%c", chr) --> fputc(chr, F) if (!CI->getArgOperand(2)->getType()->isIntegerTy()) return 0; - EmitFPutC(CI->getArgOperand(2), CI->getArgOperand(0), B, TD); - return ConstantInt::get(CI->getType(), 1); + Value *NewCI = EmitFPutC(CI->getArgOperand(2), CI->getArgOperand(0), B, + TD, TLI); + return NewCI ? ConstantInt::get(CI->getType(), 1) : 0; } if (FormatStr[1] == 's') { // fprintf(F, "%s", str) --> fputs(str, F) if (!CI->getArgOperand(2)->getType()->isPointerTy() || !CI->use_empty()) return 0; - EmitFPutS(CI->getArgOperand(2), CI->getArgOperand(0), B, TD, TLI); - return CI; + return EmitFPutS(CI->getArgOperand(2), CI->getArgOperand(0), B, TD, TLI); } return 0; } @@ -1450,8 +1502,8 @@ struct PutsOpt : public LibCallOptimization { if (Str.empty() && CI->use_empty()) { // puts("") -> putchar('\n') - Value *Res = EmitPutChar(B.getInt32('\n'), B, TD); - if (CI->use_empty()) return CI; + Value *Res = EmitPutChar(B.getInt32('\n'), B, TD, TLI); + if (CI->use_empty() || !Res) return Res; return B.CreateIntCast(Res, CI->getType(), true); } @@ -1470,12 +1522,15 @@ namespace { /// class SimplifyLibCalls : public FunctionPass { TargetLibraryInfo *TLI; - + StringMap<LibCallOptimization*> Optimizations; // String and Memory LibCall Optimizations StrCatOpt StrCat; StrNCatOpt StrNCat; StrChrOpt StrChr; StrRChrOpt StrRChr; - StrCmpOpt StrCmp; StrNCmpOpt StrNCmp; StrCpyOpt StrCpy; StrCpyOpt StrCpyChk; - StrNCpyOpt StrNCpy; StrLenOpt StrLen; StrPBrkOpt StrPBrk; + StrCmpOpt StrCmp; StrNCmpOpt StrNCmp; + StrCpyOpt StrCpy; StrCpyOpt StrCpyChk; + StpCpyOpt StpCpy; StpCpyOpt StpCpyChk; + StrNCpyOpt StrNCpy; + StrLenOpt StrLen; StrPBrkOpt StrPBrk; StrToOpt StrTo; StrSpnOpt StrSpn; StrCSpnOpt StrCSpn; StrStrOpt StrStr; MemCmpOpt MemCmp; MemCpyOpt MemCpy; MemMoveOpt MemMove; MemSetOpt MemSet; // Math Library Optimizations @@ -1487,11 +1542,12 @@ namespace { SPrintFOpt SPrintF; PrintFOpt PrintF; FWriteOpt FWrite; FPutsOpt FPuts; FPrintFOpt FPrintF; PutsOpt Puts; - + bool Modified; // This is only used by doInitialization. public: static char ID; // Pass identification - SimplifyLibCalls() : FunctionPass(ID), StrCpy(false), StrCpyChk(true) { + SimplifyLibCalls() : FunctionPass(ID), StrCpy(false), StrCpyChk(true), + StpCpy(false), StpCpyChk(true) { initializeSimplifyLibCallsPass(*PassRegistry::getPassRegistry()); } void AddOpt(LibFunc::Func F, LibCallOptimization* Opt); @@ -1542,6 +1598,7 @@ void SimplifyLibCalls::InitOptimizations() { Optimizations["strncmp"] = &StrNCmp; Optimizations["strcpy"] = &StrCpy; Optimizations["strncpy"] = &StrNCpy; + Optimizations["stpcpy"] = &StpCpy; Optimizations["strlen"] = &StrLen; Optimizations["strpbrk"] = &StrPBrk; Optimizations["strtol"] = &StrTo; @@ -1561,6 +1618,7 @@ void SimplifyLibCalls::InitOptimizations() { // _chk variants of String and Memory LibCall Optimizations. Optimizations["__strcpy_chk"] = &StrCpyChk; + Optimizations["__stpcpy_chk"] = &StpCpyChk; // Math Library Optimizations Optimizations["cosf"] = &Cos; @@ -1717,7 +1775,7 @@ void SimplifyLibCalls::setDoesNotAlias(Function &F, unsigned n) { void SimplifyLibCalls::inferPrototypeAttributes(Function &F) { FunctionType *FTy = F.getFunctionType(); - + StringRef Name = F.getName(); switch (Name[0]) { case 's': @@ -1746,6 +1804,7 @@ void SimplifyLibCalls::inferPrototypeAttributes(Function &F) { Name == "strtold" || Name == "strncat" || Name == "strncpy" || + Name == "stpncpy" || Name == "strtoull") { if (FTy->getNumParams() < 2 || !FTy->getParamType(1)->isPointerTy()) @@ -2406,10 +2465,6 @@ bool SimplifyLibCalls::doInitialization(Module &M) { // * sqrt(Nroot(x)) -> pow(x,1/(2*N)) // * sqrt(pow(x,y)) -> pow(|x|,y*0.5) // -// stpcpy: -// * stpcpy(str, "literal") -> -// llvm.memcpy(str,"literal",strlen("literal")+1,1) -// // strchr: // * strchr(p, 0) -> strlen(p) // tan, tanf, tanl: diff --git a/lib/Transforms/Scalar/Sink.cpp b/lib/Transforms/Scalar/Sink.cpp index ef65c0a..34f1d6c 100644 --- a/lib/Transforms/Scalar/Sink.cpp +++ b/lib/Transforms/Scalar/Sink.cpp @@ -27,6 +27,7 @@ using namespace llvm; STATISTIC(NumSunk, "Number of instructions sunk"); +STATISTIC(NumSinkIter, "Number of sinking iterations"); namespace { class Sinking : public FunctionPass { @@ -39,9 +40,9 @@ namespace { Sinking() : FunctionPass(ID) { initializeSinkingPass(*PassRegistry::getPassRegistry()); } - + virtual bool runOnFunction(Function &F); - + virtual void getAnalysisUsage(AnalysisUsage &AU) const { AU.setPreservesCFG(); FunctionPass::getAnalysisUsage(AU); @@ -55,9 +56,10 @@ namespace { bool ProcessBlock(BasicBlock &BB); bool SinkInstruction(Instruction *I, SmallPtrSet<Instruction *, 8> &Stores); bool AllUsesDominatedByBlock(Instruction *Inst, BasicBlock *BB) const; + bool IsAcceptableTarget(Instruction *Inst, BasicBlock *SuccToSinkTo) const; }; } // end anonymous namespace - + char Sinking::ID = 0; INITIALIZE_PASS_BEGIN(Sinking, "sink", "Code sinking", false, false) INITIALIZE_PASS_DEPENDENCY(LoopInfo) @@ -69,7 +71,7 @@ FunctionPass *llvm::createSinkingPass() { return new Sinking(); } /// AllUsesDominatedByBlock - Return true if all uses of the specified value /// occur in blocks dominated by the specified block. -bool Sinking::AllUsesDominatedByBlock(Instruction *Inst, +bool Sinking::AllUsesDominatedByBlock(Instruction *Inst, BasicBlock *BB) const { // Ignoring debug uses is necessary so debug info doesn't affect the code. // This may leave a referencing dbg_value in the original block, before @@ -98,20 +100,19 @@ bool Sinking::runOnFunction(Function &F) { LI = &getAnalysis<LoopInfo>(); AA = &getAnalysis<AliasAnalysis>(); - bool EverMadeChange = false; - - while (1) { - bool MadeChange = false; + bool MadeChange, EverMadeChange = false; + do { + MadeChange = false; + DEBUG(dbgs() << "Sinking iteration " << NumSinkIter << "\n"); // Process all basic blocks. - for (Function::iterator I = F.begin(), E = F.end(); + for (Function::iterator I = F.begin(), E = F.end(); I != E; ++I) MadeChange |= ProcessBlock(*I); - - // If this iteration over the code changed anything, keep iterating. - if (!MadeChange) break; - EverMadeChange = true; - } + EverMadeChange |= MadeChange; + NumSinkIter++; + } while (MadeChange); + return EverMadeChange; } @@ -120,8 +121,8 @@ bool Sinking::ProcessBlock(BasicBlock &BB) { if (BB.getTerminator()->getNumSuccessors() <= 1 || BB.empty()) return false; // Don't bother sinking code out of unreachable blocks. In addition to being - // unprofitable, it can also lead to infinite looping, because in an unreachable - // loop there may be nowhere to stop. + // unprofitable, it can also lead to infinite looping, because in an + // unreachable loop there may be nowhere to stop. if (!DT->isReachableFromEntry(&BB)) return false; bool MadeChange = false; @@ -133,7 +134,7 @@ bool Sinking::ProcessBlock(BasicBlock &BB) { SmallPtrSet<Instruction *, 8> Stores; do { Instruction *Inst = I; // The instruction to sink. - + // Predecrement I (if it's not begin) so that it isn't invalidated by // sinking. ProcessedBegin = I == BB.begin(); @@ -145,10 +146,10 @@ bool Sinking::ProcessBlock(BasicBlock &BB) { if (SinkInstruction(Inst, Stores)) ++NumSunk, MadeChange = true; - + // If we just processed the first instruction in the block, we're done. } while (!ProcessedBegin); - + return MadeChange; } @@ -174,6 +175,45 @@ static bool isSafeToMove(Instruction *Inst, AliasAnalysis *AA, return true; } +/// IsAcceptableTarget - Return true if it is possible to sink the instruction +/// in the specified basic block. +bool Sinking::IsAcceptableTarget(Instruction *Inst, + BasicBlock *SuccToSinkTo) const { + assert(Inst && "Instruction to be sunk is null"); + assert(SuccToSinkTo && "Candidate sink target is null"); + + // It is not possible to sink an instruction into its own block. This can + // happen with loops. + if (Inst->getParent() == SuccToSinkTo) + return false; + + // If the block has multiple predecessors, this would introduce computation + // on different code paths. We could split the critical edge, but for now we + // just punt. + // FIXME: Split critical edges if not backedges. + if (SuccToSinkTo->getUniquePredecessor() != Inst->getParent()) { + // We cannot sink a load across a critical edge - there may be stores in + // other code paths. + if (!isSafeToSpeculativelyExecute(Inst)) + return false; + + // We don't want to sink across a critical edge if we don't dominate the + // successor. We could be introducing calculations to new code paths. + if (!DT->dominates(Inst->getParent(), SuccToSinkTo)) + return false; + + // Don't sink instructions into a loop. + Loop *succ = LI->getLoopFor(SuccToSinkTo); + Loop *cur = LI->getLoopFor(Inst->getParent()); + if (succ != 0 && succ != cur) + return false; + } + + // Finally, check that all the uses of the instruction are actually + // dominated by the candidate + return AllUsesDominatedByBlock(Inst, SuccToSinkTo); +} + /// SinkInstruction - Determine whether it is safe to sink the specified machine /// instruction out of its current block into a successor. bool Sinking::SinkInstruction(Instruction *Inst, @@ -181,7 +221,7 @@ bool Sinking::SinkInstruction(Instruction *Inst, // Check if it's safe to move the instruction. if (!isSafeToMove(Inst, AA, Stores)) return false; - + // FIXME: This should include support for sinking instructions within the // block they are currently in to shorten the live ranges. We often get // instructions sunk into the top of a large block, but it would be better to @@ -189,86 +229,42 @@ bool Sinking::SinkInstruction(Instruction *Inst, // be careful not to *increase* register pressure though, e.g. sinking // "x = y + z" down if it kills y and z would increase the live ranges of y // and z and only shrink the live range of x. - - // Loop over all the operands of the specified instruction. If there is - // anything we can't handle, bail out. - BasicBlock *ParentBlock = Inst->getParent(); - + // SuccToSinkTo - This is the successor to sink this instruction to, once we // decide. BasicBlock *SuccToSinkTo = 0; - - // FIXME: This picks a successor to sink into based on having one - // successor that dominates all the uses. However, there are cases where - // sinking can happen but where the sink point isn't a successor. For - // example: - // x = computation - // if () {} else {} - // use x - // the instruction could be sunk over the whole diamond for the - // if/then/else (or loop, etc), allowing it to be sunk into other blocks - // after that. - + // Instructions can only be sunk if all their uses are in blocks // dominated by one of the successors. - // Look at all the successors and decide which one - // we should sink to. - for (succ_iterator SI = succ_begin(ParentBlock), - E = succ_end(ParentBlock); SI != E; ++SI) { - if (AllUsesDominatedByBlock(Inst, *SI)) { - SuccToSinkTo = *SI; - break; - } + // Look at all the postdominators and see if we can sink it in one. + DomTreeNode *DTN = DT->getNode(Inst->getParent()); + for (DomTreeNode::iterator I = DTN->begin(), E = DTN->end(); + I != E && SuccToSinkTo == 0; ++I) { + BasicBlock *Candidate = (*I)->getBlock(); + if ((*I)->getIDom()->getBlock() == Inst->getParent() && + IsAcceptableTarget(Inst, Candidate)) + SuccToSinkTo = Candidate; + } + + // If no suitable postdominator was found, look at all the successors and + // decide which one we should sink to, if any. + for (succ_iterator I = succ_begin(Inst->getParent()), + E = succ_end(Inst->getParent()); I != E && SuccToSinkTo == 0; ++I) { + if (IsAcceptableTarget(Inst, *I)) + SuccToSinkTo = *I; } - + // If we couldn't find a block to sink to, ignore this instruction. if (SuccToSinkTo == 0) return false; - - // It is not possible to sink an instruction into its own block. This can - // happen with loops. - if (Inst->getParent() == SuccToSinkTo) - return false; - - DEBUG(dbgs() << "Sink instr " << *Inst); - DEBUG(dbgs() << "to block "; - WriteAsOperand(dbgs(), SuccToSinkTo, false)); - - // If the block has multiple predecessors, this would introduce computation on - // a path that it doesn't already exist. We could split the critical edge, - // but for now we just punt. - // FIXME: Split critical edges if not backedges. - if (SuccToSinkTo->getUniquePredecessor() != ParentBlock) { - // We cannot sink a load across a critical edge - there may be stores in - // other code paths. - if (!isSafeToSpeculativelyExecute(Inst)) { - DEBUG(dbgs() << " *** PUNTING: Wont sink load along critical edge.\n"); - return false; - } - // We don't want to sink across a critical edge if we don't dominate the - // successor. We could be introducing calculations to new code paths. - if (!DT->dominates(ParentBlock, SuccToSinkTo)) { - DEBUG(dbgs() << " *** PUNTING: Critical edge found\n"); - return false; - } - - // Don't sink instructions into a loop. - if (LI->isLoopHeader(SuccToSinkTo)) { - DEBUG(dbgs() << " *** PUNTING: Loop header found\n"); - return false; - } + DEBUG(dbgs() << "Sink" << *Inst << " ("; + WriteAsOperand(dbgs(), Inst->getParent(), false); + dbgs() << " -> "; + WriteAsOperand(dbgs(), SuccToSinkTo, false); + dbgs() << ")\n"); - // Otherwise we are OK with sinking along a critical edge. - DEBUG(dbgs() << "Sinking along critical edge.\n"); - } - - // Determine where to insert into. Skip phi nodes. - BasicBlock::iterator InsertPos = SuccToSinkTo->begin(); - while (InsertPos != SuccToSinkTo->end() && isa<PHINode>(InsertPos)) - ++InsertPos; - // Move the instruction. - Inst->moveBefore(InsertPos); + Inst->moveBefore(SuccToSinkTo->getFirstInsertionPt()); return true; } diff --git a/lib/Transforms/Scalar/TailRecursionElimination.cpp b/lib/Transforms/Scalar/TailRecursionElimination.cpp index e21eb9d..6557d63 100644 --- a/lib/Transforms/Scalar/TailRecursionElimination.cpp +++ b/lib/Transforms/Scalar/TailRecursionElimination.cpp @@ -172,7 +172,7 @@ bool TailCallElim::runOnFunction(Function &F) { FunctionContainsEscapingAllocas |= CheckForEscapingAllocas(BB, CannotTCETailMarkedCall); } - + /// FIXME: The code generator produces really bad code when an 'escaping /// alloca' is changed from being a static alloca to being a dynamic alloca. /// Until this is resolved, disable this transformation if that would ever @@ -234,7 +234,7 @@ bool TailCallElim::CanMoveAboveCall(Instruction *I, CallInst *CI) { // call does not mod/ref the memory location being processed. if (I->mayHaveSideEffects()) // This also handles volatile loads. return false; - + if (LoadInst *L = dyn_cast<LoadInst>(I)) { // Loads may always be moved above calls without side effects. if (CI->mayHaveSideEffects()) { @@ -364,7 +364,7 @@ TailCallElim::FindTRECandidate(Instruction *TI, if (&BB->front() == TI) // Make sure there is something before the terminator. return 0; - + // Scan backwards from the return, checking to see if there is a tail call in // this block. If so, set CI to it. CallInst *CI = 0; @@ -388,10 +388,10 @@ TailCallElim::FindTRECandidate(Instruction *TI, // double fabs(double f) { return __builtin_fabs(f); } // a 'fabs' call // and disable this xform in this case, because the code generator will // lower the call to fabs into inline code. - if (BB == &F->getEntryBlock() && + if (BB == &F->getEntryBlock() && FirstNonDbg(BB->front()) == CI && FirstNonDbg(llvm::next(BB->begin())) == TI && - callIsSmall(F)) { + callIsSmall(CI)) { // A single-block function with just a call and a return. Check that // the arguments match. CallSite::arg_iterator I = CallSite(CI).arg_begin(), @@ -432,7 +432,7 @@ bool TailCallElim::EliminateRecursiveTailCall(CallInst *CI, ReturnInst *Ret, BasicBlock::iterator BBI = CI; for (++BBI; &*BBI != Ret; ++BBI) { if (CanMoveAboveCall(BBI, CI)) continue; - + // If we can't move the instruction above the call, it might be because it // is an associative and commutative operation that could be transformed // using accumulator recursion elimination. Check to see if this is the diff --git a/lib/Transforms/Utils/BasicBlockUtils.cpp b/lib/Transforms/Utils/BasicBlockUtils.cpp index 3859a1a..2679b93 100644 --- a/lib/Transforms/Utils/BasicBlockUtils.cpp +++ b/lib/Transforms/Utils/BasicBlockUtils.cpp @@ -659,10 +659,26 @@ ReturnInst *llvm::FoldReturnIntoUncondBranch(ReturnInst *RI, BasicBlock *BB, // If the return instruction returns a value, and if the value was a // PHI node in "BB", propagate the right value into the return. for (User::op_iterator i = NewRet->op_begin(), e = NewRet->op_end(); - i != e; ++i) - if (PHINode *PN = dyn_cast<PHINode>(*i)) - if (PN->getParent() == BB) - *i = PN->getIncomingValueForBlock(Pred); + i != e; ++i) { + Value *V = *i; + Instruction *NewBC = 0; + if (BitCastInst *BCI = dyn_cast<BitCastInst>(V)) { + // Return value might be bitcasted. Clone and insert it before the + // return instruction. + V = BCI->getOperand(0); + NewBC = BCI->clone(); + Pred->getInstList().insert(NewRet, NewBC); + *i = NewBC; + } + if (PHINode *PN = dyn_cast<PHINode>(V)) { + if (PN->getParent() == BB) { + if (NewBC) + NewBC->setOperand(0, PN->getIncomingValueForBlock(Pred)); + else + *i = PN->getIncomingValueForBlock(Pred); + } + } + } // Update any PHI nodes in the returning block to realize that we no // longer branch to them. @@ -671,12 +687,3 @@ ReturnInst *llvm::FoldReturnIntoUncondBranch(ReturnInst *RI, BasicBlock *BB, return cast<ReturnInst>(NewRet); } -/// GetFirstDebugLocInBasicBlock - Return first valid DebugLoc entry in a -/// given basic block. -DebugLoc llvm::GetFirstDebugLocInBasicBlock(const BasicBlock *BB) { - if (const Instruction *I = BB->getFirstNonPHI()) - return I->getDebugLoc(); - // Scanning entire block may be too expensive, if the first instruction - // does not have valid location info. - return DebugLoc(); -} diff --git a/lib/Transforms/Utils/BreakCriticalEdges.cpp b/lib/Transforms/Utils/BreakCriticalEdges.cpp index 2a8e9b8..6b04e3d 100644 --- a/lib/Transforms/Utils/BreakCriticalEdges.cpp +++ b/lib/Transforms/Utils/BreakCriticalEdges.cpp @@ -122,7 +122,7 @@ bool llvm::isCriticalEdge(const TerminatorInst *TI, unsigned SuccNum, /// new PHIs, as needed. Preds is a list of preds inside the loop, SplitBB /// is the new loop exit block, and DestBB is the old loop exit, now the /// successor of SplitBB. -static void createPHIsForSplitLoopExit(SmallVectorImpl<BasicBlock *> &Preds, +static void createPHIsForSplitLoopExit(ArrayRef<BasicBlock *> Preds, BasicBlock *SplitBB, BasicBlock *DestBB) { // SplitBB shouldn't have anything non-trivial in it yet. @@ -341,11 +341,8 @@ BasicBlock *llvm::SplitCriticalEdge(TerminatorInst *TI, unsigned SuccNum, "Split point for loop exit is contained in loop!"); // Update LCSSA form in the newly created exit block. - if (P->mustPreserveAnalysisID(LCSSAID)) { - SmallVector<BasicBlock *, 1> OrigPred; - OrigPred.push_back(TIBB); - createPHIsForSplitLoopExit(OrigPred, NewBB, DestBB); - } + if (P->mustPreserveAnalysisID(LCSSAID)) + createPHIsForSplitLoopExit(TIBB, NewBB, DestBB); // For each unique exit block... // FIXME: This code is functionally equivalent to the corresponding diff --git a/lib/Transforms/Utils/BuildLibCalls.cpp b/lib/Transforms/Utils/BuildLibCalls.cpp index a808303..e13fd71 100644 --- a/lib/Transforms/Utils/BuildLibCalls.cpp +++ b/lib/Transforms/Utils/BuildLibCalls.cpp @@ -12,18 +12,18 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/Utils/BuildLibCalls.h" -#include "llvm/Type.h" #include "llvm/Constants.h" #include "llvm/Function.h" +#include "llvm/IRBuilder.h" +#include "llvm/Intrinsics.h" #include "llvm/Intrinsics.h" #include "llvm/LLVMContext.h" +#include "llvm/LLVMContext.h" #include "llvm/Module.h" -#include "llvm/Support/IRBuilder.h" +#include "llvm/Type.h" +#include "llvm/ADT/SmallString.h" #include "llvm/Target/TargetData.h" #include "llvm/Target/TargetLibraryInfo.h" -#include "llvm/LLVMContext.h" -#include "llvm/Intrinsics.h" -#include "llvm/ADT/SmallString.h" using namespace llvm; @@ -34,7 +34,11 @@ Value *llvm::CastToCStr(Value *V, IRBuilder<> &B) { /// EmitStrLen - Emit a call to the strlen function to the builder, for the /// specified pointer. This always returns an integer value of size intptr_t. -Value *llvm::EmitStrLen(Value *Ptr, IRBuilder<> &B, const TargetData *TD) { +Value *llvm::EmitStrLen(Value *Ptr, IRBuilder<> &B, const TargetData *TD, + const TargetLibraryInfo *TLI) { + if (!TLI->has(LibFunc::strlen)) + return 0; + Module *M = B.GetInsertBlock()->getParent()->getParent(); AttributeWithIndex AWI[2]; AWI[0] = AttributeWithIndex::get(1, Attribute::NoCapture); @@ -42,7 +46,7 @@ Value *llvm::EmitStrLen(Value *Ptr, IRBuilder<> &B, const TargetData *TD) { Attribute::NoUnwind); LLVMContext &Context = B.GetInsertBlock()->getContext(); - Constant *StrLen = M->getOrInsertFunction("strlen", AttrListPtr::get(AWI, 2), + Constant *StrLen = M->getOrInsertFunction("strlen", AttrListPtr::get(AWI), TD->getIntPtrType(Context), B.getInt8PtrTy(), NULL); @@ -53,18 +57,48 @@ Value *llvm::EmitStrLen(Value *Ptr, IRBuilder<> &B, const TargetData *TD) { return CI; } +/// EmitStrNLen - Emit a call to the strnlen function to the builder, for the +/// specified pointer. Ptr is required to be some pointer type, MaxLen must +/// be of size_t type, and the return value has 'intptr_t' type. +Value *llvm::EmitStrNLen(Value *Ptr, Value *MaxLen, IRBuilder<> &B, + const TargetData *TD, const TargetLibraryInfo *TLI) { + if (!TLI->has(LibFunc::strnlen)) + return 0; + + Module *M = B.GetInsertBlock()->getParent()->getParent(); + AttributeWithIndex AWI[2]; + AWI[0] = AttributeWithIndex::get(1, Attribute::NoCapture); + AWI[1] = AttributeWithIndex::get(~0u, Attribute::ReadOnly | + Attribute::NoUnwind); + + LLVMContext &Context = B.GetInsertBlock()->getContext(); + Constant *StrNLen = M->getOrInsertFunction("strnlen", AttrListPtr::get(AWI), + TD->getIntPtrType(Context), + B.getInt8PtrTy(), + TD->getIntPtrType(Context), + NULL); + CallInst *CI = B.CreateCall2(StrNLen, CastToCStr(Ptr, B), MaxLen, "strnlen"); + if (const Function *F = dyn_cast<Function>(StrNLen->stripPointerCasts())) + CI->setCallingConv(F->getCallingConv()); + + return CI; +} + /// EmitStrChr - Emit a call to the strchr function to the builder, for the /// specified pointer and character. Ptr is required to be some pointer type, /// and the return value has 'i8*' type. Value *llvm::EmitStrChr(Value *Ptr, char C, IRBuilder<> &B, - const TargetData *TD) { + const TargetData *TD, const TargetLibraryInfo *TLI) { + if (!TLI->has(LibFunc::strchr)) + return 0; + Module *M = B.GetInsertBlock()->getParent()->getParent(); AttributeWithIndex AWI = AttributeWithIndex::get(~0u, Attribute::ReadOnly | Attribute::NoUnwind); Type *I8Ptr = B.getInt8PtrTy(); Type *I32Ty = B.getInt32Ty(); - Constant *StrChr = M->getOrInsertFunction("strchr", AttrListPtr::get(&AWI, 1), + Constant *StrChr = M->getOrInsertFunction("strchr", AttrListPtr::get(AWI), I8Ptr, I8Ptr, I32Ty, NULL); CallInst *CI = B.CreateCall2(StrChr, CastToCStr(Ptr, B), ConstantInt::get(I32Ty, C), "strchr"); @@ -75,7 +109,11 @@ Value *llvm::EmitStrChr(Value *Ptr, char C, IRBuilder<> &B, /// EmitStrNCmp - Emit a call to the strncmp function to the builder. Value *llvm::EmitStrNCmp(Value *Ptr1, Value *Ptr2, Value *Len, - IRBuilder<> &B, const TargetData *TD) { + IRBuilder<> &B, const TargetData *TD, + const TargetLibraryInfo *TLI) { + if (!TLI->has(LibFunc::strncmp)) + return 0; + Module *M = B.GetInsertBlock()->getParent()->getParent(); AttributeWithIndex AWI[3]; AWI[0] = AttributeWithIndex::get(1, Attribute::NoCapture); @@ -84,7 +122,7 @@ Value *llvm::EmitStrNCmp(Value *Ptr1, Value *Ptr2, Value *Len, Attribute::NoUnwind); LLVMContext &Context = B.GetInsertBlock()->getContext(); - Value *StrNCmp = M->getOrInsertFunction("strncmp", AttrListPtr::get(AWI, 3), + Value *StrNCmp = M->getOrInsertFunction("strncmp", AttrListPtr::get(AWI), B.getInt32Ty(), B.getInt8PtrTy(), B.getInt8PtrTy(), @@ -101,13 +139,17 @@ Value *llvm::EmitStrNCmp(Value *Ptr1, Value *Ptr2, Value *Len, /// EmitStrCpy - Emit a call to the strcpy function to the builder, for the /// specified pointer arguments. Value *llvm::EmitStrCpy(Value *Dst, Value *Src, IRBuilder<> &B, - const TargetData *TD, StringRef Name) { + const TargetData *TD, const TargetLibraryInfo *TLI, + StringRef Name) { + if (!TLI->has(LibFunc::strcpy)) + return 0; + Module *M = B.GetInsertBlock()->getParent()->getParent(); AttributeWithIndex AWI[2]; AWI[0] = AttributeWithIndex::get(2, Attribute::NoCapture); AWI[1] = AttributeWithIndex::get(~0u, Attribute::NoUnwind); Type *I8Ptr = B.getInt8PtrTy(); - Value *StrCpy = M->getOrInsertFunction(Name, AttrListPtr::get(AWI, 2), + Value *StrCpy = M->getOrInsertFunction(Name, AttrListPtr::get(AWI), I8Ptr, I8Ptr, I8Ptr, NULL); CallInst *CI = B.CreateCall2(StrCpy, CastToCStr(Dst, B), CastToCStr(Src, B), Name); @@ -119,13 +161,17 @@ Value *llvm::EmitStrCpy(Value *Dst, Value *Src, IRBuilder<> &B, /// EmitStrNCpy - Emit a call to the strncpy function to the builder, for the /// specified pointer arguments. Value *llvm::EmitStrNCpy(Value *Dst, Value *Src, Value *Len, - IRBuilder<> &B, const TargetData *TD, StringRef Name) { + IRBuilder<> &B, const TargetData *TD, + const TargetLibraryInfo *TLI, StringRef Name) { + if (!TLI->has(LibFunc::strncpy)) + return 0; + Module *M = B.GetInsertBlock()->getParent()->getParent(); AttributeWithIndex AWI[2]; AWI[0] = AttributeWithIndex::get(2, Attribute::NoCapture); AWI[1] = AttributeWithIndex::get(~0u, Attribute::NoUnwind); Type *I8Ptr = B.getInt8PtrTy(); - Value *StrNCpy = M->getOrInsertFunction(Name, AttrListPtr::get(AWI, 2), + Value *StrNCpy = M->getOrInsertFunction(Name, AttrListPtr::get(AWI), I8Ptr, I8Ptr, I8Ptr, Len->getType(), NULL); CallInst *CI = B.CreateCall3(StrNCpy, CastToCStr(Dst, B), CastToCStr(Src, B), @@ -139,13 +185,17 @@ Value *llvm::EmitStrNCpy(Value *Dst, Value *Src, Value *Len, /// This expects that the Len and ObjSize have type 'intptr_t' and Dst/Src /// are pointers. Value *llvm::EmitMemCpyChk(Value *Dst, Value *Src, Value *Len, Value *ObjSize, - IRBuilder<> &B, const TargetData *TD) { + IRBuilder<> &B, const TargetData *TD, + const TargetLibraryInfo *TLI) { + if (!TLI->has(LibFunc::memcpy_chk)) + return 0; + Module *M = B.GetInsertBlock()->getParent()->getParent(); AttributeWithIndex AWI; AWI = AttributeWithIndex::get(~0u, Attribute::NoUnwind); LLVMContext &Context = B.GetInsertBlock()->getContext(); Value *MemCpy = M->getOrInsertFunction("__memcpy_chk", - AttrListPtr::get(&AWI, 1), + AttrListPtr::get(AWI), B.getInt8PtrTy(), B.getInt8PtrTy(), B.getInt8PtrTy(), @@ -162,12 +212,16 @@ Value *llvm::EmitMemCpyChk(Value *Dst, Value *Src, Value *Len, Value *ObjSize, /// EmitMemChr - Emit a call to the memchr function. This assumes that Ptr is /// a pointer, Val is an i32 value, and Len is an 'intptr_t' value. Value *llvm::EmitMemChr(Value *Ptr, Value *Val, - Value *Len, IRBuilder<> &B, const TargetData *TD) { + Value *Len, IRBuilder<> &B, const TargetData *TD, + const TargetLibraryInfo *TLI) { + if (!TLI->has(LibFunc::memchr)) + return 0; + Module *M = B.GetInsertBlock()->getParent()->getParent(); AttributeWithIndex AWI; AWI = AttributeWithIndex::get(~0u, Attribute::ReadOnly | Attribute::NoUnwind); LLVMContext &Context = B.GetInsertBlock()->getContext(); - Value *MemChr = M->getOrInsertFunction("memchr", AttrListPtr::get(&AWI, 1), + Value *MemChr = M->getOrInsertFunction("memchr", AttrListPtr::get(AWI), B.getInt8PtrTy(), B.getInt8PtrTy(), B.getInt32Ty(), @@ -183,7 +237,11 @@ Value *llvm::EmitMemChr(Value *Ptr, Value *Val, /// EmitMemCmp - Emit a call to the memcmp function. Value *llvm::EmitMemCmp(Value *Ptr1, Value *Ptr2, - Value *Len, IRBuilder<> &B, const TargetData *TD) { + Value *Len, IRBuilder<> &B, const TargetData *TD, + const TargetLibraryInfo *TLI) { + if (!TLI->has(LibFunc::memcmp)) + return 0; + Module *M = B.GetInsertBlock()->getParent()->getParent(); AttributeWithIndex AWI[3]; AWI[0] = AttributeWithIndex::get(1, Attribute::NoCapture); @@ -192,7 +250,7 @@ Value *llvm::EmitMemCmp(Value *Ptr1, Value *Ptr2, Attribute::NoUnwind); LLVMContext &Context = B.GetInsertBlock()->getContext(); - Value *MemCmp = M->getOrInsertFunction("memcmp", AttrListPtr::get(AWI, 3), + Value *MemCmp = M->getOrInsertFunction("memcmp", AttrListPtr::get(AWI), B.getInt32Ty(), B.getInt8PtrTy(), B.getInt8PtrTy(), @@ -236,7 +294,11 @@ Value *llvm::EmitUnaryFloatFnCall(Value *Op, StringRef Name, IRBuilder<> &B, /// EmitPutChar - Emit a call to the putchar function. This assumes that Char /// is an integer. -Value *llvm::EmitPutChar(Value *Char, IRBuilder<> &B, const TargetData *TD) { +Value *llvm::EmitPutChar(Value *Char, IRBuilder<> &B, const TargetData *TD, + const TargetLibraryInfo *TLI) { + if (!TLI->has(LibFunc::putchar)) + return 0; + Module *M = B.GetInsertBlock()->getParent()->getParent(); Value *PutChar = M->getOrInsertFunction("putchar", B.getInt32Ty(), B.getInt32Ty(), NULL); @@ -254,33 +316,40 @@ Value *llvm::EmitPutChar(Value *Char, IRBuilder<> &B, const TargetData *TD) { /// EmitPutS - Emit a call to the puts function. This assumes that Str is /// some pointer. -void llvm::EmitPutS(Value *Str, IRBuilder<> &B, const TargetData *TD) { +Value *llvm::EmitPutS(Value *Str, IRBuilder<> &B, const TargetData *TD, + const TargetLibraryInfo *TLI) { + if (!TLI->has(LibFunc::puts)) + return 0; + Module *M = B.GetInsertBlock()->getParent()->getParent(); AttributeWithIndex AWI[2]; AWI[0] = AttributeWithIndex::get(1, Attribute::NoCapture); AWI[1] = AttributeWithIndex::get(~0u, Attribute::NoUnwind); - Value *PutS = M->getOrInsertFunction("puts", AttrListPtr::get(AWI, 2), + Value *PutS = M->getOrInsertFunction("puts", AttrListPtr::get(AWI), B.getInt32Ty(), B.getInt8PtrTy(), NULL); CallInst *CI = B.CreateCall(PutS, CastToCStr(Str, B), "puts"); if (const Function *F = dyn_cast<Function>(PutS->stripPointerCasts())) CI->setCallingConv(F->getCallingConv()); - + return CI; } /// EmitFPutC - Emit a call to the fputc function. This assumes that Char is /// an integer and File is a pointer to FILE. -void llvm::EmitFPutC(Value *Char, Value *File, IRBuilder<> &B, - const TargetData *TD) { +Value *llvm::EmitFPutC(Value *Char, Value *File, IRBuilder<> &B, + const TargetData *TD, const TargetLibraryInfo *TLI) { + if (!TLI->has(LibFunc::fputc)) + return 0; + Module *M = B.GetInsertBlock()->getParent()->getParent(); AttributeWithIndex AWI[2]; AWI[0] = AttributeWithIndex::get(2, Attribute::NoCapture); AWI[1] = AttributeWithIndex::get(~0u, Attribute::NoUnwind); Constant *F; if (File->getType()->isPointerTy()) - F = M->getOrInsertFunction("fputc", AttrListPtr::get(AWI, 2), + F = M->getOrInsertFunction("fputc", AttrListPtr::get(AWI), B.getInt32Ty(), B.getInt32Ty(), File->getType(), NULL); @@ -295,12 +364,16 @@ void llvm::EmitFPutC(Value *Char, Value *File, IRBuilder<> &B, if (const Function *Fn = dyn_cast<Function>(F->stripPointerCasts())) CI->setCallingConv(Fn->getCallingConv()); + return CI; } /// EmitFPutS - Emit a call to the puts function. Str is required to be a /// pointer and File is a pointer to FILE. -void llvm::EmitFPutS(Value *Str, Value *File, IRBuilder<> &B, - const TargetData *TD, const TargetLibraryInfo *TLI) { +Value *llvm::EmitFPutS(Value *Str, Value *File, IRBuilder<> &B, + const TargetData *TD, const TargetLibraryInfo *TLI) { + if (!TLI->has(LibFunc::fputs)) + return 0; + Module *M = B.GetInsertBlock()->getParent()->getParent(); AttributeWithIndex AWI[3]; AWI[0] = AttributeWithIndex::get(1, Attribute::NoCapture); @@ -309,7 +382,7 @@ void llvm::EmitFPutS(Value *Str, Value *File, IRBuilder<> &B, StringRef FPutsName = TLI->getName(LibFunc::fputs); Constant *F; if (File->getType()->isPointerTy()) - F = M->getOrInsertFunction(FPutsName, AttrListPtr::get(AWI, 3), + F = M->getOrInsertFunction(FPutsName, AttrListPtr::get(AWI), B.getInt32Ty(), B.getInt8PtrTy(), File->getType(), NULL); @@ -321,13 +394,17 @@ void llvm::EmitFPutS(Value *Str, Value *File, IRBuilder<> &B, if (const Function *Fn = dyn_cast<Function>(F->stripPointerCasts())) CI->setCallingConv(Fn->getCallingConv()); + return CI; } /// EmitFWrite - Emit a call to the fwrite function. This assumes that Ptr is /// a pointer, Size is an 'intptr_t', and File is a pointer to FILE. -void llvm::EmitFWrite(Value *Ptr, Value *Size, Value *File, - IRBuilder<> &B, const TargetData *TD, - const TargetLibraryInfo *TLI) { +Value *llvm::EmitFWrite(Value *Ptr, Value *Size, Value *File, + IRBuilder<> &B, const TargetData *TD, + const TargetLibraryInfo *TLI) { + if (!TLI->has(LibFunc::fwrite)) + return 0; + Module *M = B.GetInsertBlock()->getParent()->getParent(); AttributeWithIndex AWI[3]; AWI[0] = AttributeWithIndex::get(1, Attribute::NoCapture); @@ -337,7 +414,7 @@ void llvm::EmitFWrite(Value *Ptr, Value *Size, Value *File, StringRef FWriteName = TLI->getName(LibFunc::fwrite); Constant *F; if (File->getType()->isPointerTy()) - F = M->getOrInsertFunction(FWriteName, AttrListPtr::get(AWI, 3), + F = M->getOrInsertFunction(FWriteName, AttrListPtr::get(AWI), TD->getIntPtrType(Context), B.getInt8PtrTy(), TD->getIntPtrType(Context), @@ -354,11 +431,13 @@ void llvm::EmitFWrite(Value *Ptr, Value *Size, Value *File, if (const Function *Fn = dyn_cast<Function>(F->stripPointerCasts())) CI->setCallingConv(Fn->getCallingConv()); + return CI; } SimplifyFortifiedLibCalls::~SimplifyFortifiedLibCalls() { } -bool SimplifyFortifiedLibCalls::fold(CallInst *CI, const TargetData *TD) { +bool SimplifyFortifiedLibCalls::fold(CallInst *CI, const TargetData *TD, + const TargetLibraryInfo *TLI) { // We really need TargetData for later. if (!TD) return false; @@ -446,7 +525,9 @@ bool SimplifyFortifiedLibCalls::fold(CallInst *CI, const TargetData *TD) { // string lengths for varying. if (isFoldable(2, 1, true)) { Value *Ret = EmitStrCpy(CI->getArgOperand(0), CI->getArgOperand(1), B, TD, - Name.substr(2, 6)); + TLI, Name.substr(2, 6)); + if (!Ret) + return false; replaceCall(Ret); return true; } @@ -464,7 +545,10 @@ bool SimplifyFortifiedLibCalls::fold(CallInst *CI, const TargetData *TD) { if (isFoldable(3, 2, false)) { Value *Ret = EmitStrNCpy(CI->getArgOperand(0), CI->getArgOperand(1), - CI->getArgOperand(2), B, TD, Name.substr(2, 7)); + CI->getArgOperand(2), B, TD, TLI, + Name.substr(2, 7)); + if (!Ret) + return false; replaceCall(Ret); return true; } diff --git a/lib/Transforms/Utils/CMakeLists.txt b/lib/Transforms/Utils/CMakeLists.txt index 7f5cb5e..4ff31ca 100644 --- a/lib/Transforms/Utils/CMakeLists.txt +++ b/lib/Transforms/Utils/CMakeLists.txt @@ -29,3 +29,5 @@ add_llvm_library(LLVMTransformUtils Utils.cpp ValueMapper.cpp ) + +add_dependencies(LLVMTransformUtils intrinsics_gen) diff --git a/lib/Transforms/Utils/CloneFunction.cpp b/lib/Transforms/Utils/CloneFunction.cpp index 20052a4..99237b8 100644 --- a/lib/Transforms/Utils/CloneFunction.cpp +++ b/lib/Transforms/Utils/CloneFunction.cpp @@ -15,6 +15,7 @@ #include "llvm/Transforms/Utils/Cloning.h" #include "llvm/Constants.h" +#include "llvm/DebugInfo.h" #include "llvm/DerivedTypes.h" #include "llvm/Instructions.h" #include "llvm/IntrinsicInst.h" @@ -28,7 +29,6 @@ #include "llvm/Transforms/Utils/ValueMapper.h" #include "llvm/Analysis/ConstantFolding.h" #include "llvm/Analysis/InstructionSimplify.h" -#include "llvm/Analysis/DebugInfo.h" #include "llvm/ADT/SmallVector.h" #include <map> using namespace llvm; diff --git a/lib/Transforms/Utils/CloneModule.cpp b/lib/Transforms/Utils/CloneModule.cpp index a0e027b..1dac6b5 100644 --- a/lib/Transforms/Utils/CloneModule.cpp +++ b/lib/Transforms/Utils/CloneModule.cpp @@ -53,7 +53,7 @@ Module *llvm::CloneModule(const Module *M, ValueToValueMapTy &VMap) { I->isConstant(), I->getLinkage(), (Constant*) 0, I->getName(), (GlobalVariable*) 0, - I->isThreadLocal(), + I->getThreadLocalMode(), I->getType()->getAddressSpace()); GV->copyAttributesFrom(I); VMap[I] = GV; diff --git a/lib/Transforms/Utils/CodeExtractor.cpp b/lib/Transforms/Utils/CodeExtractor.cpp index e8c0b80..c545cd6 100644 --- a/lib/Transforms/Utils/CodeExtractor.cpp +++ b/lib/Transforms/Utils/CodeExtractor.cpp @@ -13,7 +13,7 @@ // //===----------------------------------------------------------------------===// -#include "llvm/Transforms/Utils/FunctionUtils.h" +#include "llvm/Transforms/Utils/CodeExtractor.h" #include "llvm/Constants.h" #include "llvm/DerivedTypes.h" #include "llvm/Instructions.h" @@ -23,6 +23,8 @@ #include "llvm/Pass.h" #include "llvm/Analysis/Dominators.h" #include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/RegionInfo.h" +#include "llvm/Analysis/RegionIterator.h" #include "llvm/Analysis/Verifier.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Support/CommandLine.h" @@ -43,61 +45,139 @@ static cl::opt<bool> AggregateArgsOpt("aggregate-extracted-args", cl::Hidden, cl::desc("Aggregate arguments to code-extracted functions")); -namespace { - class CodeExtractor { - typedef SetVector<Value*> Values; - SetVector<BasicBlock*> BlocksToExtract; - DominatorTree* DT; - bool AggregateArgs; - unsigned NumExitBlocks; - Type *RetTy; - public: - CodeExtractor(DominatorTree* dt = 0, bool AggArgs = false) - : DT(dt), AggregateArgs(AggArgs||AggregateArgsOpt), NumExitBlocks(~0U) {} - - Function *ExtractCodeRegion(ArrayRef<BasicBlock*> code); - - bool isEligible(ArrayRef<BasicBlock*> code); - - private: - /// definedInRegion - Return true if the specified value is defined in the - /// extracted region. - bool definedInRegion(Value *V) const { - if (Instruction *I = dyn_cast<Instruction>(V)) - if (BlocksToExtract.count(I->getParent())) - return true; - return false; - } +/// \brief Test whether a block is valid for extraction. +static bool isBlockValidForExtraction(const BasicBlock &BB) { + // Landing pads must be in the function where they were inserted for cleanup. + if (BB.isLandingPad()) + return false; - /// definedInCaller - Return true if the specified value is defined in the - /// function being code extracted, but not in the region being extracted. - /// These values must be passed in as live-ins to the function. - bool definedInCaller(Value *V) const { - if (isa<Argument>(V)) return true; - if (Instruction *I = dyn_cast<Instruction>(V)) - if (!BlocksToExtract.count(I->getParent())) - return true; + // Don't hoist code containing allocas, invokes, or vastarts. + for (BasicBlock::const_iterator I = BB.begin(), E = BB.end(); I != E; ++I) { + if (isa<AllocaInst>(I) || isa<InvokeInst>(I)) return false; + if (const CallInst *CI = dyn_cast<CallInst>(I)) + if (const Function *F = CI->getCalledFunction()) + if (F->getIntrinsicID() == Intrinsic::vastart) + return false; + } + + return true; +} + +/// \brief Build a set of blocks to extract if the input blocks are viable. +template <typename IteratorT> +static SetVector<BasicBlock *> buildExtractionBlockSet(IteratorT BBBegin, + IteratorT BBEnd) { + SetVector<BasicBlock *> Result; + + assert(BBBegin != BBEnd); + + // Loop over the blocks, adding them to our set-vector, and aborting with an + // empty set if we encounter invalid blocks. + for (IteratorT I = BBBegin, E = BBEnd; I != E; ++I) { + if (!Result.insert(*I)) + llvm_unreachable("Repeated basic blocks in extraction input"); + + if (!isBlockValidForExtraction(**I)) { + Result.clear(); + return Result; } + } + +#ifndef NDEBUG + for (SetVector<BasicBlock *>::iterator I = llvm::next(Result.begin()), + E = Result.end(); + I != E; ++I) + for (pred_iterator PI = pred_begin(*I), PE = pred_end(*I); + PI != PE; ++PI) + assert(Result.count(*PI) && + "No blocks in this region may have entries from outside the region" + " except for the first block!"); +#endif + + return Result; +} + +/// \brief Helper to call buildExtractionBlockSet with an ArrayRef. +static SetVector<BasicBlock *> +buildExtractionBlockSet(ArrayRef<BasicBlock *> BBs) { + return buildExtractionBlockSet(BBs.begin(), BBs.end()); +} + +/// \brief Helper to call buildExtractionBlockSet with a RegionNode. +static SetVector<BasicBlock *> +buildExtractionBlockSet(const RegionNode &RN) { + if (!RN.isSubRegion()) + // Just a single BasicBlock. + return buildExtractionBlockSet(RN.getNodeAs<BasicBlock>()); - void severSplitPHINodes(BasicBlock *&Header); - void splitReturnBlocks(); - void findInputsOutputs(Values &inputs, Values &outputs); + const Region &R = *RN.getNodeAs<Region>(); - Function *constructFunction(const Values &inputs, - const Values &outputs, - BasicBlock *header, - BasicBlock *newRootNode, BasicBlock *newHeader, - Function *oldFunction, Module *M); + return buildExtractionBlockSet(R.block_begin(), R.block_end()); +} - void moveCodeToFunction(Function *newFunction); +CodeExtractor::CodeExtractor(BasicBlock *BB, bool AggregateArgs) + : DT(0), AggregateArgs(AggregateArgs||AggregateArgsOpt), + Blocks(buildExtractionBlockSet(BB)), NumExitBlocks(~0U) {} + +CodeExtractor::CodeExtractor(ArrayRef<BasicBlock *> BBs, DominatorTree *DT, + bool AggregateArgs) + : DT(DT), AggregateArgs(AggregateArgs||AggregateArgsOpt), + Blocks(buildExtractionBlockSet(BBs)), NumExitBlocks(~0U) {} + +CodeExtractor::CodeExtractor(DominatorTree &DT, Loop &L, bool AggregateArgs) + : DT(&DT), AggregateArgs(AggregateArgs||AggregateArgsOpt), + Blocks(buildExtractionBlockSet(L.getBlocks())), NumExitBlocks(~0U) {} + +CodeExtractor::CodeExtractor(DominatorTree &DT, const RegionNode &RN, + bool AggregateArgs) + : DT(&DT), AggregateArgs(AggregateArgs||AggregateArgsOpt), + Blocks(buildExtractionBlockSet(RN)), NumExitBlocks(~0U) {} + +/// definedInRegion - Return true if the specified value is defined in the +/// extracted region. +static bool definedInRegion(const SetVector<BasicBlock *> &Blocks, Value *V) { + if (Instruction *I = dyn_cast<Instruction>(V)) + if (Blocks.count(I->getParent())) + return true; + return false; +} - void emitCallAndSwitchStatement(Function *newFunction, - BasicBlock *newHeader, - Values &inputs, - Values &outputs); +/// definedInCaller - Return true if the specified value is defined in the +/// function being code extracted, but not in the region being extracted. +/// These values must be passed in as live-ins to the function. +static bool definedInCaller(const SetVector<BasicBlock *> &Blocks, Value *V) { + if (isa<Argument>(V)) return true; + if (Instruction *I = dyn_cast<Instruction>(V)) + if (!Blocks.count(I->getParent())) + return true; + return false; +} - }; +void CodeExtractor::findInputsOutputs(ValueSet &Inputs, + ValueSet &Outputs) const { + for (SetVector<BasicBlock *>::const_iterator I = Blocks.begin(), + E = Blocks.end(); + I != E; ++I) { + BasicBlock *BB = *I; + + // If a used value is defined outside the region, it's an input. If an + // instruction is used outside the region, it's an output. + for (BasicBlock::iterator II = BB->begin(), IE = BB->end(); + II != IE; ++II) { + for (User::op_iterator OI = II->op_begin(), OE = II->op_end(); + OI != OE; ++OI) + if (definedInCaller(Blocks, *OI)) + Inputs.insert(*OI); + + for (Value::use_iterator UI = II->use_begin(), UE = II->use_end(); + UI != UE; ++UI) + if (!definedInRegion(Blocks, *UI)) { + Outputs.insert(II); + break; + } + } + } } /// severSplitPHINodes - If a PHI node has multiple inputs from outside of the @@ -115,7 +195,7 @@ void CodeExtractor::severSplitPHINodes(BasicBlock *&Header) { // than one entry from outside the region. If so, we need to sever the // header block into two. for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) - if (BlocksToExtract.count(PN->getIncomingBlock(i))) + if (Blocks.count(PN->getIncomingBlock(i))) ++NumPredsFromRegion; else ++NumPredsOutsideRegion; @@ -136,8 +216,8 @@ void CodeExtractor::severSplitPHINodes(BasicBlock *&Header) { // We only want to code extract the second block now, and it becomes the new // header of the region. BasicBlock *OldPred = Header; - BlocksToExtract.remove(OldPred); - BlocksToExtract.insert(NewBB); + Blocks.remove(OldPred); + Blocks.insert(NewBB); Header = NewBB; // Okay, update dominator sets. The blocks that dominate the new one are the @@ -152,7 +232,7 @@ void CodeExtractor::severSplitPHINodes(BasicBlock *&Header) { // Loop over all of the predecessors of OldPred that are in the region, // changing them to branch to NewBB instead. for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) - if (BlocksToExtract.count(PN->getIncomingBlock(i))) { + if (Blocks.count(PN->getIncomingBlock(i))) { TerminatorInst *TI = PN->getIncomingBlock(i)->getTerminator(); TI->replaceUsesOfWith(OldPred, NewBB); } @@ -170,7 +250,7 @@ void CodeExtractor::severSplitPHINodes(BasicBlock *&Header) { // Loop over all of the incoming value in PN, moving them to NewPN if they // are from the extracted region. for (unsigned i = 0; i != PN->getNumIncomingValues(); ++i) { - if (BlocksToExtract.count(PN->getIncomingBlock(i))) { + if (Blocks.count(PN->getIncomingBlock(i))) { NewPN->addIncoming(PN->getIncomingValue(i), PN->getIncomingBlock(i)); PN->removeIncomingValue(i); --i; @@ -181,8 +261,8 @@ void CodeExtractor::severSplitPHINodes(BasicBlock *&Header) { } void CodeExtractor::splitReturnBlocks() { - for (SetVector<BasicBlock*>::iterator I = BlocksToExtract.begin(), - E = BlocksToExtract.end(); I != E; ++I) + for (SetVector<BasicBlock *>::iterator I = Blocks.begin(), E = Blocks.end(); + I != E; ++I) if (ReturnInst *RI = dyn_cast<ReturnInst>((*I)->getTerminator())) { BasicBlock *New = (*I)->splitBasicBlock(RI, (*I)->getName()+".ret"); if (DT) { @@ -203,45 +283,11 @@ void CodeExtractor::splitReturnBlocks() { } } -// findInputsOutputs - Find inputs to, outputs from the code region. -// -void CodeExtractor::findInputsOutputs(Values &inputs, Values &outputs) { - std::set<BasicBlock*> ExitBlocks; - for (SetVector<BasicBlock*>::const_iterator ci = BlocksToExtract.begin(), - ce = BlocksToExtract.end(); ci != ce; ++ci) { - BasicBlock *BB = *ci; - - for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) { - // If a used value is defined outside the region, it's an input. If an - // instruction is used outside the region, it's an output. - for (User::op_iterator O = I->op_begin(), E = I->op_end(); O != E; ++O) - if (definedInCaller(*O)) - inputs.insert(*O); - - // Consider uses of this instruction (outputs). - for (Value::use_iterator UI = I->use_begin(), E = I->use_end(); - UI != E; ++UI) - if (!definedInRegion(*UI)) { - outputs.insert(I); - break; - } - } // for: insts - - // Keep track of the exit blocks from the region. - TerminatorInst *TI = BB->getTerminator(); - for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i) - if (!BlocksToExtract.count(TI->getSuccessor(i))) - ExitBlocks.insert(TI->getSuccessor(i)); - } // for: basic blocks - - NumExitBlocks = ExitBlocks.size(); -} - /// constructFunction - make a function based on inputs and outputs, as follows: /// f(in0, ..., inN, out0, ..., outN) /// -Function *CodeExtractor::constructFunction(const Values &inputs, - const Values &outputs, +Function *CodeExtractor::constructFunction(const ValueSet &inputs, + const ValueSet &outputs, BasicBlock *header, BasicBlock *newRootNode, BasicBlock *newHeader, @@ -261,15 +307,15 @@ Function *CodeExtractor::constructFunction(const Values &inputs, std::vector<Type*> paramTy; // Add the types of the input values to the function's argument list - for (Values::const_iterator i = inputs.begin(), - e = inputs.end(); i != e; ++i) { + for (ValueSet::const_iterator i = inputs.begin(), e = inputs.end(); + i != e; ++i) { const Value *value = *i; DEBUG(dbgs() << "value used in func: " << *value << "\n"); paramTy.push_back(value->getType()); } // Add the types of the output values to the function's argument list. - for (Values::const_iterator I = outputs.begin(), E = outputs.end(); + for (ValueSet::const_iterator I = outputs.begin(), E = outputs.end(); I != E; ++I) { DEBUG(dbgs() << "instr used in func: " << **I << "\n"); if (AggregateArgs) @@ -326,7 +372,7 @@ Function *CodeExtractor::constructFunction(const Values &inputs, for (std::vector<User*>::iterator use = Users.begin(), useE = Users.end(); use != useE; ++use) if (Instruction* inst = dyn_cast<Instruction>(*use)) - if (BlocksToExtract.count(inst->getParent())) + if (Blocks.count(inst->getParent())) inst->replaceUsesOfWith(inputs[i], RewriteVal); } @@ -347,7 +393,7 @@ Function *CodeExtractor::constructFunction(const Values &inputs, // The BasicBlock which contains the branch is not in the region // modify the branch target to a new block if (TerminatorInst *TI = dyn_cast<TerminatorInst>(Users[i])) - if (!BlocksToExtract.count(TI->getParent()) && + if (!Blocks.count(TI->getParent()) && TI->getParent()->getParent() == oldFunction) TI->replaceUsesOfWith(header, newHeader); @@ -373,7 +419,7 @@ static BasicBlock* FindPhiPredForUseInBlock(Value* Used, BasicBlock* BB) { /// necessary. void CodeExtractor:: emitCallAndSwitchStatement(Function *newFunction, BasicBlock *codeReplacer, - Values &inputs, Values &outputs) { + ValueSet &inputs, ValueSet &outputs) { // Emit a call to the new function, passing in: *pointer to struct (if // aggregating parameters), or plan inputs and allocated memory for outputs std::vector<Value*> params, StructValues, ReloadOutputs, Reloads; @@ -381,14 +427,14 @@ emitCallAndSwitchStatement(Function *newFunction, BasicBlock *codeReplacer, LLVMContext &Context = newFunction->getContext(); // Add inputs as params, or to be filled into the struct - for (Values::iterator i = inputs.begin(), e = inputs.end(); i != e; ++i) + for (ValueSet::iterator i = inputs.begin(), e = inputs.end(); i != e; ++i) if (AggregateArgs) StructValues.push_back(*i); else params.push_back(*i); // Create allocas for the outputs - for (Values::iterator i = outputs.begin(), e = outputs.end(); i != e; ++i) { + for (ValueSet::iterator i = outputs.begin(), e = outputs.end(); i != e; ++i) { if (AggregateArgs) { StructValues.push_back(*i); } else { @@ -403,7 +449,7 @@ emitCallAndSwitchStatement(Function *newFunction, BasicBlock *codeReplacer, AllocaInst *Struct = 0; if (AggregateArgs && (inputs.size() + outputs.size() > 0)) { std::vector<Type*> ArgTypes; - for (Values::iterator v = StructValues.begin(), + for (ValueSet::iterator v = StructValues.begin(), ve = StructValues.end(); v != ve; ++v) ArgTypes.push_back((*v)->getType()); @@ -458,7 +504,7 @@ emitCallAndSwitchStatement(Function *newFunction, BasicBlock *codeReplacer, std::vector<User*> Users(outputs[i]->use_begin(), outputs[i]->use_end()); for (unsigned u = 0, e = Users.size(); u != e; ++u) { Instruction *inst = cast<Instruction>(Users[u]); - if (!BlocksToExtract.count(inst->getParent())) + if (!Blocks.count(inst->getParent())) inst->replaceUsesOfWith(outputs[i], load); } } @@ -476,11 +522,11 @@ emitCallAndSwitchStatement(Function *newFunction, BasicBlock *codeReplacer, std::map<BasicBlock*, BasicBlock*> ExitBlockMap; unsigned switchVal = 0; - for (SetVector<BasicBlock*>::const_iterator i = BlocksToExtract.begin(), - e = BlocksToExtract.end(); i != e; ++i) { + for (SetVector<BasicBlock*>::const_iterator i = Blocks.begin(), + e = Blocks.end(); i != e; ++i) { TerminatorInst *TI = (*i)->getTerminator(); for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i) - if (!BlocksToExtract.count(TI->getSuccessor(i))) { + if (!Blocks.count(TI->getSuccessor(i))) { BasicBlock *OldTarget = TI->getSuccessor(i); // add a new basic block which returns the appropriate value BasicBlock *&NewTarget = ExitBlockMap[OldTarget]; @@ -618,18 +664,19 @@ emitCallAndSwitchStatement(Function *newFunction, BasicBlock *codeReplacer, TheSwitch->setCondition(call); TheSwitch->setDefaultDest(TheSwitch->getSuccessor(NumExitBlocks)); // Remove redundant case - TheSwitch->removeCase(SwitchInst::CaseIt(TheSwitch, NumExitBlocks-1)); + SwitchInst::CaseIt ToBeRemoved(TheSwitch, NumExitBlocks-1); + TheSwitch->removeCase(ToBeRemoved); break; } } void CodeExtractor::moveCodeToFunction(Function *newFunction) { - Function *oldFunc = (*BlocksToExtract.begin())->getParent(); + Function *oldFunc = (*Blocks.begin())->getParent(); Function::BasicBlockListType &oldBlocks = oldFunc->getBasicBlockList(); Function::BasicBlockListType &newBlocks = newFunction->getBasicBlockList(); - for (SetVector<BasicBlock*>::const_iterator i = BlocksToExtract.begin(), - e = BlocksToExtract.end(); i != e; ++i) { + for (SetVector<BasicBlock*>::const_iterator i = Blocks.begin(), + e = Blocks.end(); i != e; ++i) { // Delete the basic block from the old function, and the list of blocks oldBlocks.remove(*i); @@ -638,47 +685,15 @@ void CodeExtractor::moveCodeToFunction(Function *newFunction) { } } -/// ExtractRegion - Removes a loop from a function, replaces it with a call to -/// new function. Returns pointer to the new function. -/// -/// algorithm: -/// -/// find inputs and outputs for the region -/// -/// for inputs: add to function as args, map input instr* to arg# -/// for outputs: add allocas for scalars, -/// add to func as args, map output instr* to arg# -/// -/// rewrite func to use argument #s instead of instr* -/// -/// for each scalar output in the function: at every exit, store intermediate -/// computed result back into memory. -/// -Function *CodeExtractor:: -ExtractCodeRegion(ArrayRef<BasicBlock*> code) { - if (!isEligible(code)) +Function *CodeExtractor::extractCodeRegion() { + if (!isEligible()) return 0; - // 1) Find inputs, outputs - // 2) Construct new function - // * Add allocas for defs, pass as args by reference - // * Pass in uses as args - // 3) Move code region, add call instr to func - // - BlocksToExtract.insert(code.begin(), code.end()); - - Values inputs, outputs; + ValueSet inputs, outputs; // Assumption: this is a single-entry code region, and the header is the first // block in the region. - BasicBlock *header = code[0]; - - for (unsigned i = 1, e = code.size(); i != e; ++i) - for (pred_iterator PI = pred_begin(code[i]), E = pred_end(code[i]); - PI != E; ++PI) - assert(BlocksToExtract.count(*PI) && - "No blocks in this region may have entries from outside the region" - " except for the first block!"); + BasicBlock *header = *Blocks.begin(); // If we have to split PHI nodes or the entry block, do so now. severSplitPHINodes(header); @@ -703,6 +718,14 @@ ExtractCodeRegion(ArrayRef<BasicBlock*> code) { // Find inputs to, outputs from the code region. findInputsOutputs(inputs, outputs); + SmallPtrSet<BasicBlock *, 1> ExitBlocks; + for (SetVector<BasicBlock *>::iterator I = Blocks.begin(), E = Blocks.end(); + I != E; ++I) + for (succ_iterator SI = succ_begin(*I), SE = succ_end(*I); SI != SE; ++SI) + if (!Blocks.count(*SI)) + ExitBlocks.insert(*SI); + NumExitBlocks = ExitBlocks.size(); + // Construct new function based on inputs/outputs & add allocas for all defs. Function *newFunction = constructFunction(inputs, outputs, header, newFuncRoot, @@ -718,7 +741,7 @@ ExtractCodeRegion(ArrayRef<BasicBlock*> code) { for (BasicBlock::iterator I = header->begin(); isa<PHINode>(I); ++I) { PHINode *PN = cast<PHINode>(I); for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) - if (!BlocksToExtract.count(PN->getIncomingBlock(i))) + if (!Blocks.count(PN->getIncomingBlock(i))) PN->setIncomingBlock(i, newFuncRoot); } @@ -732,7 +755,7 @@ ExtractCodeRegion(ArrayRef<BasicBlock*> code) { PHINode *PN = cast<PHINode>(I); std::set<BasicBlock*> ProcessedPreds; for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) - if (BlocksToExtract.count(PN->getIncomingBlock(i))) { + if (Blocks.count(PN->getIncomingBlock(i))) { if (ProcessedPreds.insert(PN->getIncomingBlock(i)).second) PN->setIncomingBlock(i, codeReplacer); else { @@ -754,44 +777,3 @@ ExtractCodeRegion(ArrayRef<BasicBlock*> code) { report_fatal_error("verifyFunction failed!")); return newFunction; } - -bool CodeExtractor::isEligible(ArrayRef<BasicBlock*> code) { - // Deny a single basic block that's a landing pad block. - if (code.size() == 1 && code[0]->isLandingPad()) - return false; - - // Deny code region if it contains allocas or vastarts. - for (ArrayRef<BasicBlock*>::iterator BB = code.begin(), e=code.end(); - BB != e; ++BB) - for (BasicBlock::const_iterator I = (*BB)->begin(), Ie = (*BB)->end(); - I != Ie; ++I) - if (isa<AllocaInst>(*I)) - return false; - else if (const CallInst *CI = dyn_cast<CallInst>(I)) - if (const Function *F = CI->getCalledFunction()) - if (F->getIntrinsicID() == Intrinsic::vastart) - return false; - return true; -} - - -/// ExtractCodeRegion - Slurp a sequence of basic blocks into a brand new -/// function. -/// -Function* llvm::ExtractCodeRegion(DominatorTree &DT, - ArrayRef<BasicBlock*> code, - bool AggregateArgs) { - return CodeExtractor(&DT, AggregateArgs).ExtractCodeRegion(code); -} - -/// ExtractLoop - Slurp a natural loop into a brand new function. -/// -Function* llvm::ExtractLoop(DominatorTree &DT, Loop *L, bool AggregateArgs) { - return CodeExtractor(&DT, AggregateArgs).ExtractCodeRegion(L->getBlocks()); -} - -/// ExtractBasicBlock - Slurp a basic block into a brand new function. -/// -Function* llvm::ExtractBasicBlock(ArrayRef<BasicBlock*> BBs, bool AggregateArgs){ - return CodeExtractor(0, AggregateArgs).ExtractCodeRegion(BBs); -} diff --git a/lib/Transforms/Utils/InlineFunction.cpp b/lib/Transforms/Utils/InlineFunction.cpp index d2b167a..89e89e7 100644 --- a/lib/Transforms/Utils/InlineFunction.cpp +++ b/lib/Transforms/Utils/InlineFunction.cpp @@ -13,22 +13,22 @@ //===----------------------------------------------------------------------===// #include "llvm/Transforms/Utils/Cloning.h" +#include "llvm/Attributes.h" #include "llvm/Constants.h" +#include "llvm/DebugInfo.h" #include "llvm/DerivedTypes.h" -#include "llvm/Module.h" +#include "llvm/IRBuilder.h" #include "llvm/Instructions.h" #include "llvm/IntrinsicInst.h" #include "llvm/Intrinsics.h" -#include "llvm/Attributes.h" +#include "llvm/Module.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringExtras.h" #include "llvm/Analysis/CallGraph.h" -#include "llvm/Analysis/DebugInfo.h" #include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/Support/CallSite.h" #include "llvm/Target/TargetData.h" #include "llvm/Transforms/Utils/Local.h" -#include "llvm/ADT/SmallVector.h" -#include "llvm/ADT/StringExtras.h" -#include "llvm/Support/CallSite.h" -#include "llvm/Support/IRBuilder.h" using namespace llvm; bool llvm::InlineFunction(CallInst *CI, InlineFunctionInfo &IFI, @@ -43,10 +43,10 @@ bool llvm::InlineFunction(InvokeInst *II, InlineFunctionInfo &IFI, namespace { /// A class for recording information about inlining through an invoke. class InvokeInliningInfo { - BasicBlock *OuterResumeDest; //< Destination of the invoke's unwind. - BasicBlock *InnerResumeDest; //< Destination for the callee's resume. - LandingPadInst *CallerLPad; //< LandingPadInst associated with the invoke. - PHINode *InnerEHValuesPHI; //< PHI for EH values from landingpad insts. + BasicBlock *OuterResumeDest; ///< Destination of the invoke's unwind. + BasicBlock *InnerResumeDest; ///< Destination for the callee's resume. + LandingPadInst *CallerLPad; ///< LandingPadInst associated with the invoke. + PHINode *InnerEHValuesPHI; ///< PHI for EH values from landingpad insts. SmallVector<Value*, 8> UnwindDestPHIValues; public: diff --git a/lib/Transforms/Utils/Local.cpp b/lib/Transforms/Utils/Local.cpp index d1c4d59..bed7d72 100644 --- a/lib/Transforms/Utils/Local.cpp +++ b/lib/Transforms/Utils/Local.cpp @@ -14,31 +14,31 @@ #include "llvm/Transforms/Utils/Local.h" #include "llvm/Constants.h" +#include "llvm/DIBuilder.h" +#include "llvm/DebugInfo.h" +#include "llvm/DerivedTypes.h" #include "llvm/GlobalAlias.h" #include "llvm/GlobalVariable.h" -#include "llvm/DerivedTypes.h" +#include "llvm/IRBuilder.h" #include "llvm/Instructions.h" -#include "llvm/Intrinsics.h" #include "llvm/IntrinsicInst.h" +#include "llvm/Intrinsics.h" #include "llvm/Metadata.h" #include "llvm/Operator.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/SmallPtrSet.h" -#include "llvm/Analysis/DebugInfo.h" -#include "llvm/Analysis/DIBuilder.h" #include "llvm/Analysis/Dominators.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/MemoryBuiltins.h" #include "llvm/Analysis/ProfileInfo.h" #include "llvm/Analysis/ValueTracking.h" -#include "llvm/Target/TargetData.h" #include "llvm/Support/CFG.h" #include "llvm/Support/Debug.h" #include "llvm/Support/GetElementPtrTypeIterator.h" -#include "llvm/Support/IRBuilder.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/ValueHandle.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetData.h" using namespace llvm; //===----------------------------------------------------------------------===// @@ -169,16 +169,21 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions) { // Otherwise, we can fold this switch into a conditional branch // instruction if it has only one non-default destination. SwitchInst::CaseIt FirstCase = SI->case_begin(); - Value *Cond = Builder.CreateICmpEQ(SI->getCondition(), - FirstCase.getCaseValue(), "cond"); - - // Insert the new branch. - Builder.CreateCondBr(Cond, FirstCase.getCaseSuccessor(), - SI->getDefaultDest()); - - // Delete the old switch. - SI->eraseFromParent(); - return true; + IntegersSubset& Case = FirstCase.getCaseValueEx(); + if (Case.isSingleNumber()) { + // FIXME: Currently work with ConstantInt based numbers. + Value *Cond = Builder.CreateICmpEQ(SI->getCondition(), + Case.getSingleNumber(0).toConstantInt(), + "cond"); + + // Insert the new branch. + Builder.CreateCondBr(Cond, FirstCase.getCaseSuccessor(), + SI->getDefaultDest()); + + // Delete the old switch. + SI->eraseFromParent(); + return true; + } } return false; } @@ -260,7 +265,7 @@ bool llvm::isInstructionTriviallyDead(Instruction *I) { return isa<UndefValue>(II->getArgOperand(1)); } - if (extractMallocCall(I)) return true; + if (isAllocLikeFn(I)) return true; if (CallInst *CI = isFreeCall(I)) if (Constant *C = dyn_cast<Constant>(CI->getArgOperand(0))) @@ -700,7 +705,7 @@ bool llvm::EliminateDuplicatePHINodes(BasicBlock *BB) { CollisionMap[PN] = Old; break; } - // Procede to the next PHI in the list. + // Proceed to the next PHI in the list. OtherPN = I->second; } } diff --git a/lib/Transforms/Utils/LoopUnroll.cpp b/lib/Transforms/Utils/LoopUnroll.cpp index e15497a..2023750 100644 --- a/lib/Transforms/Utils/LoopUnroll.cpp +++ b/lib/Transforms/Utils/LoopUnroll.cpp @@ -95,9 +95,11 @@ static BasicBlock *FoldBlockIntoPredecessor(BasicBlock *BB, LoopInfo* LI, // Erase basic block from the function... // ScalarEvolution holds references to loop exit blocks. - if (ScalarEvolution *SE = LPM->getAnalysisIfAvailable<ScalarEvolution>()) { - if (Loop *L = LI->getLoopFor(BB)) - SE->forgetLoop(L); + if (LPM) { + if (ScalarEvolution *SE = LPM->getAnalysisIfAvailable<ScalarEvolution>()) { + if (Loop *L = LI->getLoopFor(BB)) + SE->forgetLoop(L); + } } LI->removeBlock(BB); BB->eraseFromParent(); @@ -204,9 +206,11 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount, // Notify ScalarEvolution that the loop will be substantially changed, // if not outright eliminated. - ScalarEvolution *SE = LPM->getAnalysisIfAvailable<ScalarEvolution>(); - if (SE) - SE->forgetLoop(L); + if (LPM) { + ScalarEvolution *SE = LPM->getAnalysisIfAvailable<ScalarEvolution>(); + if (SE) + SE->forgetLoop(L); + } // If we know the trip count, we know the multiple... unsigned BreakoutTrip = 0; @@ -405,24 +409,26 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount, } } - // FIXME: Reconstruct dom info, because it is not preserved properly. - // Incrementally updating domtree after loop unrolling would be easy. - if (DominatorTree *DT = LPM->getAnalysisIfAvailable<DominatorTree>()) - DT->runOnFunction(*L->getHeader()->getParent()); - - // Simplify any new induction variables in the partially unrolled loop. - if (SE && !CompletelyUnroll) { - SmallVector<WeakVH, 16> DeadInsts; - simplifyLoopIVs(L, SE, LPM, DeadInsts); - - // Aggressively clean up dead instructions that simplifyLoopIVs already - // identified. Any remaining should be cleaned up below. - while (!DeadInsts.empty()) - if (Instruction *Inst = - dyn_cast_or_null<Instruction>(&*DeadInsts.pop_back_val())) - RecursivelyDeleteTriviallyDeadInstructions(Inst); + if (LPM) { + // FIXME: Reconstruct dom info, because it is not preserved properly. + // Incrementally updating domtree after loop unrolling would be easy. + if (DominatorTree *DT = LPM->getAnalysisIfAvailable<DominatorTree>()) + DT->runOnFunction(*L->getHeader()->getParent()); + + // Simplify any new induction variables in the partially unrolled loop. + ScalarEvolution *SE = LPM->getAnalysisIfAvailable<ScalarEvolution>(); + if (SE && !CompletelyUnroll) { + SmallVector<WeakVH, 16> DeadInsts; + simplifyLoopIVs(L, SE, LPM, DeadInsts); + + // Aggressively clean up dead instructions that simplifyLoopIVs already + // identified. Any remaining should be cleaned up below. + while (!DeadInsts.empty()) + if (Instruction *Inst = + dyn_cast_or_null<Instruction>(&*DeadInsts.pop_back_val())) + RecursivelyDeleteTriviallyDeadInstructions(Inst); + } } - // At this point, the code is well formed. We now do a quick sweep over the // inserted code, doing constant propagation and dead code elimination as we // go. diff --git a/lib/Transforms/Utils/LoopUnrollRuntime.cpp b/lib/Transforms/Utils/LoopUnrollRuntime.cpp index 3aa6bef..67e17f4 100644 --- a/lib/Transforms/Utils/LoopUnrollRuntime.cpp +++ b/lib/Transforms/Utils/LoopUnrollRuntime.cpp @@ -131,7 +131,7 @@ static void ConnectProlog(Loop *L, Value *TripCount, unsigned Count, /// There are two value maps that are defined and used. VMap is /// for the values in the current loop instance. LVMap contains /// the values from the last loop instance. We need the LVMap values -/// to update the inital values for the current loop instance. +/// to update the initial values for the current loop instance. /// static void CloneLoopBlocks(Loop *L, bool FirstCopy, @@ -237,6 +237,8 @@ bool llvm::UnrollRuntimeLoopProlog(Loop *L, unsigned Count, LoopInfo *LI, // Use Scalar Evolution to compute the trip count. This allows more // loops to be unrolled than relying on induction var simplification + if (!LPM) + return false; ScalarEvolution *SE = LPM->getAnalysisIfAvailable<ScalarEvolution>(); if (SE == 0) return false; diff --git a/lib/Transforms/Utils/LowerExpectIntrinsic.cpp b/lib/Transforms/Utils/LowerExpectIntrinsic.cpp index c70ced1..02bdcda 100644 --- a/lib/Transforms/Utils/LowerExpectIntrinsic.cpp +++ b/lib/Transforms/Utils/LowerExpectIntrinsic.cpp @@ -12,18 +12,19 @@ //===----------------------------------------------------------------------===// #define DEBUG_TYPE "lower-expect-intrinsic" +#include "llvm/BasicBlock.h" #include "llvm/Constants.h" #include "llvm/Function.h" -#include "llvm/BasicBlock.h" -#include "llvm/LLVMContext.h" #include "llvm/Instructions.h" #include "llvm/Intrinsics.h" +#include "llvm/LLVMContext.h" +#include "llvm/MDBuilder.h" #include "llvm/Metadata.h" #include "llvm/Pass.h" +#include "llvm/ADT/Statistic.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" -#include "llvm/ADT/Statistic.h" #include <vector> using namespace llvm; @@ -70,24 +71,18 @@ bool LowerExpectIntrinsic::HandleSwitchExpect(SwitchInst *SI) { if (!ExpectedValue) return false; - LLVMContext &Context = CI->getContext(); - Type *Int32Ty = Type::getInt32Ty(Context); - SwitchInst::CaseIt Case = SI->findCaseValue(ExpectedValue); - std::vector<Value *> Vec; - unsigned n = SI->getNumCases(); - Vec.resize(n + 1 + 1); // +1 for MDString and +1 for default case - - Vec[0] = MDString::get(Context, "branch_weights"); - Vec[1] = ConstantInt::get(Int32Ty, Case == SI->case_default() ? - LikelyBranchWeight : UnlikelyBranchWeight); - for (unsigned i = 0; i < n; ++i) { - Vec[i + 1 + 1] = ConstantInt::get(Int32Ty, i == Case.getCaseIndex() ? - LikelyBranchWeight : UnlikelyBranchWeight); - } + unsigned n = SI->getNumCases(); // +1 for default case. + std::vector<uint32_t> Weights(n + 1); - MDNode *WeightsNode = llvm::MDNode::get(Context, Vec); - SI->setMetadata(LLVMContext::MD_prof, WeightsNode); + Weights[0] = Case == SI->case_default() ? LikelyBranchWeight + : UnlikelyBranchWeight; + for (unsigned i = 0; i != n; ++i) + Weights[i + 1] = i == Case.getCaseIndex() ? LikelyBranchWeight + : UnlikelyBranchWeight; + + SI->setMetadata(LLVMContext::MD_prof, + MDBuilder(CI->getContext()).createBranchWeights(Weights)); SI->setCondition(ArgValue); return true; @@ -120,20 +115,17 @@ bool LowerExpectIntrinsic::HandleIfExpect(BranchInst *BI) { if (!ExpectedValue) return false; - LLVMContext &Context = CI->getContext(); - Type *Int32Ty = Type::getInt32Ty(Context); - bool Likely = ExpectedValue->isOne(); + MDBuilder MDB(CI->getContext()); + MDNode *Node; // If expect value is equal to 1 it means that we are more likely to take // branch 0, in other case more likely is branch 1. - Value *Ops[] = { - MDString::get(Context, "branch_weights"), - ConstantInt::get(Int32Ty, Likely ? LikelyBranchWeight : UnlikelyBranchWeight), - ConstantInt::get(Int32Ty, Likely ? UnlikelyBranchWeight : LikelyBranchWeight) - }; + if (ExpectedValue->isOne()) + Node = MDB.createBranchWeights(LikelyBranchWeight, UnlikelyBranchWeight); + else + Node = MDB.createBranchWeights(UnlikelyBranchWeight, LikelyBranchWeight); - MDNode *WeightsNode = MDNode::get(Context, Ops); - BI->setMetadata(LLVMContext::MD_prof, WeightsNode); + BI->setMetadata(LLVMContext::MD_prof, Node); CmpI->setOperand(0, ArgValue); return true; diff --git a/lib/Transforms/Utils/LowerSwitch.cpp b/lib/Transforms/Utils/LowerSwitch.cpp index a16130d..1547439 100644 --- a/lib/Transforms/Utils/LowerSwitch.cpp +++ b/lib/Transforms/Utils/LowerSwitch.cpp @@ -66,18 +66,6 @@ namespace { BasicBlock* OrigBlock, BasicBlock* Default); unsigned Clusterify(CaseVector& Cases, SwitchInst *SI); }; - - /// The comparison function for sorting the switch case values in the vector. - /// WARNING: Case ranges should be disjoint! - struct CaseCmp { - bool operator () (const LowerSwitch::CaseRange& C1, - const LowerSwitch::CaseRange& C2) { - - const ConstantInt* CI1 = cast<const ConstantInt>(C1.Low); - const ConstantInt* CI2 = cast<const ConstantInt>(C2.High); - return CI1->getValue().slt(CI2->getValue()); - } - }; } char LowerSwitch::ID = 0; @@ -159,7 +147,7 @@ BasicBlock* LowerSwitch::switchConvert(CaseItr Begin, CaseItr End, Function::iterator FI = OrigBlock; F->getBasicBlockList().insert(++FI, NewNode); - ICmpInst* Comp = new ICmpInst(ICmpInst::ICMP_SLT, + ICmpInst* Comp = new ICmpInst(ICmpInst::ICMP_ULT, Val, Pivot.Low, "Pivot"); NewNode->getInstList().push_back(Comp); BranchInst::Create(LBranch, RBranch, Comp, NewNode); @@ -234,40 +222,34 @@ BasicBlock* LowerSwitch::newLeafBlock(CaseRange& Leaf, Value* Val, // Clusterify - Transform simple list of Cases into list of CaseRange's unsigned LowerSwitch::Clusterify(CaseVector& Cases, SwitchInst *SI) { - unsigned numCmps = 0; + + IntegersSubsetToBB TheClusterifier; // Start with "simple" cases - for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end(); i != e; ++i) - Cases.push_back(CaseRange(i.getCaseValue(), i.getCaseValue(), - i.getCaseSuccessor())); + for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end(); + i != e; ++i) { + BasicBlock *SuccBB = i.getCaseSuccessor(); + IntegersSubset CaseRanges = i.getCaseValueEx(); + TheClusterifier.add(CaseRanges, SuccBB); + } - std::sort(Cases.begin(), Cases.end(), CaseCmp()); - - // Merge case into clusters - if (Cases.size()>=2) - for (CaseItr I=Cases.begin(), J=llvm::next(Cases.begin()); J!=Cases.end(); ) { - int64_t nextValue = cast<ConstantInt>(J->Low)->getSExtValue(); - int64_t currentValue = cast<ConstantInt>(I->High)->getSExtValue(); - BasicBlock* nextBB = J->BB; - BasicBlock* currentBB = I->BB; - - // If the two neighboring cases go to the same destination, merge them - // into a single case. - if ((nextValue-currentValue==1) && (currentBB == nextBB)) { - I->High = J->High; - J = Cases.erase(J); - } else { - I = J++; - } - } - - for (CaseItr I=Cases.begin(), E=Cases.end(); I!=E; ++I, ++numCmps) { - if (I->Low != I->High) + TheClusterifier.optimize(); + + size_t numCmps = 0; + for (IntegersSubsetToBB::RangeIterator i = TheClusterifier.begin(), + e = TheClusterifier.end(); i != e; ++i, ++numCmps) { + IntegersSubsetToBB::Cluster &C = *i; + + // FIXME: Currently work with ConstantInt based numbers. + // Changing it to APInt based is a pretty heavy for this commit. + Cases.push_back(CaseRange(C.first.getLow().toConstantInt(), + C.first.getHigh().toConstantInt(), C.second)); + if (C.first.isSingleNumber()) // A range counts double, since it requires two compares. ++numCmps; } - return numCmps; + return numCmps; } // processSwitchInst - Replace the specified switch instruction with a sequence diff --git a/lib/Transforms/Utils/ModuleUtils.cpp b/lib/Transforms/Utils/ModuleUtils.cpp index 8491c55..dbcf3b2 100644 --- a/lib/Transforms/Utils/ModuleUtils.cpp +++ b/lib/Transforms/Utils/ModuleUtils.cpp @@ -14,8 +14,8 @@ #include "llvm/Transforms/Utils/ModuleUtils.h" #include "llvm/DerivedTypes.h" #include "llvm/Function.h" +#include "llvm/IRBuilder.h" #include "llvm/Module.h" -#include "llvm/Support/IRBuilder.h" using namespace llvm; diff --git a/lib/Transforms/Utils/PromoteMemoryToRegister.cpp b/lib/Transforms/Utils/PromoteMemoryToRegister.cpp index 2357d81..dd5e20e 100644 --- a/lib/Transforms/Utils/PromoteMemoryToRegister.cpp +++ b/lib/Transforms/Utils/PromoteMemoryToRegister.cpp @@ -28,14 +28,14 @@ #define DEBUG_TYPE "mem2reg" #include "llvm/Transforms/Utils/PromoteMemToReg.h" #include "llvm/Constants.h" +#include "llvm/DebugInfo.h" #include "llvm/DerivedTypes.h" +#include "llvm/DIBuilder.h" #include "llvm/Function.h" #include "llvm/Instructions.h" #include "llvm/IntrinsicInst.h" #include "llvm/Metadata.h" #include "llvm/Analysis/AliasSetTracker.h" -#include "llvm/Analysis/DebugInfo.h" -#include "llvm/Analysis/DIBuilder.h" #include "llvm/Analysis/Dominators.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/ValueTracking.h" diff --git a/lib/Transforms/Utils/SSAUpdater.cpp b/lib/Transforms/Utils/SSAUpdater.cpp index e60a41b..e568a61 100644 --- a/lib/Transforms/Utils/SSAUpdater.cpp +++ b/lib/Transforms/Utils/SSAUpdater.cpp @@ -190,8 +190,11 @@ Value *SSAUpdater::GetValueInMiddleOfBlock(BasicBlock *BB) { return V; } - // Set DebugLoc. - InsertedPHI->setDebugLoc(GetFirstDebugLocInBasicBlock(BB)); + // Set the DebugLoc of the inserted PHI, if available. + DebugLoc DL; + if (const Instruction *I = BB->getFirstNonPHI()) + DL = I->getDebugLoc(); + InsertedPHI->setDebugLoc(DL); // If the client wants to know about all new instructions, tell it. if (InsertedPHIs) InsertedPHIs->push_back(InsertedPHI); @@ -211,6 +214,11 @@ void SSAUpdater::RewriteUse(Use &U) { else V = GetValueInMiddleOfBlock(User->getParent()); + // Notify that users of the existing value that it is being replaced. + Value *OldVal = U.get(); + if (OldVal != V && OldVal->hasValueHandle()) + ValueHandleBase::ValueIsRAUWd(OldVal, V); + U.set(V); } @@ -230,28 +238,6 @@ void SSAUpdater::RewriteUseAfterInsertions(Use &U) { U.set(V); } -/// PHIiter - Iterator for PHI operands. This is used for the PHI_iterator -/// in the SSAUpdaterImpl template. -namespace { - class PHIiter { - private: - PHINode *PHI; - unsigned idx; - - public: - explicit PHIiter(PHINode *P) // begin iterator - : PHI(P), idx(0) {} - PHIiter(PHINode *P, bool) // end iterator - : PHI(P), idx(PHI->getNumIncomingValues()) {} - - PHIiter &operator++() { ++idx; return *this; } - bool operator==(const PHIiter& x) const { return idx == x.idx; } - bool operator!=(const PHIiter& x) const { return !operator==(x); } - Value *getIncomingValue() { return PHI->getIncomingValue(idx); } - BasicBlock *getIncomingBlock() { return PHI->getIncomingBlock(idx); } - }; -} - /// SSAUpdaterTraits<SSAUpdater> - Traits for the SSAUpdaterImpl template, /// specialized for SSAUpdater. namespace llvm { @@ -266,9 +252,26 @@ public: static BlkSucc_iterator BlkSucc_begin(BlkT *BB) { return succ_begin(BB); } static BlkSucc_iterator BlkSucc_end(BlkT *BB) { return succ_end(BB); } - typedef PHIiter PHI_iterator; - static inline PHI_iterator PHI_begin(PhiT *PHI) { return PHI_iterator(PHI); } - static inline PHI_iterator PHI_end(PhiT *PHI) { + class PHI_iterator { + private: + PHINode *PHI; + unsigned idx; + + public: + explicit PHI_iterator(PHINode *P) // begin iterator + : PHI(P), idx(0) {} + PHI_iterator(PHINode *P, bool) // end iterator + : PHI(P), idx(PHI->getNumIncomingValues()) {} + + PHI_iterator &operator++() { ++idx; return *this; } + bool operator==(const PHI_iterator& x) const { return idx == x.idx; } + bool operator!=(const PHI_iterator& x) const { return !operator==(x); } + Value *getIncomingValue() { return PHI->getIncomingValue(idx); } + BasicBlock *getIncomingBlock() { return PHI->getIncomingBlock(idx); } + }; + + static PHI_iterator PHI_begin(PhiT *PHI) { return PHI_iterator(PHI); } + static PHI_iterator PHI_end(PhiT *PHI) { return PHI_iterator(PHI, true); } diff --git a/lib/Transforms/Utils/SimplifyCFG.cpp b/lib/Transforms/Utils/SimplifyCFG.cpp index 66dd2c9..518df7c 100644 --- a/lib/Transforms/Utils/SimplifyCFG.cpp +++ b/lib/Transforms/Utils/SimplifyCFG.cpp @@ -16,29 +16,30 @@ #include "llvm/Constants.h" #include "llvm/DerivedTypes.h" #include "llvm/GlobalVariable.h" +#include "llvm/IRBuilder.h" #include "llvm/Instructions.h" #include "llvm/IntrinsicInst.h" #include "llvm/LLVMContext.h" +#include "llvm/MDBuilder.h" #include "llvm/Metadata.h" #include "llvm/Operator.h" #include "llvm/Type.h" -#include "llvm/Analysis/InstructionSimplify.h" -#include "llvm/Analysis/ValueTracking.h" -#include "llvm/Target/TargetData.h" -#include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SetVector.h" -#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" -#include "llvm/ADT/STLExtras.h" +#include "llvm/Analysis/InstructionSimplify.h" +#include "llvm/Analysis/ValueTracking.h" #include "llvm/Support/CFG.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/ConstantRange.h" #include "llvm/Support/Debug.h" -#include "llvm/Support/IRBuilder.h" #include "llvm/Support/NoFolder.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetData.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" #include <algorithm> #include <set> #include <map> @@ -55,12 +56,26 @@ DupRet("simplifycfg-dup-ret", cl::Hidden, cl::init(false), STATISTIC(NumSpeculations, "Number of speculative executed instructions"); namespace { + /// ValueEqualityComparisonCase - Represents a case of a switch. + struct ValueEqualityComparisonCase { + ConstantInt *Value; + BasicBlock *Dest; + + ValueEqualityComparisonCase(ConstantInt *Value, BasicBlock *Dest) + : Value(Value), Dest(Dest) {} + + bool operator<(ValueEqualityComparisonCase RHS) const { + // Comparing pointers is ok as we only rely on the order for uniquing. + return Value < RHS.Value; + } + }; + class SimplifyCFGOpt { const TargetData *const TD; Value *isValueEqualityComparison(TerminatorInst *TI); BasicBlock *GetValueEqualityComparisonCases(TerminatorInst *TI, - std::vector<std::pair<ConstantInt*, BasicBlock*> > &Cases); + std::vector<ValueEqualityComparisonCase> &Cases); bool SimplifyEqualityComparisonWithOnlyPredecessor(TerminatorInst *TI, BasicBlock *Pred, IRBuilder<> &Builder); @@ -107,6 +122,47 @@ static bool SafeToMergeTerminators(TerminatorInst *SI1, TerminatorInst *SI2) { return true; } +/// isProfitableToFoldUnconditional - Return true if it is safe and profitable +/// to merge these two terminator instructions together, where SI1 is an +/// unconditional branch. PhiNodes will store all PHI nodes in common +/// successors. +/// +static bool isProfitableToFoldUnconditional(BranchInst *SI1, + BranchInst *SI2, + Instruction *Cond, + SmallVectorImpl<PHINode*> &PhiNodes) { + if (SI1 == SI2) return false; // Can't merge with self! + assert(SI1->isUnconditional() && SI2->isConditional()); + + // We fold the unconditional branch if we can easily update all PHI nodes in + // common successors: + // 1> We have a constant incoming value for the conditional branch; + // 2> We have "Cond" as the incoming value for the unconditional branch; + // 3> SI2->getCondition() and Cond have same operands. + CmpInst *Ci2 = dyn_cast<CmpInst>(SI2->getCondition()); + if (!Ci2) return false; + if (!(Cond->getOperand(0) == Ci2->getOperand(0) && + Cond->getOperand(1) == Ci2->getOperand(1)) && + !(Cond->getOperand(0) == Ci2->getOperand(1) && + Cond->getOperand(1) == Ci2->getOperand(0))) + return false; + + BasicBlock *SI1BB = SI1->getParent(); + BasicBlock *SI2BB = SI2->getParent(); + SmallPtrSet<BasicBlock*, 16> SI1Succs(succ_begin(SI1BB), succ_end(SI1BB)); + for (succ_iterator I = succ_begin(SI2BB), E = succ_end(SI2BB); I != E; ++I) + if (SI1Succs.count(*I)) + for (BasicBlock::iterator BBI = (*I)->begin(); + isa<PHINode>(BBI); ++BBI) { + PHINode *PN = cast<PHINode>(BBI); + if (PN->getIncomingValueForBlock(SI1BB) != Cond || + !isa<ConstantInt>(PN->getIncomingValueForBlock(SI2BB))) + return false; + PhiNodes.push_back(PN); + } + return true; +} + /// AddPredecessorToBlock - Update PHI nodes in Succ to indicate that there will /// now be entries in it from the 'NewPred' block. The values that will be /// flowing into the PHI nodes will be the same as those coming in from @@ -476,21 +532,22 @@ Value *SimplifyCFGOpt::isValueEqualityComparison(TerminatorInst *TI) { /// decode all of the 'cases' that it represents and return the 'default' block. BasicBlock *SimplifyCFGOpt:: GetValueEqualityComparisonCases(TerminatorInst *TI, - std::vector<std::pair<ConstantInt*, - BasicBlock*> > &Cases) { + std::vector<ValueEqualityComparisonCase> + &Cases) { if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) { Cases.reserve(SI->getNumCases()); for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end(); i != e; ++i) - Cases.push_back(std::make_pair(i.getCaseValue(), - i.getCaseSuccessor())); + Cases.push_back(ValueEqualityComparisonCase(i.getCaseValue(), + i.getCaseSuccessor())); return SI->getDefaultDest(); } BranchInst *BI = cast<BranchInst>(TI); ICmpInst *ICI = cast<ICmpInst>(BI->getCondition()); - Cases.push_back(std::make_pair(GetConstantInt(ICI->getOperand(1), TD), - BI->getSuccessor(ICI->getPredicate() == - ICmpInst::ICMP_NE))); + BasicBlock *Succ = BI->getSuccessor(ICI->getPredicate() == ICmpInst::ICMP_NE); + Cases.push_back(ValueEqualityComparisonCase(GetConstantInt(ICI->getOperand(1), + TD), + Succ)); return BI->getSuccessor(ICI->getPredicate() == ICmpInst::ICMP_EQ); } @@ -498,9 +555,9 @@ GetValueEqualityComparisonCases(TerminatorInst *TI, /// EliminateBlockCases - Given a vector of bb/value pairs, remove any entries /// in the list that match the specified block. static void EliminateBlockCases(BasicBlock *BB, - std::vector<std::pair<ConstantInt*, BasicBlock*> > &Cases) { + std::vector<ValueEqualityComparisonCase> &Cases) { for (unsigned i = 0, e = Cases.size(); i != e; ++i) - if (Cases[i].second == BB) { + if (Cases[i].Dest == BB) { Cases.erase(Cases.begin()+i); --i; --e; } @@ -509,9 +566,9 @@ static void EliminateBlockCases(BasicBlock *BB, /// ValuesOverlap - Return true if there are any keys in C1 that exist in C2 as /// well. static bool -ValuesOverlap(std::vector<std::pair<ConstantInt*, BasicBlock*> > &C1, - std::vector<std::pair<ConstantInt*, BasicBlock*> > &C2) { - std::vector<std::pair<ConstantInt*, BasicBlock*> > *V1 = &C1, *V2 = &C2; +ValuesOverlap(std::vector<ValueEqualityComparisonCase> &C1, + std::vector<ValueEqualityComparisonCase > &C2) { + std::vector<ValueEqualityComparisonCase> *V1 = &C1, *V2 = &C2; // Make V1 be smaller than V2. if (V1->size() > V2->size()) @@ -520,9 +577,9 @@ ValuesOverlap(std::vector<std::pair<ConstantInt*, BasicBlock*> > &C1, if (V1->size() == 0) return false; if (V1->size() == 1) { // Just scan V2. - ConstantInt *TheVal = (*V1)[0].first; + ConstantInt *TheVal = (*V1)[0].Value; for (unsigned i = 0, e = V2->size(); i != e; ++i) - if (TheVal == (*V2)[i].first) + if (TheVal == (*V2)[i].Value) return true; } @@ -531,9 +588,9 @@ ValuesOverlap(std::vector<std::pair<ConstantInt*, BasicBlock*> > &C1, array_pod_sort(V2->begin(), V2->end()); unsigned i1 = 0, i2 = 0, e1 = V1->size(), e2 = V2->size(); while (i1 != e1 && i2 != e2) { - if ((*V1)[i1].first == (*V2)[i2].first) + if ((*V1)[i1].Value == (*V2)[i2].Value) return true; - if ((*V1)[i1].first < (*V2)[i2].first) + if ((*V1)[i1].Value < (*V2)[i2].Value) ++i1; else ++i2; @@ -559,13 +616,13 @@ SimplifyEqualityComparisonWithOnlyPredecessor(TerminatorInst *TI, if (ThisVal != PredVal) return false; // Different predicates. // Find out information about when control will move from Pred to TI's block. - std::vector<std::pair<ConstantInt*, BasicBlock*> > PredCases; + std::vector<ValueEqualityComparisonCase> PredCases; BasicBlock *PredDef = GetValueEqualityComparisonCases(Pred->getTerminator(), PredCases); EliminateBlockCases(PredDef, PredCases); // Remove default from cases. // Find information about how control leaves this block. - std::vector<std::pair<ConstantInt*, BasicBlock*> > ThisCases; + std::vector<ValueEqualityComparisonCase> ThisCases; BasicBlock *ThisDef = GetValueEqualityComparisonCases(TI, ThisCases); EliminateBlockCases(ThisDef, ThisCases); // Remove default from cases. @@ -587,7 +644,7 @@ SimplifyEqualityComparisonWithOnlyPredecessor(TerminatorInst *TI, (void) NI; // Remove PHI node entries for the dead edge. - ThisCases[0].second->removePredecessor(TI->getParent()); + ThisCases[0].Dest->removePredecessor(TI->getParent()); DEBUG(dbgs() << "Threading pred instr: " << *Pred->getTerminator() << "Through successor TI: " << *TI << "Leaving: " << *NI << "\n"); @@ -600,7 +657,7 @@ SimplifyEqualityComparisonWithOnlyPredecessor(TerminatorInst *TI, // Okay, TI has cases that are statically dead, prune them away. SmallPtrSet<Constant*, 16> DeadCases; for (unsigned i = 0, e = PredCases.size(); i != e; ++i) - DeadCases.insert(PredCases[i].first); + DeadCases.insert(PredCases[i].Value); DEBUG(dbgs() << "Threading pred instr: " << *Pred->getTerminator() << "Through successor TI: " << *TI); @@ -622,10 +679,10 @@ SimplifyEqualityComparisonWithOnlyPredecessor(TerminatorInst *TI, ConstantInt *TIV = 0; BasicBlock *TIBB = TI->getParent(); for (unsigned i = 0, e = PredCases.size(); i != e; ++i) - if (PredCases[i].second == TIBB) { + if (PredCases[i].Dest == TIBB) { if (TIV != 0) return false; // Cannot handle multiple values coming to this block. - TIV = PredCases[i].first; + TIV = PredCases[i].Value; } assert(TIV && "No edge from pred to succ?"); @@ -633,8 +690,8 @@ SimplifyEqualityComparisonWithOnlyPredecessor(TerminatorInst *TI, // BB. Find out which successor will unconditionally be branched to. BasicBlock *TheRealDest = 0; for (unsigned i = 0, e = ThisCases.size(); i != e; ++i) - if (ThisCases[i].first == TIV) { - TheRealDest = ThisCases[i].second; + if (ThisCases[i].Value == TIV) { + TheRealDest = ThisCases[i].Dest; break; } @@ -702,10 +759,10 @@ bool SimplifyCFGOpt::FoldValueComparisonIntoPredecessors(TerminatorInst *TI, if (PCV == CV && SafeToMergeTerminators(TI, PTI)) { // Figure out which 'cases' to copy from SI to PSI. - std::vector<std::pair<ConstantInt*, BasicBlock*> > BBCases; + std::vector<ValueEqualityComparisonCase> BBCases; BasicBlock *BBDefault = GetValueEqualityComparisonCases(TI, BBCases); - std::vector<std::pair<ConstantInt*, BasicBlock*> > PredCases; + std::vector<ValueEqualityComparisonCase> PredCases; BasicBlock *PredDefault = GetValueEqualityComparisonCases(PTI, PredCases); // Based on whether the default edge from PTI goes to BB or not, fill in @@ -718,8 +775,8 @@ bool SimplifyCFGOpt::FoldValueComparisonIntoPredecessors(TerminatorInst *TI, // that don't occur in PTI, or that branch to BB will be activated. std::set<ConstantInt*, ConstantIntOrdering> PTIHandled; for (unsigned i = 0, e = PredCases.size(); i != e; ++i) - if (PredCases[i].second != BB) - PTIHandled.insert(PredCases[i].first); + if (PredCases[i].Dest != BB) + PTIHandled.insert(PredCases[i].Value); else { // The default destination is BB, we don't need explicit targets. std::swap(PredCases[i], PredCases.back()); @@ -734,10 +791,10 @@ bool SimplifyCFGOpt::FoldValueComparisonIntoPredecessors(TerminatorInst *TI, NewSuccessors.push_back(BBDefault); } for (unsigned i = 0, e = BBCases.size(); i != e; ++i) - if (!PTIHandled.count(BBCases[i].first) && - BBCases[i].second != BBDefault) { + if (!PTIHandled.count(BBCases[i].Value) && + BBCases[i].Dest != BBDefault) { PredCases.push_back(BBCases[i]); - NewSuccessors.push_back(BBCases[i].second); + NewSuccessors.push_back(BBCases[i].Dest); } } else { @@ -746,8 +803,8 @@ bool SimplifyCFGOpt::FoldValueComparisonIntoPredecessors(TerminatorInst *TI, // activated. std::set<ConstantInt*, ConstantIntOrdering> PTIHandled; for (unsigned i = 0, e = PredCases.size(); i != e; ++i) - if (PredCases[i].second == BB) { - PTIHandled.insert(PredCases[i].first); + if (PredCases[i].Dest == BB) { + PTIHandled.insert(PredCases[i].Value); std::swap(PredCases[i], PredCases.back()); PredCases.pop_back(); --i; --e; @@ -756,11 +813,11 @@ bool SimplifyCFGOpt::FoldValueComparisonIntoPredecessors(TerminatorInst *TI, // Okay, now we know which constants were sent to BB from the // predecessor. Figure out where they will all go now. for (unsigned i = 0, e = BBCases.size(); i != e; ++i) - if (PTIHandled.count(BBCases[i].first)) { + if (PTIHandled.count(BBCases[i].Value)) { // If this is one we are capable of getting... PredCases.push_back(BBCases[i]); - NewSuccessors.push_back(BBCases[i].second); - PTIHandled.erase(BBCases[i].first);// This constant is taken care of + NewSuccessors.push_back(BBCases[i].Dest); + PTIHandled.erase(BBCases[i].Value);// This constant is taken care of } // If there are any constants vectored to BB that TI doesn't handle, @@ -768,7 +825,7 @@ bool SimplifyCFGOpt::FoldValueComparisonIntoPredecessors(TerminatorInst *TI, for (std::set<ConstantInt*, ConstantIntOrdering>::iterator I = PTIHandled.begin(), E = PTIHandled.end(); I != E; ++I) { - PredCases.push_back(std::make_pair(*I, BBDefault)); + PredCases.push_back(ValueEqualityComparisonCase(*I, BBDefault)); NewSuccessors.push_back(BBDefault); } } @@ -792,7 +849,7 @@ bool SimplifyCFGOpt::FoldValueComparisonIntoPredecessors(TerminatorInst *TI, PredCases.size()); NewSI->setDebugLoc(PTI->getDebugLoc()); for (unsigned i = 0, e = PredCases.size(); i != e; ++i) - NewSI->addCase(PredCases[i].first, PredCases[i].second); + NewSI->addCase(PredCases[i].Value, PredCases[i].Dest); EraseTerminatorInstAndDCECond(PTI); @@ -1273,7 +1330,7 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const TargetData *TD) { return false; } - // If we folded the the first phi, PN dangles at this point. Refresh it. If + // If we folded the first phi, PN dangles at this point. Refresh it. If // we ran out of PHIs then we simplified them all. PN = dyn_cast<PHINode>(BB->begin()); if (PN == 0) return true; @@ -1490,6 +1547,23 @@ static APInt MultiplyAndLosePrecision(APInt &A, APInt &B, APInt &C, APInt &D, return Result; } +/// checkCSEInPredecessor - Return true if the given instruction is available +/// in its predecessor block. If yes, the instruction will be removed. +/// +static bool checkCSEInPredecessor(Instruction *Inst, BasicBlock *PB) { + if (!isa<BinaryOperator>(Inst) && !isa<CmpInst>(Inst)) + return false; + for (BasicBlock::iterator I = PB->begin(), E = PB->end(); I != E; I++) { + Instruction *PBI = &*I; + // Check whether Inst and PBI generate the same value. + if (Inst->isIdenticalTo(PBI)) { + Inst->replaceAllUsesWith(PBI); + Inst->eraseFromParent(); + return true; + } + } + return false; +} /// FoldBranchToCommonDest - If this basic block is simple enough, and if a /// predecessor branches to us and one of our successors, fold the block into @@ -1497,7 +1571,36 @@ static APInt MultiplyAndLosePrecision(APInt &A, APInt &B, APInt &C, APInt &D, bool llvm::FoldBranchToCommonDest(BranchInst *BI) { BasicBlock *BB = BI->getParent(); - Instruction *Cond = dyn_cast<Instruction>(BI->getCondition()); + Instruction *Cond = 0; + if (BI->isConditional()) + Cond = dyn_cast<Instruction>(BI->getCondition()); + else { + // For unconditional branch, check for a simple CFG pattern, where + // BB has a single predecessor and BB's successor is also its predecessor's + // successor. If such pattern exisits, check for CSE between BB and its + // predecessor. + if (BasicBlock *PB = BB->getSinglePredecessor()) + if (BranchInst *PBI = dyn_cast<BranchInst>(PB->getTerminator())) + if (PBI->isConditional() && + (BI->getSuccessor(0) == PBI->getSuccessor(0) || + BI->getSuccessor(0) == PBI->getSuccessor(1))) { + for (BasicBlock::iterator I = BB->begin(), E = BB->end(); + I != E; ) { + Instruction *Curr = I++; + if (isa<CmpInst>(Curr)) { + Cond = Curr; + break; + } + // Quit if we can't remove this instruction. + if (!checkCSEInPredecessor(Curr, PB)) + return false; + } + } + + if (Cond == 0) + return false; + } + if (Cond == 0 || (!isa<CmpInst>(Cond) && !isa<BinaryOperator>(Cond)) || Cond->getParent() != BB || !Cond->hasOneUse()) return false; @@ -1549,7 +1652,7 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI) { // Finally, don't infinitely unroll conditional loops. BasicBlock *TrueDest = BI->getSuccessor(0); - BasicBlock *FalseDest = BI->getSuccessor(1); + BasicBlock *FalseDest = (BI->isConditional()) ? BI->getSuccessor(1) : 0; if (TrueDest == BB || FalseDest == BB) return false; @@ -1560,23 +1663,33 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI) { // Check that we have two conditional branches. If there is a PHI node in // the common successor, verify that the same value flows in from both // blocks. - if (PBI == 0 || PBI->isUnconditional() || !SafeToMergeTerminators(BI, PBI)) + SmallVector<PHINode*, 4> PHIs; + if (PBI == 0 || PBI->isUnconditional() || + (BI->isConditional() && + !SafeToMergeTerminators(BI, PBI)) || + (!BI->isConditional() && + !isProfitableToFoldUnconditional(BI, PBI, Cond, PHIs))) continue; // Determine if the two branches share a common destination. Instruction::BinaryOps Opc; bool InvertPredCond = false; - if (PBI->getSuccessor(0) == TrueDest) - Opc = Instruction::Or; - else if (PBI->getSuccessor(1) == FalseDest) - Opc = Instruction::And; - else if (PBI->getSuccessor(0) == FalseDest) - Opc = Instruction::And, InvertPredCond = true; - else if (PBI->getSuccessor(1) == TrueDest) - Opc = Instruction::Or, InvertPredCond = true; - else - continue; + if (BI->isConditional()) { + if (PBI->getSuccessor(0) == TrueDest) + Opc = Instruction::Or; + else if (PBI->getSuccessor(1) == FalseDest) + Opc = Instruction::And; + else if (PBI->getSuccessor(0) == FalseDest) + Opc = Instruction::And, InvertPredCond = true; + else if (PBI->getSuccessor(1) == TrueDest) + Opc = Instruction::Or, InvertPredCond = true; + else + continue; + } else { + if (PBI->getSuccessor(0) != TrueDest && PBI->getSuccessor(1) != TrueDest) + continue; + } // Ensure that any values used in the bonus instruction are also used // by the terminator of the predecessor. This means that those values @@ -1652,17 +1765,69 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI) { New->takeName(Cond); Cond->setName(New->getName()+".old"); - Instruction *NewCond = - cast<Instruction>(Builder.CreateBinOp(Opc, PBI->getCondition(), + if (BI->isConditional()) { + Instruction *NewCond = + cast<Instruction>(Builder.CreateBinOp(Opc, PBI->getCondition(), New, "or.cond")); - PBI->setCondition(NewCond); - if (PBI->getSuccessor(0) == BB) { - AddPredecessorToBlock(TrueDest, PredBlock, BB); - PBI->setSuccessor(0, TrueDest); - } - if (PBI->getSuccessor(1) == BB) { - AddPredecessorToBlock(FalseDest, PredBlock, BB); - PBI->setSuccessor(1, FalseDest); + PBI->setCondition(NewCond); + + if (PBI->getSuccessor(0) == BB) { + AddPredecessorToBlock(TrueDest, PredBlock, BB); + PBI->setSuccessor(0, TrueDest); + } + if (PBI->getSuccessor(1) == BB) { + AddPredecessorToBlock(FalseDest, PredBlock, BB); + PBI->setSuccessor(1, FalseDest); + } + } else { + // Update PHI nodes in the common successors. + for (unsigned i = 0, e = PHIs.size(); i != e; ++i) { + ConstantInt *PBI_C = cast<ConstantInt>( + PHIs[i]->getIncomingValueForBlock(PBI->getParent())); + assert(PBI_C->getType()->isIntegerTy(1)); + Instruction *MergedCond = 0; + if (PBI->getSuccessor(0) == TrueDest) { + // Create (PBI_Cond and PBI_C) or (!PBI_Cond and BI_Value) + // PBI_C is true: PBI_Cond or (!PBI_Cond and BI_Value) + // is false: !PBI_Cond and BI_Value + Instruction *NotCond = + cast<Instruction>(Builder.CreateNot(PBI->getCondition(), + "not.cond")); + MergedCond = + cast<Instruction>(Builder.CreateBinOp(Instruction::And, + NotCond, New, + "and.cond")); + if (PBI_C->isOne()) + MergedCond = + cast<Instruction>(Builder.CreateBinOp(Instruction::Or, + PBI->getCondition(), MergedCond, + "or.cond")); + } else { + // Create (PBI_Cond and BI_Value) or (!PBI_Cond and PBI_C) + // PBI_C is true: (PBI_Cond and BI_Value) or (!PBI_Cond) + // is false: PBI_Cond and BI_Value + MergedCond = + cast<Instruction>(Builder.CreateBinOp(Instruction::And, + PBI->getCondition(), New, + "and.cond")); + if (PBI_C->isOne()) { + Instruction *NotCond = + cast<Instruction>(Builder.CreateNot(PBI->getCondition(), + "not.cond")); + MergedCond = + cast<Instruction>(Builder.CreateBinOp(Instruction::Or, + NotCond, MergedCond, + "or.cond")); + } + } + // Update PHI Node. + PHIs[i]->setIncomingValue(PHIs[i]->getBasicBlockIndex(PBI->getParent()), + MergedCond); + } + // Change PBI from Conditional to Unconditional. + BranchInst *New_PBI = BranchInst::Create(TrueDest, PBI); + EraseTerminatorInstAndDCECond(PBI); + PBI = New_PBI; } // TODO: If BB is reachable from all paths through PredBlock, then we @@ -1670,7 +1835,8 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI) { // Merge probability data into PredBlock's branch. APInt A, B, C, D; - if (ExtractBranchMetadata(PBI, C, D) && ExtractBranchMetadata(BI, A, B)) { + if (PBI->isConditional() && BI->isConditional() && + ExtractBranchMetadata(PBI, C, D) && ExtractBranchMetadata(BI, A, B)) { // Given IR which does: // bbA: // br i1 %x, label %bbB, label %bbC @@ -1740,12 +1906,10 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI) { ProbTrue = ProbTrue.udiv(GCD); ProbFalse = ProbFalse.udiv(GCD); - LLVMContext &Context = BI->getContext(); - Value *Ops[3]; - Ops[0] = BI->getMetadata(LLVMContext::MD_prof)->getOperand(0); - Ops[1] = ConstantInt::get(Context, ProbTrue); - Ops[2] = ConstantInt::get(Context, ProbFalse); - PBI->setMetadata(LLVMContext::MD_prof, MDNode::get(Context, Ops)); + MDBuilder MDB(BI->getContext()); + MDNode *N = MDB.createBranchWeights(ProbTrue.getZExtValue(), + ProbFalse.getZExtValue()); + PBI->setMetadata(LLVMContext::MD_prof, N); } else { PBI->setMetadata(LLVMContext::MD_prof, NULL); } @@ -2758,6 +2922,12 @@ bool SimplifyCFGOpt::SimplifyUncondBranch(BranchInst *BI, IRBuilder<> &Builder){ return true; } + // If this basic block is ONLY a compare and a branch, and if a predecessor + // branches to us and our successor, fold the comparison into the + // predecessor and use logical operations to update the incoming value + // for PHI nodes in common successor. + if (FoldBranchToCommonDest(BI)) + return SimplifyCFG(BB) | true; return false; } diff --git a/lib/Transforms/Utils/SimplifyIndVar.cpp b/lib/Transforms/Utils/SimplifyIndVar.cpp index 4030bef..5d673f1 100644 --- a/lib/Transforms/Utils/SimplifyIndVar.cpp +++ b/lib/Transforms/Utils/SimplifyIndVar.cpp @@ -16,7 +16,6 @@ #define DEBUG_TYPE "indvars" #include "llvm/Instructions.h" -#include "llvm/Analysis/Dominators.h" #include "llvm/Analysis/IVUsers.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopPass.h" @@ -44,7 +43,6 @@ namespace { class SimplifyIndvar { Loop *L; LoopInfo *LI; - DominatorTree *DT; ScalarEvolution *SE; const TargetData *TD; // May be NULL diff --git a/lib/Transforms/Vectorize/BBVectorize.cpp b/lib/Transforms/Vectorize/BBVectorize.cpp index 9d62306..62d23cb 100644 --- a/lib/Transforms/Vectorize/BBVectorize.cpp +++ b/lib/Transforms/Vectorize/BBVectorize.cpp @@ -23,6 +23,7 @@ #include "llvm/IntrinsicInst.h" #include "llvm/Intrinsics.h" #include "llvm/LLVMContext.h" +#include "llvm/Metadata.h" #include "llvm/Pass.h" #include "llvm/Type.h" #include "llvm/ADT/DenseMap.h" @@ -41,6 +42,7 @@ #include "llvm/Support/raw_ostream.h" #include "llvm/Support/ValueHandle.h" #include "llvm/Target/TargetData.h" +#include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Vectorize.h" #include <algorithm> #include <map> @@ -66,6 +68,10 @@ static cl::opt<unsigned> MaxIter("bb-vectorize-max-iter", cl::init(0), cl::Hidden, cl::desc("The maximum number of pairing iterations")); +static cl::opt<bool> +Pow2LenOnly("bb-vectorize-pow2-len-only", cl::init(false), cl::Hidden, + cl::desc("Don't try to form non-2^n-length vectors")); + static cl::opt<unsigned> MaxInsts("bb-vectorize-max-instr-per-group", cl::init(500), cl::Hidden, cl::desc("The maximum number of pairable instructions per group")); @@ -76,6 +82,10 @@ MaxCandPairsForCycleCheck("bb-vectorize-max-cycle-check-pairs", cl::init(200), " a full cycle check")); static cl::opt<bool> +NoBools("bb-vectorize-no-bools", cl::init(false), cl::Hidden, + cl::desc("Don't try to vectorize boolean (i1) values")); + +static cl::opt<bool> NoInts("bb-vectorize-no-ints", cl::init(false), cl::Hidden, cl::desc("Don't try to vectorize integer values")); @@ -104,6 +114,10 @@ NoSelect("bb-vectorize-no-select", cl::init(false), cl::Hidden, cl::desc("Don't try to vectorize select instructions")); static cl::opt<bool> +NoCmp("bb-vectorize-no-cmp", cl::init(false), cl::Hidden, + cl::desc("Don't try to vectorize comparison instructions")); + +static cl::opt<bool> NoGEP("bb-vectorize-no-gep", cl::init(false), cl::Hidden, cl::desc("Don't try to vectorize getelementptr instructions")); @@ -182,12 +196,12 @@ namespace { // FIXME: const correct? - bool vectorizePairs(BasicBlock &BB); + bool vectorizePairs(BasicBlock &BB, bool NonPow2Len = false); bool getCandidatePairs(BasicBlock &BB, BasicBlock::iterator &Start, std::multimap<Value *, Value *> &CandidatePairs, - std::vector<Value *> &PairableInsts); + std::vector<Value *> &PairableInsts, bool NonPow2Len); void computeConnectedPairs(std::multimap<Value *, Value *> &CandidatePairs, std::vector<Value *> &PairableInsts, @@ -211,7 +225,7 @@ namespace { bool isInstVectorizable(Instruction *I, bool &IsSimpleLoadStore); bool areInstsCompatible(Instruction *I, Instruction *J, - bool IsSimpleLoadStore); + bool IsSimpleLoadStore, bool NonPow2Len); bool trackUsesOfI(DenseSet<Value *> &Users, AliasSetTracker &WriteSet, Instruction *I, @@ -263,26 +277,32 @@ namespace { bool UseCycleCheck); Value *getReplacementPointerInput(LLVMContext& Context, Instruction *I, - Instruction *J, unsigned o, bool &FlipMemInputs); + Instruction *J, unsigned o, bool FlipMemInputs); void fillNewShuffleMask(LLVMContext& Context, Instruction *J, - unsigned NumElem, unsigned MaskOffset, unsigned NumInElem, - unsigned IdxOffset, std::vector<Constant*> &Mask); + unsigned MaskOffset, unsigned NumInElem, + unsigned NumInElem1, unsigned IdxOffset, + std::vector<Constant*> &Mask); Value *getReplacementShuffleMask(LLVMContext& Context, Instruction *I, Instruction *J); + bool expandIEChain(LLVMContext& Context, Instruction *I, Instruction *J, + unsigned o, Value *&LOp, unsigned numElemL, + Type *ArgTypeL, Type *ArgTypeR, + unsigned IdxOff = 0); + Value *getReplacementInput(LLVMContext& Context, Instruction *I, Instruction *J, unsigned o, bool FlipMemInputs); void getReplacementInputsForPair(LLVMContext& Context, Instruction *I, Instruction *J, SmallVector<Value *, 3> &ReplacedOperands, - bool &FlipMemInputs); + bool FlipMemInputs); void replaceOutputsOfPair(LLVMContext& Context, Instruction *I, Instruction *J, Instruction *K, Instruction *&InsertionPt, Instruction *&K1, - Instruction *&K2, bool &FlipMemInputs); + Instruction *&K2, bool FlipMemInputs); void collectPairLoadMoveSet(BasicBlock &BB, DenseMap<Value *, Value *> &ChosenPairs, @@ -294,6 +314,10 @@ namespace { DenseMap<Value *, Value *> &ChosenPairs, std::multimap<Value *, Value *> &LoadMoveSet); + void collectPtrInfo(std::vector<Value *> &PairableInsts, + DenseMap<Value *, Value *> &ChosenPairs, + DenseSet<Value *> &LowPtrInsts); + bool canMoveUsesOfIAfterJ(BasicBlock &BB, std::multimap<Value *, Value *> &LoadMoveSet, Instruction *I, Instruction *J); @@ -303,12 +327,15 @@ namespace { Instruction *&InsertionPt, Instruction *I, Instruction *J); + void combineMetadata(Instruction *K, const Instruction *J); + bool vectorizeBB(BasicBlock &BB) { bool changed = false; // Iterate a sufficient number of times to merge types of size 1 bit, // then 2 bits, then 4, etc. up to half of the target vector width of the // target vector register. - for (unsigned v = 2, n = 1; + unsigned n = 1; + for (unsigned v = 2; v <= Config.VectorBits && (!Config.MaxIter || n <= Config.MaxIter); v *= 2, ++n) { DEBUG(dbgs() << "BBV: fusing loop #" << n << @@ -320,6 +347,16 @@ namespace { break; } + if (changed && !Pow2LenOnly) { + ++n; + for (; !Config.MaxIter || n <= Config.MaxIter; ++n) { + DEBUG(dbgs() << "BBV: fusing for non-2^n-length vectors loop #: " << + n << " for " << BB.getName() << " in " << + BB.getParent()->getName() << "...\n"); + if (!vectorizePairs(BB, true)) break; + } + } + DEBUG(dbgs() << "BBV: done!\n"); return changed; } @@ -341,15 +378,43 @@ namespace { AU.setPreservesCFG(); } - // This returns the vector type that holds a pair of the provided type. - // If the provided type is already a vector, then its length is doubled. - static inline VectorType *getVecTypeForPair(Type *ElemTy) { + static inline VectorType *getVecTypeForPair(Type *ElemTy, Type *Elem2Ty) { + assert(ElemTy->getScalarType() == Elem2Ty->getScalarType() && + "Cannot form vector from incompatible scalar types"); + Type *STy = ElemTy->getScalarType(); + + unsigned numElem; if (VectorType *VTy = dyn_cast<VectorType>(ElemTy)) { - unsigned numElem = VTy->getNumElements(); - return VectorType::get(ElemTy->getScalarType(), numElem*2); + numElem = VTy->getNumElements(); + } else { + numElem = 1; } - return VectorType::get(ElemTy, 2); + if (VectorType *VTy = dyn_cast<VectorType>(Elem2Ty)) { + numElem += VTy->getNumElements(); + } else { + numElem += 1; + } + + return VectorType::get(STy, numElem); + } + + static inline void getInstructionTypes(Instruction *I, + Type *&T1, Type *&T2) { + if (isa<StoreInst>(I)) { + // For stores, it is the value type, not the pointer type that matters + // because the value is what will come from a vector register. + + Value *IVal = cast<StoreInst>(I)->getValueOperand(); + T1 = IVal->getType(); + } else { + T1 = I->getType(); + } + + if (I->isCast()) + T2 = cast<CastInst>(I)->getSrcTy(); + else + T2 = T1; } // Returns the weight associated with the provided value. A chain of @@ -385,8 +450,7 @@ namespace { // true if the offset could be determined to be some constant value. // For example, if OffsetInElmts == 1, then J accesses the memory directly // after I; if OffsetInElmts == -1 then I accesses the memory - // directly after J. This function assumes that both instructions - // have the same type. + // directly after J. bool getPairPtrInfo(Instruction *I, Instruction *J, Value *&IPtr, Value *&JPtr, unsigned &IAlignment, unsigned &JAlignment, int64_t &OffsetInElmts) { @@ -418,7 +482,12 @@ namespace { Type *VTy = cast<PointerType>(IPtr->getType())->getElementType(); int64_t VTyTSS = (int64_t) TD->getTypeStoreSize(VTy); - assert(VTy == cast<PointerType>(JPtr->getType())->getElementType()); + Type *VTy2 = cast<PointerType>(JPtr->getType())->getElementType(); + if (VTy != VTy2 && Offset < 0) { + int64_t VTy2TSS = (int64_t) TD->getTypeStoreSize(VTy2); + OffsetInElmts = Offset/VTy2TSS; + return (abs64(Offset) % VTy2TSS) == 0; + } OffsetInElmts = Offset/VTyTSS; return (abs64(Offset) % VTyTSS) == 0; @@ -471,7 +540,7 @@ namespace { // This function implements one vectorization iteration on the provided // basic block. It returns true if the block is changed. - bool BBVectorize::vectorizePairs(BasicBlock &BB) { + bool BBVectorize::vectorizePairs(BasicBlock &BB, bool NonPow2Len) { bool ShouldContinue; BasicBlock::iterator Start = BB.getFirstInsertionPt(); @@ -482,7 +551,7 @@ namespace { std::vector<Value *> PairableInsts; std::multimap<Value *, Value *> CandidatePairs; ShouldContinue = getCandidatePairs(BB, Start, CandidatePairs, - PairableInsts); + PairableInsts, NonPow2Len); if (PairableInsts.empty()) continue; // Now we have a map of all of the pairable instructions and we need to @@ -529,6 +598,10 @@ namespace { // passes should coalesce the build/extract combinations. fuseChosenPairs(BB, AllPairableInsts, AllChosenPairs); + + // It is important to cleanup here so that future iterations of this + // function have less work to do. + (void) SimplifyInstructionsInBlock(&BB, TD); return true; } @@ -567,6 +640,9 @@ namespace { } else if (isa<SelectInst>(I)) { if (!Config.VectorizeSelect) return false; + } else if (isa<CmpInst>(I)) { + if (!Config.VectorizeCmp) + return false; } else if (GetElementPtrInst *G = dyn_cast<GetElementPtrInst>(I)) { if (!Config.VectorizeGEP) return false; @@ -584,41 +660,39 @@ namespace { return false; Type *T1, *T2; - if (isa<StoreInst>(I)) { - // For stores, it is the value type, not the pointer type that matters - // because the value is what will come from a vector register. - - Value *IVal = cast<StoreInst>(I)->getValueOperand(); - T1 = IVal->getType(); - } else { - T1 = I->getType(); - } - - if (I->isCast()) - T2 = cast<CastInst>(I)->getSrcTy(); - else - T2 = T1; + getInstructionTypes(I, T1, T2); // Not every type can be vectorized... if (!(VectorType::isValidElementType(T1) || T1->isVectorTy()) || !(VectorType::isValidElementType(T2) || T2->isVectorTy())) return false; - if (!Config.VectorizeInts - && (T1->isIntOrIntVectorTy() || T2->isIntOrIntVectorTy())) - return false; - + if (T1->getScalarSizeInBits() == 1 && T2->getScalarSizeInBits() == 1) { + if (!Config.VectorizeBools) + return false; + } else { + if (!Config.VectorizeInts + && (T1->isIntOrIntVectorTy() || T2->isIntOrIntVectorTy())) + return false; + } + if (!Config.VectorizeFloats && (T1->isFPOrFPVectorTy() || T2->isFPOrFPVectorTy())) return false; + // Don't vectorize target-specific types. + if (T1->isX86_FP80Ty() || T1->isPPC_FP128Ty() || T1->isX86_MMXTy()) + return false; + if (T2->isX86_FP80Ty() || T2->isPPC_FP128Ty() || T2->isX86_MMXTy()) + return false; + if ((!Config.VectorizePointers || TD == 0) && (T1->getScalarType()->isPointerTy() || T2->getScalarType()->isPointerTy())) return false; - if (T1->getPrimitiveSizeInBits() > Config.VectorBits/2 || - T2->getPrimitiveSizeInBits() > Config.VectorBits/2) + if (T1->getPrimitiveSizeInBits() >= Config.VectorBits || + T2->getPrimitiveSizeInBits() >= Config.VectorBits) return false; return true; @@ -629,36 +703,25 @@ namespace { // that I has already been determined to be vectorizable and that J is not // in the use tree of I. bool BBVectorize::areInstsCompatible(Instruction *I, Instruction *J, - bool IsSimpleLoadStore) { + bool IsSimpleLoadStore, bool NonPow2Len) { DEBUG(if (DebugInstructionExamination) dbgs() << "BBV: looking at " << *I << " <-> " << *J << "\n"); // Loads and stores can be merged if they have different alignments, // but are otherwise the same. - LoadInst *LI, *LJ; - StoreInst *SI, *SJ; - if ((LI = dyn_cast<LoadInst>(I)) && (LJ = dyn_cast<LoadInst>(J))) { - if (I->getType() != J->getType()) - return false; + if (!J->isSameOperationAs(I, Instruction::CompareIgnoringAlignment | + (NonPow2Len ? Instruction::CompareUsingScalarTypes : 0))) + return false; - if (LI->getPointerOperand()->getType() != - LJ->getPointerOperand()->getType() || - LI->isVolatile() != LJ->isVolatile() || - LI->getOrdering() != LJ->getOrdering() || - LI->getSynchScope() != LJ->getSynchScope()) - return false; - } else if ((SI = dyn_cast<StoreInst>(I)) && (SJ = dyn_cast<StoreInst>(J))) { - if (SI->getValueOperand()->getType() != - SJ->getValueOperand()->getType() || - SI->getPointerOperand()->getType() != - SJ->getPointerOperand()->getType() || - SI->isVolatile() != SJ->isVolatile() || - SI->getOrdering() != SJ->getOrdering() || - SI->getSynchScope() != SJ->getSynchScope()) - return false; - } else if (!J->isSameOperationAs(I)) { + Type *IT1, *IT2, *JT1, *JT2; + getInstructionTypes(I, IT1, IT2); + getInstructionTypes(J, JT1, JT2); + unsigned MaxTypeBits = std::max( + IT1->getPrimitiveSizeInBits() + JT1->getPrimitiveSizeInBits(), + IT2->getPrimitiveSizeInBits() + JT2->getPrimitiveSizeInBits()); + if (MaxTypeBits > Config.VectorBits) return false; - } + // FIXME: handle addsub-type operations! if (IsSimpleLoadStore) { @@ -668,8 +731,11 @@ namespace { if (getPairPtrInfo(I, J, IPtr, JPtr, IAlignment, JAlignment, OffsetInElmts) && abs64(OffsetInElmts) == 1) { if (Config.AlignedOnly) { - Type *aType = isa<StoreInst>(I) ? + Type *aTypeI = isa<StoreInst>(I) ? cast<StoreInst>(I)->getValueOperand()->getType() : I->getType(); + Type *aTypeJ = isa<StoreInst>(J) ? + cast<StoreInst>(J)->getValueOperand()->getType() : J->getType(); + // An aligned load or store is possible only if the instruction // with the lower offset has an alignment suitable for the // vector type. @@ -677,7 +743,7 @@ namespace { unsigned BottomAlignment = IAlignment; if (OffsetInElmts < 0) BottomAlignment = JAlignment; - Type *VType = getVecTypeForPair(aType); + Type *VType = getVecTypeForPair(aTypeI, aTypeJ); unsigned VecAlignment = TD->getPrefTypeAlignment(VType); if (BottomAlignment < VecAlignment) return false; @@ -685,11 +751,6 @@ namespace { } else { return false; } - } else if (isa<ShuffleVectorInst>(I)) { - // Only merge two shuffles if they're both constant - return isa<Constant>(I->getOperand(2)) && - isa<Constant>(J->getOperand(2)); - // FIXME: We may want to vectorize non-constant shuffles also. } // The powi intrinsic is special because only the first argument is @@ -772,7 +833,7 @@ namespace { bool BBVectorize::getCandidatePairs(BasicBlock &BB, BasicBlock::iterator &Start, std::multimap<Value *, Value *> &CandidatePairs, - std::vector<Value *> &PairableInsts) { + std::vector<Value *> &PairableInsts, bool NonPow2Len) { BasicBlock::iterator E = BB.end(); if (Start == E) return false; @@ -808,7 +869,7 @@ namespace { // J does not use I, and comes before the first use of I, so it can be // merged with I if the instructions are compatible. - if (!areInstsCompatible(I, J, IsSimpleLoadStore)) continue; + if (!areInstsCompatible(I, J, IsSimpleLoadStore, NonPow2Len)) continue; // J is a candidate for merging with I. if (!PairableInsts.size() || @@ -1430,24 +1491,27 @@ namespace { // instruction that fuses I with J. Value *BBVectorize::getReplacementPointerInput(LLVMContext& Context, Instruction *I, Instruction *J, unsigned o, - bool &FlipMemInputs) { + bool FlipMemInputs) { Value *IPtr, *JPtr; unsigned IAlignment, JAlignment; int64_t OffsetInElmts; + + // Note: the analysis might fail here, that is why FlipMemInputs has + // been precomputed (OffsetInElmts must be unused here). (void) getPairPtrInfo(I, J, IPtr, JPtr, IAlignment, JAlignment, OffsetInElmts); // The pointer value is taken to be the one with the lowest offset. Value *VPtr; - if (OffsetInElmts > 0) { + if (!FlipMemInputs) { VPtr = IPtr; } else { - FlipMemInputs = true; VPtr = JPtr; } - Type *ArgType = cast<PointerType>(IPtr->getType())->getElementType(); - Type *VArgType = getVecTypeForPair(ArgType); + Type *ArgTypeI = cast<PointerType>(IPtr->getType())->getElementType(); + Type *ArgTypeJ = cast<PointerType>(JPtr->getType())->getElementType(); + Type *VArgType = getVecTypeForPair(ArgTypeI, ArgTypeJ); Type *VArgPtrType = PointerType::get(VArgType, cast<PointerType>(IPtr->getType())->getAddressSpace()); return new BitCastInst(VPtr, VArgPtrType, getReplacementName(I, true, o), @@ -1455,15 +1519,17 @@ namespace { } void BBVectorize::fillNewShuffleMask(LLVMContext& Context, Instruction *J, - unsigned NumElem, unsigned MaskOffset, unsigned NumInElem, - unsigned IdxOffset, std::vector<Constant*> &Mask) { - for (unsigned v = 0; v < NumElem/2; ++v) { + unsigned MaskOffset, unsigned NumInElem, + unsigned NumInElem1, unsigned IdxOffset, + std::vector<Constant*> &Mask) { + unsigned NumElem1 = cast<VectorType>(J->getType())->getNumElements(); + for (unsigned v = 0; v < NumElem1; ++v) { int m = cast<ShuffleVectorInst>(J)->getMaskValue(v); if (m < 0) { Mask[v+MaskOffset] = UndefValue::get(Type::getInt32Ty(Context)); } else { unsigned mm = m + (int) IdxOffset; - if (m >= (int) NumInElem) + if (m >= (int) NumInElem1) mm += (int) NumInElem; Mask[v+MaskOffset] = @@ -1479,8 +1545,11 @@ namespace { // This is the shuffle mask. We need to append the second // mask to the first, and the numbers need to be adjusted. - Type *ArgType = I->getType(); - Type *VArgType = getVecTypeForPair(ArgType); + Type *ArgTypeI = I->getType(); + Type *ArgTypeJ = J->getType(); + Type *VArgType = getVecTypeForPair(ArgTypeI, ArgTypeJ); + + unsigned NumElemI = cast<VectorType>(ArgTypeI)->getNumElements(); // Get the total number of elements in the fused vector type. // By definition, this must equal the number of elements in @@ -1488,19 +1557,81 @@ namespace { unsigned NumElem = cast<VectorType>(VArgType)->getNumElements(); std::vector<Constant*> Mask(NumElem); - Type *OpType = I->getOperand(0)->getType(); - unsigned NumInElem = cast<VectorType>(OpType)->getNumElements(); + Type *OpTypeI = I->getOperand(0)->getType(); + unsigned NumInElemI = cast<VectorType>(OpTypeI)->getNumElements(); + Type *OpTypeJ = J->getOperand(0)->getType(); + unsigned NumInElemJ = cast<VectorType>(OpTypeJ)->getNumElements(); + + // The fused vector will be: + // ----------------------------------------------------- + // | NumInElemI | NumInElemJ | NumInElemI | NumInElemJ | + // ----------------------------------------------------- + // from which we'll extract NumElem total elements (where the first NumElemI + // of them come from the mask in I and the remainder come from the mask + // in J. // For the mask from the first pair... - fillNewShuffleMask(Context, I, NumElem, 0, NumInElem, 0, Mask); + fillNewShuffleMask(Context, I, 0, NumInElemJ, NumInElemI, + 0, Mask); // For the mask from the second pair... - fillNewShuffleMask(Context, J, NumElem, NumElem/2, NumInElem, NumInElem, - Mask); + fillNewShuffleMask(Context, J, NumElemI, NumInElemI, NumInElemJ, + NumInElemI, Mask); return ConstantVector::get(Mask); } + bool BBVectorize::expandIEChain(LLVMContext& Context, Instruction *I, + Instruction *J, unsigned o, Value *&LOp, + unsigned numElemL, + Type *ArgTypeL, Type *ArgTypeH, + unsigned IdxOff) { + bool ExpandedIEChain = false; + if (InsertElementInst *LIE = dyn_cast<InsertElementInst>(LOp)) { + // If we have a pure insertelement chain, then this can be rewritten + // into a chain that directly builds the larger type. + bool PureChain = true; + InsertElementInst *LIENext = LIE; + do { + if (!isa<UndefValue>(LIENext->getOperand(0)) && + !isa<InsertElementInst>(LIENext->getOperand(0))) { + PureChain = false; + break; + } + } while ((LIENext = + dyn_cast<InsertElementInst>(LIENext->getOperand(0)))); + + if (PureChain) { + SmallVector<Value *, 8> VectElemts(numElemL, + UndefValue::get(ArgTypeL->getScalarType())); + InsertElementInst *LIENext = LIE; + do { + unsigned Idx = + cast<ConstantInt>(LIENext->getOperand(2))->getSExtValue(); + VectElemts[Idx] = LIENext->getOperand(1); + } while ((LIENext = + dyn_cast<InsertElementInst>(LIENext->getOperand(0)))); + + LIENext = 0; + Value *LIEPrev = UndefValue::get(ArgTypeH); + for (unsigned i = 0; i < numElemL; ++i) { + if (isa<UndefValue>(VectElemts[i])) continue; + LIENext = InsertElementInst::Create(LIEPrev, VectElemts[i], + ConstantInt::get(Type::getInt32Ty(Context), + i + IdxOff), + getReplacementName(I, true, o, i+1)); + LIENext->insertBefore(J); + LIEPrev = LIENext; + } + + LOp = LIENext ? (Value*) LIENext : UndefValue::get(ArgTypeH); + ExpandedIEChain = true; + } + } + + return ExpandedIEChain; + } + // Returns the value to be used as the specified operand of the vector // instruction that fuses I with J. Value *BBVectorize::getReplacementInput(LLVMContext& Context, Instruction *I, @@ -1508,84 +1639,333 @@ namespace { Value *CV0 = ConstantInt::get(Type::getInt32Ty(Context), 0); Value *CV1 = ConstantInt::get(Type::getInt32Ty(Context), 1); - // Compute the fused vector type for this operand - Type *ArgType = I->getOperand(o)->getType(); - VectorType *VArgType = getVecTypeForPair(ArgType); + // Compute the fused vector type for this operand + Type *ArgTypeI = I->getOperand(o)->getType(); + Type *ArgTypeJ = J->getOperand(o)->getType(); + VectorType *VArgType = getVecTypeForPair(ArgTypeI, ArgTypeJ); Instruction *L = I, *H = J; + Type *ArgTypeL = ArgTypeI, *ArgTypeH = ArgTypeJ; if (FlipMemInputs) { L = J; H = I; + ArgTypeL = ArgTypeJ; + ArgTypeH = ArgTypeI; } - if (ArgType->isVectorTy()) { - unsigned numElem = cast<VectorType>(VArgType)->getNumElements(); - std::vector<Constant*> Mask(numElem); - for (unsigned v = 0; v < numElem; ++v) - Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), v); + unsigned numElemL; + if (ArgTypeL->isVectorTy()) + numElemL = cast<VectorType>(ArgTypeL)->getNumElements(); + else + numElemL = 1; - Instruction *BV = new ShuffleVectorInst(L->getOperand(o), - H->getOperand(o), - ConstantVector::get(Mask), - getReplacementName(I, true, o)); - BV->insertBefore(J); - return BV; + unsigned numElemH; + if (ArgTypeH->isVectorTy()) + numElemH = cast<VectorType>(ArgTypeH)->getNumElements(); + else + numElemH = 1; + + Value *LOp = L->getOperand(o); + Value *HOp = H->getOperand(o); + unsigned numElem = VArgType->getNumElements(); + + // First, we check if we can reuse the "original" vector outputs (if these + // exist). We might need a shuffle. + ExtractElementInst *LEE = dyn_cast<ExtractElementInst>(LOp); + ExtractElementInst *HEE = dyn_cast<ExtractElementInst>(HOp); + ShuffleVectorInst *LSV = dyn_cast<ShuffleVectorInst>(LOp); + ShuffleVectorInst *HSV = dyn_cast<ShuffleVectorInst>(HOp); + + // FIXME: If we're fusing shuffle instructions, then we can't apply this + // optimization. The input vectors to the shuffle might be a different + // length from the shuffle outputs. Unfortunately, the replacement + // shuffle mask has already been formed, and the mask entries are sensitive + // to the sizes of the inputs. + bool IsSizeChangeShuffle = + isa<ShuffleVectorInst>(L) && + (LOp->getType() != L->getType() || HOp->getType() != H->getType()); + + if ((LEE || LSV) && (HEE || HSV) && !IsSizeChangeShuffle) { + // We can have at most two unique vector inputs. + bool CanUseInputs = true; + Value *I1, *I2 = 0; + if (LEE) { + I1 = LEE->getOperand(0); + } else { + I1 = LSV->getOperand(0); + I2 = LSV->getOperand(1); + if (I2 == I1 || isa<UndefValue>(I2)) + I2 = 0; + } + + if (HEE) { + Value *I3 = HEE->getOperand(0); + if (!I2 && I3 != I1) + I2 = I3; + else if (I3 != I1 && I3 != I2) + CanUseInputs = false; + } else { + Value *I3 = HSV->getOperand(0); + if (!I2 && I3 != I1) + I2 = I3; + else if (I3 != I1 && I3 != I2) + CanUseInputs = false; + + if (CanUseInputs) { + Value *I4 = HSV->getOperand(1); + if (!isa<UndefValue>(I4)) { + if (!I2 && I4 != I1) + I2 = I4; + else if (I4 != I1 && I4 != I2) + CanUseInputs = false; + } + } + } + + if (CanUseInputs) { + unsigned LOpElem = + cast<VectorType>(cast<Instruction>(LOp)->getOperand(0)->getType()) + ->getNumElements(); + unsigned HOpElem = + cast<VectorType>(cast<Instruction>(HOp)->getOperand(0)->getType()) + ->getNumElements(); + + // We have one or two input vectors. We need to map each index of the + // operands to the index of the original vector. + SmallVector<std::pair<int, int>, 8> II(numElem); + for (unsigned i = 0; i < numElemL; ++i) { + int Idx, INum; + if (LEE) { + Idx = + cast<ConstantInt>(LEE->getOperand(1))->getSExtValue(); + INum = LEE->getOperand(0) == I1 ? 0 : 1; + } else { + Idx = LSV->getMaskValue(i); + if (Idx < (int) LOpElem) { + INum = LSV->getOperand(0) == I1 ? 0 : 1; + } else { + Idx -= LOpElem; + INum = LSV->getOperand(1) == I1 ? 0 : 1; + } + } + + II[i] = std::pair<int, int>(Idx, INum); + } + for (unsigned i = 0; i < numElemH; ++i) { + int Idx, INum; + if (HEE) { + Idx = + cast<ConstantInt>(HEE->getOperand(1))->getSExtValue(); + INum = HEE->getOperand(0) == I1 ? 0 : 1; + } else { + Idx = HSV->getMaskValue(i); + if (Idx < (int) HOpElem) { + INum = HSV->getOperand(0) == I1 ? 0 : 1; + } else { + Idx -= HOpElem; + INum = HSV->getOperand(1) == I1 ? 0 : 1; + } + } + + II[i + numElemL] = std::pair<int, int>(Idx, INum); + } + + // We now have an array which tells us from which index of which + // input vector each element of the operand comes. + VectorType *I1T = cast<VectorType>(I1->getType()); + unsigned I1Elem = I1T->getNumElements(); + + if (!I2) { + // In this case there is only one underlying vector input. Check for + // the trivial case where we can use the input directly. + if (I1Elem == numElem) { + bool ElemInOrder = true; + for (unsigned i = 0; i < numElem; ++i) { + if (II[i].first != (int) i && II[i].first != -1) { + ElemInOrder = false; + break; + } + } + + if (ElemInOrder) + return I1; + } + + // A shuffle is needed. + std::vector<Constant *> Mask(numElem); + for (unsigned i = 0; i < numElem; ++i) { + int Idx = II[i].first; + if (Idx == -1) + Mask[i] = UndefValue::get(Type::getInt32Ty(Context)); + else + Mask[i] = ConstantInt::get(Type::getInt32Ty(Context), Idx); + } + + Instruction *S = + new ShuffleVectorInst(I1, UndefValue::get(I1T), + ConstantVector::get(Mask), + getReplacementName(I, true, o)); + S->insertBefore(J); + return S; + } + + VectorType *I2T = cast<VectorType>(I2->getType()); + unsigned I2Elem = I2T->getNumElements(); + + // This input comes from two distinct vectors. The first step is to + // make sure that both vectors are the same length. If not, the + // smaller one will need to grow before they can be shuffled together. + if (I1Elem < I2Elem) { + std::vector<Constant *> Mask(I2Elem); + unsigned v = 0; + for (; v < I1Elem; ++v) + Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), v); + for (; v < I2Elem; ++v) + Mask[v] = UndefValue::get(Type::getInt32Ty(Context)); + + Instruction *NewI1 = + new ShuffleVectorInst(I1, UndefValue::get(I1T), + ConstantVector::get(Mask), + getReplacementName(I, true, o, 1)); + NewI1->insertBefore(J); + I1 = NewI1; + I1T = I2T; + I1Elem = I2Elem; + } else if (I1Elem > I2Elem) { + std::vector<Constant *> Mask(I1Elem); + unsigned v = 0; + for (; v < I2Elem; ++v) + Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), v); + for (; v < I1Elem; ++v) + Mask[v] = UndefValue::get(Type::getInt32Ty(Context)); + + Instruction *NewI2 = + new ShuffleVectorInst(I2, UndefValue::get(I2T), + ConstantVector::get(Mask), + getReplacementName(I, true, o, 1)); + NewI2->insertBefore(J); + I2 = NewI2; + I2T = I1T; + I2Elem = I1Elem; + } + + // Now that both I1 and I2 are the same length we can shuffle them + // together (and use the result). + std::vector<Constant *> Mask(numElem); + for (unsigned v = 0; v < numElem; ++v) { + if (II[v].first == -1) { + Mask[v] = UndefValue::get(Type::getInt32Ty(Context)); + } else { + int Idx = II[v].first + II[v].second * I1Elem; + Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), Idx); + } + } + + Instruction *NewOp = + new ShuffleVectorInst(I1, I2, ConstantVector::get(Mask), + getReplacementName(I, true, o)); + NewOp->insertBefore(J); + return NewOp; + } } - // If these two inputs are the output of another vector instruction, - // then we should use that output directly. It might be necessary to - // permute it first. [When pairings are fused recursively, you can - // end up with cases where a large vector is decomposed into scalars - // using extractelement instructions, then built into size-2 - // vectors using insertelement and the into larger vectors using - // shuffles. InstCombine does not simplify all of these cases well, - // and so we make sure that shuffles are generated here when possible. - ExtractElementInst *LEE - = dyn_cast<ExtractElementInst>(L->getOperand(o)); - ExtractElementInst *HEE - = dyn_cast<ExtractElementInst>(H->getOperand(o)); - - if (LEE && HEE && - LEE->getOperand(0)->getType() == HEE->getOperand(0)->getType()) { - VectorType *EEType = cast<VectorType>(LEE->getOperand(0)->getType()); - unsigned LowIndx = cast<ConstantInt>(LEE->getOperand(1))->getZExtValue(); - unsigned HighIndx = cast<ConstantInt>(HEE->getOperand(1))->getZExtValue(); - if (LEE->getOperand(0) == HEE->getOperand(0)) { - if (LowIndx == 0 && HighIndx == 1) - return LEE->getOperand(0); - - std::vector<Constant*> Mask(2); - Mask[0] = ConstantInt::get(Type::getInt32Ty(Context), LowIndx); - Mask[1] = ConstantInt::get(Type::getInt32Ty(Context), HighIndx); - - Instruction *BV = new ShuffleVectorInst(LEE->getOperand(0), - UndefValue::get(EEType), - ConstantVector::get(Mask), - getReplacementName(I, true, o)); - BV->insertBefore(J); - return BV; + Type *ArgType = ArgTypeL; + if (numElemL < numElemH) { + if (numElemL == 1 && expandIEChain(Context, I, J, o, HOp, numElemH, + ArgTypeL, VArgType, 1)) { + // This is another short-circuit case: we're combining a scalar into + // a vector that is formed by an IE chain. We've just expanded the IE + // chain, now insert the scalar and we're done. + + Instruction *S = InsertElementInst::Create(HOp, LOp, CV0, + getReplacementName(I, true, o)); + S->insertBefore(J); + return S; + } else if (!expandIEChain(Context, I, J, o, LOp, numElemL, ArgTypeL, + ArgTypeH)) { + // The two vector inputs to the shuffle must be the same length, + // so extend the smaller vector to be the same length as the larger one. + Instruction *NLOp; + if (numElemL > 1) { + + std::vector<Constant *> Mask(numElemH); + unsigned v = 0; + for (; v < numElemL; ++v) + Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), v); + for (; v < numElemH; ++v) + Mask[v] = UndefValue::get(Type::getInt32Ty(Context)); + + NLOp = new ShuffleVectorInst(LOp, UndefValue::get(ArgTypeL), + ConstantVector::get(Mask), + getReplacementName(I, true, o, 1)); + } else { + NLOp = InsertElementInst::Create(UndefValue::get(ArgTypeH), LOp, CV0, + getReplacementName(I, true, o, 1)); + } + + NLOp->insertBefore(J); + LOp = NLOp; } - std::vector<Constant*> Mask(2); - HighIndx += EEType->getNumElements(); - Mask[0] = ConstantInt::get(Type::getInt32Ty(Context), LowIndx); - Mask[1] = ConstantInt::get(Type::getInt32Ty(Context), HighIndx); + ArgType = ArgTypeH; + } else if (numElemL > numElemH) { + if (numElemH == 1 && expandIEChain(Context, I, J, o, LOp, numElemL, + ArgTypeH, VArgType)) { + Instruction *S = + InsertElementInst::Create(LOp, HOp, + ConstantInt::get(Type::getInt32Ty(Context), + numElemL), + getReplacementName(I, true, o)); + S->insertBefore(J); + return S; + } else if (!expandIEChain(Context, I, J, o, HOp, numElemH, ArgTypeH, + ArgTypeL)) { + Instruction *NHOp; + if (numElemH > 1) { + std::vector<Constant *> Mask(numElemL); + unsigned v = 0; + for (; v < numElemH; ++v) + Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), v); + for (; v < numElemL; ++v) + Mask[v] = UndefValue::get(Type::getInt32Ty(Context)); + + NHOp = new ShuffleVectorInst(HOp, UndefValue::get(ArgTypeH), + ConstantVector::get(Mask), + getReplacementName(I, true, o, 1)); + } else { + NHOp = InsertElementInst::Create(UndefValue::get(ArgTypeL), HOp, CV0, + getReplacementName(I, true, o, 1)); + } + + NHOp->insertBefore(J); + HOp = NHOp; + } + } - Instruction *BV = new ShuffleVectorInst(LEE->getOperand(0), - HEE->getOperand(0), - ConstantVector::get(Mask), - getReplacementName(I, true, o)); + if (ArgType->isVectorTy()) { + unsigned numElem = cast<VectorType>(VArgType)->getNumElements(); + std::vector<Constant*> Mask(numElem); + for (unsigned v = 0; v < numElem; ++v) { + unsigned Idx = v; + // If the low vector was expanded, we need to skip the extra + // undefined entries. + if (v >= numElemL && numElemH > numElemL) + Idx += (numElemH - numElemL); + Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), Idx); + } + + Instruction *BV = new ShuffleVectorInst(LOp, HOp, + ConstantVector::get(Mask), + getReplacementName(I, true, o)); BV->insertBefore(J); return BV; } Instruction *BV1 = InsertElementInst::Create( - UndefValue::get(VArgType), - L->getOperand(o), CV0, + UndefValue::get(VArgType), LOp, CV0, getReplacementName(I, true, o, 1)); BV1->insertBefore(I); - Instruction *BV2 = InsertElementInst::Create(BV1, H->getOperand(o), - CV1, + Instruction *BV2 = InsertElementInst::Create(BV1, HOp, CV1, getReplacementName(I, true, o, 2)); BV2->insertBefore(J); return BV2; @@ -1596,8 +1976,7 @@ namespace { void BBVectorize::getReplacementInputsForPair(LLVMContext& Context, Instruction *I, Instruction *J, SmallVector<Value *, 3> &ReplacedOperands, - bool &FlipMemInputs) { - FlipMemInputs = false; + bool FlipMemInputs) { unsigned NumOperands = I->getNumOperands(); for (unsigned p = 0, o = NumOperands-1; p < NumOperands; ++p, --o) { @@ -1616,10 +1995,10 @@ namespace { BasicBlock &BB = *I->getParent(); Module *M = BB.getParent()->getParent(); - Type *ArgType = I->getType(); - Type *VArgType = getVecTypeForPair(ArgType); + Type *ArgTypeI = I->getType(); + Type *ArgTypeJ = J->getType(); + Type *VArgType = getVecTypeForPair(ArgTypeI, ArgTypeJ); - // FIXME: is it safe to do this here? ReplacedOperands[o] = Intrinsic::getDeclaration(M, (Intrinsic::ID) IID, VArgType); continue; @@ -1648,36 +2027,60 @@ namespace { Instruction *J, Instruction *K, Instruction *&InsertionPt, Instruction *&K1, Instruction *&K2, - bool &FlipMemInputs) { - Value *CV0 = ConstantInt::get(Type::getInt32Ty(Context), 0); - Value *CV1 = ConstantInt::get(Type::getInt32Ty(Context), 1); - + bool FlipMemInputs) { if (isa<StoreInst>(I)) { AA->replaceWithNewValue(I, K); AA->replaceWithNewValue(J, K); } else { Type *IType = I->getType(); - Type *VType = getVecTypeForPair(IType); + Type *JType = J->getType(); + + VectorType *VType = getVecTypeForPair(IType, JType); + unsigned numElem = VType->getNumElements(); + + unsigned numElemI, numElemJ; + if (IType->isVectorTy()) + numElemI = cast<VectorType>(IType)->getNumElements(); + else + numElemI = 1; + + if (JType->isVectorTy()) + numElemJ = cast<VectorType>(JType)->getNumElements(); + else + numElemJ = 1; if (IType->isVectorTy()) { - unsigned numElem = cast<VectorType>(IType)->getNumElements(); - std::vector<Constant*> Mask1(numElem), Mask2(numElem); - for (unsigned v = 0; v < numElem; ++v) { - Mask1[v] = ConstantInt::get(Type::getInt32Ty(Context), v); - Mask2[v] = ConstantInt::get(Type::getInt32Ty(Context), numElem+v); - } + std::vector<Constant*> Mask1(numElemI), Mask2(numElemI); + for (unsigned v = 0; v < numElemI; ++v) { + Mask1[v] = ConstantInt::get(Type::getInt32Ty(Context), v); + Mask2[v] = ConstantInt::get(Type::getInt32Ty(Context), numElemJ+v); + } - K1 = new ShuffleVectorInst(K, UndefValue::get(VType), - ConstantVector::get( - FlipMemInputs ? Mask2 : Mask1), - getReplacementName(K, false, 1)); - K2 = new ShuffleVectorInst(K, UndefValue::get(VType), - ConstantVector::get( - FlipMemInputs ? Mask1 : Mask2), - getReplacementName(K, false, 2)); + K1 = new ShuffleVectorInst(K, UndefValue::get(VType), + ConstantVector::get( + FlipMemInputs ? Mask2 : Mask1), + getReplacementName(K, false, 1)); } else { + Value *CV0 = ConstantInt::get(Type::getInt32Ty(Context), 0); + Value *CV1 = ConstantInt::get(Type::getInt32Ty(Context), numElem-1); K1 = ExtractElementInst::Create(K, FlipMemInputs ? CV1 : CV0, getReplacementName(K, false, 1)); + } + + if (JType->isVectorTy()) { + std::vector<Constant*> Mask1(numElemJ), Mask2(numElemJ); + for (unsigned v = 0; v < numElemJ; ++v) { + Mask1[v] = ConstantInt::get(Type::getInt32Ty(Context), v); + Mask2[v] = ConstantInt::get(Type::getInt32Ty(Context), numElemI+v); + } + + K2 = new ShuffleVectorInst(K, UndefValue::get(VType), + ConstantVector::get( + FlipMemInputs ? Mask1 : Mask2), + getReplacementName(K, false, 2)); + } else { + Value *CV0 = ConstantInt::get(Type::getInt32Ty(Context), 0); + Value *CV1 = ConstantInt::get(Type::getInt32Ty(Context), numElem-1); K2 = ExtractElementInst::Create(K, FlipMemInputs ? CV0 : CV1, getReplacementName(K, false, 2)); } @@ -1778,6 +2181,61 @@ namespace { } } + // As with the aliasing information, SCEV can also change because of + // vectorization. This information is used to compute relative pointer + // offsets; the necessary information will be cached here prior to + // fusion. + void BBVectorize::collectPtrInfo(std::vector<Value *> &PairableInsts, + DenseMap<Value *, Value *> &ChosenPairs, + DenseSet<Value *> &LowPtrInsts) { + for (std::vector<Value *>::iterator PI = PairableInsts.begin(), + PIE = PairableInsts.end(); PI != PIE; ++PI) { + DenseMap<Value *, Value *>::iterator P = ChosenPairs.find(*PI); + if (P == ChosenPairs.end()) continue; + + Instruction *I = cast<Instruction>(P->first); + Instruction *J = cast<Instruction>(P->second); + + if (!isa<LoadInst>(I) && !isa<StoreInst>(I)) + continue; + + Value *IPtr, *JPtr; + unsigned IAlignment, JAlignment; + int64_t OffsetInElmts; + if (!getPairPtrInfo(I, J, IPtr, JPtr, IAlignment, JAlignment, + OffsetInElmts) || abs64(OffsetInElmts) != 1) + llvm_unreachable("Pre-fusion pointer analysis failed"); + + Value *LowPI = (OffsetInElmts > 0) ? I : J; + LowPtrInsts.insert(LowPI); + } + } + + // When the first instruction in each pair is cloned, it will inherit its + // parent's metadata. This metadata must be combined with that of the other + // instruction in a safe way. + void BBVectorize::combineMetadata(Instruction *K, const Instruction *J) { + SmallVector<std::pair<unsigned, MDNode*>, 4> Metadata; + K->getAllMetadataOtherThanDebugLoc(Metadata); + for (unsigned i = 0, n = Metadata.size(); i < n; ++i) { + unsigned Kind = Metadata[i].first; + MDNode *JMD = J->getMetadata(Kind); + MDNode *KMD = Metadata[i].second; + + switch (Kind) { + default: + K->setMetadata(Kind, 0); // Remove unknown metadata + break; + case LLVMContext::MD_tbaa: + K->setMetadata(Kind, MDNode::getMostGenericTBAA(JMD, KMD)); + break; + case LLVMContext::MD_fpmath: + K->setMetadata(Kind, MDNode::getMostGenericFPMath(JMD, KMD)); + break; + } + } + } + // This function fuses the chosen instruction pairs into vector instructions, // taking care preserve any needed scalar outputs and, then, it reorders the // remaining instructions as needed (users of the first member of the pair @@ -1804,6 +2262,9 @@ namespace { std::multimap<Value *, Value *> LoadMoveSet; collectLoadMoveSet(BB, PairableInsts, ChosenPairs, LoadMoveSet); + DenseSet<Value *> LowPtrInsts; + collectPtrInfo(PairableInsts, ChosenPairs, LowPtrInsts); + DEBUG(dbgs() << "BBV: initial: \n" << BB << "\n"); for (BasicBlock::iterator PI = BB.getFirstInsertionPt(); PI != BB.end();) { @@ -1843,7 +2304,10 @@ namespace { continue; } - bool FlipMemInputs; + bool FlipMemInputs = false; + if (isa<LoadInst>(I) || isa<StoreInst>(I)) + FlipMemInputs = (LowPtrInsts.find(I) == LowPtrInsts.end()); + unsigned NumOperands = I->getNumOperands(); SmallVector<Value *, 3> ReplacedOperands(NumOperands); getReplacementInputsForPair(Context, I, J, ReplacedOperands, @@ -1855,7 +2319,9 @@ namespace { if (I->hasName()) K->takeName(I); if (!isa<StoreInst>(K)) - K->mutateType(getVecTypeForPair(I->getType())); + K->mutateType(getVecTypeForPair(I->getType(), J->getType())); + + combineMetadata(K, J); for (unsigned o = 0; o < NumOperands; ++o) K->setOperand(o, ReplacedOperands[o]); @@ -1947,6 +2413,7 @@ llvm::vectorizeBasicBlock(Pass *P, BasicBlock &BB, const VectorizeConfig &C) { //===----------------------------------------------------------------------===// VectorizeConfig::VectorizeConfig() { VectorBits = ::VectorBits; + VectorizeBools = !::NoBools; VectorizeInts = !::NoInts; VectorizeFloats = !::NoFloats; VectorizePointers = !::NoPointers; @@ -1954,6 +2421,7 @@ VectorizeConfig::VectorizeConfig() { VectorizeMath = !::NoMath; VectorizeFMA = !::NoFMA; VectorizeSelect = !::NoSelect; + VectorizeCmp = !::NoCmp; VectorizeGEP = !::NoGEP; VectorizeMemOps = !::NoMemOps; AlignedOnly = ::AlignedOnly; @@ -1963,6 +2431,7 @@ VectorizeConfig::VectorizeConfig() { SplatBreaksChain = ::SplatBreaksChain; MaxInsts = ::MaxInsts; MaxIter = ::MaxIter; + Pow2LenOnly = ::Pow2LenOnly; NoMemOpBoost = ::NoMemOpBoost; FastDep = ::FastDep; } diff --git a/lib/Transforms/Vectorize/CMakeLists.txt b/lib/Transforms/Vectorize/CMakeLists.txt index 4b66930..06cf1e4 100644 --- a/lib/Transforms/Vectorize/CMakeLists.txt +++ b/lib/Transforms/Vectorize/CMakeLists.txt @@ -2,3 +2,5 @@ add_llvm_library(LLVMVectorize BBVectorize.cpp Vectorize.cpp ) + +add_dependencies(LLVMVectorize intrinsics_gen) |